1 | //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// Custom DAG lowering for SI |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "SIISelLowering.h" |
15 | #include "AMDGPU.h" |
16 | #include "AMDGPUInstrInfo.h" |
17 | #include "AMDGPUTargetMachine.h" |
18 | #include "GCNSubtarget.h" |
19 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
20 | #include "SIMachineFunctionInfo.h" |
21 | #include "SIRegisterInfo.h" |
22 | #include "llvm/ADT/APInt.h" |
23 | #include "llvm/ADT/FloatingPointMode.h" |
24 | #include "llvm/ADT/Statistic.h" |
25 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
26 | #include "llvm/Analysis/UniformityAnalysis.h" |
27 | #include "llvm/BinaryFormat/ELF.h" |
28 | #include "llvm/CodeGen/Analysis.h" |
29 | #include "llvm/CodeGen/ByteProvider.h" |
30 | #include "llvm/CodeGen/FunctionLoweringInfo.h" |
31 | #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" |
32 | #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" |
33 | #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" |
34 | #include "llvm/CodeGen/MachineFrameInfo.h" |
35 | #include "llvm/CodeGen/MachineFunction.h" |
36 | #include "llvm/CodeGen/MachineLoopInfo.h" |
37 | #include "llvm/IR/DiagnosticInfo.h" |
38 | #include "llvm/IR/IRBuilder.h" |
39 | #include "llvm/IR/IntrinsicInst.h" |
40 | #include "llvm/IR/IntrinsicsAMDGPU.h" |
41 | #include "llvm/IR/IntrinsicsR600.h" |
42 | #include "llvm/Support/CommandLine.h" |
43 | #include "llvm/Support/KnownBits.h" |
44 | #include "llvm/Support/ModRef.h" |
45 | #include <optional> |
46 | |
47 | using namespace llvm; |
48 | |
49 | #define DEBUG_TYPE "si-lower" |
50 | |
51 | STATISTIC(NumTailCalls, "Number of tail calls" ); |
52 | |
53 | static cl::opt<bool> DisableLoopAlignment( |
54 | "amdgpu-disable-loop-alignment" , |
55 | cl::desc("Do not align and prefetch loops" ), |
56 | cl::init(Val: false)); |
57 | |
58 | static cl::opt<bool> UseDivergentRegisterIndexing( |
59 | "amdgpu-use-divergent-register-indexing" , |
60 | cl::Hidden, |
61 | cl::desc("Use indirect register addressing for divergent indexes" ), |
62 | cl::init(Val: false)); |
63 | |
64 | static bool denormalModeIsFlushAllF32(const MachineFunction &MF) { |
65 | const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
66 | return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign(); |
67 | } |
68 | |
69 | static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF) { |
70 | const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
71 | return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign(); |
72 | } |
73 | |
74 | static unsigned findFirstFreeSGPR(CCState &CCInfo) { |
75 | unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); |
76 | for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) { |
77 | if (!CCInfo.isAllocated(AMDGPU::Reg: SGPR0 + Reg)) { |
78 | return AMDGPU::SGPR0 + Reg; |
79 | } |
80 | } |
81 | llvm_unreachable("Cannot allocate sgpr" ); |
82 | } |
83 | |
84 | SITargetLowering::SITargetLowering(const TargetMachine &TM, |
85 | const GCNSubtarget &STI) |
86 | : AMDGPUTargetLowering(TM, STI), |
87 | Subtarget(&STI) { |
88 | addRegisterClass(MVT::VT: i1, RC: &AMDGPU::VReg_1RegClass); |
89 | addRegisterClass(MVT::VT: i64, RC: &AMDGPU::SReg_64RegClass); |
90 | |
91 | addRegisterClass(MVT::VT: i32, RC: &AMDGPU::SReg_32RegClass); |
92 | addRegisterClass(MVT::VT: f32, RC: &AMDGPU::VGPR_32RegClass); |
93 | |
94 | addRegisterClass(MVT::VT: v2i32, RC: &AMDGPU::SReg_64RegClass); |
95 | |
96 | const SIRegisterInfo *TRI = STI.getRegisterInfo(); |
97 | const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class(); |
98 | |
99 | addRegisterClass(MVT::VT: f64, RC: V64RegClass); |
100 | addRegisterClass(MVT::VT: v2f32, RC: V64RegClass); |
101 | addRegisterClass(MVT::VT: Untyped, RC: V64RegClass); |
102 | |
103 | addRegisterClass(MVT::VT: v3i32, RC: &AMDGPU::SGPR_96RegClass); |
104 | addRegisterClass(MVT::VT: v3f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 96)); |
105 | |
106 | addRegisterClass(MVT::VT: v2i64, RC: &AMDGPU::SGPR_128RegClass); |
107 | addRegisterClass(MVT::VT: v2f64, RC: &AMDGPU::SGPR_128RegClass); |
108 | |
109 | addRegisterClass(MVT::VT: v4i32, RC: &AMDGPU::SGPR_128RegClass); |
110 | addRegisterClass(MVT::VT: v4f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 128)); |
111 | |
112 | addRegisterClass(MVT::VT: v5i32, RC: &AMDGPU::SGPR_160RegClass); |
113 | addRegisterClass(MVT::VT: v5f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 160)); |
114 | |
115 | addRegisterClass(MVT::VT: v6i32, RC: &AMDGPU::SGPR_192RegClass); |
116 | addRegisterClass(MVT::VT: v6f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 192)); |
117 | |
118 | addRegisterClass(MVT::VT: v3i64, RC: &AMDGPU::SGPR_192RegClass); |
119 | addRegisterClass(MVT::VT: v3f64, RC: TRI->getVGPRClassForBitWidth(BitWidth: 192)); |
120 | |
121 | addRegisterClass(MVT::VT: v7i32, RC: &AMDGPU::SGPR_224RegClass); |
122 | addRegisterClass(MVT::VT: v7f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 224)); |
123 | |
124 | addRegisterClass(MVT::VT: v8i32, RC: &AMDGPU::SGPR_256RegClass); |
125 | addRegisterClass(MVT::VT: v8f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 256)); |
126 | |
127 | addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass); |
128 | addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256)); |
129 | |
130 | addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass); |
131 | addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288)); |
132 | |
133 | addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass); |
134 | addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320)); |
135 | |
136 | addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass); |
137 | addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352)); |
138 | |
139 | addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass); |
140 | addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384)); |
141 | |
142 | addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass); |
143 | addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512)); |
144 | |
145 | addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass); |
146 | addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512)); |
147 | |
148 | addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass); |
149 | addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024)); |
150 | |
151 | if (Subtarget->has16BitInsts()) { |
152 | if (Subtarget->useRealTrue16Insts()) { |
153 | addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass); |
154 | addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass); |
155 | addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass); |
156 | } else { |
157 | addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass); |
158 | addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass); |
159 | addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass); |
160 | } |
161 | |
162 | // Unless there are also VOP3P operations, not operations are really legal. |
163 | addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass); |
164 | addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass); |
165 | addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass); |
166 | addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass); |
167 | addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass); |
168 | addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass); |
169 | addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass); |
170 | addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass); |
171 | addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass); |
172 | addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass); |
173 | addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass); |
174 | addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass); |
175 | addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass); |
176 | addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass); |
177 | addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass); |
178 | } |
179 | |
180 | addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass); |
181 | addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024)); |
182 | |
183 | computeRegisterProperties(Subtarget->getRegisterInfo()); |
184 | |
185 | // The boolean content concept here is too inflexible. Compares only ever |
186 | // really produce a 1-bit result. Any copy/extend from these will turn into a |
187 | // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as |
188 | // it's what most targets use. |
189 | setBooleanContents(ZeroOrOneBooleanContent); |
190 | setBooleanVectorContents(ZeroOrOneBooleanContent); |
191 | |
192 | // We need to custom lower vector stores from local memory |
193 | setOperationAction(ISD::LOAD, |
194 | {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, |
195 | MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32, |
196 | MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32, |
197 | MVT::i1, MVT::v32i32}, |
198 | Custom); |
199 | |
200 | setOperationAction(ISD::STORE, |
201 | {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, |
202 | MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32, |
203 | MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32, |
204 | MVT::i1, MVT::v32i32}, |
205 | Custom); |
206 | |
207 | if (isTypeLegal(MVT::bf16)) { |
208 | for (unsigned Opc : |
209 | {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV, |
210 | ISD::FREM, ISD::FMA, ISD::FMINNUM, ISD::FMAXNUM, |
211 | ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FSQRT, ISD::FCBRT, |
212 | ISD::FSIN, ISD::FCOS, ISD::FPOW, ISD::FPOWI, |
213 | ISD::FLDEXP, ISD::FFREXP, ISD::FLOG, ISD::FLOG2, |
214 | ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10, |
215 | ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT, |
216 | ISD::FROUND, ISD::FROUNDEVEN, ISD::FFLOOR, ISD::FCANONICALIZE, |
217 | ISD::SETCC}) { |
218 | // FIXME: The promoted to type shouldn't need to be explicit |
219 | setOperationAction(Opc, MVT::bf16, Promote); |
220 | AddPromotedToType(Opc, MVT::bf16, MVT::f32); |
221 | } |
222 | |
223 | setOperationAction(ISD::FP_ROUND, MVT::bf16, Expand); |
224 | |
225 | setOperationAction(ISD::SELECT, MVT::bf16, Promote); |
226 | AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16); |
227 | |
228 | // TODO: Could make these legal |
229 | setOperationAction(ISD::FABS, MVT::bf16, Expand); |
230 | setOperationAction(ISD::FNEG, MVT::bf16, Expand); |
231 | setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Expand); |
232 | |
233 | // We only need to custom lower because we can't specify an action for bf16 |
234 | // sources. |
235 | setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); |
236 | setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); |
237 | |
238 | setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Promote); |
239 | AddPromotedToType(ISD::BUILD_VECTOR, MVT::v2bf16, MVT::v2i16); |
240 | } |
241 | |
242 | setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); |
243 | setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand); |
244 | setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand); |
245 | setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); |
246 | setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); |
247 | setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand); |
248 | setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand); |
249 | setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand); |
250 | setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand); |
251 | setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand); |
252 | setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand); |
253 | setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand); |
254 | setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand); |
255 | setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand); |
256 | setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand); |
257 | setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand); |
258 | |
259 | setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand); |
260 | setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand); |
261 | setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand); |
262 | setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand); |
263 | setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand); |
264 | setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand); |
265 | setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand); |
266 | |
267 | setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom); |
268 | |
269 | setOperationAction(ISD::SELECT, MVT::i1, Promote); |
270 | setOperationAction(ISD::SELECT, MVT::i64, Custom); |
271 | setOperationAction(ISD::SELECT, MVT::f64, Promote); |
272 | AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64); |
273 | |
274 | setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom); |
275 | |
276 | setOperationAction(ISD::SELECT_CC, |
277 | {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand); |
278 | |
279 | setOperationAction(ISD::SETCC, MVT::i1, Promote); |
280 | setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand); |
281 | AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32); |
282 | |
283 | setOperationAction(ISD::TRUNCATE, |
284 | {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, |
285 | MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32, |
286 | MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32}, |
287 | Expand); |
288 | setOperationAction(ISD::FP_ROUND, |
289 | {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, |
290 | MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32, |
291 | MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32}, |
292 | Expand); |
293 | |
294 | setOperationAction(ISD::SIGN_EXTEND_INREG, |
295 | {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16, |
296 | MVT::v3i16, MVT::v4i16, MVT::Other}, |
297 | Custom); |
298 | |
299 | setOperationAction(ISD::BRCOND, MVT::Other, Custom); |
300 | setOperationAction(ISD::BR_CC, |
301 | {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand); |
302 | |
303 | setOperationAction({ISD::UADDO, ISD::USUBO}, MVT::i32, Legal); |
304 | |
305 | setOperationAction({ISD::UADDO_CARRY, ISD::USUBO_CARRY}, MVT::i32, Legal); |
306 | |
307 | setOperationAction({ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS}, MVT::i64, |
308 | Expand); |
309 | |
310 | #if 0 |
311 | setOperationAction({ISD::UADDO_CARRY, ISD::USUBO_CARRY}, MVT::i64, Legal); |
312 | #endif |
313 | |
314 | // We only support LOAD/STORE and vector manipulation ops for vectors |
315 | // with > 4 elements. |
316 | for (MVT VT : |
317 | {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32, |
318 | MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32, |
319 | MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16, |
320 | MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32, |
321 | MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64, |
322 | MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16, |
323 | MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32, |
324 | MVT::v32i16, MVT::v32f16, MVT::v32bf16}) { |
325 | for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { |
326 | switch (Op) { |
327 | case ISD::LOAD: |
328 | case ISD::STORE: |
329 | case ISD::BUILD_VECTOR: |
330 | case ISD::BITCAST: |
331 | case ISD::UNDEF: |
332 | case ISD::EXTRACT_VECTOR_ELT: |
333 | case ISD::INSERT_VECTOR_ELT: |
334 | case ISD::SCALAR_TO_VECTOR: |
335 | case ISD::IS_FPCLASS: |
336 | break; |
337 | case ISD::EXTRACT_SUBVECTOR: |
338 | case ISD::INSERT_SUBVECTOR: |
339 | case ISD::CONCAT_VECTORS: |
340 | setOperationAction(Op, VT, Custom); |
341 | break; |
342 | default: |
343 | setOperationAction(Op, VT, Expand); |
344 | break; |
345 | } |
346 | } |
347 | } |
348 | |
349 | setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand); |
350 | |
351 | // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that |
352 | // is expanded to avoid having two separate loops in case the index is a VGPR. |
353 | |
354 | // Most operations are naturally 32-bit vector operations. We only support |
355 | // load and store of i64 vectors, so promote v2i64 vector operations to v4i32. |
356 | for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) { |
357 | setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); |
358 | AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32); |
359 | |
360 | setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); |
361 | AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32); |
362 | |
363 | setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); |
364 | AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32); |
365 | |
366 | setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); |
367 | AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32); |
368 | } |
369 | |
370 | for (MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) { |
371 | setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); |
372 | AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32); |
373 | |
374 | setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); |
375 | AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32); |
376 | |
377 | setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); |
378 | AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32); |
379 | |
380 | setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); |
381 | AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32); |
382 | } |
383 | |
384 | for (MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) { |
385 | setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); |
386 | AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32); |
387 | |
388 | setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); |
389 | AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32); |
390 | |
391 | setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); |
392 | AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32); |
393 | |
394 | setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); |
395 | AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32); |
396 | } |
397 | |
398 | for (MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) { |
399 | setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); |
400 | AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32); |
401 | |
402 | setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); |
403 | AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32); |
404 | |
405 | setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); |
406 | AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32); |
407 | |
408 | setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); |
409 | AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32); |
410 | } |
411 | |
412 | for (MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) { |
413 | setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); |
414 | AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32); |
415 | |
416 | setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); |
417 | AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32); |
418 | |
419 | setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); |
420 | AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32); |
421 | |
422 | setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); |
423 | AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32); |
424 | } |
425 | |
426 | setOperationAction(ISD::VECTOR_SHUFFLE, |
427 | {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32}, |
428 | Expand); |
429 | |
430 | setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16}, |
431 | Custom); |
432 | |
433 | // Avoid stack access for these. |
434 | // TODO: Generalize to more vector types. |
435 | setOperationAction({ISD::EXTRACT_VECTOR_ELT, ISD::INSERT_VECTOR_ELT}, |
436 | {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8, |
437 | MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16}, |
438 | Custom); |
439 | |
440 | // Deal with vec3 vector operations when widened to vec4. |
441 | setOperationAction(ISD::INSERT_SUBVECTOR, |
442 | {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom); |
443 | |
444 | // Deal with vec5/6/7 vector operations when widened to vec8. |
445 | setOperationAction(ISD::INSERT_SUBVECTOR, |
446 | {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32, |
447 | MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32, |
448 | MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32, |
449 | MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32}, |
450 | Custom); |
451 | |
452 | // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, |
453 | // and output demarshalling |
454 | setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom); |
455 | |
456 | // We can't return success/failure, only the old value, |
457 | // let LLVM add the comparison |
458 | setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, {MVT::i32, MVT::i64}, |
459 | Expand); |
460 | |
461 | setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom); |
462 | |
463 | setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal); |
464 | |
465 | // FIXME: This should be narrowed to i32, but that only happens if i64 is |
466 | // illegal. |
467 | // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32. |
468 | setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal); |
469 | |
470 | // On SI this is s_memtime and s_memrealtime on VI. |
471 | setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); |
472 | |
473 | if (Subtarget->hasSMemRealTime() || |
474 | Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11) |
475 | setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Legal); |
476 | setOperationAction({ISD::TRAP, ISD::DEBUGTRAP}, MVT::Other, Custom); |
477 | |
478 | if (Subtarget->has16BitInsts()) { |
479 | setOperationAction({ISD::FPOW, ISD::FPOWI}, MVT::f16, Promote); |
480 | setOperationAction({ISD::FLOG, ISD::FEXP, ISD::FLOG10}, MVT::f16, Custom); |
481 | } else { |
482 | setOperationAction(ISD::FSQRT, MVT::f16, Custom); |
483 | } |
484 | |
485 | if (Subtarget->hasMadMacF32Insts()) |
486 | setOperationAction(ISD::FMAD, MVT::f32, Legal); |
487 | |
488 | if (!Subtarget->hasBFI()) |
489 | // fcopysign can be done in a single instruction with BFI. |
490 | setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand); |
491 | |
492 | if (!Subtarget->hasBCNT(32)) |
493 | setOperationAction(ISD::CTPOP, MVT::i32, Expand); |
494 | |
495 | if (!Subtarget->hasBCNT(64)) |
496 | setOperationAction(ISD::CTPOP, MVT::i64, Expand); |
497 | |
498 | if (Subtarget->hasFFBH()) |
499 | setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, MVT::i32, Custom); |
500 | |
501 | if (Subtarget->hasFFBL()) |
502 | setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom); |
503 | |
504 | // We only really have 32-bit BFE instructions (and 16-bit on VI). |
505 | // |
506 | // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any |
507 | // effort to match them now. We want this to be false for i64 cases when the |
508 | // extraction isn't restricted to the upper or lower half. Ideally we would |
509 | // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that |
510 | // span the midpoint are probably relatively rare, so don't worry about them |
511 | // for now. |
512 | if (Subtarget->hasBFE()) |
513 | setHasExtractBitsInsn(true); |
514 | |
515 | // Clamp modifier on add/sub |
516 | if (Subtarget->hasIntClamp()) |
517 | setOperationAction({ISD::UADDSAT, ISD::USUBSAT}, MVT::i32, Legal); |
518 | |
519 | if (Subtarget->hasAddNoCarry()) |
520 | setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32}, |
521 | Legal); |
522 | |
523 | setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, {MVT::f32, MVT::f64}, |
524 | Custom); |
525 | |
526 | // These are really only legal for ieee_mode functions. We should be avoiding |
527 | // them for functions that don't have ieee_mode enabled, so just say they are |
528 | // legal. |
529 | setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE}, |
530 | {MVT::f32, MVT::f64}, Legal); |
531 | |
532 | if (Subtarget->haveRoundOpsF64()) |
533 | setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FROUNDEVEN}, MVT::f64, |
534 | Legal); |
535 | else |
536 | setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FROUNDEVEN, ISD::FFLOOR}, |
537 | MVT::f64, Custom); |
538 | |
539 | setOperationAction(ISD::FFLOOR, MVT::f64, Legal); |
540 | setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64}, |
541 | Legal); |
542 | setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom); |
543 | |
544 | setOperationAction({ISD::FSIN, ISD::FCOS, ISD::FDIV}, MVT::f32, Custom); |
545 | setOperationAction(ISD::FDIV, MVT::f64, Custom); |
546 | |
547 | setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand); |
548 | setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand); |
549 | |
550 | // Custom lower these because we can't specify a rule based on an illegal |
551 | // source bf16. |
552 | setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f32, Custom); |
553 | setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f64, Custom); |
554 | |
555 | if (Subtarget->has16BitInsts()) { |
556 | setOperationAction({ISD::Constant, ISD::SMIN, ISD::SMAX, ISD::UMIN, |
557 | ISD::UMAX, ISD::UADDSAT, ISD::USUBSAT}, |
558 | MVT::i16, Legal); |
559 | |
560 | AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32); |
561 | |
562 | setOperationAction({ISD::ROTR, ISD::ROTL, ISD::SELECT_CC, ISD::BR_CC}, |
563 | MVT::i16, Expand); |
564 | |
565 | setOperationAction({ISD::SIGN_EXTEND, ISD::SDIV, ISD::UDIV, ISD::SREM, |
566 | ISD::UREM, ISD::BITREVERSE, ISD::CTTZ, |
567 | ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF, |
568 | ISD::CTPOP}, |
569 | MVT::i16, Promote); |
570 | |
571 | setOperationAction(ISD::LOAD, MVT::i16, Custom); |
572 | |
573 | setTruncStoreAction(MVT::i64, MVT::i16, Expand); |
574 | |
575 | setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote); |
576 | AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32); |
577 | setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote); |
578 | AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32); |
579 | |
580 | setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::i16, Custom); |
581 | setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom); |
582 | setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom); |
583 | |
584 | setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i32, Custom); |
585 | |
586 | // F16 - Constant Actions. |
587 | setOperationAction(ISD::ConstantFP, MVT::f16, Legal); |
588 | setOperationAction(ISD::ConstantFP, MVT::bf16, Legal); |
589 | |
590 | // F16 - Load/Store Actions. |
591 | setOperationAction(ISD::LOAD, MVT::f16, Promote); |
592 | AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16); |
593 | setOperationAction(ISD::STORE, MVT::f16, Promote); |
594 | AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16); |
595 | |
596 | // BF16 - Load/Store Actions. |
597 | setOperationAction(ISD::LOAD, MVT::bf16, Promote); |
598 | AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16); |
599 | setOperationAction(ISD::STORE, MVT::bf16, Promote); |
600 | AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16); |
601 | |
602 | // F16 - VOP1 Actions. |
603 | setOperationAction({ISD::FP_ROUND, ISD::STRICT_FP_ROUND, ISD::FCOS, |
604 | ISD::FSIN, ISD::FROUND, ISD::FPTRUNC_ROUND}, |
605 | MVT::f16, Custom); |
606 | |
607 | setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::f16, Promote); |
608 | setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::bf16, Promote); |
609 | |
610 | // F16 - VOP2 Actions. |
611 | setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16}, |
612 | Expand); |
613 | setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, MVT::f16, Custom); |
614 | setOperationAction(ISD::FFREXP, MVT::f16, Custom); |
615 | setOperationAction(ISD::FDIV, MVT::f16, Custom); |
616 | |
617 | // F16 - VOP3 Actions. |
618 | setOperationAction(ISD::FMA, MVT::f16, Legal); |
619 | if (STI.hasMadF16()) |
620 | setOperationAction(ISD::FMAD, MVT::f16, Legal); |
621 | |
622 | for (MVT VT : |
623 | {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16, |
624 | MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, |
625 | MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) { |
626 | for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { |
627 | switch (Op) { |
628 | case ISD::LOAD: |
629 | case ISD::STORE: |
630 | case ISD::BUILD_VECTOR: |
631 | case ISD::BITCAST: |
632 | case ISD::UNDEF: |
633 | case ISD::EXTRACT_VECTOR_ELT: |
634 | case ISD::INSERT_VECTOR_ELT: |
635 | case ISD::INSERT_SUBVECTOR: |
636 | case ISD::EXTRACT_SUBVECTOR: |
637 | case ISD::SCALAR_TO_VECTOR: |
638 | case ISD::IS_FPCLASS: |
639 | break; |
640 | case ISD::CONCAT_VECTORS: |
641 | setOperationAction(Op, VT, Custom); |
642 | break; |
643 | default: |
644 | setOperationAction(Op, VT, Expand); |
645 | break; |
646 | } |
647 | } |
648 | } |
649 | |
650 | // v_perm_b32 can handle either of these. |
651 | setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal); |
652 | setOperationAction(ISD::BSWAP, MVT::v4i16, Custom); |
653 | |
654 | // XXX - Do these do anything? Vector constants turn into build_vector. |
655 | setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal); |
656 | |
657 | setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, |
658 | Legal); |
659 | |
660 | setOperationAction(ISD::STORE, MVT::v2i16, Promote); |
661 | AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32); |
662 | setOperationAction(ISD::STORE, MVT::v2f16, Promote); |
663 | AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32); |
664 | |
665 | setOperationAction(ISD::LOAD, MVT::v2i16, Promote); |
666 | AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32); |
667 | setOperationAction(ISD::LOAD, MVT::v2f16, Promote); |
668 | AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32); |
669 | |
670 | setOperationAction(ISD::AND, MVT::v2i16, Promote); |
671 | AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32); |
672 | setOperationAction(ISD::OR, MVT::v2i16, Promote); |
673 | AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32); |
674 | setOperationAction(ISD::XOR, MVT::v2i16, Promote); |
675 | AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32); |
676 | |
677 | setOperationAction(ISD::LOAD, MVT::v4i16, Promote); |
678 | AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32); |
679 | setOperationAction(ISD::LOAD, MVT::v4f16, Promote); |
680 | AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32); |
681 | setOperationAction(ISD::LOAD, MVT::v4bf16, Promote); |
682 | AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32); |
683 | |
684 | setOperationAction(ISD::STORE, MVT::v4i16, Promote); |
685 | AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32); |
686 | setOperationAction(ISD::STORE, MVT::v4f16, Promote); |
687 | AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32); |
688 | setOperationAction(ISD::STORE, MVT::v4bf16, Promote); |
689 | AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32); |
690 | |
691 | setOperationAction(ISD::LOAD, MVT::v8i16, Promote); |
692 | AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32); |
693 | setOperationAction(ISD::LOAD, MVT::v8f16, Promote); |
694 | AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32); |
695 | setOperationAction(ISD::LOAD, MVT::v8bf16, Promote); |
696 | AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32); |
697 | |
698 | setOperationAction(ISD::STORE, MVT::v4i16, Promote); |
699 | AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32); |
700 | setOperationAction(ISD::STORE, MVT::v4f16, Promote); |
701 | AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32); |
702 | |
703 | setOperationAction(ISD::STORE, MVT::v8i16, Promote); |
704 | AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32); |
705 | setOperationAction(ISD::STORE, MVT::v8f16, Promote); |
706 | AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32); |
707 | setOperationAction(ISD::STORE, MVT::v8bf16, Promote); |
708 | AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32); |
709 | |
710 | setOperationAction(ISD::LOAD, MVT::v16i16, Promote); |
711 | AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32); |
712 | setOperationAction(ISD::LOAD, MVT::v16f16, Promote); |
713 | AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32); |
714 | setOperationAction(ISD::LOAD, MVT::v16bf16, Promote); |
715 | AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32); |
716 | |
717 | setOperationAction(ISD::STORE, MVT::v16i16, Promote); |
718 | AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32); |
719 | setOperationAction(ISD::STORE, MVT::v16f16, Promote); |
720 | AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32); |
721 | setOperationAction(ISD::STORE, MVT::v16bf16, Promote); |
722 | AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32); |
723 | |
724 | setOperationAction(ISD::LOAD, MVT::v32i16, Promote); |
725 | AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32); |
726 | setOperationAction(ISD::LOAD, MVT::v32f16, Promote); |
727 | AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32); |
728 | setOperationAction(ISD::LOAD, MVT::v32bf16, Promote); |
729 | AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32); |
730 | |
731 | setOperationAction(ISD::STORE, MVT::v32i16, Promote); |
732 | AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32); |
733 | setOperationAction(ISD::STORE, MVT::v32f16, Promote); |
734 | AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32); |
735 | setOperationAction(ISD::STORE, MVT::v32bf16, Promote); |
736 | AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32); |
737 | |
738 | setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND}, |
739 | MVT::v2i32, Expand); |
740 | setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand); |
741 | |
742 | setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND}, |
743 | MVT::v4i32, Expand); |
744 | |
745 | setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND}, |
746 | MVT::v8i32, Expand); |
747 | |
748 | if (!Subtarget->hasVOP3PInsts()) |
749 | setOperationAction(ISD::BUILD_VECTOR, |
750 | {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom); |
751 | |
752 | setOperationAction(ISD::FNEG, MVT::v2f16, Legal); |
753 | // This isn't really legal, but this avoids the legalizer unrolling it (and |
754 | // allows matching fneg (fabs x) patterns) |
755 | setOperationAction(ISD::FABS, MVT::v2f16, Legal); |
756 | |
757 | setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom); |
758 | setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal); |
759 | |
760 | setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE}, |
761 | {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16}, |
762 | Custom); |
763 | |
764 | setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, |
765 | {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16}, |
766 | Expand); |
767 | |
768 | for (MVT Vec16 : |
769 | {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16, |
770 | MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) { |
771 | setOperationAction( |
772 | {ISD::BUILD_VECTOR, ISD::EXTRACT_VECTOR_ELT, ISD::SCALAR_TO_VECTOR}, |
773 | Vec16, Custom); |
774 | setOperationAction(ISD::INSERT_VECTOR_ELT, Vec16, Expand); |
775 | } |
776 | } |
777 | |
778 | if (Subtarget->hasVOP3PInsts()) { |
779 | setOperationAction({ISD::ADD, ISD::SUB, ISD::MUL, ISD::SHL, ISD::SRL, |
780 | ISD::SRA, ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX, |
781 | ISD::UADDSAT, ISD::USUBSAT, ISD::SADDSAT, ISD::SSUBSAT}, |
782 | MVT::v2i16, Legal); |
783 | |
784 | setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FMINNUM_IEEE, |
785 | ISD::FMAXNUM_IEEE, ISD::FCANONICALIZE}, |
786 | MVT::v2f16, Legal); |
787 | |
788 | setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, |
789 | Custom); |
790 | |
791 | setOperationAction(ISD::VECTOR_SHUFFLE, |
792 | {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16, |
793 | MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16}, |
794 | Custom); |
795 | |
796 | for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16}) |
797 | // Split vector operations. |
798 | setOperationAction({ISD::SHL, ISD::SRA, ISD::SRL, ISD::ADD, ISD::SUB, |
799 | ISD::MUL, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX, |
800 | ISD::UADDSAT, ISD::SADDSAT, ISD::USUBSAT, |
801 | ISD::SSUBSAT}, |
802 | VT, Custom); |
803 | |
804 | for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16}) |
805 | // Split vector operations. |
806 | setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE}, |
807 | VT, Custom); |
808 | |
809 | setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, {MVT::v2f16, MVT::v4f16}, |
810 | Custom); |
811 | |
812 | setOperationAction(ISD::FEXP, MVT::v2f16, Custom); |
813 | setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16}, |
814 | Custom); |
815 | |
816 | if (Subtarget->hasPackedFP32Ops()) { |
817 | setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG}, |
818 | MVT::v2f32, Legal); |
819 | setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA}, |
820 | {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32}, |
821 | Custom); |
822 | } |
823 | } |
824 | |
825 | setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v4f16, Custom); |
826 | |
827 | if (Subtarget->has16BitInsts()) { |
828 | setOperationAction(ISD::SELECT, MVT::v2i16, Promote); |
829 | AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32); |
830 | setOperationAction(ISD::SELECT, MVT::v2f16, Promote); |
831 | AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32); |
832 | } else { |
833 | // Legalization hack. |
834 | setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom); |
835 | |
836 | setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v2f16, Custom); |
837 | } |
838 | |
839 | setOperationAction(ISD::SELECT, |
840 | {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8, |
841 | MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16, |
842 | MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16, |
843 | MVT::v32f16, MVT::v32bf16}, |
844 | Custom); |
845 | |
846 | setOperationAction({ISD::SMULO, ISD::UMULO}, MVT::i64, Custom); |
847 | |
848 | if (Subtarget->hasScalarSMulU64()) |
849 | setOperationAction(ISD::MUL, MVT::i64, Custom); |
850 | |
851 | if (Subtarget->hasMad64_32()) |
852 | setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom); |
853 | |
854 | if (Subtarget->hasPrefetch()) |
855 | setOperationAction(ISD::PREFETCH, MVT::Other, Custom); |
856 | |
857 | if (Subtarget->hasIEEEMinMax()) |
858 | setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, |
859 | {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal); |
860 | |
861 | setOperationAction(ISD::INTRINSIC_WO_CHAIN, |
862 | {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16, |
863 | MVT::v2i16, MVT::v2f16, MVT::i128, MVT::i8}, |
864 | Custom); |
865 | |
866 | setOperationAction(ISD::INTRINSIC_W_CHAIN, |
867 | {MVT::v2f16, MVT::v2i16, MVT::v3f16, MVT::v3i16, |
868 | MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::Other, MVT::f16, |
869 | MVT::i16, MVT::i8, MVT::i128}, |
870 | Custom); |
871 | |
872 | setOperationAction(ISD::INTRINSIC_VOID, |
873 | {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v3i16, |
874 | MVT::v3f16, MVT::v4f16, MVT::v4i16, MVT::f16, MVT::i16, |
875 | MVT::i8, MVT::i128}, |
876 | Custom); |
877 | |
878 | setOperationAction(ISD::STACKSAVE, MVT::Other, Custom); |
879 | setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom); |
880 | setOperationAction(ISD::GET_FPENV, MVT::i64, Custom); |
881 | setOperationAction(ISD::SET_FPENV, MVT::i64, Custom); |
882 | |
883 | // TODO: Could move this to custom lowering, could benefit from combines on |
884 | // extract of relevant bits. |
885 | setOperationAction(ISD::GET_FPMODE, MVT::i32, Legal); |
886 | |
887 | setOperationAction(ISD::MUL, MVT::i1, Promote); |
888 | |
889 | setTargetDAGCombine({ISD::ADD, |
890 | ISD::UADDO_CARRY, |
891 | ISD::SUB, |
892 | ISD::USUBO_CARRY, |
893 | ISD::FADD, |
894 | ISD::FSUB, |
895 | ISD::FDIV, |
896 | ISD::FMINNUM, |
897 | ISD::FMAXNUM, |
898 | ISD::FMINNUM_IEEE, |
899 | ISD::FMAXNUM_IEEE, |
900 | ISD::FMINIMUM, |
901 | ISD::FMAXIMUM, |
902 | ISD::FMA, |
903 | ISD::SMIN, |
904 | ISD::SMAX, |
905 | ISD::UMIN, |
906 | ISD::UMAX, |
907 | ISD::SETCC, |
908 | ISD::AND, |
909 | ISD::OR, |
910 | ISD::XOR, |
911 | ISD::FSHR, |
912 | ISD::SINT_TO_FP, |
913 | ISD::UINT_TO_FP, |
914 | ISD::FCANONICALIZE, |
915 | ISD::SCALAR_TO_VECTOR, |
916 | ISD::ZERO_EXTEND, |
917 | ISD::SIGN_EXTEND_INREG, |
918 | ISD::EXTRACT_VECTOR_ELT, |
919 | ISD::INSERT_VECTOR_ELT, |
920 | ISD::FCOPYSIGN}); |
921 | |
922 | if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16()) |
923 | setTargetDAGCombine(ISD::FP_ROUND); |
924 | |
925 | // All memory operations. Some folding on the pointer operand is done to help |
926 | // matching the constant offsets in the addressing modes. |
927 | setTargetDAGCombine({ISD::LOAD, |
928 | ISD::STORE, |
929 | ISD::ATOMIC_LOAD, |
930 | ISD::ATOMIC_STORE, |
931 | ISD::ATOMIC_CMP_SWAP, |
932 | ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, |
933 | ISD::ATOMIC_SWAP, |
934 | ISD::ATOMIC_LOAD_ADD, |
935 | ISD::ATOMIC_LOAD_SUB, |
936 | ISD::ATOMIC_LOAD_AND, |
937 | ISD::ATOMIC_LOAD_OR, |
938 | ISD::ATOMIC_LOAD_XOR, |
939 | ISD::ATOMIC_LOAD_NAND, |
940 | ISD::ATOMIC_LOAD_MIN, |
941 | ISD::ATOMIC_LOAD_MAX, |
942 | ISD::ATOMIC_LOAD_UMIN, |
943 | ISD::ATOMIC_LOAD_UMAX, |
944 | ISD::ATOMIC_LOAD_FADD, |
945 | ISD::ATOMIC_LOAD_UINC_WRAP, |
946 | ISD::ATOMIC_LOAD_UDEC_WRAP, |
947 | ISD::INTRINSIC_VOID, |
948 | ISD::INTRINSIC_W_CHAIN}); |
949 | |
950 | // FIXME: In other contexts we pretend this is a per-function property. |
951 | setStackPointerRegisterToSaveRestore(AMDGPU::SGPR32); |
952 | |
953 | setSchedulingPreference(Sched::RegPressure); |
954 | } |
955 | |
956 | const GCNSubtarget *SITargetLowering::getSubtarget() const { |
957 | return Subtarget; |
958 | } |
959 | |
960 | //===----------------------------------------------------------------------===// |
961 | // TargetLowering queries |
962 | //===----------------------------------------------------------------------===// |
963 | |
964 | // v_mad_mix* support a conversion from f16 to f32. |
965 | // |
966 | // There is only one special case when denormals are enabled we don't currently, |
967 | // where this is OK to use. |
968 | bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, |
969 | EVT DestVT, EVT SrcVT) const { |
970 | return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) || |
971 | (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) && |
972 | DestVT.getScalarType() == MVT::f32 && |
973 | SrcVT.getScalarType() == MVT::f16 && |
974 | // TODO: This probably only requires no input flushing? |
975 | denormalModeIsFlushAllF32(DAG.getMachineFunction()); |
976 | } |
977 | |
978 | bool SITargetLowering::isFPExtFoldable(const MachineInstr &MI, unsigned Opcode, |
979 | LLT DestTy, LLT SrcTy) const { |
980 | return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) || |
981 | (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) && |
982 | DestTy.getScalarSizeInBits() == 32 && |
983 | SrcTy.getScalarSizeInBits() == 16 && |
984 | // TODO: This probably only requires no input flushing? |
985 | denormalModeIsFlushAllF32(MF: *MI.getMF()); |
986 | } |
987 | |
988 | bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const { |
989 | // SI has some legal vector types, but no legal vector operations. Say no |
990 | // shuffles are legal in order to prefer scalarizing some vector operations. |
991 | return false; |
992 | } |
993 | |
994 | MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, |
995 | CallingConv::ID CC, |
996 | EVT VT) const { |
997 | if (CC == CallingConv::AMDGPU_KERNEL) |
998 | return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); |
999 | |
1000 | if (VT.isVector()) { |
1001 | EVT ScalarVT = VT.getScalarType(); |
1002 | unsigned Size = ScalarVT.getSizeInBits(); |
1003 | if (Size == 16) { |
1004 | if (Subtarget->has16BitInsts()) { |
1005 | if (VT.isInteger()) |
1006 | return MVT::v2i16; |
1007 | return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16); |
1008 | } |
1009 | return VT.isInteger() ? MVT::i32 : MVT::f32; |
1010 | } |
1011 | |
1012 | if (Size < 16) |
1013 | return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32; |
1014 | return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32; |
1015 | } |
1016 | |
1017 | if (VT.getSizeInBits() > 32) |
1018 | return MVT::i32; |
1019 | |
1020 | return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); |
1021 | } |
1022 | |
1023 | unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, |
1024 | CallingConv::ID CC, |
1025 | EVT VT) const { |
1026 | if (CC == CallingConv::AMDGPU_KERNEL) |
1027 | return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); |
1028 | |
1029 | if (VT.isVector()) { |
1030 | unsigned NumElts = VT.getVectorNumElements(); |
1031 | EVT ScalarVT = VT.getScalarType(); |
1032 | unsigned Size = ScalarVT.getSizeInBits(); |
1033 | |
1034 | // FIXME: Should probably promote 8-bit vectors to i16. |
1035 | if (Size == 16 && Subtarget->has16BitInsts()) |
1036 | return (NumElts + 1) / 2; |
1037 | |
1038 | if (Size <= 32) |
1039 | return NumElts; |
1040 | |
1041 | if (Size > 32) |
1042 | return NumElts * ((Size + 31) / 32); |
1043 | } else if (VT.getSizeInBits() > 32) |
1044 | return (VT.getSizeInBits() + 31) / 32; |
1045 | |
1046 | return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); |
1047 | } |
1048 | |
1049 | unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv( |
1050 | LLVMContext &Context, CallingConv::ID CC, |
1051 | EVT VT, EVT &IntermediateVT, |
1052 | unsigned &NumIntermediates, MVT &RegisterVT) const { |
1053 | if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) { |
1054 | unsigned NumElts = VT.getVectorNumElements(); |
1055 | EVT ScalarVT = VT.getScalarType(); |
1056 | unsigned Size = ScalarVT.getSizeInBits(); |
1057 | // FIXME: We should fix the ABI to be the same on targets without 16-bit |
1058 | // support, but unless we can properly handle 3-vectors, it will be still be |
1059 | // inconsistent. |
1060 | if (Size == 16 && Subtarget->has16BitInsts()) { |
1061 | if (ScalarVT == MVT::bf16) { |
1062 | RegisterVT = MVT::i32; |
1063 | IntermediateVT = MVT::v2bf16; |
1064 | } else { |
1065 | RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16; |
1066 | IntermediateVT = RegisterVT; |
1067 | } |
1068 | NumIntermediates = (NumElts + 1) / 2; |
1069 | return NumIntermediates; |
1070 | } |
1071 | |
1072 | if (Size == 32) { |
1073 | RegisterVT = ScalarVT.getSimpleVT(); |
1074 | IntermediateVT = RegisterVT; |
1075 | NumIntermediates = NumElts; |
1076 | return NumIntermediates; |
1077 | } |
1078 | |
1079 | if (Size < 16 && Subtarget->has16BitInsts()) { |
1080 | // FIXME: Should probably form v2i16 pieces |
1081 | RegisterVT = MVT::i16; |
1082 | IntermediateVT = ScalarVT; |
1083 | NumIntermediates = NumElts; |
1084 | return NumIntermediates; |
1085 | } |
1086 | |
1087 | |
1088 | if (Size != 16 && Size <= 32) { |
1089 | RegisterVT = MVT::i32; |
1090 | IntermediateVT = ScalarVT; |
1091 | NumIntermediates = NumElts; |
1092 | return NumIntermediates; |
1093 | } |
1094 | |
1095 | if (Size > 32) { |
1096 | RegisterVT = MVT::i32; |
1097 | IntermediateVT = RegisterVT; |
1098 | NumIntermediates = NumElts * ((Size + 31) / 32); |
1099 | return NumIntermediates; |
1100 | } |
1101 | } |
1102 | |
1103 | return TargetLowering::getVectorTypeBreakdownForCallingConv( |
1104 | Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT); |
1105 | } |
1106 | |
1107 | static EVT memVTFromLoadIntrData(Type *Ty, unsigned MaxNumLanes) { |
1108 | assert(MaxNumLanes != 0); |
1109 | |
1110 | if (auto *VT = dyn_cast<FixedVectorType>(Val: Ty)) { |
1111 | unsigned NumElts = std::min(a: MaxNumLanes, b: VT->getNumElements()); |
1112 | return EVT::getVectorVT(Context&: Ty->getContext(), |
1113 | VT: EVT::getEVT(Ty: VT->getElementType()), |
1114 | NumElements: NumElts); |
1115 | } |
1116 | |
1117 | return EVT::getEVT(Ty); |
1118 | } |
1119 | |
1120 | // Peek through TFE struct returns to only use the data size. |
1121 | static EVT memVTFromLoadIntrReturn(Type *Ty, unsigned MaxNumLanes) { |
1122 | auto *ST = dyn_cast<StructType>(Val: Ty); |
1123 | if (!ST) |
1124 | return memVTFromLoadIntrData(Ty, MaxNumLanes); |
1125 | |
1126 | // TFE intrinsics return an aggregate type. |
1127 | assert(ST->getNumContainedTypes() == 2 && |
1128 | ST->getContainedType(1)->isIntegerTy(32)); |
1129 | return memVTFromLoadIntrData(Ty: ST->getContainedType(i: 0), MaxNumLanes); |
1130 | } |
1131 | |
1132 | /// Map address space 7 to MVT::v5i32 because that's its in-memory |
1133 | /// representation. This return value is vector-typed because there is no |
1134 | /// MVT::i160 and it is not clear if one can be added. While this could |
1135 | /// cause issues during codegen, these address space 7 pointers will be |
1136 | /// rewritten away by then. Therefore, we can return MVT::v5i32 in order |
1137 | /// to allow pre-codegen passes that query TargetTransformInfo, often for cost |
1138 | /// modeling, to work. |
1139 | MVT SITargetLowering::getPointerTy(const DataLayout &DL, unsigned AS) const { |
1140 | if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160) |
1141 | return MVT::v5i32; |
1142 | if (AMDGPUAS::BUFFER_STRIDED_POINTER == AS && |
1143 | DL.getPointerSizeInBits(AS) == 192) |
1144 | return MVT::v6i32; |
1145 | return AMDGPUTargetLowering::getPointerTy(DL, AS); |
1146 | } |
1147 | /// Similarly, the in-memory representation of a p7 is {p8, i32}, aka |
1148 | /// v8i32 when padding is added. |
1149 | /// The in-memory representation of a p9 is {p8, i32, i32}, which is |
1150 | /// also v8i32 with padding. |
1151 | MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const { |
1152 | if ((AMDGPUAS::BUFFER_FAT_POINTER == AS && |
1153 | DL.getPointerSizeInBits(AS) == 160) || |
1154 | (AMDGPUAS::BUFFER_STRIDED_POINTER == AS && |
1155 | DL.getPointerSizeInBits(AS) == 192)) |
1156 | return MVT::v8i32; |
1157 | return AMDGPUTargetLowering::getPointerMemTy(DL, AS); |
1158 | } |
1159 | |
1160 | bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, |
1161 | const CallInst &CI, |
1162 | MachineFunction &MF, |
1163 | unsigned IntrID) const { |
1164 | Info.flags = MachineMemOperand::MONone; |
1165 | if (CI.hasMetadata(KindID: LLVMContext::MD_invariant_load)) |
1166 | Info.flags |= MachineMemOperand::MOInvariant; |
1167 | |
1168 | if (const AMDGPU::RsrcIntrinsic *RsrcIntr = |
1169 | AMDGPU::lookupRsrcIntrinsic(Intr: IntrID)) { |
1170 | AttributeList Attr = Intrinsic::getAttributes(C&: CI.getContext(), |
1171 | id: (Intrinsic::ID)IntrID); |
1172 | MemoryEffects ME = Attr.getMemoryEffects(); |
1173 | if (ME.doesNotAccessMemory()) |
1174 | return false; |
1175 | |
1176 | // TODO: Should images get their own address space? |
1177 | Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE; |
1178 | |
1179 | if (RsrcIntr->IsImage) |
1180 | Info.align.reset(); |
1181 | |
1182 | Value *RsrcArg = CI.getArgOperand(i: RsrcIntr->RsrcArg); |
1183 | if (auto *RsrcPtrTy = dyn_cast<PointerType>(Val: RsrcArg->getType())) { |
1184 | if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE) |
1185 | // We conservatively set the memory operand of a buffer intrinsic to the |
1186 | // base resource pointer, so that we can access alias information about |
1187 | // those pointers. Cases like "this points at the same value |
1188 | // but with a different offset" are handled in |
1189 | // areMemAccessesTriviallyDisjoint. |
1190 | Info.ptrVal = RsrcArg; |
1191 | } |
1192 | |
1193 | auto *Aux = cast<ConstantInt>(Val: CI.getArgOperand(i: CI.arg_size() - 1)); |
1194 | if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE) |
1195 | Info.flags |= MachineMemOperand::MOVolatile; |
1196 | Info.flags |= MachineMemOperand::MODereferenceable; |
1197 | if (ME.onlyReadsMemory()) { |
1198 | unsigned MaxNumLanes = 4; |
1199 | |
1200 | if (RsrcIntr->IsImage) { |
1201 | const AMDGPU::ImageDimIntrinsicInfo *Intr |
1202 | = AMDGPU::getImageDimIntrinsicInfo(Intr: IntrID); |
1203 | const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = |
1204 | AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Intr->BaseOpcode); |
1205 | |
1206 | if (!BaseOpcode->Gather4) { |
1207 | // If this isn't a gather, we may have excess loaded elements in the |
1208 | // IR type. Check the dmask for the real number of elements loaded. |
1209 | unsigned DMask |
1210 | = cast<ConstantInt>(Val: CI.getArgOperand(i: 0))->getZExtValue(); |
1211 | MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(Value: DMask); |
1212 | } |
1213 | } |
1214 | |
1215 | Info.memVT = memVTFromLoadIntrReturn(Ty: CI.getType(), MaxNumLanes); |
1216 | |
1217 | // FIXME: What does alignment mean for an image? |
1218 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
1219 | Info.flags |= MachineMemOperand::MOLoad; |
1220 | } else if (ME.onlyWritesMemory()) { |
1221 | Info.opc = ISD::INTRINSIC_VOID; |
1222 | |
1223 | Type *DataTy = CI.getArgOperand(i: 0)->getType(); |
1224 | if (RsrcIntr->IsImage) { |
1225 | unsigned DMask = cast<ConstantInt>(Val: CI.getArgOperand(i: 1))->getZExtValue(); |
1226 | unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(Value: DMask); |
1227 | Info.memVT = memVTFromLoadIntrData(Ty: DataTy, MaxNumLanes: DMaskLanes); |
1228 | } else |
1229 | Info.memVT = EVT::getEVT(Ty: DataTy); |
1230 | |
1231 | Info.flags |= MachineMemOperand::MOStore; |
1232 | } else { |
1233 | // Atomic |
1234 | Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID : |
1235 | ISD::INTRINSIC_W_CHAIN; |
1236 | Info.memVT = MVT::getVT(Ty: CI.getArgOperand(i: 0)->getType()); |
1237 | Info.flags |= MachineMemOperand::MOLoad | |
1238 | MachineMemOperand::MOStore | |
1239 | MachineMemOperand::MODereferenceable; |
1240 | |
1241 | switch (IntrID) { |
1242 | default: |
1243 | // XXX - Should this be volatile without known ordering? |
1244 | Info.flags |= MachineMemOperand::MOVolatile; |
1245 | break; |
1246 | case Intrinsic::amdgcn_raw_buffer_load_lds: |
1247 | case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: |
1248 | case Intrinsic::amdgcn_struct_buffer_load_lds: |
1249 | case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: { |
1250 | unsigned Width = cast<ConstantInt>(Val: CI.getArgOperand(i: 2))->getZExtValue(); |
1251 | Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: Width * 8); |
1252 | Info.ptrVal = CI.getArgOperand(i: 1); |
1253 | return true; |
1254 | } |
1255 | } |
1256 | } |
1257 | return true; |
1258 | } |
1259 | |
1260 | switch (IntrID) { |
1261 | case Intrinsic::amdgcn_ds_ordered_add: |
1262 | case Intrinsic::amdgcn_ds_ordered_swap: |
1263 | case Intrinsic::amdgcn_ds_fadd: |
1264 | case Intrinsic::amdgcn_ds_fmin: |
1265 | case Intrinsic::amdgcn_ds_fmax: { |
1266 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
1267 | Info.memVT = MVT::getVT(Ty: CI.getType()); |
1268 | Info.ptrVal = CI.getOperand(i_nocapture: 0); |
1269 | Info.align.reset(); |
1270 | Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; |
1271 | |
1272 | const ConstantInt *Vol = cast<ConstantInt>(Val: CI.getOperand(i_nocapture: 4)); |
1273 | if (!Vol->isZero()) |
1274 | Info.flags |= MachineMemOperand::MOVolatile; |
1275 | |
1276 | return true; |
1277 | } |
1278 | case Intrinsic::amdgcn_buffer_atomic_fadd: { |
1279 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
1280 | Info.memVT = MVT::getVT(Ty: CI.getOperand(i_nocapture: 0)->getType()); |
1281 | Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE; |
1282 | Info.align.reset(); |
1283 | Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; |
1284 | |
1285 | const ConstantInt *Vol = dyn_cast<ConstantInt>(Val: CI.getOperand(i_nocapture: 4)); |
1286 | if (!Vol || !Vol->isZero()) |
1287 | Info.flags |= MachineMemOperand::MOVolatile; |
1288 | |
1289 | return true; |
1290 | } |
1291 | case Intrinsic::amdgcn_ds_add_gs_reg_rtn: |
1292 | case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: { |
1293 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
1294 | Info.memVT = MVT::getVT(Ty: CI.getOperand(i_nocapture: 0)->getType()); |
1295 | Info.ptrVal = nullptr; |
1296 | Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER; |
1297 | Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; |
1298 | return true; |
1299 | } |
1300 | case Intrinsic::amdgcn_ds_append: |
1301 | case Intrinsic::amdgcn_ds_consume: { |
1302 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
1303 | Info.memVT = MVT::getVT(Ty: CI.getType()); |
1304 | Info.ptrVal = CI.getOperand(i_nocapture: 0); |
1305 | Info.align.reset(); |
1306 | Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; |
1307 | |
1308 | const ConstantInt *Vol = cast<ConstantInt>(Val: CI.getOperand(i_nocapture: 1)); |
1309 | if (!Vol->isZero()) |
1310 | Info.flags |= MachineMemOperand::MOVolatile; |
1311 | |
1312 | return true; |
1313 | } |
1314 | case Intrinsic::amdgcn_global_atomic_csub: { |
1315 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
1316 | Info.memVT = MVT::getVT(Ty: CI.getType()); |
1317 | Info.ptrVal = CI.getOperand(i_nocapture: 0); |
1318 | Info.align.reset(); |
1319 | Info.flags |= MachineMemOperand::MOLoad | |
1320 | MachineMemOperand::MOStore | |
1321 | MachineMemOperand::MOVolatile; |
1322 | return true; |
1323 | } |
1324 | case Intrinsic::amdgcn_image_bvh_intersect_ray: { |
1325 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
1326 | Info.memVT = MVT::getVT(Ty: CI.getType()); // XXX: what is correct VT? |
1327 | |
1328 | Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE; |
1329 | Info.align.reset(); |
1330 | Info.flags |= MachineMemOperand::MOLoad | |
1331 | MachineMemOperand::MODereferenceable; |
1332 | return true; |
1333 | } |
1334 | case Intrinsic::amdgcn_global_atomic_fadd: |
1335 | case Intrinsic::amdgcn_global_atomic_fmin: |
1336 | case Intrinsic::amdgcn_global_atomic_fmax: |
1337 | case Intrinsic::amdgcn_global_atomic_fmin_num: |
1338 | case Intrinsic::amdgcn_global_atomic_fmax_num: |
1339 | case Intrinsic::amdgcn_global_atomic_ordered_add_b64: |
1340 | case Intrinsic::amdgcn_flat_atomic_fadd: |
1341 | case Intrinsic::amdgcn_flat_atomic_fmin: |
1342 | case Intrinsic::amdgcn_flat_atomic_fmax: |
1343 | case Intrinsic::amdgcn_flat_atomic_fmin_num: |
1344 | case Intrinsic::amdgcn_flat_atomic_fmax_num: |
1345 | case Intrinsic::amdgcn_global_atomic_fadd_v2bf16: |
1346 | case Intrinsic::amdgcn_atomic_cond_sub_u32: |
1347 | case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: { |
1348 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
1349 | Info.memVT = MVT::getVT(Ty: CI.getType()); |
1350 | Info.ptrVal = CI.getOperand(i_nocapture: 0); |
1351 | Info.align.reset(); |
1352 | Info.flags |= MachineMemOperand::MOLoad | |
1353 | MachineMemOperand::MOStore | |
1354 | MachineMemOperand::MODereferenceable | |
1355 | MachineMemOperand::MOVolatile; |
1356 | return true; |
1357 | } |
1358 | case Intrinsic::amdgcn_global_load_tr_b64: |
1359 | case Intrinsic::amdgcn_global_load_tr_b128: { |
1360 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
1361 | Info.memVT = MVT::getVT(Ty: CI.getType()); |
1362 | Info.ptrVal = CI.getOperand(i_nocapture: 0); |
1363 | Info.align.reset(); |
1364 | Info.flags |= MachineMemOperand::MOLoad; |
1365 | return true; |
1366 | } |
1367 | case Intrinsic::amdgcn_ds_gws_init: |
1368 | case Intrinsic::amdgcn_ds_gws_barrier: |
1369 | case Intrinsic::amdgcn_ds_gws_sema_v: |
1370 | case Intrinsic::amdgcn_ds_gws_sema_br: |
1371 | case Intrinsic::amdgcn_ds_gws_sema_p: |
1372 | case Intrinsic::amdgcn_ds_gws_sema_release_all: { |
1373 | Info.opc = ISD::INTRINSIC_VOID; |
1374 | |
1375 | const GCNTargetMachine &TM = |
1376 | static_cast<const GCNTargetMachine &>(getTargetMachine()); |
1377 | |
1378 | SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
1379 | Info.ptrVal = MFI->getGWSPSV(TM); |
1380 | |
1381 | // This is an abstract access, but we need to specify a type and size. |
1382 | Info.memVT = MVT::i32; |
1383 | Info.size = 4; |
1384 | Info.align = Align(4); |
1385 | |
1386 | if (IntrID == Intrinsic::amdgcn_ds_gws_barrier) |
1387 | Info.flags |= MachineMemOperand::MOLoad; |
1388 | else |
1389 | Info.flags |= MachineMemOperand::MOStore; |
1390 | return true; |
1391 | } |
1392 | case Intrinsic::amdgcn_global_load_lds: { |
1393 | Info.opc = ISD::INTRINSIC_VOID; |
1394 | unsigned Width = cast<ConstantInt>(Val: CI.getArgOperand(i: 2))->getZExtValue(); |
1395 | Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: Width * 8); |
1396 | Info.ptrVal = CI.getArgOperand(i: 1); |
1397 | Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; |
1398 | return true; |
1399 | } |
1400 | case Intrinsic::amdgcn_ds_bvh_stack_rtn: { |
1401 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
1402 | |
1403 | const GCNTargetMachine &TM = |
1404 | static_cast<const GCNTargetMachine &>(getTargetMachine()); |
1405 | |
1406 | SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
1407 | Info.ptrVal = MFI->getGWSPSV(TM); |
1408 | |
1409 | // This is an abstract access, but we need to specify a type and size. |
1410 | Info.memVT = MVT::i32; |
1411 | Info.size = 4; |
1412 | Info.align = Align(4); |
1413 | |
1414 | Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; |
1415 | return true; |
1416 | } |
1417 | default: |
1418 | return false; |
1419 | } |
1420 | } |
1421 | |
1422 | void SITargetLowering::CollectTargetIntrinsicOperands( |
1423 | const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const { |
1424 | switch (cast<IntrinsicInst>(Val: I).getIntrinsicID()) { |
1425 | case Intrinsic::amdgcn_addrspacecast_nonnull: { |
1426 | // The DAG's ValueType loses the addrspaces. |
1427 | // Add them as 2 extra Constant operands "from" and "to". |
1428 | unsigned SrcAS = I.getOperand(i_nocapture: 0)->getType()->getPointerAddressSpace(); |
1429 | unsigned DstAS = I.getType()->getPointerAddressSpace(); |
1430 | Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32)); |
1431 | Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32)); |
1432 | break; |
1433 | } |
1434 | default: |
1435 | break; |
1436 | } |
1437 | } |
1438 | |
1439 | bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II, |
1440 | SmallVectorImpl<Value*> &Ops, |
1441 | Type *&AccessTy) const { |
1442 | Value *Ptr = nullptr; |
1443 | switch (II->getIntrinsicID()) { |
1444 | case Intrinsic::amdgcn_atomic_cond_sub_u32: |
1445 | case Intrinsic::amdgcn_ds_append: |
1446 | case Intrinsic::amdgcn_ds_consume: |
1447 | case Intrinsic::amdgcn_ds_fadd: |
1448 | case Intrinsic::amdgcn_ds_fmax: |
1449 | case Intrinsic::amdgcn_ds_fmin: |
1450 | case Intrinsic::amdgcn_ds_ordered_add: |
1451 | case Intrinsic::amdgcn_ds_ordered_swap: |
1452 | case Intrinsic::amdgcn_flat_atomic_fadd: |
1453 | case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: |
1454 | case Intrinsic::amdgcn_flat_atomic_fmax: |
1455 | case Intrinsic::amdgcn_flat_atomic_fmax_num: |
1456 | case Intrinsic::amdgcn_flat_atomic_fmin: |
1457 | case Intrinsic::amdgcn_flat_atomic_fmin_num: |
1458 | case Intrinsic::amdgcn_global_atomic_csub: |
1459 | case Intrinsic::amdgcn_global_atomic_fadd: |
1460 | case Intrinsic::amdgcn_global_atomic_fadd_v2bf16: |
1461 | case Intrinsic::amdgcn_global_atomic_fmax: |
1462 | case Intrinsic::amdgcn_global_atomic_fmax_num: |
1463 | case Intrinsic::amdgcn_global_atomic_fmin: |
1464 | case Intrinsic::amdgcn_global_atomic_fmin_num: |
1465 | case Intrinsic::amdgcn_global_atomic_ordered_add_b64: |
1466 | case Intrinsic::amdgcn_global_load_tr_b64: |
1467 | case Intrinsic::amdgcn_global_load_tr_b128: |
1468 | Ptr = II->getArgOperand(i: 0); |
1469 | break; |
1470 | case Intrinsic::amdgcn_global_load_lds: |
1471 | Ptr = II->getArgOperand(i: 1); |
1472 | break; |
1473 | default: |
1474 | return false; |
1475 | } |
1476 | AccessTy = II->getType(); |
1477 | Ops.push_back(Elt: Ptr); |
1478 | return true; |
1479 | } |
1480 | |
1481 | bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM, |
1482 | unsigned AddrSpace, |
1483 | uint64_t FlatVariant) const { |
1484 | if (!Subtarget->hasFlatInstOffsets()) { |
1485 | // Flat instructions do not have offsets, and only have the register |
1486 | // address. |
1487 | return AM.BaseOffs == 0 && AM.Scale == 0; |
1488 | } |
1489 | |
1490 | return AM.Scale == 0 && |
1491 | (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset( |
1492 | Offset: AM.BaseOffs, AddrSpace, FlatVariant)); |
1493 | } |
1494 | |
1495 | bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const { |
1496 | if (Subtarget->hasFlatGlobalInsts()) |
1497 | return isLegalFlatAddressingMode(AM, AddrSpace: AMDGPUAS::GLOBAL_ADDRESS, |
1498 | FlatVariant: SIInstrFlags::FlatGlobal); |
1499 | |
1500 | if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) { |
1501 | // Assume the we will use FLAT for all global memory accesses |
1502 | // on VI. |
1503 | // FIXME: This assumption is currently wrong. On VI we still use |
1504 | // MUBUF instructions for the r + i addressing mode. As currently |
1505 | // implemented, the MUBUF instructions only work on buffer < 4GB. |
1506 | // It may be possible to support > 4GB buffers with MUBUF instructions, |
1507 | // by setting the stride value in the resource descriptor which would |
1508 | // increase the size limit to (stride * 4GB). However, this is risky, |
1509 | // because it has never been validated. |
1510 | return isLegalFlatAddressingMode(AM, AddrSpace: AMDGPUAS::FLAT_ADDRESS, |
1511 | FlatVariant: SIInstrFlags::FLAT); |
1512 | } |
1513 | |
1514 | return isLegalMUBUFAddressingMode(AM); |
1515 | } |
1516 | |
1517 | bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const { |
1518 | // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and |
1519 | // additionally can do r + r + i with addr64. 32-bit has more addressing |
1520 | // mode options. Depending on the resource constant, it can also do |
1521 | // (i64 r0) + (i32 r1) * (i14 i). |
1522 | // |
1523 | // Private arrays end up using a scratch buffer most of the time, so also |
1524 | // assume those use MUBUF instructions. Scratch loads / stores are currently |
1525 | // implemented as mubuf instructions with offen bit set, so slightly |
1526 | // different than the normal addr64. |
1527 | const SIInstrInfo *TII = Subtarget->getInstrInfo(); |
1528 | if (!TII->isLegalMUBUFImmOffset(Imm: AM.BaseOffs)) |
1529 | return false; |
1530 | |
1531 | // FIXME: Since we can split immediate into soffset and immediate offset, |
1532 | // would it make sense to allow any immediate? |
1533 | |
1534 | switch (AM.Scale) { |
1535 | case 0: // r + i or just i, depending on HasBaseReg. |
1536 | return true; |
1537 | case 1: |
1538 | return true; // We have r + r or r + i. |
1539 | case 2: |
1540 | if (AM.HasBaseReg) { |
1541 | // Reject 2 * r + r. |
1542 | return false; |
1543 | } |
1544 | |
1545 | // Allow 2 * r as r + r |
1546 | // Or 2 * r + i is allowed as r + r + i. |
1547 | return true; |
1548 | default: // Don't allow n * r |
1549 | return false; |
1550 | } |
1551 | } |
1552 | |
1553 | bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, |
1554 | const AddrMode &AM, Type *Ty, |
1555 | unsigned AS, Instruction *I) const { |
1556 | // No global is ever allowed as a base. |
1557 | if (AM.BaseGV) |
1558 | return false; |
1559 | |
1560 | if (AS == AMDGPUAS::GLOBAL_ADDRESS) |
1561 | return isLegalGlobalAddressingMode(AM); |
1562 | |
1563 | if (AS == AMDGPUAS::CONSTANT_ADDRESS || |
1564 | AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT || |
1565 | AS == AMDGPUAS::BUFFER_FAT_POINTER || AS == AMDGPUAS::BUFFER_RESOURCE || |
1566 | AS == AMDGPUAS::BUFFER_STRIDED_POINTER) { |
1567 | // If the offset isn't a multiple of 4, it probably isn't going to be |
1568 | // correctly aligned. |
1569 | // FIXME: Can we get the real alignment here? |
1570 | if (AM.BaseOffs % 4 != 0) |
1571 | return isLegalMUBUFAddressingMode(AM); |
1572 | |
1573 | if (!Subtarget->hasScalarSubwordLoads()) { |
1574 | // There are no SMRD extloads, so if we have to do a small type access we |
1575 | // will use a MUBUF load. |
1576 | // FIXME?: We also need to do this if unaligned, but we don't know the |
1577 | // alignment here. |
1578 | if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4) |
1579 | return isLegalGlobalAddressingMode(AM); |
1580 | } |
1581 | |
1582 | if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) { |
1583 | // SMRD instructions have an 8-bit, dword offset on SI. |
1584 | if (!isUInt<8>(x: AM.BaseOffs / 4)) |
1585 | return false; |
1586 | } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) { |
1587 | // On CI+, this can also be a 32-bit literal constant offset. If it fits |
1588 | // in 8-bits, it can use a smaller encoding. |
1589 | if (!isUInt<32>(x: AM.BaseOffs / 4)) |
1590 | return false; |
1591 | } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) { |
1592 | // On VI, these use the SMEM format and the offset is 20-bit in bytes. |
1593 | if (!isUInt<20>(x: AM.BaseOffs)) |
1594 | return false; |
1595 | } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) { |
1596 | // On GFX9 the offset is signed 21-bit in bytes (but must not be negative |
1597 | // for S_BUFFER_* instructions). |
1598 | if (!isInt<21>(x: AM.BaseOffs)) |
1599 | return false; |
1600 | } else { |
1601 | // On GFX12, all offsets are signed 24-bit in bytes. |
1602 | if (!isInt<24>(x: AM.BaseOffs)) |
1603 | return false; |
1604 | } |
1605 | |
1606 | if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg. |
1607 | return true; |
1608 | |
1609 | if (AM.Scale == 1 && AM.HasBaseReg) |
1610 | return true; |
1611 | |
1612 | return false; |
1613 | } |
1614 | |
1615 | if (AS == AMDGPUAS::PRIVATE_ADDRESS) |
1616 | return Subtarget->enableFlatScratch() |
1617 | ? isLegalFlatAddressingMode(AM, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS, |
1618 | FlatVariant: SIInstrFlags::FlatScratch) |
1619 | : isLegalMUBUFAddressingMode(AM); |
1620 | |
1621 | if (AS == AMDGPUAS::LOCAL_ADDRESS || |
1622 | (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) { |
1623 | // Basic, single offset DS instructions allow a 16-bit unsigned immediate |
1624 | // field. |
1625 | // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have |
1626 | // an 8-bit dword offset but we don't know the alignment here. |
1627 | if (!isUInt<16>(x: AM.BaseOffs)) |
1628 | return false; |
1629 | |
1630 | if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg. |
1631 | return true; |
1632 | |
1633 | if (AM.Scale == 1 && AM.HasBaseReg) |
1634 | return true; |
1635 | |
1636 | return false; |
1637 | } |
1638 | |
1639 | if (AS == AMDGPUAS::FLAT_ADDRESS || AS == AMDGPUAS::UNKNOWN_ADDRESS_SPACE) { |
1640 | // For an unknown address space, this usually means that this is for some |
1641 | // reason being used for pure arithmetic, and not based on some addressing |
1642 | // computation. We don't have instructions that compute pointers with any |
1643 | // addressing modes, so treat them as having no offset like flat |
1644 | // instructions. |
1645 | return isLegalFlatAddressingMode(AM, AddrSpace: AMDGPUAS::FLAT_ADDRESS, |
1646 | FlatVariant: SIInstrFlags::FLAT); |
1647 | } |
1648 | |
1649 | // Assume a user alias of global for unknown address spaces. |
1650 | return isLegalGlobalAddressingMode(AM); |
1651 | } |
1652 | |
1653 | bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT, |
1654 | const MachineFunction &MF) const { |
1655 | if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) { |
1656 | return (MemVT.getSizeInBits() <= 4 * 32); |
1657 | } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) { |
1658 | unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize(); |
1659 | return (MemVT.getSizeInBits() <= MaxPrivateBits); |
1660 | } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { |
1661 | return (MemVT.getSizeInBits() <= 2 * 32); |
1662 | } |
1663 | return true; |
1664 | } |
1665 | |
1666 | bool SITargetLowering::allowsMisalignedMemoryAccessesImpl( |
1667 | unsigned Size, unsigned AddrSpace, Align Alignment, |
1668 | MachineMemOperand::Flags Flags, unsigned *IsFast) const { |
1669 | if (IsFast) |
1670 | *IsFast = 0; |
1671 | |
1672 | if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || |
1673 | AddrSpace == AMDGPUAS::REGION_ADDRESS) { |
1674 | // Check if alignment requirements for ds_read/write instructions are |
1675 | // disabled. |
1676 | if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4)) |
1677 | return false; |
1678 | |
1679 | Align RequiredAlignment(PowerOf2Ceil(A: Size/8)); // Natural alignment. |
1680 | if (Subtarget->hasLDSMisalignedBug() && Size > 32 && |
1681 | Alignment < RequiredAlignment) |
1682 | return false; |
1683 | |
1684 | // Either, the alignment requirements are "enabled", or there is an |
1685 | // unaligned LDS access related hardware bug though alignment requirements |
1686 | // are "disabled". In either case, we need to check for proper alignment |
1687 | // requirements. |
1688 | // |
1689 | switch (Size) { |
1690 | case 64: |
1691 | // SI has a hardware bug in the LDS / GDS bounds checking: if the base |
1692 | // address is negative, then the instruction is incorrectly treated as |
1693 | // out-of-bounds even if base + offsets is in bounds. Split vectorized |
1694 | // loads here to avoid emitting ds_read2_b32. We may re-combine the |
1695 | // load later in the SILoadStoreOptimizer. |
1696 | if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8)) |
1697 | return false; |
1698 | |
1699 | // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we |
1700 | // can do a 4 byte aligned, 8 byte access in a single operation using |
1701 | // ds_read2/write2_b32 with adjacent offsets. |
1702 | RequiredAlignment = Align(4); |
1703 | |
1704 | if (Subtarget->hasUnalignedDSAccessEnabled()) { |
1705 | // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/ |
1706 | // ds_write2_b32 depending on the alignment. In either case with either |
1707 | // alignment there is no faster way of doing this. |
1708 | |
1709 | // The numbers returned here and below are not additive, it is a 'speed |
1710 | // rank'. They are just meant to be compared to decide if a certain way |
1711 | // of lowering an operation is faster than another. For that purpose |
1712 | // naturally aligned operation gets it bitsize to indicate that "it |
1713 | // operates with a speed comparable to N-bit wide load". With the full |
1714 | // alignment ds128 is slower than ds96 for example. If underaligned it |
1715 | // is comparable to a speed of a single dword access, which would then |
1716 | // mean 32 < 128 and it is faster to issue a wide load regardless. |
1717 | // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a |
1718 | // wider load which will not be aligned anymore the latter is slower. |
1719 | if (IsFast) |
1720 | *IsFast = (Alignment >= RequiredAlignment) ? 64 |
1721 | : (Alignment < Align(4)) ? 32 |
1722 | : 1; |
1723 | return true; |
1724 | } |
1725 | |
1726 | break; |
1727 | case 96: |
1728 | if (!Subtarget->hasDS96AndDS128()) |
1729 | return false; |
1730 | |
1731 | // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on |
1732 | // gfx8 and older. |
1733 | |
1734 | if (Subtarget->hasUnalignedDSAccessEnabled()) { |
1735 | // Naturally aligned access is fastest. However, also report it is Fast |
1736 | // if memory is aligned less than DWORD. A narrow load or store will be |
1737 | // be equally slow as a single ds_read_b96/ds_write_b96, but there will |
1738 | // be more of them, so overall we will pay less penalty issuing a single |
1739 | // instruction. |
1740 | |
1741 | // See comment on the values above. |
1742 | if (IsFast) |
1743 | *IsFast = (Alignment >= RequiredAlignment) ? 96 |
1744 | : (Alignment < Align(4)) ? 32 |
1745 | : 1; |
1746 | return true; |
1747 | } |
1748 | |
1749 | break; |
1750 | case 128: |
1751 | if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128()) |
1752 | return false; |
1753 | |
1754 | // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on |
1755 | // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a |
1756 | // single operation using ds_read2/write2_b64. |
1757 | RequiredAlignment = Align(8); |
1758 | |
1759 | if (Subtarget->hasUnalignedDSAccessEnabled()) { |
1760 | // Naturally aligned access is fastest. However, also report it is Fast |
1761 | // if memory is aligned less than DWORD. A narrow load or store will be |
1762 | // be equally slow as a single ds_read_b128/ds_write_b128, but there |
1763 | // will be more of them, so overall we will pay less penalty issuing a |
1764 | // single instruction. |
1765 | |
1766 | // See comment on the values above. |
1767 | if (IsFast) |
1768 | *IsFast = (Alignment >= RequiredAlignment) ? 128 |
1769 | : (Alignment < Align(4)) ? 32 |
1770 | : 1; |
1771 | return true; |
1772 | } |
1773 | |
1774 | break; |
1775 | default: |
1776 | if (Size > 32) |
1777 | return false; |
1778 | |
1779 | break; |
1780 | } |
1781 | |
1782 | // See comment on the values above. |
1783 | // Note that we have a single-dword or sub-dword here, so if underaligned |
1784 | // it is a slowest possible access, hence returned value is 0. |
1785 | if (IsFast) |
1786 | *IsFast = (Alignment >= RequiredAlignment) ? Size : 0; |
1787 | |
1788 | return Alignment >= RequiredAlignment || |
1789 | Subtarget->hasUnalignedDSAccessEnabled(); |
1790 | } |
1791 | |
1792 | if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) { |
1793 | bool AlignedBy4 = Alignment >= Align(4); |
1794 | if (IsFast) |
1795 | *IsFast = AlignedBy4; |
1796 | |
1797 | return AlignedBy4 || |
1798 | Subtarget->enableFlatScratch() || |
1799 | Subtarget->hasUnalignedScratchAccess(); |
1800 | } |
1801 | |
1802 | // FIXME: We have to be conservative here and assume that flat operations |
1803 | // will access scratch. If we had access to the IR function, then we |
1804 | // could determine if any private memory was used in the function. |
1805 | if (AddrSpace == AMDGPUAS::FLAT_ADDRESS && |
1806 | !Subtarget->hasUnalignedScratchAccess()) { |
1807 | bool AlignedBy4 = Alignment >= Align(4); |
1808 | if (IsFast) |
1809 | *IsFast = AlignedBy4; |
1810 | |
1811 | return AlignedBy4; |
1812 | } |
1813 | |
1814 | // So long as they are correct, wide global memory operations perform better |
1815 | // than multiple smaller memory ops -- even when misaligned |
1816 | if (AMDGPU::isExtendedGlobalAddrSpace(AS: AddrSpace)) { |
1817 | if (IsFast) |
1818 | *IsFast = Size; |
1819 | |
1820 | return Alignment >= Align(4) || |
1821 | Subtarget->hasUnalignedBufferAccessEnabled(); |
1822 | } |
1823 | |
1824 | // Smaller than dword value must be aligned. |
1825 | if (Size < 32) |
1826 | return false; |
1827 | |
1828 | // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the |
1829 | // byte-address are ignored, thus forcing Dword alignment. |
1830 | // This applies to private, global, and constant memory. |
1831 | if (IsFast) |
1832 | *IsFast = 1; |
1833 | |
1834 | return Size >= 32 && Alignment >= Align(4); |
1835 | } |
1836 | |
1837 | bool SITargetLowering::allowsMisalignedMemoryAccesses( |
1838 | EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, |
1839 | unsigned *IsFast) const { |
1840 | return allowsMisalignedMemoryAccessesImpl(Size: VT.getSizeInBits(), AddrSpace, |
1841 | Alignment, Flags, IsFast); |
1842 | } |
1843 | |
1844 | EVT SITargetLowering::getOptimalMemOpType( |
1845 | const MemOp &Op, const AttributeList &FuncAttributes) const { |
1846 | // FIXME: Should account for address space here. |
1847 | |
1848 | // The default fallback uses the private pointer size as a guess for a type to |
1849 | // use. Make sure we switch these to 64-bit accesses. |
1850 | |
1851 | if (Op.size() >= 16 && |
1852 | Op.isDstAligned(Align(4))) // XXX: Should only do for global |
1853 | return MVT::v4i32; |
1854 | |
1855 | if (Op.size() >= 8 && Op.isDstAligned(Align(4))) |
1856 | return MVT::v2i32; |
1857 | |
1858 | // Use the default. |
1859 | return MVT::Other; |
1860 | } |
1861 | |
1862 | bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const { |
1863 | const MemSDNode *MemNode = cast<MemSDNode>(Val: N); |
1864 | return MemNode->getMemOperand()->getFlags() & MONoClobber; |
1865 | } |
1866 | |
1867 | bool SITargetLowering::isNonGlobalAddrSpace(unsigned AS) { |
1868 | return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS || |
1869 | AS == AMDGPUAS::PRIVATE_ADDRESS; |
1870 | } |
1871 | |
1872 | bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS, |
1873 | unsigned DestAS) const { |
1874 | // Flat -> private/local is a simple truncate. |
1875 | // Flat -> global is no-op |
1876 | if (SrcAS == AMDGPUAS::FLAT_ADDRESS) |
1877 | return true; |
1878 | |
1879 | const GCNTargetMachine &TM = |
1880 | static_cast<const GCNTargetMachine &>(getTargetMachine()); |
1881 | return TM.isNoopAddrSpaceCast(SrcAS, DestAS); |
1882 | } |
1883 | |
1884 | bool SITargetLowering::isMemOpUniform(const SDNode *N) const { |
1885 | const MemSDNode *MemNode = cast<MemSDNode>(Val: N); |
1886 | |
1887 | return AMDGPUInstrInfo::isUniformMMO(MMO: MemNode->getMemOperand()); |
1888 | } |
1889 | |
1890 | TargetLoweringBase::LegalizeTypeAction |
1891 | SITargetLowering::getPreferredVectorAction(MVT VT) const { |
1892 | if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 && |
1893 | VT.getScalarType().bitsLE(MVT::i16)) |
1894 | return VT.isPow2VectorType() ? TypeSplitVector : TypeWidenVector; |
1895 | return TargetLoweringBase::getPreferredVectorAction(VT); |
1896 | } |
1897 | |
1898 | bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, |
1899 | Type *Ty) const { |
1900 | // FIXME: Could be smarter if called for vector constants. |
1901 | return true; |
1902 | } |
1903 | |
1904 | bool SITargetLowering::(EVT ResVT, EVT SrcVT, |
1905 | unsigned Index) const { |
1906 | if (!isOperationLegalOrCustom(Op: ISD::EXTRACT_SUBVECTOR, VT: ResVT)) |
1907 | return false; |
1908 | |
1909 | // TODO: Add more cases that are cheap. |
1910 | return Index == 0; |
1911 | } |
1912 | |
1913 | bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const { |
1914 | if (Subtarget->has16BitInsts() && VT == MVT::i16) { |
1915 | switch (Op) { |
1916 | case ISD::LOAD: |
1917 | case ISD::STORE: |
1918 | |
1919 | // These operations are done with 32-bit instructions anyway. |
1920 | case ISD::AND: |
1921 | case ISD::OR: |
1922 | case ISD::XOR: |
1923 | case ISD::SELECT: |
1924 | // TODO: Extensions? |
1925 | return true; |
1926 | default: |
1927 | return false; |
1928 | } |
1929 | } |
1930 | |
1931 | // SimplifySetCC uses this function to determine whether or not it should |
1932 | // create setcc with i1 operands. We don't have instructions for i1 setcc. |
1933 | if (VT == MVT::i1 && Op == ISD::SETCC) |
1934 | return false; |
1935 | |
1936 | return TargetLowering::isTypeDesirableForOp(Op, VT); |
1937 | } |
1938 | |
1939 | SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG, |
1940 | const SDLoc &SL, |
1941 | SDValue Chain, |
1942 | uint64_t Offset) const { |
1943 | const DataLayout &DL = DAG.getDataLayout(); |
1944 | MachineFunction &MF = DAG.getMachineFunction(); |
1945 | const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
1946 | |
1947 | const ArgDescriptor *InputPtrReg; |
1948 | const TargetRegisterClass *RC; |
1949 | LLT ArgTy; |
1950 | MVT PtrVT = getPointerTy(DL, AS: AMDGPUAS::CONSTANT_ADDRESS); |
1951 | |
1952 | std::tie(args&: InputPtrReg, args&: RC, args&: ArgTy) = |
1953 | Info->getPreloadedValue(Value: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); |
1954 | |
1955 | // We may not have the kernarg segment argument if we have no kernel |
1956 | // arguments. |
1957 | if (!InputPtrReg) |
1958 | return DAG.getConstant(Val: Offset, DL: SL, VT: PtrVT); |
1959 | |
1960 | MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); |
1961 | SDValue BasePtr = DAG.getCopyFromReg(Chain, dl: SL, |
1962 | Reg: MRI.getLiveInVirtReg(PReg: InputPtrReg->getRegister()), VT: PtrVT); |
1963 | |
1964 | return DAG.getObjectPtrOffset(SL, Ptr: BasePtr, Offset: TypeSize::getFixed(ExactSize: Offset)); |
1965 | } |
1966 | |
1967 | SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG, |
1968 | const SDLoc &SL) const { |
1969 | uint64_t Offset = getImplicitParameterOffset(MF: DAG.getMachineFunction(), |
1970 | Param: FIRST_IMPLICIT); |
1971 | return lowerKernArgParameterPtr(DAG, SL, Chain: DAG.getEntryNode(), Offset); |
1972 | } |
1973 | |
1974 | SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG, |
1975 | const SDLoc &SL) const { |
1976 | |
1977 | Function &F = DAG.getMachineFunction().getFunction(); |
1978 | std::optional<uint32_t> KnownSize = |
1979 | AMDGPUMachineFunction::getLDSKernelIdMetadata(F); |
1980 | if (KnownSize.has_value()) |
1981 | return DAG.getConstant(*KnownSize, SL, MVT::i32); |
1982 | return SDValue(); |
1983 | } |
1984 | |
1985 | SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT, |
1986 | const SDLoc &SL, SDValue Val, |
1987 | bool Signed, |
1988 | const ISD::InputArg *Arg) const { |
1989 | // First, if it is a widened vector, narrow it. |
1990 | if (VT.isVector() && |
1991 | VT.getVectorNumElements() != MemVT.getVectorNumElements()) { |
1992 | EVT NarrowedVT = |
1993 | EVT::getVectorVT(Context&: *DAG.getContext(), VT: MemVT.getVectorElementType(), |
1994 | NumElements: VT.getVectorNumElements()); |
1995 | Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val, |
1996 | DAG.getConstant(0, SL, MVT::i32)); |
1997 | } |
1998 | |
1999 | // Then convert the vector elements or scalar value. |
2000 | if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && |
2001 | VT.bitsLT(VT: MemVT)) { |
2002 | unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext; |
2003 | Val = DAG.getNode(Opcode: Opc, DL: SL, VT: MemVT, N1: Val, N2: DAG.getValueType(VT)); |
2004 | } |
2005 | |
2006 | if (MemVT.isFloatingPoint()) |
2007 | Val = getFPExtOrFPRound(DAG, Op: Val, DL: SL, VT); |
2008 | else if (Signed) |
2009 | Val = DAG.getSExtOrTrunc(Op: Val, DL: SL, VT); |
2010 | else |
2011 | Val = DAG.getZExtOrTrunc(Op: Val, DL: SL, VT); |
2012 | |
2013 | return Val; |
2014 | } |
2015 | |
2016 | SDValue SITargetLowering::lowerKernargMemParameter( |
2017 | SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain, |
2018 | uint64_t Offset, Align Alignment, bool Signed, |
2019 | const ISD::InputArg *Arg) const { |
2020 | MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); |
2021 | |
2022 | // Try to avoid using an extload by loading earlier than the argument address, |
2023 | // and extracting the relevant bits. The load should hopefully be merged with |
2024 | // the previous argument. |
2025 | if (MemVT.getStoreSize() < 4 && Alignment < 4) { |
2026 | // TODO: Handle align < 4 and size >= 4 (can happen with packed structs). |
2027 | int64_t AlignDownOffset = alignDown(Value: Offset, Align: 4); |
2028 | int64_t OffsetDiff = Offset - AlignDownOffset; |
2029 | |
2030 | EVT IntVT = MemVT.changeTypeToInteger(); |
2031 | |
2032 | // TODO: If we passed in the base kernel offset we could have a better |
2033 | // alignment than 4, but we don't really need it. |
2034 | SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset: AlignDownOffset); |
2035 | SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4), |
2036 | MachineMemOperand::MODereferenceable | |
2037 | MachineMemOperand::MOInvariant); |
2038 | |
2039 | SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32); |
2040 | SDValue = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt); |
2041 | |
2042 | SDValue ArgVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: IntVT, Operand: Extract); |
2043 | ArgVal = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MemVT, Operand: ArgVal); |
2044 | ArgVal = convertArgType(DAG, VT, MemVT, SL, Val: ArgVal, Signed, Arg); |
2045 | |
2046 | |
2047 | return DAG.getMergeValues(Ops: { ArgVal, Load.getValue(R: 1) }, dl: SL); |
2048 | } |
2049 | |
2050 | SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset); |
2051 | SDValue Load = DAG.getLoad(VT: MemVT, dl: SL, Chain, Ptr, PtrInfo, Alignment, |
2052 | MMOFlags: MachineMemOperand::MODereferenceable | |
2053 | MachineMemOperand::MOInvariant); |
2054 | |
2055 | SDValue Val = convertArgType(DAG, VT, MemVT, SL, Val: Load, Signed, Arg); |
2056 | return DAG.getMergeValues(Ops: { Val, Load.getValue(R: 1) }, dl: SL); |
2057 | } |
2058 | |
2059 | SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA, |
2060 | const SDLoc &SL, SDValue Chain, |
2061 | const ISD::InputArg &Arg) const { |
2062 | MachineFunction &MF = DAG.getMachineFunction(); |
2063 | MachineFrameInfo &MFI = MF.getFrameInfo(); |
2064 | |
2065 | if (Arg.Flags.isByVal()) { |
2066 | unsigned Size = Arg.Flags.getByValSize(); |
2067 | int FrameIdx = MFI.CreateFixedObject(Size, SPOffset: VA.getLocMemOffset(), IsImmutable: false); |
2068 | return DAG.getFrameIndex(FrameIdx, MVT::i32); |
2069 | } |
2070 | |
2071 | unsigned ArgOffset = VA.getLocMemOffset(); |
2072 | unsigned ArgSize = VA.getValVT().getStoreSize(); |
2073 | |
2074 | int FI = MFI.CreateFixedObject(Size: ArgSize, SPOffset: ArgOffset, IsImmutable: true); |
2075 | |
2076 | // Create load nodes to retrieve arguments from the stack. |
2077 | SDValue FIN = DAG.getFrameIndex(FI, MVT::i32); |
2078 | SDValue ArgValue; |
2079 | |
2080 | // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT) |
2081 | ISD::LoadExtType ExtType = ISD::NON_EXTLOAD; |
2082 | MVT MemVT = VA.getValVT(); |
2083 | |
2084 | switch (VA.getLocInfo()) { |
2085 | default: |
2086 | break; |
2087 | case CCValAssign::BCvt: |
2088 | MemVT = VA.getLocVT(); |
2089 | break; |
2090 | case CCValAssign::SExt: |
2091 | ExtType = ISD::SEXTLOAD; |
2092 | break; |
2093 | case CCValAssign::ZExt: |
2094 | ExtType = ISD::ZEXTLOAD; |
2095 | break; |
2096 | case CCValAssign::AExt: |
2097 | ExtType = ISD::EXTLOAD; |
2098 | break; |
2099 | } |
2100 | |
2101 | ArgValue = DAG.getExtLoad( |
2102 | ExtType, dl: SL, VT: VA.getLocVT(), Chain, Ptr: FIN, |
2103 | PtrInfo: MachinePointerInfo::getFixedStack(MF&: DAG.getMachineFunction(), FI), |
2104 | MemVT); |
2105 | return ArgValue; |
2106 | } |
2107 | |
2108 | SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG, |
2109 | const SIMachineFunctionInfo &MFI, |
2110 | EVT VT, |
2111 | AMDGPUFunctionArgInfo::PreloadedValue PVID) const { |
2112 | const ArgDescriptor *Reg = nullptr; |
2113 | const TargetRegisterClass *RC; |
2114 | LLT Ty; |
2115 | |
2116 | CallingConv::ID CC = DAG.getMachineFunction().getFunction().getCallingConv(); |
2117 | const ArgDescriptor WorkGroupIDX = |
2118 | ArgDescriptor::createRegister(AMDGPU::TTMP9); |
2119 | // If GridZ is not programmed in an entry function then the hardware will set |
2120 | // it to all zeros, so there is no need to mask the GridY value in the low |
2121 | // order bits. |
2122 | const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister( |
2123 | AMDGPU::TTMP7, |
2124 | AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu); |
2125 | const ArgDescriptor WorkGroupIDZ = |
2126 | ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u); |
2127 | if (Subtarget->hasArchitectedSGPRs() && |
2128 | (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) { |
2129 | switch (PVID) { |
2130 | case AMDGPUFunctionArgInfo::WORKGROUP_ID_X: |
2131 | Reg = &WorkGroupIDX; |
2132 | RC = &AMDGPU::SReg_32RegClass; |
2133 | Ty = LLT::scalar(SizeInBits: 32); |
2134 | break; |
2135 | case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y: |
2136 | Reg = &WorkGroupIDY; |
2137 | RC = &AMDGPU::SReg_32RegClass; |
2138 | Ty = LLT::scalar(SizeInBits: 32); |
2139 | break; |
2140 | case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z: |
2141 | Reg = &WorkGroupIDZ; |
2142 | RC = &AMDGPU::SReg_32RegClass; |
2143 | Ty = LLT::scalar(SizeInBits: 32); |
2144 | break; |
2145 | default: |
2146 | break; |
2147 | } |
2148 | } |
2149 | |
2150 | if (!Reg) |
2151 | std::tie(args&: Reg, args&: RC, args&: Ty) = MFI.getPreloadedValue(Value: PVID); |
2152 | if (!Reg) { |
2153 | if (PVID == AMDGPUFunctionArgInfo::PreloadedValue::KERNARG_SEGMENT_PTR) { |
2154 | // It's possible for a kernarg intrinsic call to appear in a kernel with |
2155 | // no allocated segment, in which case we do not add the user sgpr |
2156 | // argument, so just return null. |
2157 | return DAG.getConstant(Val: 0, DL: SDLoc(), VT); |
2158 | } |
2159 | |
2160 | // It's undefined behavior if a function marked with the amdgpu-no-* |
2161 | // attributes uses the corresponding intrinsic. |
2162 | return DAG.getUNDEF(VT); |
2163 | } |
2164 | |
2165 | return loadInputValue(DAG, RC, VT, SL: SDLoc(DAG.getEntryNode()), Arg: *Reg); |
2166 | } |
2167 | |
2168 | static void processPSInputArgs(SmallVectorImpl<ISD::InputArg> &Splits, |
2169 | CallingConv::ID CallConv, |
2170 | ArrayRef<ISD::InputArg> Ins, BitVector &Skipped, |
2171 | FunctionType *FType, |
2172 | SIMachineFunctionInfo *Info) { |
2173 | for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) { |
2174 | const ISD::InputArg *Arg = &Ins[I]; |
2175 | |
2176 | assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) && |
2177 | "vector type argument should have been split" ); |
2178 | |
2179 | // First check if it's a PS input addr. |
2180 | if (CallConv == CallingConv::AMDGPU_PS && |
2181 | !Arg->Flags.isInReg() && PSInputNum <= 15) { |
2182 | bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(Index: PSInputNum); |
2183 | |
2184 | // Inconveniently only the first part of the split is marked as isSplit, |
2185 | // so skip to the end. We only want to increment PSInputNum once for the |
2186 | // entire split argument. |
2187 | if (Arg->Flags.isSplit()) { |
2188 | while (!Arg->Flags.isSplitEnd()) { |
2189 | assert((!Arg->VT.isVector() || |
2190 | Arg->VT.getScalarSizeInBits() == 16) && |
2191 | "unexpected vector split in ps argument type" ); |
2192 | if (!SkipArg) |
2193 | Splits.push_back(Elt: *Arg); |
2194 | Arg = &Ins[++I]; |
2195 | } |
2196 | } |
2197 | |
2198 | if (SkipArg) { |
2199 | // We can safely skip PS inputs. |
2200 | Skipped.set(Arg->getOrigArgIndex()); |
2201 | ++PSInputNum; |
2202 | continue; |
2203 | } |
2204 | |
2205 | Info->markPSInputAllocated(Index: PSInputNum); |
2206 | if (Arg->Used) |
2207 | Info->markPSInputEnabled(Index: PSInputNum); |
2208 | |
2209 | ++PSInputNum; |
2210 | } |
2211 | |
2212 | Splits.push_back(Elt: *Arg); |
2213 | } |
2214 | } |
2215 | |
2216 | // Allocate special inputs passed in VGPRs. |
2217 | void SITargetLowering::allocateSpecialEntryInputVGPRs(CCState &CCInfo, |
2218 | MachineFunction &MF, |
2219 | const SIRegisterInfo &TRI, |
2220 | SIMachineFunctionInfo &Info) const { |
2221 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
2222 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
2223 | |
2224 | if (Info.hasWorkItemIDX()) { |
2225 | Register Reg = AMDGPU::VGPR0; |
2226 | MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); |
2227 | |
2228 | CCInfo.AllocateReg(Reg); |
2229 | unsigned Mask = (Subtarget->hasPackedTID() && |
2230 | Info.hasWorkItemIDY()) ? 0x3ff : ~0u; |
2231 | Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask)); |
2232 | } |
2233 | |
2234 | if (Info.hasWorkItemIDY()) { |
2235 | assert(Info.hasWorkItemIDX()); |
2236 | if (Subtarget->hasPackedTID()) { |
2237 | Info.setWorkItemIDY(ArgDescriptor::createRegister(AMDGPU::VGPR0, |
2238 | 0x3ff << 10)); |
2239 | } else { |
2240 | unsigned Reg = AMDGPU::VGPR1; |
2241 | MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); |
2242 | |
2243 | CCInfo.AllocateReg(Reg); |
2244 | Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg)); |
2245 | } |
2246 | } |
2247 | |
2248 | if (Info.hasWorkItemIDZ()) { |
2249 | assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY()); |
2250 | if (Subtarget->hasPackedTID()) { |
2251 | Info.setWorkItemIDZ(ArgDescriptor::createRegister(AMDGPU::VGPR0, |
2252 | 0x3ff << 20)); |
2253 | } else { |
2254 | unsigned Reg = AMDGPU::VGPR2; |
2255 | MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); |
2256 | |
2257 | CCInfo.AllocateReg(Reg); |
2258 | Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg)); |
2259 | } |
2260 | } |
2261 | } |
2262 | |
2263 | // Try to allocate a VGPR at the end of the argument list, or if no argument |
2264 | // VGPRs are left allocating a stack slot. |
2265 | // If \p Mask is is given it indicates bitfield position in the register. |
2266 | // If \p Arg is given use it with new ]p Mask instead of allocating new. |
2267 | static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u, |
2268 | ArgDescriptor Arg = ArgDescriptor()) { |
2269 | if (Arg.isSet()) |
2270 | return ArgDescriptor::createArg(Arg, Mask); |
2271 | |
2272 | ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32); |
2273 | unsigned RegIdx = CCInfo.getFirstUnallocated(Regs: ArgVGPRs); |
2274 | if (RegIdx == ArgVGPRs.size()) { |
2275 | // Spill to stack required. |
2276 | int64_t Offset = CCInfo.AllocateStack(Size: 4, Alignment: Align(4)); |
2277 | |
2278 | return ArgDescriptor::createStack(Offset, Mask); |
2279 | } |
2280 | |
2281 | unsigned Reg = ArgVGPRs[RegIdx]; |
2282 | Reg = CCInfo.AllocateReg(Reg); |
2283 | assert(Reg != AMDGPU::NoRegister); |
2284 | |
2285 | MachineFunction &MF = CCInfo.getMachineFunction(); |
2286 | Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); |
2287 | MF.getRegInfo().setType(VReg: LiveInVReg, Ty: LLT::scalar(SizeInBits: 32)); |
2288 | return ArgDescriptor::createRegister(Reg, Mask); |
2289 | } |
2290 | |
2291 | static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, |
2292 | const TargetRegisterClass *RC, |
2293 | unsigned NumArgRegs) { |
2294 | ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32); |
2295 | unsigned RegIdx = CCInfo.getFirstUnallocated(Regs: ArgSGPRs); |
2296 | if (RegIdx == ArgSGPRs.size()) |
2297 | report_fatal_error(reason: "ran out of SGPRs for arguments" ); |
2298 | |
2299 | unsigned Reg = ArgSGPRs[RegIdx]; |
2300 | Reg = CCInfo.AllocateReg(Reg); |
2301 | assert(Reg != AMDGPU::NoRegister); |
2302 | |
2303 | MachineFunction &MF = CCInfo.getMachineFunction(); |
2304 | MF.addLiveIn(PReg: Reg, RC); |
2305 | return ArgDescriptor::createRegister(Reg); |
2306 | } |
2307 | |
2308 | // If this has a fixed position, we still should allocate the register in the |
2309 | // CCInfo state. Technically we could get away with this for values passed |
2310 | // outside of the normal argument range. |
2311 | static void allocateFixedSGPRInputImpl(CCState &CCInfo, |
2312 | const TargetRegisterClass *RC, |
2313 | MCRegister Reg) { |
2314 | Reg = CCInfo.AllocateReg(Reg); |
2315 | assert(Reg != AMDGPU::NoRegister); |
2316 | MachineFunction &MF = CCInfo.getMachineFunction(); |
2317 | MF.addLiveIn(PReg: Reg, RC); |
2318 | } |
2319 | |
2320 | static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) { |
2321 | if (Arg) { |
2322 | allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, |
2323 | Arg.getRegister()); |
2324 | } else |
2325 | Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32); |
2326 | } |
2327 | |
2328 | static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) { |
2329 | if (Arg) { |
2330 | allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, |
2331 | Arg.getRegister()); |
2332 | } else |
2333 | Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16); |
2334 | } |
2335 | |
2336 | /// Allocate implicit function VGPR arguments at the end of allocated user |
2337 | /// arguments. |
2338 | void SITargetLowering::allocateSpecialInputVGPRs( |
2339 | CCState &CCInfo, MachineFunction &MF, |
2340 | const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const { |
2341 | const unsigned Mask = 0x3ff; |
2342 | ArgDescriptor Arg; |
2343 | |
2344 | if (Info.hasWorkItemIDX()) { |
2345 | Arg = allocateVGPR32Input(CCInfo, Mask); |
2346 | Info.setWorkItemIDX(Arg); |
2347 | } |
2348 | |
2349 | if (Info.hasWorkItemIDY()) { |
2350 | Arg = allocateVGPR32Input(CCInfo, Mask: Mask << 10, Arg); |
2351 | Info.setWorkItemIDY(Arg); |
2352 | } |
2353 | |
2354 | if (Info.hasWorkItemIDZ()) |
2355 | Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask: Mask << 20, Arg)); |
2356 | } |
2357 | |
2358 | /// Allocate implicit function VGPR arguments in fixed registers. |
2359 | void SITargetLowering::allocateSpecialInputVGPRsFixed( |
2360 | CCState &CCInfo, MachineFunction &MF, |
2361 | const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const { |
2362 | Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31); |
2363 | if (!Reg) |
2364 | report_fatal_error(reason: "failed to allocated VGPR for implicit arguments" ); |
2365 | |
2366 | const unsigned Mask = 0x3ff; |
2367 | Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask)); |
2368 | Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask: Mask << 10)); |
2369 | Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask: Mask << 20)); |
2370 | } |
2371 | |
2372 | void SITargetLowering::allocateSpecialInputSGPRs( |
2373 | CCState &CCInfo, |
2374 | MachineFunction &MF, |
2375 | const SIRegisterInfo &TRI, |
2376 | SIMachineFunctionInfo &Info) const { |
2377 | auto &ArgInfo = Info.getArgInfo(); |
2378 | const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo(); |
2379 | |
2380 | // TODO: Unify handling with private memory pointers. |
2381 | if (UserSGPRInfo.hasDispatchPtr()) |
2382 | allocateSGPR64Input(CCInfo, Arg&: ArgInfo.DispatchPtr); |
2383 | |
2384 | const Module *M = MF.getFunction().getParent(); |
2385 | if (UserSGPRInfo.hasQueuePtr() && |
2386 | AMDGPU::getAMDHSACodeObjectVersion(M: *M) < AMDGPU::AMDHSA_COV5) |
2387 | allocateSGPR64Input(CCInfo, Arg&: ArgInfo.QueuePtr); |
2388 | |
2389 | // Implicit arg ptr takes the place of the kernarg segment pointer. This is a |
2390 | // constant offset from the kernarg segment. |
2391 | if (Info.hasImplicitArgPtr()) |
2392 | allocateSGPR64Input(CCInfo, Arg&: ArgInfo.ImplicitArgPtr); |
2393 | |
2394 | if (UserSGPRInfo.hasDispatchID()) |
2395 | allocateSGPR64Input(CCInfo, Arg&: ArgInfo.DispatchID); |
2396 | |
2397 | // flat_scratch_init is not applicable for non-kernel functions. |
2398 | |
2399 | if (Info.hasWorkGroupIDX()) |
2400 | allocateSGPR32Input(CCInfo, Arg&: ArgInfo.WorkGroupIDX); |
2401 | |
2402 | if (Info.hasWorkGroupIDY()) |
2403 | allocateSGPR32Input(CCInfo, Arg&: ArgInfo.WorkGroupIDY); |
2404 | |
2405 | if (Info.hasWorkGroupIDZ()) |
2406 | allocateSGPR32Input(CCInfo, Arg&: ArgInfo.WorkGroupIDZ); |
2407 | |
2408 | if (Info.hasLDSKernelId()) |
2409 | allocateSGPR32Input(CCInfo, Arg&: ArgInfo.LDSKernelId); |
2410 | } |
2411 | |
2412 | // Allocate special inputs passed in user SGPRs. |
2413 | void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo, |
2414 | MachineFunction &MF, |
2415 | const SIRegisterInfo &TRI, |
2416 | SIMachineFunctionInfo &Info) const { |
2417 | const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo(); |
2418 | if (UserSGPRInfo.hasImplicitBufferPtr()) { |
2419 | Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI); |
2420 | MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass); |
2421 | CCInfo.AllocateReg(Reg: ImplicitBufferPtrReg); |
2422 | } |
2423 | |
2424 | // FIXME: How should these inputs interact with inreg / custom SGPR inputs? |
2425 | if (UserSGPRInfo.hasPrivateSegmentBuffer()) { |
2426 | Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI); |
2427 | MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass); |
2428 | CCInfo.AllocateReg(Reg: PrivateSegmentBufferReg); |
2429 | } |
2430 | |
2431 | if (UserSGPRInfo.hasDispatchPtr()) { |
2432 | Register DispatchPtrReg = Info.addDispatchPtr(TRI); |
2433 | MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass); |
2434 | CCInfo.AllocateReg(Reg: DispatchPtrReg); |
2435 | } |
2436 | |
2437 | const Module *M = MF.getFunction().getParent(); |
2438 | if (UserSGPRInfo.hasQueuePtr() && |
2439 | AMDGPU::getAMDHSACodeObjectVersion(M: *M) < AMDGPU::AMDHSA_COV5) { |
2440 | Register QueuePtrReg = Info.addQueuePtr(TRI); |
2441 | MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass); |
2442 | CCInfo.AllocateReg(Reg: QueuePtrReg); |
2443 | } |
2444 | |
2445 | if (UserSGPRInfo.hasKernargSegmentPtr()) { |
2446 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
2447 | Register InputPtrReg = Info.addKernargSegmentPtr(TRI); |
2448 | CCInfo.AllocateReg(Reg: InputPtrReg); |
2449 | |
2450 | Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass); |
2451 | MRI.setType(VReg, Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64)); |
2452 | } |
2453 | |
2454 | if (UserSGPRInfo.hasDispatchID()) { |
2455 | Register DispatchIDReg = Info.addDispatchID(TRI); |
2456 | MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass); |
2457 | CCInfo.AllocateReg(Reg: DispatchIDReg); |
2458 | } |
2459 | |
2460 | if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) { |
2461 | Register FlatScratchInitReg = Info.addFlatScratchInit(TRI); |
2462 | MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass); |
2463 | CCInfo.AllocateReg(Reg: FlatScratchInitReg); |
2464 | } |
2465 | |
2466 | // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read |
2467 | // these from the dispatch pointer. |
2468 | } |
2469 | |
2470 | // Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be |
2471 | // sequential starting from the first argument. |
2472 | void SITargetLowering::allocatePreloadKernArgSGPRs( |
2473 | CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs, |
2474 | const SmallVectorImpl<ISD::InputArg> &Ins, MachineFunction &MF, |
2475 | const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const { |
2476 | Function &F = MF.getFunction(); |
2477 | unsigned LastExplicitArgOffset = |
2478 | MF.getSubtarget<GCNSubtarget>().getExplicitKernelArgOffset(); |
2479 | GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo(); |
2480 | bool InPreloadSequence = true; |
2481 | unsigned InIdx = 0; |
2482 | for (auto &Arg : F.args()) { |
2483 | if (!InPreloadSequence || !Arg.hasInRegAttr()) |
2484 | break; |
2485 | |
2486 | int ArgIdx = Arg.getArgNo(); |
2487 | // Don't preload non-original args or parts not in the current preload |
2488 | // sequence. |
2489 | if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() || |
2490 | (int)Ins[InIdx].getOrigArgIndex() != ArgIdx)) |
2491 | break; |
2492 | |
2493 | for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() && |
2494 | (int)Ins[InIdx].getOrigArgIndex() == ArgIdx; |
2495 | InIdx++) { |
2496 | assert(ArgLocs[ArgIdx].isMemLoc()); |
2497 | auto &ArgLoc = ArgLocs[InIdx]; |
2498 | const Align KernelArgBaseAlign = Align(16); |
2499 | unsigned ArgOffset = ArgLoc.getLocMemOffset(); |
2500 | Align Alignment = commonAlignment(A: KernelArgBaseAlign, Offset: ArgOffset); |
2501 | unsigned NumAllocSGPRs = |
2502 | alignTo(Value: ArgLoc.getLocVT().getFixedSizeInBits(), Align: 32) / 32; |
2503 | |
2504 | // Arg is preloaded into the previous SGPR. |
2505 | if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) { |
2506 | Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back( |
2507 | Elt: Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]); |
2508 | continue; |
2509 | } |
2510 | |
2511 | unsigned Padding = ArgOffset - LastExplicitArgOffset; |
2512 | unsigned PaddingSGPRs = alignTo(Value: Padding, Align: 4) / 4; |
2513 | // Check for free user SGPRs for preloading. |
2514 | if (PaddingSGPRs + NumAllocSGPRs + 1 /*Synthetic SGPRs*/ > |
2515 | SGPRInfo.getNumFreeUserSGPRs()) { |
2516 | InPreloadSequence = false; |
2517 | break; |
2518 | } |
2519 | |
2520 | // Preload this argument. |
2521 | const TargetRegisterClass *RC = |
2522 | TRI.getSGPRClassForBitWidth(BitWidth: NumAllocSGPRs * 32); |
2523 | SmallVectorImpl<MCRegister> *PreloadRegs = |
2524 | Info.addPreloadedKernArg(TRI, RC, AllocSizeDWord: NumAllocSGPRs, KernArgIdx: InIdx, PaddingSGPRs); |
2525 | |
2526 | if (PreloadRegs->size() > 1) |
2527 | RC = &AMDGPU::SGPR_32RegClass; |
2528 | for (auto &Reg : *PreloadRegs) { |
2529 | assert(Reg); |
2530 | MF.addLiveIn(Reg, RC); |
2531 | CCInfo.AllocateReg(Reg); |
2532 | } |
2533 | |
2534 | LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset; |
2535 | } |
2536 | } |
2537 | } |
2538 | |
2539 | void SITargetLowering::allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, |
2540 | const SIRegisterInfo &TRI, |
2541 | SIMachineFunctionInfo &Info) const { |
2542 | // Always allocate this last since it is a synthetic preload. |
2543 | if (Info.hasLDSKernelId()) { |
2544 | Register Reg = Info.addLDSKernelId(); |
2545 | MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); |
2546 | CCInfo.AllocateReg(Reg); |
2547 | } |
2548 | } |
2549 | |
2550 | // Allocate special input registers that are initialized per-wave. |
2551 | void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo, |
2552 | MachineFunction &MF, |
2553 | SIMachineFunctionInfo &Info, |
2554 | CallingConv::ID CallConv, |
2555 | bool IsShader) const { |
2556 | bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs(); |
2557 | if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) { |
2558 | // Note: user SGPRs are handled by the front-end for graphics shaders |
2559 | // Pad up the used user SGPRs with dead inputs. |
2560 | |
2561 | // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately |
2562 | // before enabling architected SGPRs for workgroup IDs. |
2563 | assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget" ); |
2564 | |
2565 | unsigned CurrentUserSGPRs = Info.getNumUserSGPRs(); |
2566 | // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to |
2567 | // rely on it to reach 16 since if we end up having no stack usage, it will |
2568 | // not really be added. |
2569 | unsigned NumRequiredSystemSGPRs = Info.hasWorkGroupIDX() + |
2570 | Info.hasWorkGroupIDY() + |
2571 | Info.hasWorkGroupIDZ() + |
2572 | Info.hasWorkGroupInfo(); |
2573 | for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) { |
2574 | Register Reg = Info.addReservedUserSGPR(); |
2575 | MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); |
2576 | CCInfo.AllocateReg(Reg); |
2577 | } |
2578 | } |
2579 | |
2580 | if (!HasArchitectedSGPRs) { |
2581 | if (Info.hasWorkGroupIDX()) { |
2582 | Register Reg = Info.addWorkGroupIDX(); |
2583 | MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); |
2584 | CCInfo.AllocateReg(Reg); |
2585 | } |
2586 | |
2587 | if (Info.hasWorkGroupIDY()) { |
2588 | Register Reg = Info.addWorkGroupIDY(); |
2589 | MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); |
2590 | CCInfo.AllocateReg(Reg); |
2591 | } |
2592 | |
2593 | if (Info.hasWorkGroupIDZ()) { |
2594 | Register Reg = Info.addWorkGroupIDZ(); |
2595 | MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); |
2596 | CCInfo.AllocateReg(Reg); |
2597 | } |
2598 | } |
2599 | |
2600 | if (Info.hasWorkGroupInfo()) { |
2601 | Register Reg = Info.addWorkGroupInfo(); |
2602 | MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); |
2603 | CCInfo.AllocateReg(Reg); |
2604 | } |
2605 | |
2606 | if (Info.hasPrivateSegmentWaveByteOffset()) { |
2607 | // Scratch wave offset passed in system SGPR. |
2608 | unsigned PrivateSegmentWaveByteOffsetReg; |
2609 | |
2610 | if (IsShader) { |
2611 | PrivateSegmentWaveByteOffsetReg = |
2612 | Info.getPrivateSegmentWaveByteOffsetSystemSGPR(); |
2613 | |
2614 | // This is true if the scratch wave byte offset doesn't have a fixed |
2615 | // location. |
2616 | if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) { |
2617 | PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo); |
2618 | Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg); |
2619 | } |
2620 | } else |
2621 | PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset(); |
2622 | |
2623 | MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass); |
2624 | CCInfo.AllocateReg(Reg: PrivateSegmentWaveByteOffsetReg); |
2625 | } |
2626 | |
2627 | assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader || |
2628 | Info.getNumPreloadedSGPRs() >= 16); |
2629 | } |
2630 | |
2631 | static void reservePrivateMemoryRegs(const TargetMachine &TM, |
2632 | MachineFunction &MF, |
2633 | const SIRegisterInfo &TRI, |
2634 | SIMachineFunctionInfo &Info) { |
2635 | // Now that we've figured out where the scratch register inputs are, see if |
2636 | // should reserve the arguments and use them directly. |
2637 | MachineFrameInfo &MFI = MF.getFrameInfo(); |
2638 | bool HasStackObjects = MFI.hasStackObjects(); |
2639 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
2640 | |
2641 | // Record that we know we have non-spill stack objects so we don't need to |
2642 | // check all stack objects later. |
2643 | if (HasStackObjects) |
2644 | Info.setHasNonSpillStackObjects(true); |
2645 | |
2646 | // Everything live out of a block is spilled with fast regalloc, so it's |
2647 | // almost certain that spilling will be required. |
2648 | if (TM.getOptLevel() == CodeGenOptLevel::None) |
2649 | HasStackObjects = true; |
2650 | |
2651 | // For now assume stack access is needed in any callee functions, so we need |
2652 | // the scratch registers to pass in. |
2653 | bool RequiresStackAccess = HasStackObjects || MFI.hasCalls(); |
2654 | |
2655 | if (!ST.enableFlatScratch()) { |
2656 | if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) { |
2657 | // If we have stack objects, we unquestionably need the private buffer |
2658 | // resource. For the Code Object V2 ABI, this will be the first 4 user |
2659 | // SGPR inputs. We can reserve those and use them directly. |
2660 | |
2661 | Register PrivateSegmentBufferReg = |
2662 | Info.getPreloadedReg(Value: AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); |
2663 | Info.setScratchRSrcReg(PrivateSegmentBufferReg); |
2664 | } else { |
2665 | unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF); |
2666 | // We tentatively reserve the last registers (skipping the last registers |
2667 | // which may contain VCC, FLAT_SCR, and XNACK). After register allocation, |
2668 | // we'll replace these with the ones immediately after those which were |
2669 | // really allocated. In the prologue copies will be inserted from the |
2670 | // argument to these reserved registers. |
2671 | |
2672 | // Without HSA, relocations are used for the scratch pointer and the |
2673 | // buffer resource setup is always inserted in the prologue. Scratch wave |
2674 | // offset is still in an input SGPR. |
2675 | Info.setScratchRSrcReg(ReservedBufferReg); |
2676 | } |
2677 | } |
2678 | |
2679 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
2680 | |
2681 | // For entry functions we have to set up the stack pointer if we use it, |
2682 | // whereas non-entry functions get this "for free". This means there is no |
2683 | // intrinsic advantage to using S32 over S34 in cases where we do not have |
2684 | // calls but do need a frame pointer (i.e. if we are requested to have one |
2685 | // because frame pointer elimination is disabled). To keep things simple we |
2686 | // only ever use S32 as the call ABI stack pointer, and so using it does not |
2687 | // imply we need a separate frame pointer. |
2688 | // |
2689 | // Try to use s32 as the SP, but move it if it would interfere with input |
2690 | // arguments. This won't work with calls though. |
2691 | // |
2692 | // FIXME: Move SP to avoid any possible inputs, or find a way to spill input |
2693 | // registers. |
2694 | if (!MRI.isLiveIn(AMDGPU::SGPR32)) { |
2695 | Info.setStackPtrOffsetReg(AMDGPU::SGPR32); |
2696 | } else { |
2697 | assert(AMDGPU::isShader(MF.getFunction().getCallingConv())); |
2698 | |
2699 | if (MFI.hasCalls()) |
2700 | report_fatal_error(reason: "call in graphics shader with too many input SGPRs" ); |
2701 | |
2702 | for (unsigned Reg : AMDGPU::SGPR_32RegClass) { |
2703 | if (!MRI.isLiveIn(Reg)) { |
2704 | Info.setStackPtrOffsetReg(Reg); |
2705 | break; |
2706 | } |
2707 | } |
2708 | |
2709 | if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG) |
2710 | report_fatal_error(reason: "failed to find register for SP" ); |
2711 | } |
2712 | |
2713 | // hasFP should be accurate for entry functions even before the frame is |
2714 | // finalized, because it does not rely on the known stack size, only |
2715 | // properties like whether variable sized objects are present. |
2716 | if (ST.getFrameLowering()->hasFP(MF)) { |
2717 | Info.setFrameOffsetReg(AMDGPU::SGPR33); |
2718 | } |
2719 | } |
2720 | |
2721 | bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const { |
2722 | const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); |
2723 | return !Info->isEntryFunction(); |
2724 | } |
2725 | |
2726 | void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { |
2727 | |
2728 | } |
2729 | |
2730 | void SITargetLowering::insertCopiesSplitCSR( |
2731 | MachineBasicBlock *Entry, |
2732 | const SmallVectorImpl<MachineBasicBlock *> &Exits) const { |
2733 | const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); |
2734 | |
2735 | const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(MF: Entry->getParent()); |
2736 | if (!IStart) |
2737 | return; |
2738 | |
2739 | const TargetInstrInfo *TII = Subtarget->getInstrInfo(); |
2740 | MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); |
2741 | MachineBasicBlock::iterator MBBI = Entry->begin(); |
2742 | for (const MCPhysReg *I = IStart; *I; ++I) { |
2743 | const TargetRegisterClass *RC = nullptr; |
2744 | if (AMDGPU::SReg_64RegClass.contains(*I)) |
2745 | RC = &AMDGPU::SGPR_64RegClass; |
2746 | else if (AMDGPU::SReg_32RegClass.contains(*I)) |
2747 | RC = &AMDGPU::SGPR_32RegClass; |
2748 | else |
2749 | llvm_unreachable("Unexpected register class in CSRsViaCopy!" ); |
2750 | |
2751 | Register NewVR = MRI->createVirtualRegister(RegClass: RC); |
2752 | // Create copy from CSR to a virtual register. |
2753 | Entry->addLiveIn(PhysReg: *I); |
2754 | BuildMI(BB&: *Entry, I: MBBI, MIMD: DebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: NewVR) |
2755 | .addReg(RegNo: *I); |
2756 | |
2757 | // Insert the copy-back instructions right before the terminator. |
2758 | for (auto *Exit : Exits) |
2759 | BuildMI(BB&: *Exit, I: Exit->getFirstTerminator(), MIMD: DebugLoc(), |
2760 | MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: *I) |
2761 | .addReg(RegNo: NewVR); |
2762 | } |
2763 | } |
2764 | |
2765 | SDValue SITargetLowering::LowerFormalArguments( |
2766 | SDValue Chain, CallingConv::ID CallConv, bool isVarArg, |
2767 | const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, |
2768 | SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { |
2769 | const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); |
2770 | |
2771 | MachineFunction &MF = DAG.getMachineFunction(); |
2772 | const Function &Fn = MF.getFunction(); |
2773 | FunctionType *FType = MF.getFunction().getFunctionType(); |
2774 | SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
2775 | |
2776 | if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CC: CallConv)) { |
2777 | DiagnosticInfoUnsupported NoGraphicsHSA( |
2778 | Fn, "unsupported non-compute shaders with HSA" , DL.getDebugLoc()); |
2779 | DAG.getContext()->diagnose(DI: NoGraphicsHSA); |
2780 | return DAG.getEntryNode(); |
2781 | } |
2782 | |
2783 | SmallVector<ISD::InputArg, 16> Splits; |
2784 | SmallVector<CCValAssign, 16> ArgLocs; |
2785 | BitVector Skipped(Ins.size()); |
2786 | CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, |
2787 | *DAG.getContext()); |
2788 | |
2789 | bool IsGraphics = AMDGPU::isGraphics(CC: CallConv); |
2790 | bool IsKernel = AMDGPU::isKernel(CC: CallConv); |
2791 | bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC: CallConv); |
2792 | |
2793 | if (IsGraphics) { |
2794 | const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo(); |
2795 | assert(!UserSGPRInfo.hasDispatchPtr() && |
2796 | !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() && |
2797 | !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() && |
2798 | !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ()); |
2799 | (void)UserSGPRInfo; |
2800 | if (!Subtarget->enableFlatScratch()) |
2801 | assert(!UserSGPRInfo.hasFlatScratchInit()); |
2802 | if ((CallConv != CallingConv::AMDGPU_CS && |
2803 | CallConv != CallingConv::AMDGPU_Gfx) || |
2804 | !Subtarget->hasArchitectedSGPRs()) |
2805 | assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && |
2806 | !Info->hasWorkGroupIDZ()); |
2807 | } |
2808 | |
2809 | if (CallConv == CallingConv::AMDGPU_PS) { |
2810 | processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info); |
2811 | |
2812 | // At least one interpolation mode must be enabled or else the GPU will |
2813 | // hang. |
2814 | // |
2815 | // Check PSInputAddr instead of PSInputEnable. The idea is that if the user |
2816 | // set PSInputAddr, the user wants to enable some bits after the compilation |
2817 | // based on run-time states. Since we can't know what the final PSInputEna |
2818 | // will look like, so we shouldn't do anything here and the user should take |
2819 | // responsibility for the correct programming. |
2820 | // |
2821 | // Otherwise, the following restrictions apply: |
2822 | // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled. |
2823 | // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be |
2824 | // enabled too. |
2825 | if ((Info->getPSInputAddr() & 0x7F) == 0 || |
2826 | ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(Index: 11))) { |
2827 | CCInfo.AllocateReg(AMDGPU::VGPR0); |
2828 | CCInfo.AllocateReg(AMDGPU::VGPR1); |
2829 | Info->markPSInputAllocated(Index: 0); |
2830 | Info->markPSInputEnabled(Index: 0); |
2831 | } |
2832 | if (Subtarget->isAmdPalOS()) { |
2833 | // For isAmdPalOS, the user does not enable some bits after compilation |
2834 | // based on run-time states; the register values being generated here are |
2835 | // the final ones set in hardware. Therefore we need to apply the |
2836 | // workaround to PSInputAddr and PSInputEnable together. (The case where |
2837 | // a bit is set in PSInputAddr but not PSInputEnable is where the |
2838 | // frontend set up an input arg for a particular interpolation mode, but |
2839 | // nothing uses that input arg. Really we should have an earlier pass |
2840 | // that removes such an arg.) |
2841 | unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable(); |
2842 | if ((PsInputBits & 0x7F) == 0 || |
2843 | ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1))) |
2844 | Info->markPSInputEnabled(Index: llvm::countr_zero(Val: Info->getPSInputAddr())); |
2845 | } |
2846 | } else if (IsKernel) { |
2847 | assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX()); |
2848 | } else { |
2849 | Splits.append(in_start: Ins.begin(), in_end: Ins.end()); |
2850 | } |
2851 | |
2852 | if (IsKernel) |
2853 | analyzeFormalArgumentsCompute(State&: CCInfo, Ins); |
2854 | |
2855 | if (IsEntryFunc) { |
2856 | allocateSpecialEntryInputVGPRs(CCInfo, MF, TRI: *TRI, Info&: *Info); |
2857 | allocateHSAUserSGPRs(CCInfo, MF, TRI: *TRI, Info&: *Info); |
2858 | if (IsKernel && Subtarget->hasKernargPreload()) |
2859 | allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, TRI: *TRI, Info&: *Info); |
2860 | |
2861 | allocateLDSKernelId(CCInfo, MF, TRI: *TRI, Info&: *Info); |
2862 | } else if (!IsGraphics) { |
2863 | // For the fixed ABI, pass workitem IDs in the last argument register. |
2864 | allocateSpecialInputVGPRsFixed(CCInfo, MF, TRI: *TRI, Info&: *Info); |
2865 | |
2866 | // FIXME: Sink this into allocateSpecialInputSGPRs |
2867 | if (!Subtarget->enableFlatScratch()) |
2868 | CCInfo.AllocateReg(Reg: Info->getScratchRSrcReg()); |
2869 | |
2870 | allocateSpecialInputSGPRs(CCInfo, MF, TRI: *TRI, Info&: *Info); |
2871 | } |
2872 | |
2873 | if (!IsKernel) { |
2874 | CCAssignFn *AssignFn = CCAssignFnForCall(CC: CallConv, IsVarArg: isVarArg); |
2875 | CCInfo.AnalyzeFormalArguments(Ins: Splits, Fn: AssignFn); |
2876 | } |
2877 | |
2878 | SmallVector<SDValue, 16> Chains; |
2879 | |
2880 | // FIXME: This is the minimum kernel argument alignment. We should improve |
2881 | // this to the maximum alignment of the arguments. |
2882 | // |
2883 | // FIXME: Alignment of explicit arguments totally broken with non-0 explicit |
2884 | // kern arg offset. |
2885 | const Align KernelArgBaseAlign = Align(16); |
2886 | |
2887 | for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { |
2888 | const ISD::InputArg &Arg = Ins[i]; |
2889 | if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) { |
2890 | InVals.push_back(Elt: DAG.getUNDEF(VT: Arg.VT)); |
2891 | continue; |
2892 | } |
2893 | |
2894 | CCValAssign &VA = ArgLocs[ArgIdx++]; |
2895 | MVT VT = VA.getLocVT(); |
2896 | |
2897 | if (IsEntryFunc && VA.isMemLoc()) { |
2898 | VT = Ins[i].VT; |
2899 | EVT MemVT = VA.getLocVT(); |
2900 | |
2901 | const uint64_t Offset = VA.getLocMemOffset(); |
2902 | Align Alignment = commonAlignment(A: KernelArgBaseAlign, Offset); |
2903 | |
2904 | if (Arg.Flags.isByRef()) { |
2905 | SDValue Ptr = lowerKernArgParameterPtr(DAG, SL: DL, Chain, Offset); |
2906 | |
2907 | const GCNTargetMachine &TM = |
2908 | static_cast<const GCNTargetMachine &>(getTargetMachine()); |
2909 | if (!TM.isNoopAddrSpaceCast(SrcAS: AMDGPUAS::CONSTANT_ADDRESS, |
2910 | DestAS: Arg.Flags.getPointerAddrSpace())) { |
2911 | Ptr = DAG.getAddrSpaceCast(dl: DL, VT, Ptr, SrcAS: AMDGPUAS::CONSTANT_ADDRESS, |
2912 | DestAS: Arg.Flags.getPointerAddrSpace()); |
2913 | } |
2914 | |
2915 | InVals.push_back(Elt: Ptr); |
2916 | continue; |
2917 | } |
2918 | |
2919 | SDValue NewArg; |
2920 | if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(Val: i)) { |
2921 | if (MemVT.getStoreSize() < 4 && Alignment < 4) { |
2922 | // In this case the argument is packed into the previous preload SGPR. |
2923 | int64_t AlignDownOffset = alignDown(Value: Offset, Align: 4); |
2924 | int64_t OffsetDiff = Offset - AlignDownOffset; |
2925 | EVT IntVT = MemVT.changeTypeToInteger(); |
2926 | |
2927 | const SIMachineFunctionInfo *Info = |
2928 | MF.getInfo<SIMachineFunctionInfo>(); |
2929 | MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); |
2930 | Register Reg = |
2931 | Info->getArgInfo().PreloadKernArgs.find(Val: i)->getSecond().Regs[0]; |
2932 | |
2933 | assert(Reg); |
2934 | Register VReg = MRI.getLiveInVirtReg(PReg: Reg); |
2935 | SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32); |
2936 | |
2937 | SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32); |
2938 | SDValue = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt); |
2939 | |
2940 | SDValue ArgVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: IntVT, Operand: Extract); |
2941 | ArgVal = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MemVT, Operand: ArgVal); |
2942 | NewArg = convertArgType(DAG, VT, MemVT, SL: DL, Val: ArgVal, |
2943 | Signed: Ins[i].Flags.isSExt(), Arg: &Ins[i]); |
2944 | |
2945 | NewArg = DAG.getMergeValues(Ops: {NewArg, Copy.getValue(R: 1)}, dl: DL); |
2946 | } else { |
2947 | const SIMachineFunctionInfo *Info = |
2948 | MF.getInfo<SIMachineFunctionInfo>(); |
2949 | MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); |
2950 | const SmallVectorImpl<MCRegister> &PreloadRegs = |
2951 | Info->getArgInfo().PreloadKernArgs.find(Val: i)->getSecond().Regs; |
2952 | |
2953 | SDValue Copy; |
2954 | if (PreloadRegs.size() == 1) { |
2955 | Register VReg = MRI.getLiveInVirtReg(PReg: PreloadRegs[0]); |
2956 | const TargetRegisterClass *RC = MRI.getRegClass(Reg: VReg); |
2957 | NewArg = DAG.getCopyFromReg( |
2958 | Chain, DL, VReg, |
2959 | EVT::getIntegerVT(Context&: *DAG.getContext(), |
2960 | BitWidth: TRI->getRegSizeInBits(*RC))); |
2961 | |
2962 | } else { |
2963 | // If the kernarg alignment does not match the alignment of the SGPR |
2964 | // tuple RC that can accommodate this argument, it will be built up |
2965 | // via copies from from the individual SGPRs that the argument was |
2966 | // preloaded to. |
2967 | SmallVector<SDValue, 4> Elts; |
2968 | for (auto Reg : PreloadRegs) { |
2969 | Register VReg = MRI.getLiveInVirtReg(PReg: Reg); |
2970 | Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32); |
2971 | Elts.push_back(Elt: Copy); |
2972 | } |
2973 | NewArg = |
2974 | DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32, |
2975 | PreloadRegs.size()), |
2976 | DL, Elts); |
2977 | } |
2978 | |
2979 | SDValue CMemVT; |
2980 | if (VT.isScalarInteger() && VT.bitsLT(VT: NewArg.getSimpleValueType())) |
2981 | CMemVT = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MemVT, Operand: NewArg); |
2982 | else |
2983 | CMemVT = DAG.getBitcast(VT: MemVT, V: NewArg); |
2984 | NewArg = convertArgType(DAG, VT, MemVT, SL: DL, Val: CMemVT, |
2985 | Signed: Ins[i].Flags.isSExt(), Arg: &Ins[i]); |
2986 | NewArg = DAG.getMergeValues(Ops: {NewArg, Chain}, dl: DL); |
2987 | } |
2988 | } else { |
2989 | NewArg = |
2990 | lowerKernargMemParameter(DAG, VT, MemVT, SL: DL, Chain, Offset, |
2991 | Alignment, Signed: Ins[i].Flags.isSExt(), Arg: &Ins[i]); |
2992 | } |
2993 | Chains.push_back(Elt: NewArg.getValue(R: 1)); |
2994 | |
2995 | auto *ParamTy = |
2996 | dyn_cast<PointerType>(Val: FType->getParamType(i: Ins[i].getOrigArgIndex())); |
2997 | if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && |
2998 | ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || |
2999 | ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) { |
3000 | // On SI local pointers are just offsets into LDS, so they are always |
3001 | // less than 16-bits. On CI and newer they could potentially be |
3002 | // real pointers, so we can't guarantee their size. |
3003 | NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg, |
3004 | DAG.getValueType(MVT::i16)); |
3005 | } |
3006 | |
3007 | InVals.push_back(Elt: NewArg); |
3008 | continue; |
3009 | } else if (!IsEntryFunc && VA.isMemLoc()) { |
3010 | SDValue Val = lowerStackParameter(DAG, VA, SL: DL, Chain, Arg); |
3011 | InVals.push_back(Elt: Val); |
3012 | if (!Arg.Flags.isByVal()) |
3013 | Chains.push_back(Elt: Val.getValue(R: 1)); |
3014 | continue; |
3015 | } |
3016 | |
3017 | assert(VA.isRegLoc() && "Parameter must be in a register!" ); |
3018 | |
3019 | Register Reg = VA.getLocReg(); |
3020 | const TargetRegisterClass *RC = nullptr; |
3021 | if (AMDGPU::VGPR_32RegClass.contains(Reg)) |
3022 | RC = &AMDGPU::VGPR_32RegClass; |
3023 | else if (AMDGPU::SGPR_32RegClass.contains(Reg)) |
3024 | RC = &AMDGPU::SGPR_32RegClass; |
3025 | else |
3026 | llvm_unreachable("Unexpected register class in LowerFormalArguments!" ); |
3027 | EVT ValVT = VA.getValVT(); |
3028 | |
3029 | Reg = MF.addLiveIn(PReg: Reg, RC); |
3030 | SDValue Val = DAG.getCopyFromReg(Chain, dl: DL, Reg, VT); |
3031 | |
3032 | if (Arg.Flags.isSRet()) { |
3033 | // The return object should be reasonably addressable. |
3034 | |
3035 | // FIXME: This helps when the return is a real sret. If it is a |
3036 | // automatically inserted sret (i.e. CanLowerReturn returns false), an |
3037 | // extra copy is inserted in SelectionDAGBuilder which obscures this. |
3038 | unsigned NumBits |
3039 | = 32 - getSubtarget()->getKnownHighZeroBitsForFrameIndex(); |
3040 | Val = DAG.getNode(Opcode: ISD::AssertZext, DL, VT, N1: Val, |
3041 | N2: DAG.getValueType(EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: NumBits))); |
3042 | } |
3043 | |
3044 | // If this is an 8 or 16-bit value, it is really passed promoted |
3045 | // to 32 bits. Insert an assert[sz]ext to capture this, then |
3046 | // truncate to the right size. |
3047 | switch (VA.getLocInfo()) { |
3048 | case CCValAssign::Full: |
3049 | break; |
3050 | case CCValAssign::BCvt: |
3051 | Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: ValVT, Operand: Val); |
3052 | break; |
3053 | case CCValAssign::SExt: |
3054 | Val = DAG.getNode(Opcode: ISD::AssertSext, DL, VT, N1: Val, |
3055 | N2: DAG.getValueType(ValVT)); |
3056 | Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ValVT, Operand: Val); |
3057 | break; |
3058 | case CCValAssign::ZExt: |
3059 | Val = DAG.getNode(Opcode: ISD::AssertZext, DL, VT, N1: Val, |
3060 | N2: DAG.getValueType(ValVT)); |
3061 | Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ValVT, Operand: Val); |
3062 | break; |
3063 | case CCValAssign::AExt: |
3064 | Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ValVT, Operand: Val); |
3065 | break; |
3066 | default: |
3067 | llvm_unreachable("Unknown loc info!" ); |
3068 | } |
3069 | |
3070 | InVals.push_back(Elt: Val); |
3071 | } |
3072 | |
3073 | // Start adding system SGPRs. |
3074 | if (IsEntryFunc) |
3075 | allocateSystemSGPRs(CCInfo, MF, Info&: *Info, CallConv, IsShader: IsGraphics); |
3076 | |
3077 | auto &ArgUsageInfo = |
3078 | DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>(); |
3079 | ArgUsageInfo.setFuncArgInfo(F: Fn, ArgInfo: Info->getArgInfo()); |
3080 | |
3081 | unsigned StackArgSize = CCInfo.getStackSize(); |
3082 | Info->setBytesInStackArgArea(StackArgSize); |
3083 | |
3084 | return Chains.empty() ? Chain : |
3085 | DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); |
3086 | } |
3087 | |
3088 | // TODO: If return values can't fit in registers, we should return as many as |
3089 | // possible in registers before passing on stack. |
3090 | bool SITargetLowering::CanLowerReturn( |
3091 | CallingConv::ID CallConv, |
3092 | MachineFunction &MF, bool IsVarArg, |
3093 | const SmallVectorImpl<ISD::OutputArg> &Outs, |
3094 | LLVMContext &Context) const { |
3095 | // Replacing returns with sret/stack usage doesn't make sense for shaders. |
3096 | // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn |
3097 | // for shaders. Vector types should be explicitly handled by CC. |
3098 | if (AMDGPU::isEntryFunctionCC(CC: CallConv)) |
3099 | return true; |
3100 | |
3101 | SmallVector<CCValAssign, 16> RVLocs; |
3102 | CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context); |
3103 | if (!CCInfo.CheckReturn(Outs, Fn: CCAssignFnForReturn(CC: CallConv, IsVarArg))) |
3104 | return false; |
3105 | |
3106 | // We must use the stack if return would require unavailable registers. |
3107 | unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF); |
3108 | unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); |
3109 | for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) |
3110 | if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i))) |
3111 | return false; |
3112 | |
3113 | return true; |
3114 | } |
3115 | |
3116 | SDValue |
3117 | SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, |
3118 | bool isVarArg, |
3119 | const SmallVectorImpl<ISD::OutputArg> &Outs, |
3120 | const SmallVectorImpl<SDValue> &OutVals, |
3121 | const SDLoc &DL, SelectionDAG &DAG) const { |
3122 | MachineFunction &MF = DAG.getMachineFunction(); |
3123 | SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
3124 | |
3125 | if (AMDGPU::isKernel(CC: CallConv)) { |
3126 | return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs, |
3127 | OutVals, DL, DAG); |
3128 | } |
3129 | |
3130 | bool IsShader = AMDGPU::isShader(CC: CallConv); |
3131 | |
3132 | Info->setIfReturnsVoid(Outs.empty()); |
3133 | bool IsWaveEnd = Info->returnsVoid() && IsShader; |
3134 | |
3135 | // CCValAssign - represent the assignment of the return value to a location. |
3136 | SmallVector<CCValAssign, 48> RVLocs; |
3137 | SmallVector<ISD::OutputArg, 48> Splits; |
3138 | |
3139 | // CCState - Info about the registers and stack slots. |
3140 | CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, |
3141 | *DAG.getContext()); |
3142 | |
3143 | // Analyze outgoing return values. |
3144 | CCInfo.AnalyzeReturn(Outs, Fn: CCAssignFnForReturn(CC: CallConv, IsVarArg: isVarArg)); |
3145 | |
3146 | SDValue Glue; |
3147 | SmallVector<SDValue, 48> RetOps; |
3148 | RetOps.push_back(Elt: Chain); // Operand #0 = Chain (updated below) |
3149 | |
3150 | // Copy the result values into the output registers. |
3151 | for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E; |
3152 | ++I, ++RealRVLocIdx) { |
3153 | CCValAssign &VA = RVLocs[I]; |
3154 | assert(VA.isRegLoc() && "Can only return in registers!" ); |
3155 | // TODO: Partially return in registers if return values don't fit. |
3156 | SDValue Arg = OutVals[RealRVLocIdx]; |
3157 | |
3158 | // Copied from other backends. |
3159 | switch (VA.getLocInfo()) { |
3160 | case CCValAssign::Full: |
3161 | break; |
3162 | case CCValAssign::BCvt: |
3163 | Arg = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getLocVT(), Operand: Arg); |
3164 | break; |
3165 | case CCValAssign::SExt: |
3166 | Arg = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg); |
3167 | break; |
3168 | case CCValAssign::ZExt: |
3169 | Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg); |
3170 | break; |
3171 | case CCValAssign::AExt: |
3172 | Arg = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg); |
3173 | break; |
3174 | default: |
3175 | llvm_unreachable("Unknown loc info!" ); |
3176 | } |
3177 | |
3178 | Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: VA.getLocReg(), N: Arg, Glue); |
3179 | Glue = Chain.getValue(R: 1); |
3180 | RetOps.push_back(Elt: DAG.getRegister(Reg: VA.getLocReg(), VT: VA.getLocVT())); |
3181 | } |
3182 | |
3183 | // FIXME: Does sret work properly? |
3184 | if (!Info->isEntryFunction()) { |
3185 | const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); |
3186 | const MCPhysReg *I = |
3187 | TRI->getCalleeSavedRegsViaCopy(MF: &DAG.getMachineFunction()); |
3188 | if (I) { |
3189 | for (; *I; ++I) { |
3190 | if (AMDGPU::SReg_64RegClass.contains(*I)) |
3191 | RetOps.push_back(DAG.getRegister(*I, MVT::i64)); |
3192 | else if (AMDGPU::SReg_32RegClass.contains(*I)) |
3193 | RetOps.push_back(DAG.getRegister(*I, MVT::i32)); |
3194 | else |
3195 | llvm_unreachable("Unexpected register class in CSRsViaCopy!" ); |
3196 | } |
3197 | } |
3198 | } |
3199 | |
3200 | // Update chain and glue. |
3201 | RetOps[0] = Chain; |
3202 | if (Glue.getNode()) |
3203 | RetOps.push_back(Elt: Glue); |
3204 | |
3205 | unsigned Opc = AMDGPUISD::ENDPGM; |
3206 | if (!IsWaveEnd) |
3207 | Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_GLUE; |
3208 | return DAG.getNode(Opc, DL, MVT::Other, RetOps); |
3209 | } |
3210 | |
3211 | SDValue SITargetLowering::LowerCallResult( |
3212 | SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg, |
3213 | const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, |
3214 | SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn, |
3215 | SDValue ThisVal) const { |
3216 | CCAssignFn *RetCC = CCAssignFnForReturn(CC: CallConv, IsVarArg); |
3217 | |
3218 | // Assign locations to each value returned by this call. |
3219 | SmallVector<CCValAssign, 16> RVLocs; |
3220 | CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs, |
3221 | *DAG.getContext()); |
3222 | CCInfo.AnalyzeCallResult(Ins, Fn: RetCC); |
3223 | |
3224 | // Copy all of the result registers out of their specified physreg. |
3225 | for (unsigned i = 0; i != RVLocs.size(); ++i) { |
3226 | CCValAssign VA = RVLocs[i]; |
3227 | SDValue Val; |
3228 | |
3229 | if (VA.isRegLoc()) { |
3230 | Val = DAG.getCopyFromReg(Chain, dl: DL, Reg: VA.getLocReg(), VT: VA.getLocVT(), Glue: InGlue); |
3231 | Chain = Val.getValue(R: 1); |
3232 | InGlue = Val.getValue(R: 2); |
3233 | } else if (VA.isMemLoc()) { |
3234 | report_fatal_error(reason: "TODO: return values in memory" ); |
3235 | } else |
3236 | llvm_unreachable("unknown argument location type" ); |
3237 | |
3238 | switch (VA.getLocInfo()) { |
3239 | case CCValAssign::Full: |
3240 | break; |
3241 | case CCValAssign::BCvt: |
3242 | Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getValVT(), Operand: Val); |
3243 | break; |
3244 | case CCValAssign::ZExt: |
3245 | Val = DAG.getNode(Opcode: ISD::AssertZext, DL, VT: VA.getLocVT(), N1: Val, |
3246 | N2: DAG.getValueType(VA.getValVT())); |
3247 | Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VA.getValVT(), Operand: Val); |
3248 | break; |
3249 | case CCValAssign::SExt: |
3250 | Val = DAG.getNode(Opcode: ISD::AssertSext, DL, VT: VA.getLocVT(), N1: Val, |
3251 | N2: DAG.getValueType(VA.getValVT())); |
3252 | Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VA.getValVT(), Operand: Val); |
3253 | break; |
3254 | case CCValAssign::AExt: |
3255 | Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VA.getValVT(), Operand: Val); |
3256 | break; |
3257 | default: |
3258 | llvm_unreachable("Unknown loc info!" ); |
3259 | } |
3260 | |
3261 | InVals.push_back(Elt: Val); |
3262 | } |
3263 | |
3264 | return Chain; |
3265 | } |
3266 | |
3267 | // Add code to pass special inputs required depending on used features separate |
3268 | // from the explicit user arguments present in the IR. |
3269 | void SITargetLowering::passSpecialInputs( |
3270 | CallLoweringInfo &CLI, |
3271 | CCState &CCInfo, |
3272 | const SIMachineFunctionInfo &Info, |
3273 | SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass, |
3274 | SmallVectorImpl<SDValue> &MemOpChains, |
3275 | SDValue Chain) const { |
3276 | // If we don't have a call site, this was a call inserted by |
3277 | // legalization. These can never use special inputs. |
3278 | if (!CLI.CB) |
3279 | return; |
3280 | |
3281 | SelectionDAG &DAG = CLI.DAG; |
3282 | const SDLoc &DL = CLI.DL; |
3283 | const Function &F = DAG.getMachineFunction().getFunction(); |
3284 | |
3285 | const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); |
3286 | const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo(); |
3287 | |
3288 | const AMDGPUFunctionArgInfo *CalleeArgInfo |
3289 | = &AMDGPUArgumentUsageInfo::FixedABIFunctionInfo; |
3290 | if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) { |
3291 | auto &ArgUsageInfo = |
3292 | DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>(); |
3293 | CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(F: *CalleeFunc); |
3294 | } |
3295 | |
3296 | // TODO: Unify with private memory register handling. This is complicated by |
3297 | // the fact that at least in kernels, the input argument is not necessarily |
3298 | // in the same location as the input. |
3299 | static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue, |
3300 | StringLiteral> ImplicitAttrs[] = { |
3301 | {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr" }, |
3302 | {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" }, |
3303 | {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr" }, |
3304 | {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id" }, |
3305 | {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x" }, |
3306 | {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y" }, |
3307 | {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z" }, |
3308 | {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id" }, |
3309 | }; |
3310 | |
3311 | for (auto Attr : ImplicitAttrs) { |
3312 | const ArgDescriptor *OutgoingArg; |
3313 | const TargetRegisterClass *ArgRC; |
3314 | LLT ArgTy; |
3315 | |
3316 | AMDGPUFunctionArgInfo::PreloadedValue InputID = Attr.first; |
3317 | |
3318 | // If the callee does not use the attribute value, skip copying the value. |
3319 | if (CLI.CB->hasFnAttr(Kind: Attr.second)) |
3320 | continue; |
3321 | |
3322 | std::tie(args&: OutgoingArg, args&: ArgRC, args&: ArgTy) = |
3323 | CalleeArgInfo->getPreloadedValue(Value: InputID); |
3324 | if (!OutgoingArg) |
3325 | continue; |
3326 | |
3327 | const ArgDescriptor *IncomingArg; |
3328 | const TargetRegisterClass *IncomingArgRC; |
3329 | LLT Ty; |
3330 | std::tie(args&: IncomingArg, args&: IncomingArgRC, args&: Ty) = |
3331 | CallerArgInfo.getPreloadedValue(Value: InputID); |
3332 | assert(IncomingArgRC == ArgRC); |
3333 | |
3334 | // All special arguments are ints for now. |
3335 | EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32; |
3336 | SDValue InputReg; |
3337 | |
3338 | if (IncomingArg) { |
3339 | InputReg = loadInputValue(DAG, RC: ArgRC, VT: ArgVT, SL: DL, Arg: *IncomingArg); |
3340 | } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) { |
3341 | // The implicit arg ptr is special because it doesn't have a corresponding |
3342 | // input for kernels, and is computed from the kernarg segment pointer. |
3343 | InputReg = getImplicitArgPtr(DAG, SL: DL); |
3344 | } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) { |
3345 | std::optional<uint32_t> Id = |
3346 | AMDGPUMachineFunction::getLDSKernelIdMetadata(F); |
3347 | if (Id.has_value()) { |
3348 | InputReg = DAG.getConstant(Val: *Id, DL, VT: ArgVT); |
3349 | } else { |
3350 | InputReg = DAG.getUNDEF(VT: ArgVT); |
3351 | } |
3352 | } else { |
3353 | // We may have proven the input wasn't needed, although the ABI is |
3354 | // requiring it. We just need to allocate the register appropriately. |
3355 | InputReg = DAG.getUNDEF(VT: ArgVT); |
3356 | } |
3357 | |
3358 | if (OutgoingArg->isRegister()) { |
3359 | RegsToPass.emplace_back(Args: OutgoingArg->getRegister(), Args&: InputReg); |
3360 | if (!CCInfo.AllocateReg(Reg: OutgoingArg->getRegister())) |
3361 | report_fatal_error(reason: "failed to allocate implicit input argument" ); |
3362 | } else { |
3363 | unsigned SpecialArgOffset = |
3364 | CCInfo.AllocateStack(Size: ArgVT.getStoreSize(), Alignment: Align(4)); |
3365 | SDValue ArgStore = storeStackInputValue(DAG, SL: DL, Chain, ArgVal: InputReg, |
3366 | Offset: SpecialArgOffset); |
3367 | MemOpChains.push_back(Elt: ArgStore); |
3368 | } |
3369 | } |
3370 | |
3371 | // Pack workitem IDs into a single register or pass it as is if already |
3372 | // packed. |
3373 | const ArgDescriptor *OutgoingArg; |
3374 | const TargetRegisterClass *ArgRC; |
3375 | LLT Ty; |
3376 | |
3377 | std::tie(args&: OutgoingArg, args&: ArgRC, args&: Ty) = |
3378 | CalleeArgInfo->getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_X); |
3379 | if (!OutgoingArg) |
3380 | std::tie(args&: OutgoingArg, args&: ArgRC, args&: Ty) = |
3381 | CalleeArgInfo->getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Y); |
3382 | if (!OutgoingArg) |
3383 | std::tie(args&: OutgoingArg, args&: ArgRC, args&: Ty) = |
3384 | CalleeArgInfo->getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Z); |
3385 | if (!OutgoingArg) |
3386 | return; |
3387 | |
3388 | const ArgDescriptor *IncomingArgX = std::get<0>( |
3389 | t: CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_X)); |
3390 | const ArgDescriptor *IncomingArgY = std::get<0>( |
3391 | t: CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Y)); |
3392 | const ArgDescriptor *IncomingArgZ = std::get<0>( |
3393 | t: CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Z)); |
3394 | |
3395 | SDValue InputReg; |
3396 | SDLoc SL; |
3397 | |
3398 | const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-x" ); |
3399 | const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-y" ); |
3400 | const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-z" ); |
3401 | |
3402 | // If incoming ids are not packed we need to pack them. |
3403 | if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX && |
3404 | NeedWorkItemIDX) { |
3405 | if (Subtarget->getMaxWorkitemID(F, 0) != 0) { |
3406 | InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX); |
3407 | } else { |
3408 | InputReg = DAG.getConstant(0, DL, MVT::i32); |
3409 | } |
3410 | } |
3411 | |
3412 | if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY && |
3413 | NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) { |
3414 | SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY); |
3415 | Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y, |
3416 | DAG.getShiftAmountConstant(10, MVT::i32, SL)); |
3417 | InputReg = InputReg.getNode() ? |
3418 | DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y; |
3419 | } |
3420 | |
3421 | if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ && |
3422 | NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) { |
3423 | SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ); |
3424 | Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z, |
3425 | DAG.getShiftAmountConstant(20, MVT::i32, SL)); |
3426 | InputReg = InputReg.getNode() ? |
3427 | DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z) : Z; |
3428 | } |
3429 | |
3430 | if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) { |
3431 | if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) { |
3432 | // We're in a situation where the outgoing function requires the workitem |
3433 | // ID, but the calling function does not have it (e.g a graphics function |
3434 | // calling a C calling convention function). This is illegal, but we need |
3435 | // to produce something. |
3436 | InputReg = DAG.getUNDEF(MVT::i32); |
3437 | } else { |
3438 | // Workitem ids are already packed, any of present incoming arguments |
3439 | // will carry all required fields. |
3440 | ArgDescriptor IncomingArg = ArgDescriptor::createArg( |
3441 | Arg: IncomingArgX ? *IncomingArgX : |
3442 | IncomingArgY ? *IncomingArgY : |
3443 | *IncomingArgZ, Mask: ~0u); |
3444 | InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg); |
3445 | } |
3446 | } |
3447 | |
3448 | if (OutgoingArg->isRegister()) { |
3449 | if (InputReg) |
3450 | RegsToPass.emplace_back(Args: OutgoingArg->getRegister(), Args&: InputReg); |
3451 | |
3452 | CCInfo.AllocateReg(Reg: OutgoingArg->getRegister()); |
3453 | } else { |
3454 | unsigned SpecialArgOffset = CCInfo.AllocateStack(Size: 4, Alignment: Align(4)); |
3455 | if (InputReg) { |
3456 | SDValue ArgStore = storeStackInputValue(DAG, SL: DL, Chain, ArgVal: InputReg, |
3457 | Offset: SpecialArgOffset); |
3458 | MemOpChains.push_back(Elt: ArgStore); |
3459 | } |
3460 | } |
3461 | } |
3462 | |
3463 | static bool canGuaranteeTCO(CallingConv::ID CC) { |
3464 | return CC == CallingConv::Fast; |
3465 | } |
3466 | |
3467 | /// Return true if we might ever do TCO for calls with this calling convention. |
3468 | static bool mayTailCallThisCC(CallingConv::ID CC) { |
3469 | switch (CC) { |
3470 | case CallingConv::C: |
3471 | case CallingConv::AMDGPU_Gfx: |
3472 | return true; |
3473 | default: |
3474 | return canGuaranteeTCO(CC); |
3475 | } |
3476 | } |
3477 | |
3478 | bool SITargetLowering::isEligibleForTailCallOptimization( |
3479 | SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg, |
3480 | const SmallVectorImpl<ISD::OutputArg> &Outs, |
3481 | const SmallVectorImpl<SDValue> &OutVals, |
3482 | const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const { |
3483 | if (AMDGPU::isChainCC(CC: CalleeCC)) |
3484 | return true; |
3485 | |
3486 | if (!mayTailCallThisCC(CC: CalleeCC)) |
3487 | return false; |
3488 | |
3489 | // For a divergent call target, we need to do a waterfall loop over the |
3490 | // possible callees which precludes us from using a simple jump. |
3491 | if (Callee->isDivergent()) |
3492 | return false; |
3493 | |
3494 | MachineFunction &MF = DAG.getMachineFunction(); |
3495 | const Function &CallerF = MF.getFunction(); |
3496 | CallingConv::ID CallerCC = CallerF.getCallingConv(); |
3497 | const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); |
3498 | const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); |
3499 | |
3500 | // Kernels aren't callable, and don't have a live in return address so it |
3501 | // doesn't make sense to do a tail call with entry functions. |
3502 | if (!CallerPreserved) |
3503 | return false; |
3504 | |
3505 | bool CCMatch = CallerCC == CalleeCC; |
3506 | |
3507 | if (DAG.getTarget().Options.GuaranteedTailCallOpt) { |
3508 | if (canGuaranteeTCO(CC: CalleeCC) && CCMatch) |
3509 | return true; |
3510 | return false; |
3511 | } |
3512 | |
3513 | // TODO: Can we handle var args? |
3514 | if (IsVarArg) |
3515 | return false; |
3516 | |
3517 | for (const Argument &Arg : CallerF.args()) { |
3518 | if (Arg.hasByValAttr()) |
3519 | return false; |
3520 | } |
3521 | |
3522 | LLVMContext &Ctx = *DAG.getContext(); |
3523 | |
3524 | // Check that the call results are passed in the same way. |
3525 | if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C&: Ctx, Ins, |
3526 | CalleeFn: CCAssignFnForCall(CC: CalleeCC, IsVarArg), |
3527 | CallerFn: CCAssignFnForCall(CC: CallerCC, IsVarArg))) |
3528 | return false; |
3529 | |
3530 | // The callee has to preserve all registers the caller needs to preserve. |
3531 | if (!CCMatch) { |
3532 | const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); |
3533 | if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) |
3534 | return false; |
3535 | } |
3536 | |
3537 | // Nothing more to check if the callee is taking no arguments. |
3538 | if (Outs.empty()) |
3539 | return true; |
3540 | |
3541 | SmallVector<CCValAssign, 16> ArgLocs; |
3542 | CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx); |
3543 | |
3544 | CCInfo.AnalyzeCallOperands(Outs, Fn: CCAssignFnForCall(CC: CalleeCC, IsVarArg)); |
3545 | |
3546 | const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); |
3547 | // If the stack arguments for this call do not fit into our own save area then |
3548 | // the call cannot be made tail. |
3549 | // TODO: Is this really necessary? |
3550 | if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea()) |
3551 | return false; |
3552 | |
3553 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
3554 | return parametersInCSRMatch(MRI, CallerPreservedMask: CallerPreserved, ArgLocs, OutVals); |
3555 | } |
3556 | |
3557 | bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { |
3558 | if (!CI->isTailCall()) |
3559 | return false; |
3560 | |
3561 | const Function *ParentFn = CI->getParent()->getParent(); |
3562 | if (AMDGPU::isEntryFunctionCC(CC: ParentFn->getCallingConv())) |
3563 | return false; |
3564 | return true; |
3565 | } |
3566 | |
3567 | // The wave scratch offset register is used as the global base pointer. |
3568 | SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, |
3569 | SmallVectorImpl<SDValue> &InVals) const { |
3570 | CallingConv::ID CallConv = CLI.CallConv; |
3571 | bool IsChainCallConv = AMDGPU::isChainCC(CC: CallConv); |
3572 | |
3573 | SelectionDAG &DAG = CLI.DAG; |
3574 | |
3575 | TargetLowering::ArgListEntry RequestedExec; |
3576 | if (IsChainCallConv) { |
3577 | // The last argument should be the value that we need to put in EXEC. |
3578 | // Pop it out of CLI.Outs and CLI.OutVals before we do any processing so we |
3579 | // don't treat it like the rest of the arguments. |
3580 | RequestedExec = CLI.Args.back(); |
3581 | assert(RequestedExec.Node && "No node for EXEC" ); |
3582 | |
3583 | if (!RequestedExec.Ty->isIntegerTy(Subtarget->getWavefrontSize())) |
3584 | return lowerUnhandledCall(CLI, InVals, Reason: "Invalid value for EXEC" ); |
3585 | |
3586 | assert(CLI.Outs.back().OrigArgIndex == 2 && "Unexpected last arg" ); |
3587 | CLI.Outs.pop_back(); |
3588 | CLI.OutVals.pop_back(); |
3589 | |
3590 | if (RequestedExec.Ty->isIntegerTy(Bitwidth: 64)) { |
3591 | assert(CLI.Outs.back().OrigArgIndex == 2 && "Exec wasn't split up" ); |
3592 | CLI.Outs.pop_back(); |
3593 | CLI.OutVals.pop_back(); |
3594 | } |
3595 | |
3596 | assert(CLI.Outs.back().OrigArgIndex != 2 && |
3597 | "Haven't popped all the pieces of the EXEC mask" ); |
3598 | } |
3599 | |
3600 | const SDLoc &DL = CLI.DL; |
3601 | SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; |
3602 | SmallVector<SDValue, 32> &OutVals = CLI.OutVals; |
3603 | SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; |
3604 | SDValue Chain = CLI.Chain; |
3605 | SDValue Callee = CLI.Callee; |
3606 | bool &IsTailCall = CLI.IsTailCall; |
3607 | bool IsVarArg = CLI.IsVarArg; |
3608 | bool IsSibCall = false; |
3609 | MachineFunction &MF = DAG.getMachineFunction(); |
3610 | |
3611 | if (Callee.isUndef() || isNullConstant(V: Callee)) { |
3612 | if (!CLI.IsTailCall) { |
3613 | for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I) |
3614 | InVals.push_back(Elt: DAG.getUNDEF(VT: CLI.Ins[I].VT)); |
3615 | } |
3616 | |
3617 | return Chain; |
3618 | } |
3619 | |
3620 | if (IsVarArg) { |
3621 | return lowerUnhandledCall(CLI, InVals, |
3622 | Reason: "unsupported call to variadic function " ); |
3623 | } |
3624 | |
3625 | if (!CLI.CB) |
3626 | report_fatal_error(reason: "unsupported libcall legalization" ); |
3627 | |
3628 | if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) { |
3629 | return lowerUnhandledCall(CLI, InVals, |
3630 | Reason: "unsupported required tail call to function " ); |
3631 | } |
3632 | |
3633 | if (IsTailCall) { |
3634 | IsTailCall = isEligibleForTailCallOptimization( |
3635 | Callee, CalleeCC: CallConv, IsVarArg, Outs, OutVals, Ins, DAG); |
3636 | if (!IsTailCall && |
3637 | ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) { |
3638 | report_fatal_error(reason: "failed to perform tail call elimination on a call " |
3639 | "site marked musttail or on llvm.amdgcn.cs.chain" ); |
3640 | } |
3641 | |
3642 | bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; |
3643 | |
3644 | // A sibling call is one where we're under the usual C ABI and not planning |
3645 | // to change that but can still do a tail call: |
3646 | if (!TailCallOpt && IsTailCall) |
3647 | IsSibCall = true; |
3648 | |
3649 | if (IsTailCall) |
3650 | ++NumTailCalls; |
3651 | } |
3652 | |
3653 | const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
3654 | SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; |
3655 | SmallVector<SDValue, 8> MemOpChains; |
3656 | |
3657 | // Analyze operands of the call, assigning locations to each operand. |
3658 | SmallVector<CCValAssign, 16> ArgLocs; |
3659 | CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); |
3660 | CCAssignFn *AssignFn = CCAssignFnForCall(CC: CallConv, IsVarArg); |
3661 | |
3662 | if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CC: CallConv)) { |
3663 | // With a fixed ABI, allocate fixed registers before user arguments. |
3664 | passSpecialInputs(CLI, CCInfo, Info: *Info, RegsToPass, MemOpChains, Chain); |
3665 | } |
3666 | |
3667 | CCInfo.AnalyzeCallOperands(Outs, Fn: AssignFn); |
3668 | |
3669 | // Get a count of how many bytes are to be pushed on the stack. |
3670 | unsigned NumBytes = CCInfo.getStackSize(); |
3671 | |
3672 | if (IsSibCall) { |
3673 | // Since we're not changing the ABI to make this a tail call, the memory |
3674 | // operands are already available in the caller's incoming argument space. |
3675 | NumBytes = 0; |
3676 | } |
3677 | |
3678 | // FPDiff is the byte offset of the call's argument area from the callee's. |
3679 | // Stores to callee stack arguments will be placed in FixedStackSlots offset |
3680 | // by this amount for a tail call. In a sibling call it must be 0 because the |
3681 | // caller will deallocate the entire stack and the callee still expects its |
3682 | // arguments to begin at SP+0. Completely unused for non-tail calls. |
3683 | int32_t FPDiff = 0; |
3684 | MachineFrameInfo &MFI = MF.getFrameInfo(); |
3685 | |
3686 | // Adjust the stack pointer for the new arguments... |
3687 | // These operations are automatically eliminated by the prolog/epilog pass |
3688 | if (!IsSibCall) |
3689 | Chain = DAG.getCALLSEQ_START(Chain, InSize: 0, OutSize: 0, DL); |
3690 | |
3691 | if (!IsSibCall || IsChainCallConv) { |
3692 | if (!Subtarget->enableFlatScratch()) { |
3693 | SmallVector<SDValue, 4> CopyFromChains; |
3694 | |
3695 | // In the HSA case, this should be an identity copy. |
3696 | SDValue ScratchRSrcReg |
3697 | = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32); |
3698 | RegsToPass.emplace_back(IsChainCallConv |
3699 | ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51 |
3700 | : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, |
3701 | ScratchRSrcReg); |
3702 | CopyFromChains.push_back(Elt: ScratchRSrcReg.getValue(R: 1)); |
3703 | Chain = DAG.getTokenFactor(DL, Vals&: CopyFromChains); |
3704 | } |
3705 | } |
3706 | |
3707 | MVT PtrVT = MVT::i32; |
3708 | |
3709 | // Walk the register/memloc assignments, inserting copies/loads. |
3710 | for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { |
3711 | CCValAssign &VA = ArgLocs[i]; |
3712 | SDValue Arg = OutVals[i]; |
3713 | |
3714 | // Promote the value if needed. |
3715 | switch (VA.getLocInfo()) { |
3716 | case CCValAssign::Full: |
3717 | break; |
3718 | case CCValAssign::BCvt: |
3719 | Arg = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getLocVT(), Operand: Arg); |
3720 | break; |
3721 | case CCValAssign::ZExt: |
3722 | Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg); |
3723 | break; |
3724 | case CCValAssign::SExt: |
3725 | Arg = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg); |
3726 | break; |
3727 | case CCValAssign::AExt: |
3728 | Arg = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg); |
3729 | break; |
3730 | case CCValAssign::FPExt: |
3731 | Arg = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg); |
3732 | break; |
3733 | default: |
3734 | llvm_unreachable("Unknown loc info!" ); |
3735 | } |
3736 | |
3737 | if (VA.isRegLoc()) { |
3738 | RegsToPass.push_back(Elt: std::pair(VA.getLocReg(), Arg)); |
3739 | } else { |
3740 | assert(VA.isMemLoc()); |
3741 | |
3742 | SDValue DstAddr; |
3743 | MachinePointerInfo DstInfo; |
3744 | |
3745 | unsigned LocMemOffset = VA.getLocMemOffset(); |
3746 | int32_t Offset = LocMemOffset; |
3747 | |
3748 | SDValue PtrOff = DAG.getConstant(Val: Offset, DL, VT: PtrVT); |
3749 | MaybeAlign Alignment; |
3750 | |
3751 | if (IsTailCall) { |
3752 | ISD::ArgFlagsTy Flags = Outs[i].Flags; |
3753 | unsigned OpSize = Flags.isByVal() ? |
3754 | Flags.getByValSize() : VA.getValVT().getStoreSize(); |
3755 | |
3756 | // FIXME: We can have better than the minimum byval required alignment. |
3757 | Alignment = |
3758 | Flags.isByVal() |
3759 | ? Flags.getNonZeroByValAlign() |
3760 | : commonAlignment(A: Subtarget->getStackAlignment(), Offset); |
3761 | |
3762 | Offset = Offset + FPDiff; |
3763 | int FI = MFI.CreateFixedObject(Size: OpSize, SPOffset: Offset, IsImmutable: true); |
3764 | |
3765 | DstAddr = DAG.getFrameIndex(FI, VT: PtrVT); |
3766 | DstInfo = MachinePointerInfo::getFixedStack(MF, FI); |
3767 | |
3768 | // Make sure any stack arguments overlapping with where we're storing |
3769 | // are loaded before this eventual operation. Otherwise they'll be |
3770 | // clobbered. |
3771 | |
3772 | // FIXME: Why is this really necessary? This seems to just result in a |
3773 | // lot of code to copy the stack and write them back to the same |
3774 | // locations, which are supposed to be immutable? |
3775 | Chain = addTokenForArgument(Chain, DAG, MFI, ClobberedFI: FI); |
3776 | } else { |
3777 | // Stores to the argument stack area are relative to the stack pointer. |
3778 | SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(), |
3779 | MVT::i32); |
3780 | DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff); |
3781 | DstInfo = MachinePointerInfo::getStack(MF, Offset: LocMemOffset); |
3782 | Alignment = |
3783 | commonAlignment(A: Subtarget->getStackAlignment(), Offset: LocMemOffset); |
3784 | } |
3785 | |
3786 | if (Outs[i].Flags.isByVal()) { |
3787 | SDValue SizeNode = |
3788 | DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32); |
3789 | SDValue Cpy = |
3790 | DAG.getMemcpy(Chain, dl: DL, Dst: DstAddr, Src: Arg, Size: SizeNode, |
3791 | Alignment: Outs[i].Flags.getNonZeroByValAlign(), |
3792 | /*isVol = */ false, /*AlwaysInline = */ true, |
3793 | /*isTailCall = */ false, DstPtrInfo: DstInfo, |
3794 | SrcPtrInfo: MachinePointerInfo(AMDGPUAS::PRIVATE_ADDRESS)); |
3795 | |
3796 | MemOpChains.push_back(Elt: Cpy); |
3797 | } else { |
3798 | SDValue Store = |
3799 | DAG.getStore(Chain, dl: DL, Val: Arg, Ptr: DstAddr, PtrInfo: DstInfo, Alignment); |
3800 | MemOpChains.push_back(Elt: Store); |
3801 | } |
3802 | } |
3803 | } |
3804 | |
3805 | if (!MemOpChains.empty()) |
3806 | Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains); |
3807 | |
3808 | // Build a sequence of copy-to-reg nodes chained together with token chain |
3809 | // and flag operands which copy the outgoing args into the appropriate regs. |
3810 | SDValue InGlue; |
3811 | for (auto &RegToPass : RegsToPass) { |
3812 | Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: RegToPass.first, |
3813 | N: RegToPass.second, Glue: InGlue); |
3814 | InGlue = Chain.getValue(R: 1); |
3815 | } |
3816 | |
3817 | |
3818 | // We don't usually want to end the call-sequence here because we would tidy |
3819 | // the frame up *after* the call, however in the ABI-changing tail-call case |
3820 | // we've carefully laid out the parameters so that when sp is reset they'll be |
3821 | // in the correct location. |
3822 | if (IsTailCall && !IsSibCall) { |
3823 | Chain = DAG.getCALLSEQ_END(Chain, Size1: NumBytes, Size2: 0, Glue: InGlue, DL); |
3824 | InGlue = Chain.getValue(R: 1); |
3825 | } |
3826 | |
3827 | std::vector<SDValue> Ops; |
3828 | Ops.push_back(x: Chain); |
3829 | Ops.push_back(x: Callee); |
3830 | // Add a redundant copy of the callee global which will not be legalized, as |
3831 | // we need direct access to the callee later. |
3832 | if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Val&: Callee)) { |
3833 | const GlobalValue *GV = GSD->getGlobal(); |
3834 | Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64)); |
3835 | } else { |
3836 | Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64)); |
3837 | } |
3838 | |
3839 | if (IsTailCall) { |
3840 | // Each tail call may have to adjust the stack by a different amount, so |
3841 | // this information must travel along with the operation for eventual |
3842 | // consumption by emitEpilogue. |
3843 | Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32)); |
3844 | } |
3845 | |
3846 | if (IsChainCallConv) |
3847 | Ops.push_back(x: RequestedExec.Node); |
3848 | |
3849 | // Add argument registers to the end of the list so that they are known live |
3850 | // into the call. |
3851 | for (auto &RegToPass : RegsToPass) { |
3852 | Ops.push_back(x: DAG.getRegister(Reg: RegToPass.first, |
3853 | VT: RegToPass.second.getValueType())); |
3854 | } |
3855 | |
3856 | // Add a register mask operand representing the call-preserved registers. |
3857 | auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); |
3858 | const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv); |
3859 | assert(Mask && "Missing call preserved mask for calling convention" ); |
3860 | Ops.push_back(x: DAG.getRegisterMask(RegMask: Mask)); |
3861 | |
3862 | if (InGlue.getNode()) |
3863 | Ops.push_back(x: InGlue); |
3864 | |
3865 | // NOTE: This potentially results in *two* glue operands, and the wrong one |
3866 | // might possibly show up where the other was intended. In particular, |
3867 | // Emitter::EmitMachineNode() expects only the glued convergence token if it |
3868 | // exists. Similarly, the selection of the call expects to match only the |
3869 | // InGlue operand if it exists. |
3870 | if (SDValue Token = CLI.ConvergenceControlToken) { |
3871 | Ops.push_back(SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, |
3872 | DL, MVT::Glue, Token), |
3873 | 0)); |
3874 | } |
3875 | |
3876 | SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); |
3877 | |
3878 | // If we're doing a tall call, use a TC_RETURN here rather than an |
3879 | // actual call instruction. |
3880 | if (IsTailCall) { |
3881 | MFI.setHasTailCall(); |
3882 | unsigned OPC = AMDGPUISD::TC_RETURN; |
3883 | switch (CallConv) { |
3884 | case CallingConv::AMDGPU_Gfx: |
3885 | OPC = AMDGPUISD::TC_RETURN_GFX; |
3886 | break; |
3887 | case CallingConv::AMDGPU_CS_Chain: |
3888 | case CallingConv::AMDGPU_CS_ChainPreserve: |
3889 | OPC = AMDGPUISD::TC_RETURN_CHAIN; |
3890 | break; |
3891 | } |
3892 | |
3893 | return DAG.getNode(Opcode: OPC, DL, VTList: NodeTys, Ops); |
3894 | } |
3895 | |
3896 | // Returns a chain and a flag for retval copy to use. |
3897 | SDValue Call = DAG.getNode(Opcode: AMDGPUISD::CALL, DL, VTList: NodeTys, Ops); |
3898 | Chain = Call.getValue(R: 0); |
3899 | InGlue = Call.getValue(R: 1); |
3900 | |
3901 | uint64_t CalleePopBytes = NumBytes; |
3902 | Chain = DAG.getCALLSEQ_END(Chain, Size1: 0, Size2: CalleePopBytes, Glue: InGlue, DL); |
3903 | if (!Ins.empty()) |
3904 | InGlue = Chain.getValue(R: 1); |
3905 | |
3906 | // Handle result values, copying them out of physregs into vregs that we |
3907 | // return. |
3908 | return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG, |
3909 | InVals, /*IsThisReturn=*/false, ThisVal: SDValue()); |
3910 | } |
3911 | |
3912 | // This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC, |
3913 | // except for applying the wave size scale to the increment amount. |
3914 | SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl( |
3915 | SDValue Op, SelectionDAG &DAG) const { |
3916 | const MachineFunction &MF = DAG.getMachineFunction(); |
3917 | const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
3918 | |
3919 | SDLoc dl(Op); |
3920 | EVT VT = Op.getValueType(); |
3921 | SDValue Tmp1 = Op; |
3922 | SDValue Tmp2 = Op.getValue(R: 1); |
3923 | SDValue Tmp3 = Op.getOperand(i: 2); |
3924 | SDValue Chain = Tmp1.getOperand(i: 0); |
3925 | |
3926 | Register SPReg = Info->getStackPtrOffsetReg(); |
3927 | |
3928 | // Chain the dynamic stack allocation so that it doesn't modify the stack |
3929 | // pointer when other instructions are using the stack. |
3930 | Chain = DAG.getCALLSEQ_START(Chain, InSize: 0, OutSize: 0, DL: dl); |
3931 | |
3932 | SDValue Size = Tmp2.getOperand(i: 1); |
3933 | SDValue SP = DAG.getCopyFromReg(Chain, dl, Reg: SPReg, VT); |
3934 | Chain = SP.getValue(R: 1); |
3935 | MaybeAlign Alignment = cast<ConstantSDNode>(Val&: Tmp3)->getMaybeAlignValue(); |
3936 | const TargetFrameLowering *TFL = Subtarget->getFrameLowering(); |
3937 | unsigned Opc = |
3938 | TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp ? |
3939 | ISD::ADD : ISD::SUB; |
3940 | |
3941 | SDValue ScaledSize = DAG.getNode( |
3942 | ISD::SHL, dl, VT, Size, |
3943 | DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32)); |
3944 | |
3945 | Align StackAlign = TFL->getStackAlign(); |
3946 | Tmp1 = DAG.getNode(Opcode: Opc, DL: dl, VT, N1: SP, N2: ScaledSize); // Value |
3947 | if (Alignment && *Alignment > StackAlign) { |
3948 | Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1, |
3949 | DAG.getConstant(-(uint64_t)Alignment->value() |
3950 | << Subtarget->getWavefrontSizeLog2(), |
3951 | dl, VT)); |
3952 | } |
3953 | |
3954 | Chain = DAG.getCopyToReg(Chain, dl, Reg: SPReg, N: Tmp1); // Output chain |
3955 | Tmp2 = DAG.getCALLSEQ_END(Chain, Size1: 0, Size2: 0, Glue: SDValue(), DL: dl); |
3956 | |
3957 | return DAG.getMergeValues(Ops: {Tmp1, Tmp2}, dl); |
3958 | } |
3959 | |
3960 | SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, |
3961 | SelectionDAG &DAG) const { |
3962 | // We only handle constant sizes here to allow non-entry block, static sized |
3963 | // allocas. A truly dynamic value is more difficult to support because we |
3964 | // don't know if the size value is uniform or not. If the size isn't uniform, |
3965 | // we would need to do a wave reduction to get the maximum size to know how |
3966 | // much to increment the uniform stack pointer. |
3967 | SDValue Size = Op.getOperand(i: 1); |
3968 | if (isa<ConstantSDNode>(Val: Size)) |
3969 | return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion. |
3970 | |
3971 | return AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(Op, DAG); |
3972 | } |
3973 | |
3974 | SDValue SITargetLowering::LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const { |
3975 | if (Op.getValueType() != MVT::i32) |
3976 | return Op; // Defer to cannot select error. |
3977 | |
3978 | Register SP = getStackPointerRegisterToSaveRestore(); |
3979 | SDLoc SL(Op); |
3980 | |
3981 | SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32); |
3982 | |
3983 | // Convert from wave uniform to swizzled vector address. This should protect |
3984 | // from any edge cases where the stacksave result isn't directly used with |
3985 | // stackrestore. |
3986 | SDValue VectorAddress = |
3987 | DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP); |
3988 | return DAG.getMergeValues(Ops: {VectorAddress, CopyFromSP.getValue(R: 1)}, dl: SL); |
3989 | } |
3990 | |
3991 | SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op, |
3992 | SelectionDAG &DAG) const { |
3993 | SDLoc SL(Op); |
3994 | assert(Op.getValueType() == MVT::i32); |
3995 | |
3996 | uint32_t BothRoundHwReg = |
3997 | AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 4); |
3998 | SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32); |
3999 | |
4000 | SDValue IntrinID = |
4001 | DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32); |
4002 | SDValue GetReg = DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL: SL, VTList: Op->getVTList(), |
4003 | N1: Op.getOperand(i: 0), N2: IntrinID, N3: GetRoundBothImm); |
4004 | |
4005 | // There are two rounding modes, one for f32 and one for f64/f16. We only |
4006 | // report in the standard value range if both are the same. |
4007 | // |
4008 | // The raw values also differ from the expected FLT_ROUNDS values. Nearest |
4009 | // ties away from zero is not supported, and the other values are rotated by |
4010 | // 1. |
4011 | // |
4012 | // If the two rounding modes are not the same, report a target defined value. |
4013 | |
4014 | // Mode register rounding mode fields: |
4015 | // |
4016 | // [1:0] Single-precision round mode. |
4017 | // [3:2] Double/Half-precision round mode. |
4018 | // |
4019 | // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero. |
4020 | // |
4021 | // Hardware Spec |
4022 | // Toward-0 3 0 |
4023 | // Nearest Even 0 1 |
4024 | // +Inf 1 2 |
4025 | // -Inf 2 3 |
4026 | // NearestAway0 N/A 4 |
4027 | // |
4028 | // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit |
4029 | // table we can index by the raw hardware mode. |
4030 | // |
4031 | // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf |
4032 | |
4033 | SDValue BitTable = |
4034 | DAG.getConstant(AMDGPU::FltRoundConversionTable, SL, MVT::i64); |
4035 | |
4036 | SDValue Two = DAG.getConstant(2, SL, MVT::i32); |
4037 | SDValue RoundModeTimesNumBits = |
4038 | DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two); |
4039 | |
4040 | // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we |
4041 | // knew only one mode was demanded. |
4042 | SDValue TableValue = |
4043 | DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits); |
4044 | SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue); |
4045 | |
4046 | SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32); |
4047 | SDValue TableEntry = |
4048 | DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask); |
4049 | |
4050 | // There's a gap in the 4-bit encoded table and actual enum values, so offset |
4051 | // if it's an extended value. |
4052 | SDValue Four = DAG.getConstant(4, SL, MVT::i32); |
4053 | SDValue IsStandardValue = |
4054 | DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT); |
4055 | SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four); |
4056 | SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue, |
4057 | TableEntry, EnumOffset); |
4058 | |
4059 | return DAG.getMergeValues(Ops: {Result, GetReg.getValue(R: 1)}, dl: SL); |
4060 | } |
4061 | |
4062 | SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const { |
4063 | if (Op->isDivergent()) |
4064 | return SDValue(); |
4065 | |
4066 | switch (cast<MemSDNode>(Val&: Op)->getAddressSpace()) { |
4067 | case AMDGPUAS::FLAT_ADDRESS: |
4068 | case AMDGPUAS::GLOBAL_ADDRESS: |
4069 | case AMDGPUAS::CONSTANT_ADDRESS: |
4070 | case AMDGPUAS::CONSTANT_ADDRESS_32BIT: |
4071 | break; |
4072 | default: |
4073 | return SDValue(); |
4074 | } |
4075 | |
4076 | return Op; |
4077 | } |
4078 | |
4079 | // Work around DAG legality rules only based on the result type. |
4080 | SDValue SITargetLowering::lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { |
4081 | bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND; |
4082 | SDValue Src = Op.getOperand(i: IsStrict ? 1 : 0); |
4083 | EVT SrcVT = Src.getValueType(); |
4084 | |
4085 | if (SrcVT.getScalarType() != MVT::bf16) |
4086 | return Op; |
4087 | |
4088 | SDLoc SL(Op); |
4089 | SDValue BitCast = |
4090 | DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: SrcVT.changeTypeToInteger(), Operand: Src); |
4091 | |
4092 | EVT DstVT = Op.getValueType(); |
4093 | if (IsStrict) |
4094 | llvm_unreachable("Need STRICT_BF16_TO_FP" ); |
4095 | |
4096 | return DAG.getNode(Opcode: ISD::BF16_TO_FP, DL: SL, VT: DstVT, Operand: BitCast); |
4097 | } |
4098 | |
4099 | SDValue SITargetLowering::lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const { |
4100 | SDLoc SL(Op); |
4101 | if (Op.getValueType() != MVT::i64) |
4102 | return Op; |
4103 | |
4104 | uint32_t ModeHwReg = |
4105 | AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 23); |
4106 | SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32); |
4107 | uint32_t TrapHwReg = |
4108 | AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_TRAPSTS, 0, 5); |
4109 | SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32); |
4110 | |
4111 | SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other); |
4112 | SDValue IntrinID = |
4113 | DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32); |
4114 | SDValue GetModeReg = DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL: SL, VTList, |
4115 | N1: Op.getOperand(i: 0), N2: IntrinID, N3: ModeHwRegImm); |
4116 | SDValue GetTrapReg = DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL: SL, VTList, |
4117 | N1: Op.getOperand(i: 0), N2: IntrinID, N3: TrapHwRegImm); |
4118 | SDValue TokenReg = |
4119 | DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1), |
4120 | GetTrapReg.getValue(1)); |
4121 | |
4122 | SDValue CvtPtr = |
4123 | DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg); |
4124 | SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr); |
4125 | |
4126 | return DAG.getMergeValues(Ops: {Result, TokenReg}, dl: SL); |
4127 | } |
4128 | |
4129 | SDValue SITargetLowering::lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const { |
4130 | SDLoc SL(Op); |
4131 | if (Op.getOperand(1).getValueType() != MVT::i64) |
4132 | return Op; |
4133 | |
4134 | SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1)); |
4135 | SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input, |
4136 | DAG.getConstant(0, SL, MVT::i32)); |
4137 | SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input, |
4138 | DAG.getConstant(1, SL, MVT::i32)); |
4139 | |
4140 | SDValue ReadFirstLaneID = |
4141 | DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32); |
4142 | NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, |
4143 | ReadFirstLaneID, NewModeReg); |
4144 | NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, |
4145 | ReadFirstLaneID, NewTrapReg); |
4146 | |
4147 | unsigned ModeHwReg = |
4148 | AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 23); |
4149 | SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32); |
4150 | unsigned TrapHwReg = |
4151 | AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_TRAPSTS, 0, 5); |
4152 | SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32); |
4153 | |
4154 | SDValue IntrinID = |
4155 | DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32); |
4156 | SDValue SetModeReg = |
4157 | DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0), |
4158 | IntrinID, ModeHwRegImm, NewModeReg); |
4159 | SDValue SetTrapReg = |
4160 | DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0), |
4161 | IntrinID, TrapHwRegImm, NewTrapReg); |
4162 | return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg); |
4163 | } |
4164 | |
4165 | Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT, |
4166 | const MachineFunction &MF) const { |
4167 | Register Reg = StringSwitch<Register>(RegName) |
4168 | .Case("m0" , AMDGPU::M0) |
4169 | .Case("exec" , AMDGPU::EXEC) |
4170 | .Case("exec_lo" , AMDGPU::EXEC_LO) |
4171 | .Case("exec_hi" , AMDGPU::EXEC_HI) |
4172 | .Case("flat_scratch" , AMDGPU::FLAT_SCR) |
4173 | .Case("flat_scratch_lo" , AMDGPU::FLAT_SCR_LO) |
4174 | .Case("flat_scratch_hi" , AMDGPU::FLAT_SCR_HI) |
4175 | .Default(Register()); |
4176 | |
4177 | if (Reg == AMDGPU::NoRegister) { |
4178 | report_fatal_error(reason: Twine("invalid register name \"" |
4179 | + StringRef(RegName) + "\"." )); |
4180 | |
4181 | } |
4182 | |
4183 | if (!Subtarget->hasFlatScrRegister() && |
4184 | Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) { |
4185 | report_fatal_error(reason: Twine("invalid register \"" |
4186 | + StringRef(RegName) + "\" for subtarget." )); |
4187 | } |
4188 | |
4189 | switch (Reg) { |
4190 | case AMDGPU::M0: |
4191 | case AMDGPU::EXEC_LO: |
4192 | case AMDGPU::EXEC_HI: |
4193 | case AMDGPU::FLAT_SCR_LO: |
4194 | case AMDGPU::FLAT_SCR_HI: |
4195 | if (VT.getSizeInBits() == 32) |
4196 | return Reg; |
4197 | break; |
4198 | case AMDGPU::EXEC: |
4199 | case AMDGPU::FLAT_SCR: |
4200 | if (VT.getSizeInBits() == 64) |
4201 | return Reg; |
4202 | break; |
4203 | default: |
4204 | llvm_unreachable("missing register type checking" ); |
4205 | } |
4206 | |
4207 | report_fatal_error(reason: Twine("invalid type for register \"" |
4208 | + StringRef(RegName) + "\"." )); |
4209 | } |
4210 | |
4211 | // If kill is not the last instruction, split the block so kill is always a |
4212 | // proper terminator. |
4213 | MachineBasicBlock * |
4214 | SITargetLowering::splitKillBlock(MachineInstr &MI, |
4215 | MachineBasicBlock *BB) const { |
4216 | MachineBasicBlock *SplitBB = BB->splitAt(SplitInst&: MI, UpdateLiveIns: false /*UpdateLiveIns*/); |
4217 | const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); |
4218 | MI.setDesc(TII->getKillTerminatorFromPseudo(Opcode: MI.getOpcode())); |
4219 | return SplitBB; |
4220 | } |
4221 | |
4222 | // Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true, |
4223 | // \p MI will be the only instruction in the loop body block. Otherwise, it will |
4224 | // be the first instruction in the remainder block. |
4225 | // |
4226 | /// \returns { LoopBody, Remainder } |
4227 | static std::pair<MachineBasicBlock *, MachineBasicBlock *> |
4228 | splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop) { |
4229 | MachineFunction *MF = MBB.getParent(); |
4230 | MachineBasicBlock::iterator I(&MI); |
4231 | |
4232 | // To insert the loop we need to split the block. Move everything after this |
4233 | // point to a new block, and insert a new empty block between the two. |
4234 | MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); |
4235 | MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); |
4236 | MachineFunction::iterator MBBI(MBB); |
4237 | ++MBBI; |
4238 | |
4239 | MF->insert(MBBI, MBB: LoopBB); |
4240 | MF->insert(MBBI, MBB: RemainderBB); |
4241 | |
4242 | LoopBB->addSuccessor(Succ: LoopBB); |
4243 | LoopBB->addSuccessor(Succ: RemainderBB); |
4244 | |
4245 | // Move the rest of the block into a new block. |
4246 | RemainderBB->transferSuccessorsAndUpdatePHIs(FromMBB: &MBB); |
4247 | |
4248 | if (InstInLoop) { |
4249 | auto Next = std::next(x: I); |
4250 | |
4251 | // Move instruction to loop body. |
4252 | LoopBB->splice(Where: LoopBB->begin(), Other: &MBB, From: I, To: Next); |
4253 | |
4254 | // Move the rest of the block. |
4255 | RemainderBB->splice(Where: RemainderBB->begin(), Other: &MBB, From: Next, To: MBB.end()); |
4256 | } else { |
4257 | RemainderBB->splice(Where: RemainderBB->begin(), Other: &MBB, From: I, To: MBB.end()); |
4258 | } |
4259 | |
4260 | MBB.addSuccessor(Succ: LoopBB); |
4261 | |
4262 | return std::pair(LoopBB, RemainderBB); |
4263 | } |
4264 | |
4265 | /// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it. |
4266 | void SITargetLowering::bundleInstWithWaitcnt(MachineInstr &MI) const { |
4267 | MachineBasicBlock *MBB = MI.getParent(); |
4268 | const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); |
4269 | auto I = MI.getIterator(); |
4270 | auto E = std::next(x: I); |
4271 | |
4272 | BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT)) |
4273 | .addImm(0); |
4274 | |
4275 | MIBundleBuilder Bundler(*MBB, I, E); |
4276 | finalizeBundle(MBB&: *MBB, FirstMI: Bundler.begin()); |
4277 | } |
4278 | |
4279 | MachineBasicBlock * |
4280 | SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI, |
4281 | MachineBasicBlock *BB) const { |
4282 | const DebugLoc &DL = MI.getDebugLoc(); |
4283 | |
4284 | MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); |
4285 | |
4286 | MachineBasicBlock *LoopBB; |
4287 | MachineBasicBlock *RemainderBB; |
4288 | const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); |
4289 | |
4290 | // Apparently kill flags are only valid if the def is in the same block? |
4291 | if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) |
4292 | Src->setIsKill(false); |
4293 | |
4294 | std::tie(args&: LoopBB, args&: RemainderBB) = splitBlockForLoop(MI, MBB&: *BB, InstInLoop: true); |
4295 | |
4296 | MachineBasicBlock::iterator I = LoopBB->end(); |
4297 | |
4298 | const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode( |
4299 | AMDGPU::Hwreg::ID_TRAPSTS, AMDGPU::Hwreg::OFFSET_MEM_VIOL, 1); |
4300 | |
4301 | // Clear TRAP_STS.MEM_VIOL |
4302 | BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32)) |
4303 | .addImm(0) |
4304 | .addImm(EncodedReg); |
4305 | |
4306 | bundleInstWithWaitcnt(MI); |
4307 | |
4308 | Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); |
4309 | |
4310 | // Load and check TRAP_STS.MEM_VIOL |
4311 | BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg) |
4312 | .addImm(EncodedReg); |
4313 | |
4314 | // FIXME: Do we need to use an isel pseudo that may clobber scc? |
4315 | BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32)) |
4316 | .addReg(Reg, RegState::Kill) |
4317 | .addImm(0); |
4318 | BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1)) |
4319 | .addMBB(LoopBB); |
4320 | |
4321 | return RemainderBB; |
4322 | } |
4323 | |
4324 | // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the |
4325 | // wavefront. If the value is uniform and just happens to be in a VGPR, this |
4326 | // will only do one iteration. In the worst case, this will loop 64 times. |
4327 | // |
4328 | // TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value. |
4329 | static MachineBasicBlock::iterator |
4330 | emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, |
4331 | MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, |
4332 | const DebugLoc &DL, const MachineOperand &Idx, |
4333 | unsigned InitReg, unsigned ResultReg, unsigned PhiReg, |
4334 | unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, |
4335 | Register &SGPRIdxReg) { |
4336 | |
4337 | MachineFunction *MF = OrigBB.getParent(); |
4338 | const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); |
4339 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
4340 | MachineBasicBlock::iterator I = LoopBB.begin(); |
4341 | |
4342 | const TargetRegisterClass *BoolRC = TRI->getBoolRC(); |
4343 | Register PhiExec = MRI.createVirtualRegister(RegClass: BoolRC); |
4344 | Register NewExec = MRI.createVirtualRegister(RegClass: BoolRC); |
4345 | Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); |
4346 | Register CondReg = MRI.createVirtualRegister(RegClass: BoolRC); |
4347 | |
4348 | BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg) |
4349 | .addReg(InitReg) |
4350 | .addMBB(&OrigBB) |
4351 | .addReg(ResultReg) |
4352 | .addMBB(&LoopBB); |
4353 | |
4354 | BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec) |
4355 | .addReg(InitSaveExecReg) |
4356 | .addMBB(&OrigBB) |
4357 | .addReg(NewExec) |
4358 | .addMBB(&LoopBB); |
4359 | |
4360 | // Read the next variant <- also loop target. |
4361 | BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg) |
4362 | .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef())); |
4363 | |
4364 | // Compare the just read M0 value to all possible Idx values. |
4365 | BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg) |
4366 | .addReg(CurrentIdxReg) |
4367 | .addReg(Idx.getReg(), 0, Idx.getSubReg()); |
4368 | |
4369 | // Update EXEC, save the original EXEC value to VCC. |
4370 | BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 |
4371 | : AMDGPU::S_AND_SAVEEXEC_B64), |
4372 | NewExec) |
4373 | .addReg(CondReg, RegState::Kill); |
4374 | |
4375 | MRI.setSimpleHint(VReg: NewExec, PrefReg: CondReg); |
4376 | |
4377 | if (UseGPRIdxMode) { |
4378 | if (Offset == 0) { |
4379 | SGPRIdxReg = CurrentIdxReg; |
4380 | } else { |
4381 | SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); |
4382 | BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg) |
4383 | .addReg(CurrentIdxReg, RegState::Kill) |
4384 | .addImm(Offset); |
4385 | } |
4386 | } else { |
4387 | // Move index from VCC into M0 |
4388 | if (Offset == 0) { |
4389 | BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) |
4390 | .addReg(CurrentIdxReg, RegState::Kill); |
4391 | } else { |
4392 | BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) |
4393 | .addReg(CurrentIdxReg, RegState::Kill) |
4394 | .addImm(Offset); |
4395 | } |
4396 | } |
4397 | |
4398 | // Update EXEC, switch all done bits to 0 and all todo bits to 1. |
4399 | unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; |
4400 | MachineInstr *InsertPt = |
4401 | BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term |
4402 | : AMDGPU::S_XOR_B64_term), Exec) |
4403 | .addReg(Exec) |
4404 | .addReg(NewExec); |
4405 | |
4406 | // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use |
4407 | // s_cbranch_scc0? |
4408 | |
4409 | // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover. |
4410 | BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) |
4411 | .addMBB(&LoopBB); |
4412 | |
4413 | return InsertPt->getIterator(); |
4414 | } |
4415 | |
4416 | // This has slightly sub-optimal regalloc when the source vector is killed by |
4417 | // the read. The register allocator does not understand that the kill is |
4418 | // per-workitem, so is kept alive for the whole loop so we end up not re-using a |
4419 | // subregister from it, using 1 more VGPR than necessary. This was saved when |
4420 | // this was expanded after register allocation. |
4421 | static MachineBasicBlock::iterator |
4422 | loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, |
4423 | unsigned InitResultReg, unsigned PhiReg, int Offset, |
4424 | bool UseGPRIdxMode, Register &SGPRIdxReg) { |
4425 | MachineFunction *MF = MBB.getParent(); |
4426 | const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); |
4427 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
4428 | MachineRegisterInfo &MRI = MF->getRegInfo(); |
4429 | const DebugLoc &DL = MI.getDebugLoc(); |
4430 | MachineBasicBlock::iterator I(&MI); |
4431 | |
4432 | const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); |
4433 | Register DstReg = MI.getOperand(i: 0).getReg(); |
4434 | Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); |
4435 | Register TmpExec = MRI.createVirtualRegister(BoolXExecRC); |
4436 | unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; |
4437 | unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; |
4438 | |
4439 | BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec); |
4440 | |
4441 | // Save the EXEC mask |
4442 | BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec) |
4443 | .addReg(Exec); |
4444 | |
4445 | MachineBasicBlock *LoopBB; |
4446 | MachineBasicBlock *RemainderBB; |
4447 | std::tie(args&: LoopBB, args&: RemainderBB) = splitBlockForLoop(MI, MBB, InstInLoop: false); |
4448 | |
4449 | const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); |
4450 | |
4451 | auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, OrigBB&: MBB, LoopBB&: *LoopBB, DL, Idx: *Idx, |
4452 | InitReg: InitResultReg, ResultReg: DstReg, PhiReg, InitSaveExecReg: TmpExec, |
4453 | Offset, UseGPRIdxMode, SGPRIdxReg); |
4454 | |
4455 | MachineBasicBlock* LandingPad = MF->CreateMachineBasicBlock(); |
4456 | MachineFunction::iterator MBBI(LoopBB); |
4457 | ++MBBI; |
4458 | MF->insert(MBBI, MBB: LandingPad); |
4459 | LoopBB->removeSuccessor(Succ: RemainderBB); |
4460 | LandingPad->addSuccessor(Succ: RemainderBB); |
4461 | LoopBB->addSuccessor(Succ: LandingPad); |
4462 | MachineBasicBlock::iterator First = LandingPad->begin(); |
4463 | BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec) |
4464 | .addReg(SaveExec); |
4465 | |
4466 | return InsPt; |
4467 | } |
4468 | |
4469 | // Returns subreg index, offset |
4470 | static std::pair<unsigned, int> |
4471 | computeIndirectRegAndOffset(const SIRegisterInfo &TRI, |
4472 | const TargetRegisterClass *SuperRC, |
4473 | unsigned VecReg, |
4474 | int Offset) { |
4475 | int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32; |
4476 | |
4477 | // Skip out of bounds offsets, or else we would end up using an undefined |
4478 | // register. |
4479 | if (Offset >= NumElts || Offset < 0) |
4480 | return std::pair(AMDGPU::sub0, Offset); |
4481 | |
4482 | return std::pair(SIRegisterInfo::getSubRegFromChannel(Channel: Offset), 0); |
4483 | } |
4484 | |
4485 | static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, |
4486 | MachineRegisterInfo &MRI, MachineInstr &MI, |
4487 | int Offset) { |
4488 | MachineBasicBlock *MBB = MI.getParent(); |
4489 | const DebugLoc &DL = MI.getDebugLoc(); |
4490 | MachineBasicBlock::iterator I(&MI); |
4491 | |
4492 | const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); |
4493 | |
4494 | assert(Idx->getReg() != AMDGPU::NoRegister); |
4495 | |
4496 | if (Offset == 0) { |
4497 | BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0).add(*Idx); |
4498 | } else { |
4499 | BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) |
4500 | .add(*Idx) |
4501 | .addImm(Offset); |
4502 | } |
4503 | } |
4504 | |
4505 | static Register getIndirectSGPRIdx(const SIInstrInfo *TII, |
4506 | MachineRegisterInfo &MRI, MachineInstr &MI, |
4507 | int Offset) { |
4508 | MachineBasicBlock *MBB = MI.getParent(); |
4509 | const DebugLoc &DL = MI.getDebugLoc(); |
4510 | MachineBasicBlock::iterator I(&MI); |
4511 | |
4512 | const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); |
4513 | |
4514 | if (Offset == 0) |
4515 | return Idx->getReg(); |
4516 | |
4517 | Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); |
4518 | BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp) |
4519 | .add(*Idx) |
4520 | .addImm(Offset); |
4521 | return Tmp; |
4522 | } |
4523 | |
4524 | static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI, |
4525 | MachineBasicBlock &MBB, |
4526 | const GCNSubtarget &ST) { |
4527 | const SIInstrInfo *TII = ST.getInstrInfo(); |
4528 | const SIRegisterInfo &TRI = TII->getRegisterInfo(); |
4529 | MachineFunction *MF = MBB.getParent(); |
4530 | MachineRegisterInfo &MRI = MF->getRegInfo(); |
4531 | |
4532 | Register Dst = MI.getOperand(i: 0).getReg(); |
4533 | const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); |
4534 | Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg(); |
4535 | int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); |
4536 | |
4537 | const TargetRegisterClass *VecRC = MRI.getRegClass(Reg: SrcReg); |
4538 | const TargetRegisterClass *IdxRC = MRI.getRegClass(Reg: Idx->getReg()); |
4539 | |
4540 | unsigned SubReg; |
4541 | std::tie(args&: SubReg, args&: Offset) |
4542 | = computeIndirectRegAndOffset(TRI, SuperRC: VecRC, VecReg: SrcReg, Offset); |
4543 | |
4544 | const bool UseGPRIdxMode = ST.useVGPRIndexMode(); |
4545 | |
4546 | // Check for a SGPR index. |
4547 | if (TII->getRegisterInfo().isSGPRClass(RC: IdxRC)) { |
4548 | MachineBasicBlock::iterator I(&MI); |
4549 | const DebugLoc &DL = MI.getDebugLoc(); |
4550 | |
4551 | if (UseGPRIdxMode) { |
4552 | // TODO: Look at the uses to avoid the copy. This may require rescheduling |
4553 | // to avoid interfering with other uses, so probably requires a new |
4554 | // optimization pass. |
4555 | Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset); |
4556 | |
4557 | const MCInstrDesc &GPRIDXDesc = |
4558 | TII->getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(*VecRC), IsIndirectSrc: true); |
4559 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: GPRIDXDesc, DestReg: Dst) |
4560 | .addReg(RegNo: SrcReg) |
4561 | .addReg(RegNo: Idx) |
4562 | .addImm(Val: SubReg); |
4563 | } else { |
4564 | setM0ToIndexFromSGPR(TII, MRI, MI, Offset); |
4565 | |
4566 | BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) |
4567 | .addReg(SrcReg, 0, SubReg) |
4568 | .addReg(SrcReg, RegState::Implicit); |
4569 | } |
4570 | |
4571 | MI.eraseFromParent(); |
4572 | |
4573 | return &MBB; |
4574 | } |
4575 | |
4576 | // Control flow needs to be inserted if indexing with a VGPR. |
4577 | const DebugLoc &DL = MI.getDebugLoc(); |
4578 | MachineBasicBlock::iterator I(&MI); |
4579 | |
4580 | Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
4581 | Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
4582 | |
4583 | BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg); |
4584 | |
4585 | Register SGPRIdxReg; |
4586 | auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitResultReg: InitReg, PhiReg, Offset, |
4587 | UseGPRIdxMode, SGPRIdxReg); |
4588 | |
4589 | MachineBasicBlock *LoopBB = InsPt->getParent(); |
4590 | |
4591 | if (UseGPRIdxMode) { |
4592 | const MCInstrDesc &GPRIDXDesc = |
4593 | TII->getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(*VecRC), IsIndirectSrc: true); |
4594 | |
4595 | BuildMI(BB&: *LoopBB, I: InsPt, MIMD: DL, MCID: GPRIDXDesc, DestReg: Dst) |
4596 | .addReg(RegNo: SrcReg) |
4597 | .addReg(RegNo: SGPRIdxReg) |
4598 | .addImm(Val: SubReg); |
4599 | } else { |
4600 | BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) |
4601 | .addReg(SrcReg, 0, SubReg) |
4602 | .addReg(SrcReg, RegState::Implicit); |
4603 | } |
4604 | |
4605 | MI.eraseFromParent(); |
4606 | |
4607 | return LoopBB; |
4608 | } |
4609 | |
4610 | static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, |
4611 | MachineBasicBlock &MBB, |
4612 | const GCNSubtarget &ST) { |
4613 | const SIInstrInfo *TII = ST.getInstrInfo(); |
4614 | const SIRegisterInfo &TRI = TII->getRegisterInfo(); |
4615 | MachineFunction *MF = MBB.getParent(); |
4616 | MachineRegisterInfo &MRI = MF->getRegInfo(); |
4617 | |
4618 | Register Dst = MI.getOperand(i: 0).getReg(); |
4619 | const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src); |
4620 | const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); |
4621 | const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val); |
4622 | int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); |
4623 | const TargetRegisterClass *VecRC = MRI.getRegClass(Reg: SrcVec->getReg()); |
4624 | const TargetRegisterClass *IdxRC = MRI.getRegClass(Reg: Idx->getReg()); |
4625 | |
4626 | // This can be an immediate, but will be folded later. |
4627 | assert(Val->getReg()); |
4628 | |
4629 | unsigned SubReg; |
4630 | std::tie(args&: SubReg, args&: Offset) = computeIndirectRegAndOffset(TRI, SuperRC: VecRC, |
4631 | VecReg: SrcVec->getReg(), |
4632 | Offset); |
4633 | const bool UseGPRIdxMode = ST.useVGPRIndexMode(); |
4634 | |
4635 | if (Idx->getReg() == AMDGPU::NoRegister) { |
4636 | MachineBasicBlock::iterator I(&MI); |
4637 | const DebugLoc &DL = MI.getDebugLoc(); |
4638 | |
4639 | assert(Offset == 0); |
4640 | |
4641 | BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst) |
4642 | .add(*SrcVec) |
4643 | .add(*Val) |
4644 | .addImm(SubReg); |
4645 | |
4646 | MI.eraseFromParent(); |
4647 | return &MBB; |
4648 | } |
4649 | |
4650 | // Check for a SGPR index. |
4651 | if (TII->getRegisterInfo().isSGPRClass(RC: IdxRC)) { |
4652 | MachineBasicBlock::iterator I(&MI); |
4653 | const DebugLoc &DL = MI.getDebugLoc(); |
4654 | |
4655 | if (UseGPRIdxMode) { |
4656 | Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset); |
4657 | |
4658 | const MCInstrDesc &GPRIDXDesc = |
4659 | TII->getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(*VecRC), IsIndirectSrc: false); |
4660 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: GPRIDXDesc, DestReg: Dst) |
4661 | .addReg(RegNo: SrcVec->getReg()) |
4662 | .add(MO: *Val) |
4663 | .addReg(RegNo: Idx) |
4664 | .addImm(Val: SubReg); |
4665 | } else { |
4666 | setM0ToIndexFromSGPR(TII, MRI, MI, Offset); |
4667 | |
4668 | const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo( |
4669 | VecSize: TRI.getRegSizeInBits(*VecRC), EltSize: 32, IsSGPR: false); |
4670 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: MovRelDesc, DestReg: Dst) |
4671 | .addReg(RegNo: SrcVec->getReg()) |
4672 | .add(MO: *Val) |
4673 | .addImm(Val: SubReg); |
4674 | } |
4675 | MI.eraseFromParent(); |
4676 | return &MBB; |
4677 | } |
4678 | |
4679 | // Control flow needs to be inserted if indexing with a VGPR. |
4680 | if (Val->isReg()) |
4681 | MRI.clearKillFlags(Reg: Val->getReg()); |
4682 | |
4683 | const DebugLoc &DL = MI.getDebugLoc(); |
4684 | |
4685 | Register PhiReg = MRI.createVirtualRegister(RegClass: VecRC); |
4686 | |
4687 | Register SGPRIdxReg; |
4688 | auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitResultReg: SrcVec->getReg(), PhiReg, Offset, |
4689 | UseGPRIdxMode, SGPRIdxReg); |
4690 | MachineBasicBlock *LoopBB = InsPt->getParent(); |
4691 | |
4692 | if (UseGPRIdxMode) { |
4693 | const MCInstrDesc &GPRIDXDesc = |
4694 | TII->getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(*VecRC), IsIndirectSrc: false); |
4695 | |
4696 | BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst) |
4697 | .addReg(PhiReg) |
4698 | .add(*Val) |
4699 | .addReg(SGPRIdxReg) |
4700 | .addImm(AMDGPU::sub0); |
4701 | } else { |
4702 | const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo( |
4703 | VecSize: TRI.getRegSizeInBits(*VecRC), EltSize: 32, IsSGPR: false); |
4704 | BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst) |
4705 | .addReg(PhiReg) |
4706 | .add(*Val) |
4707 | .addImm(AMDGPU::sub0); |
4708 | } |
4709 | |
4710 | MI.eraseFromParent(); |
4711 | return LoopBB; |
4712 | } |
4713 | |
4714 | static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, |
4715 | MachineBasicBlock &BB, |
4716 | const GCNSubtarget &ST, |
4717 | unsigned Opc) { |
4718 | MachineRegisterInfo &MRI = BB.getParent()->getRegInfo(); |
4719 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
4720 | const DebugLoc &DL = MI.getDebugLoc(); |
4721 | const SIInstrInfo *TII = ST.getInstrInfo(); |
4722 | |
4723 | // Reduction operations depend on whether the input operand is SGPR or VGPR. |
4724 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
4725 | bool isSGPR = TRI->isSGPRClass(RC: MRI.getRegClass(Reg: SrcReg)); |
4726 | Register DstReg = MI.getOperand(i: 0).getReg(); |
4727 | MachineBasicBlock *RetBB = nullptr; |
4728 | if (isSGPR) { |
4729 | // These operations with a uniform value i.e. SGPR are idempotent. |
4730 | // Reduced value will be same as given sgpr. |
4731 | BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg); |
4732 | RetBB = &BB; |
4733 | } else { |
4734 | // TODO: Implement DPP Strategy and switch based on immediate strategy |
4735 | // operand. For now, for all the cases (default, Iterative and DPP we use |
4736 | // iterative approach by default.) |
4737 | |
4738 | // To reduce the VGPR using iterative approach, we need to iterate |
4739 | // over all the active lanes. Lowering consists of ComputeLoop, |
4740 | // which iterate over only active lanes. We use copy of EXEC register |
4741 | // as induction variable and every active lane modifies it using bitset0 |
4742 | // so that we will get the next active lane for next iteration. |
4743 | MachineBasicBlock::iterator I = BB.end(); |
4744 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
4745 | |
4746 | // Create Control flow for loop |
4747 | // Split MI's Machine Basic block into For loop |
4748 | auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, MBB&: BB, InstInLoop: true); |
4749 | |
4750 | // Create virtual registers required for lowering. |
4751 | const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass(); |
4752 | const TargetRegisterClass *DstRegClass = MRI.getRegClass(Reg: DstReg); |
4753 | Register LoopIterator = MRI.createVirtualRegister(RegClass: WaveMaskRegClass); |
4754 | Register InitalValReg = MRI.createVirtualRegister(RegClass: DstRegClass); |
4755 | |
4756 | Register AccumulatorReg = MRI.createVirtualRegister(RegClass: DstRegClass); |
4757 | Register ActiveBitsReg = MRI.createVirtualRegister(RegClass: WaveMaskRegClass); |
4758 | Register NewActiveBitsReg = MRI.createVirtualRegister(RegClass: WaveMaskRegClass); |
4759 | |
4760 | Register FF1Reg = MRI.createVirtualRegister(RegClass: DstRegClass); |
4761 | Register LaneValueReg = MRI.createVirtualRegister(RegClass: DstRegClass); |
4762 | |
4763 | bool IsWave32 = ST.isWave32(); |
4764 | unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; |
4765 | unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; |
4766 | |
4767 | // Create initail values of induction variable from Exec, Accumulator and |
4768 | // insert branch instr to newly created ComputeBlockk |
4769 | uint32_t InitalValue = |
4770 | (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0; |
4771 | auto TmpSReg = |
4772 | BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg); |
4773 | BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg) |
4774 | .addImm(InitalValue); |
4775 | BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH)).addMBB(ComputeLoop); |
4776 | |
4777 | // Start constructing ComputeLoop |
4778 | I = ComputeLoop->end(); |
4779 | auto Accumulator = |
4780 | BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg) |
4781 | .addReg(InitalValReg) |
4782 | .addMBB(&BB); |
4783 | auto ActiveBits = |
4784 | BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg) |
4785 | .addReg(TmpSReg->getOperand(0).getReg()) |
4786 | .addMBB(&BB); |
4787 | |
4788 | // Perform the computations |
4789 | unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64; |
4790 | auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg) |
4791 | .addReg(ActiveBits->getOperand(0).getReg()); |
4792 | auto LaneValue = BuildMI(*ComputeLoop, I, DL, |
4793 | TII->get(AMDGPU::V_READLANE_B32), LaneValueReg) |
4794 | .addReg(SrcReg) |
4795 | .addReg(FF1->getOperand(0).getReg()); |
4796 | auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg) |
4797 | .addReg(Accumulator->getOperand(0).getReg()) |
4798 | .addReg(LaneValue->getOperand(0).getReg()); |
4799 | |
4800 | // Manipulate the iterator to get the next active lane |
4801 | unsigned BITSETOpc = |
4802 | IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64; |
4803 | auto NewActiveBits = |
4804 | BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg) |
4805 | .addReg(FF1->getOperand(0).getReg()) |
4806 | .addReg(ActiveBits->getOperand(0).getReg()); |
4807 | |
4808 | // Add phi nodes |
4809 | Accumulator.addReg(NewAccumulator->getOperand(0).getReg()) |
4810 | .addMBB(ComputeLoop); |
4811 | ActiveBits.addReg(NewActiveBits->getOperand(0).getReg()) |
4812 | .addMBB(ComputeLoop); |
4813 | |
4814 | // Creating branching |
4815 | unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64; |
4816 | BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc)) |
4817 | .addReg(NewActiveBits->getOperand(0).getReg()) |
4818 | .addImm(0); |
4819 | BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1)) |
4820 | .addMBB(ComputeLoop); |
4821 | |
4822 | RetBB = ComputeEnd; |
4823 | } |
4824 | MI.eraseFromParent(); |
4825 | return RetBB; |
4826 | } |
4827 | |
4828 | MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( |
4829 | MachineInstr &MI, MachineBasicBlock *BB) const { |
4830 | |
4831 | const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); |
4832 | MachineFunction *MF = BB->getParent(); |
4833 | SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); |
4834 | |
4835 | switch (MI.getOpcode()) { |
4836 | case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32: |
4837 | return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32); |
4838 | case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32: |
4839 | return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32); |
4840 | case AMDGPU::S_UADDO_PSEUDO: |
4841 | case AMDGPU::S_USUBO_PSEUDO: { |
4842 | const DebugLoc &DL = MI.getDebugLoc(); |
4843 | MachineOperand &Dest0 = MI.getOperand(i: 0); |
4844 | MachineOperand &Dest1 = MI.getOperand(i: 1); |
4845 | MachineOperand &Src0 = MI.getOperand(i: 2); |
4846 | MachineOperand &Src1 = MI.getOperand(i: 3); |
4847 | |
4848 | unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO) |
4849 | ? AMDGPU::S_ADD_I32 |
4850 | : AMDGPU::S_SUB_I32; |
4851 | BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg()).add(Src0).add(Src1); |
4852 | |
4853 | BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg()) |
4854 | .addImm(1) |
4855 | .addImm(0); |
4856 | |
4857 | MI.eraseFromParent(); |
4858 | return BB; |
4859 | } |
4860 | case AMDGPU::S_ADD_U64_PSEUDO: |
4861 | case AMDGPU::S_SUB_U64_PSEUDO: { |
4862 | // For targets older than GFX12, we emit a sequence of 32-bit operations. |
4863 | // For GFX12, we emit s_add_u64 and s_sub_u64. |
4864 | const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); |
4865 | MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); |
4866 | const DebugLoc &DL = MI.getDebugLoc(); |
4867 | MachineOperand &Dest = MI.getOperand(i: 0); |
4868 | MachineOperand &Src0 = MI.getOperand(i: 1); |
4869 | MachineOperand &Src1 = MI.getOperand(i: 2); |
4870 | bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO); |
4871 | if (Subtarget->hasScalarAddSub64()) { |
4872 | unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64; |
4873 | BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg()) |
4874 | .add(Src0) |
4875 | .add(Src1); |
4876 | } else { |
4877 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
4878 | const TargetRegisterClass *BoolRC = TRI->getBoolRC(); |
4879 | |
4880 | Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); |
4881 | Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); |
4882 | |
4883 | MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm( |
4884 | MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass); |
4885 | MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm( |
4886 | MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass); |
4887 | |
4888 | MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm( |
4889 | MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass); |
4890 | MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm( |
4891 | MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass); |
4892 | |
4893 | unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; |
4894 | unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; |
4895 | BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0) |
4896 | .add(Src0Sub0) |
4897 | .add(Src1Sub0); |
4898 | BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1) |
4899 | .add(Src0Sub1) |
4900 | .add(Src1Sub1); |
4901 | BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg()) |
4902 | .addReg(DestSub0) |
4903 | .addImm(AMDGPU::sub0) |
4904 | .addReg(DestSub1) |
4905 | .addImm(AMDGPU::sub1); |
4906 | } |
4907 | MI.eraseFromParent(); |
4908 | return BB; |
4909 | } |
4910 | case AMDGPU::V_ADD_U64_PSEUDO: |
4911 | case AMDGPU::V_SUB_U64_PSEUDO: { |
4912 | MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); |
4913 | const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); |
4914 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
4915 | const DebugLoc &DL = MI.getDebugLoc(); |
4916 | |
4917 | bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO); |
4918 | |
4919 | MachineOperand &Dest = MI.getOperand(i: 0); |
4920 | MachineOperand &Src0 = MI.getOperand(i: 1); |
4921 | MachineOperand &Src1 = MI.getOperand(i: 2); |
4922 | |
4923 | if (IsAdd && ST.hasLshlAddB64()) { |
4924 | auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64), |
4925 | Dest.getReg()) |
4926 | .add(Src0) |
4927 | .addImm(0) |
4928 | .add(Src1); |
4929 | TII->legalizeOperands(MI&: *Add); |
4930 | MI.eraseFromParent(); |
4931 | return BB; |
4932 | } |
4933 | |
4934 | const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); |
4935 | |
4936 | Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
4937 | Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
4938 | |
4939 | Register CarryReg = MRI.createVirtualRegister(CarryRC); |
4940 | Register DeadCarryReg = MRI.createVirtualRegister(CarryRC); |
4941 | |
4942 | const TargetRegisterClass *Src0RC = Src0.isReg() |
4943 | ? MRI.getRegClass(Src0.getReg()) |
4944 | : &AMDGPU::VReg_64RegClass; |
4945 | const TargetRegisterClass *Src1RC = Src1.isReg() |
4946 | ? MRI.getRegClass(Src1.getReg()) |
4947 | : &AMDGPU::VReg_64RegClass; |
4948 | |
4949 | const TargetRegisterClass *Src0SubRC = |
4950 | TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0); |
4951 | const TargetRegisterClass *Src1SubRC = |
4952 | TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1); |
4953 | |
4954 | MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm( |
4955 | MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC); |
4956 | MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm( |
4957 | MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC); |
4958 | |
4959 | MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm( |
4960 | MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC); |
4961 | MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm( |
4962 | MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC); |
4963 | |
4964 | unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64; |
4965 | MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0) |
4966 | .addReg(CarryReg, RegState::Define) |
4967 | .add(SrcReg0Sub0) |
4968 | .add(SrcReg1Sub0) |
4969 | .addImm(0); // clamp bit |
4970 | |
4971 | unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; |
4972 | MachineInstr *HiHalf = |
4973 | BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1) |
4974 | .addReg(DeadCarryReg, RegState::Define | RegState::Dead) |
4975 | .add(SrcReg0Sub1) |
4976 | .add(SrcReg1Sub1) |
4977 | .addReg(CarryReg, RegState::Kill) |
4978 | .addImm(0); // clamp bit |
4979 | |
4980 | BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg()) |
4981 | .addReg(DestSub0) |
4982 | .addImm(AMDGPU::sub0) |
4983 | .addReg(DestSub1) |
4984 | .addImm(AMDGPU::sub1); |
4985 | TII->legalizeOperands(MI&: *LoHalf); |
4986 | TII->legalizeOperands(MI&: *HiHalf); |
4987 | MI.eraseFromParent(); |
4988 | return BB; |
4989 | } |
4990 | case AMDGPU::S_ADD_CO_PSEUDO: |
4991 | case AMDGPU::S_SUB_CO_PSEUDO: { |
4992 | // This pseudo has a chance to be selected |
4993 | // only from uniform add/subcarry node. All the VGPR operands |
4994 | // therefore assumed to be splat vectors. |
4995 | MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); |
4996 | const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); |
4997 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
4998 | MachineBasicBlock::iterator MII = MI; |
4999 | const DebugLoc &DL = MI.getDebugLoc(); |
5000 | MachineOperand &Dest = MI.getOperand(i: 0); |
5001 | MachineOperand &CarryDest = MI.getOperand(i: 1); |
5002 | MachineOperand &Src0 = MI.getOperand(i: 2); |
5003 | MachineOperand &Src1 = MI.getOperand(i: 3); |
5004 | MachineOperand &Src2 = MI.getOperand(i: 4); |
5005 | unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) |
5006 | ? AMDGPU::S_ADDC_U32 |
5007 | : AMDGPU::S_SUBB_U32; |
5008 | if (Src0.isReg() && TRI->isVectorRegister(MRI, Reg: Src0.getReg())) { |
5009 | Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); |
5010 | BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0) |
5011 | .addReg(Src0.getReg()); |
5012 | Src0.setReg(RegOp0); |
5013 | } |
5014 | if (Src1.isReg() && TRI->isVectorRegister(MRI, Reg: Src1.getReg())) { |
5015 | Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); |
5016 | BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1) |
5017 | .addReg(Src1.getReg()); |
5018 | Src1.setReg(RegOp1); |
5019 | } |
5020 | Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); |
5021 | if (TRI->isVectorRegister(MRI, Reg: Src2.getReg())) { |
5022 | BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2) |
5023 | .addReg(Src2.getReg()); |
5024 | Src2.setReg(RegOp2); |
5025 | } |
5026 | |
5027 | const TargetRegisterClass *Src2RC = MRI.getRegClass(Reg: Src2.getReg()); |
5028 | unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC); |
5029 | assert(WaveSize == 64 || WaveSize == 32); |
5030 | |
5031 | if (WaveSize == 64) { |
5032 | if (ST.hasScalarCompareEq64()) { |
5033 | BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64)) |
5034 | .addReg(Src2.getReg()) |
5035 | .addImm(0); |
5036 | } else { |
5037 | const TargetRegisterClass *SubRC = |
5038 | TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0); |
5039 | MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm( |
5040 | MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC); |
5041 | MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm( |
5042 | MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC); |
5043 | Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); |
5044 | |
5045 | BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32) |
5046 | .add(Src2Sub0) |
5047 | .add(Src2Sub1); |
5048 | |
5049 | BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32)) |
5050 | .addReg(Src2_32, RegState::Kill) |
5051 | .addImm(0); |
5052 | } |
5053 | } else { |
5054 | BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32)) |
5055 | .addReg(Src2.getReg()) |
5056 | .addImm(0); |
5057 | } |
5058 | |
5059 | BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1); |
5060 | |
5061 | unsigned SelOpc = |
5062 | (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32; |
5063 | |
5064 | BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg()) |
5065 | .addImm(-1) |
5066 | .addImm(0); |
5067 | |
5068 | MI.eraseFromParent(); |
5069 | return BB; |
5070 | } |
5071 | case AMDGPU::SI_INIT_M0: { |
5072 | BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(), |
5073 | TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) |
5074 | .add(MI.getOperand(0)); |
5075 | MI.eraseFromParent(); |
5076 | return BB; |
5077 | } |
5078 | case AMDGPU::GET_GROUPSTATICSIZE: { |
5079 | assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA || |
5080 | getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL); |
5081 | DebugLoc DL = MI.getDebugLoc(); |
5082 | BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32)) |
5083 | .add(MI.getOperand(0)) |
5084 | .addImm(MFI->getLDSSize()); |
5085 | MI.eraseFromParent(); |
5086 | return BB; |
5087 | } |
5088 | case AMDGPU::GET_SHADERCYCLESHILO: { |
5089 | assert(MF->getSubtarget<GCNSubtarget>().hasShaderCyclesHiLoRegisters()); |
5090 | MachineRegisterInfo &MRI = MF->getRegInfo(); |
5091 | const DebugLoc &DL = MI.getDebugLoc(); |
5092 | // The algorithm is: |
5093 | // |
5094 | // hi1 = getreg(SHADER_CYCLES_HI) |
5095 | // lo1 = getreg(SHADER_CYCLES_LO) |
5096 | // hi2 = getreg(SHADER_CYCLES_HI) |
5097 | // |
5098 | // If hi1 == hi2 then there was no overflow and the result is hi2:lo1. |
5099 | // Otherwise there was overflow and the result is hi2:0. In both cases the |
5100 | // result should represent the actual time at some point during the sequence |
5101 | // of three getregs. |
5102 | using namespace AMDGPU::Hwreg; |
5103 | Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); |
5104 | BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1) |
5105 | .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32)); |
5106 | Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); |
5107 | BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1) |
5108 | .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32)); |
5109 | Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); |
5110 | BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2) |
5111 | .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32)); |
5112 | BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32)) |
5113 | .addReg(RegHi1) |
5114 | .addReg(RegHi2); |
5115 | Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); |
5116 | BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo) |
5117 | .addReg(RegLo1) |
5118 | .addImm(0); |
5119 | BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE)) |
5120 | .add(MI.getOperand(0)) |
5121 | .addReg(RegLo) |
5122 | .addImm(AMDGPU::sub0) |
5123 | .addReg(RegHi2) |
5124 | .addImm(AMDGPU::sub1); |
5125 | MI.eraseFromParent(); |
5126 | return BB; |
5127 | } |
5128 | case AMDGPU::SI_INDIRECT_SRC_V1: |
5129 | case AMDGPU::SI_INDIRECT_SRC_V2: |
5130 | case AMDGPU::SI_INDIRECT_SRC_V4: |
5131 | case AMDGPU::SI_INDIRECT_SRC_V8: |
5132 | case AMDGPU::SI_INDIRECT_SRC_V9: |
5133 | case AMDGPU::SI_INDIRECT_SRC_V10: |
5134 | case AMDGPU::SI_INDIRECT_SRC_V11: |
5135 | case AMDGPU::SI_INDIRECT_SRC_V12: |
5136 | case AMDGPU::SI_INDIRECT_SRC_V16: |
5137 | case AMDGPU::SI_INDIRECT_SRC_V32: |
5138 | return emitIndirectSrc(MI, MBB&: *BB, ST: *getSubtarget()); |
5139 | case AMDGPU::SI_INDIRECT_DST_V1: |
5140 | case AMDGPU::SI_INDIRECT_DST_V2: |
5141 | case AMDGPU::SI_INDIRECT_DST_V4: |
5142 | case AMDGPU::SI_INDIRECT_DST_V8: |
5143 | case AMDGPU::SI_INDIRECT_DST_V9: |
5144 | case AMDGPU::SI_INDIRECT_DST_V10: |
5145 | case AMDGPU::SI_INDIRECT_DST_V11: |
5146 | case AMDGPU::SI_INDIRECT_DST_V12: |
5147 | case AMDGPU::SI_INDIRECT_DST_V16: |
5148 | case AMDGPU::SI_INDIRECT_DST_V32: |
5149 | return emitIndirectDst(MI, MBB&: *BB, ST: *getSubtarget()); |
5150 | case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO: |
5151 | case AMDGPU::SI_KILL_I1_PSEUDO: |
5152 | return splitKillBlock(MI, BB); |
5153 | case AMDGPU::V_CNDMASK_B64_PSEUDO: { |
5154 | MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); |
5155 | const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); |
5156 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
5157 | |
5158 | Register Dst = MI.getOperand(i: 0).getReg(); |
5159 | const MachineOperand &Src0 = MI.getOperand(i: 1); |
5160 | const MachineOperand &Src1 = MI.getOperand(i: 2); |
5161 | const DebugLoc &DL = MI.getDebugLoc(); |
5162 | Register SrcCond = MI.getOperand(i: 3).getReg(); |
5163 | |
5164 | Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
5165 | Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
5166 | const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); |
5167 | Register SrcCondCopy = MRI.createVirtualRegister(CondRC); |
5168 | |
5169 | const TargetRegisterClass *Src0RC = Src0.isReg() |
5170 | ? MRI.getRegClass(Src0.getReg()) |
5171 | : &AMDGPU::VReg_64RegClass; |
5172 | const TargetRegisterClass *Src1RC = Src1.isReg() |
5173 | ? MRI.getRegClass(Src1.getReg()) |
5174 | : &AMDGPU::VReg_64RegClass; |
5175 | |
5176 | const TargetRegisterClass *Src0SubRC = |
5177 | TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0); |
5178 | const TargetRegisterClass *Src1SubRC = |
5179 | TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1); |
5180 | |
5181 | MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm( |
5182 | MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC); |
5183 | MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm( |
5184 | MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC); |
5185 | |
5186 | MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm( |
5187 | MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC); |
5188 | MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm( |
5189 | MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC); |
5190 | |
5191 | BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy) |
5192 | .addReg(SrcCond); |
5193 | BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo) |
5194 | .addImm(0) |
5195 | .add(Src0Sub0) |
5196 | .addImm(0) |
5197 | .add(Src1Sub0) |
5198 | .addReg(SrcCondCopy); |
5199 | BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi) |
5200 | .addImm(0) |
5201 | .add(Src0Sub1) |
5202 | .addImm(0) |
5203 | .add(Src1Sub1) |
5204 | .addReg(SrcCondCopy); |
5205 | |
5206 | BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst) |
5207 | .addReg(DstLo) |
5208 | .addImm(AMDGPU::sub0) |
5209 | .addReg(DstHi) |
5210 | .addImm(AMDGPU::sub1); |
5211 | MI.eraseFromParent(); |
5212 | return BB; |
5213 | } |
5214 | case AMDGPU::SI_BR_UNDEF: { |
5215 | const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); |
5216 | const DebugLoc &DL = MI.getDebugLoc(); |
5217 | MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1)) |
5218 | .add(MI.getOperand(0)); |
5219 | Br->getOperand(i: 1).setIsUndef(); // read undef SCC |
5220 | MI.eraseFromParent(); |
5221 | return BB; |
5222 | } |
5223 | case AMDGPU::ADJCALLSTACKUP: |
5224 | case AMDGPU::ADJCALLSTACKDOWN: { |
5225 | const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); |
5226 | MachineInstrBuilder MIB(*MF, &MI); |
5227 | MIB.addReg(RegNo: Info->getStackPtrOffsetReg(), flags: RegState::ImplicitDefine) |
5228 | .addReg(RegNo: Info->getStackPtrOffsetReg(), flags: RegState::Implicit); |
5229 | return BB; |
5230 | } |
5231 | case AMDGPU::SI_CALL_ISEL: { |
5232 | const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); |
5233 | const DebugLoc &DL = MI.getDebugLoc(); |
5234 | |
5235 | unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(MF: *MF); |
5236 | |
5237 | MachineInstrBuilder MIB; |
5238 | MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg); |
5239 | |
5240 | for (const MachineOperand &MO : MI.operands()) |
5241 | MIB.add(MO); |
5242 | |
5243 | MIB.cloneMemRefs(OtherMI: MI); |
5244 | MI.eraseFromParent(); |
5245 | return BB; |
5246 | } |
5247 | case AMDGPU::V_ADD_CO_U32_e32: |
5248 | case AMDGPU::V_SUB_CO_U32_e32: |
5249 | case AMDGPU::V_SUBREV_CO_U32_e32: { |
5250 | // TODO: Define distinct V_*_I32_Pseudo instructions instead. |
5251 | const DebugLoc &DL = MI.getDebugLoc(); |
5252 | unsigned Opc = MI.getOpcode(); |
5253 | |
5254 | bool NeedClampOperand = false; |
5255 | if (TII->pseudoToMCOpcode(Opcode: Opc) == -1) { |
5256 | Opc = AMDGPU::getVOPe64(Opcode: Opc); |
5257 | NeedClampOperand = true; |
5258 | } |
5259 | |
5260 | auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(i: 0).getReg()); |
5261 | if (TII->isVOP3(*I)) { |
5262 | const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); |
5263 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
5264 | I.addReg(TRI->getVCC(), RegState::Define); |
5265 | } |
5266 | I.add(MI.getOperand(i: 1)) |
5267 | .add(MI.getOperand(i: 2)); |
5268 | if (NeedClampOperand) |
5269 | I.addImm(0); // clamp bit for e64 encoding |
5270 | |
5271 | TII->legalizeOperands(MI&: *I); |
5272 | |
5273 | MI.eraseFromParent(); |
5274 | return BB; |
5275 | } |
5276 | case AMDGPU::V_ADDC_U32_e32: |
5277 | case AMDGPU::V_SUBB_U32_e32: |
5278 | case AMDGPU::V_SUBBREV_U32_e32: |
5279 | // These instructions have an implicit use of vcc which counts towards the |
5280 | // constant bus limit. |
5281 | TII->legalizeOperands(MI); |
5282 | return BB; |
5283 | case AMDGPU::DS_GWS_INIT: |
5284 | case AMDGPU::DS_GWS_SEMA_BR: |
5285 | case AMDGPU::DS_GWS_BARRIER: |
5286 | TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0); |
5287 | [[fallthrough]]; |
5288 | case AMDGPU::DS_GWS_SEMA_V: |
5289 | case AMDGPU::DS_GWS_SEMA_P: |
5290 | case AMDGPU::DS_GWS_SEMA_RELEASE_ALL: |
5291 | // A s_waitcnt 0 is required to be the instruction immediately following. |
5292 | if (getSubtarget()->hasGWSAutoReplay()) { |
5293 | bundleInstWithWaitcnt(MI); |
5294 | return BB; |
5295 | } |
5296 | |
5297 | return emitGWSMemViolTestLoop(MI, BB); |
5298 | case AMDGPU::S_SETREG_B32: { |
5299 | // Try to optimize cases that only set the denormal mode or rounding mode. |
5300 | // |
5301 | // If the s_setreg_b32 fully sets all of the bits in the rounding mode or |
5302 | // denormal mode to a constant, we can use s_round_mode or s_denorm_mode |
5303 | // instead. |
5304 | // |
5305 | // FIXME: This could be predicates on the immediate, but tablegen doesn't |
5306 | // allow you to have a no side effect instruction in the output of a |
5307 | // sideeffecting pattern. |
5308 | auto [ID, Offset, Width] = |
5309 | AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm()); |
5310 | if (ID != AMDGPU::Hwreg::ID_MODE) |
5311 | return BB; |
5312 | |
5313 | const unsigned WidthMask = maskTrailingOnes<unsigned>(Width); |
5314 | const unsigned SetMask = WidthMask << Offset; |
5315 | |
5316 | if (getSubtarget()->hasDenormModeInst()) { |
5317 | unsigned SetDenormOp = 0; |
5318 | unsigned SetRoundOp = 0; |
5319 | |
5320 | // The dedicated instructions can only set the whole denorm or round mode |
5321 | // at once, not a subset of bits in either. |
5322 | if (SetMask == |
5323 | (AMDGPU::Hwreg::FP_ROUND_MASK | AMDGPU::Hwreg::FP_DENORM_MASK)) { |
5324 | // If this fully sets both the round and denorm mode, emit the two |
5325 | // dedicated instructions for these. |
5326 | SetRoundOp = AMDGPU::S_ROUND_MODE; |
5327 | SetDenormOp = AMDGPU::S_DENORM_MODE; |
5328 | } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) { |
5329 | SetRoundOp = AMDGPU::S_ROUND_MODE; |
5330 | } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) { |
5331 | SetDenormOp = AMDGPU::S_DENORM_MODE; |
5332 | } |
5333 | |
5334 | if (SetRoundOp || SetDenormOp) { |
5335 | MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); |
5336 | MachineInstr *Def = MRI.getVRegDef(Reg: MI.getOperand(i: 0).getReg()); |
5337 | if (Def && Def->isMoveImmediate() && Def->getOperand(i: 1).isImm()) { |
5338 | unsigned ImmVal = Def->getOperand(i: 1).getImm(); |
5339 | if (SetRoundOp) { |
5340 | BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp)) |
5341 | .addImm(ImmVal & 0xf); |
5342 | |
5343 | // If we also have the denorm mode, get just the denorm mode bits. |
5344 | ImmVal >>= 4; |
5345 | } |
5346 | |
5347 | if (SetDenormOp) { |
5348 | BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp)) |
5349 | .addImm(ImmVal & 0xf); |
5350 | } |
5351 | |
5352 | MI.eraseFromParent(); |
5353 | return BB; |
5354 | } |
5355 | } |
5356 | } |
5357 | |
5358 | // If only FP bits are touched, used the no side effects pseudo. |
5359 | if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK | |
5360 | AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask) |
5361 | MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode)); |
5362 | |
5363 | return BB; |
5364 | } |
5365 | case AMDGPU::S_INVERSE_BALLOT_U32: |
5366 | case AMDGPU::S_INVERSE_BALLOT_U64: { |
5367 | MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); |
5368 | const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); |
5369 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
5370 | const DebugLoc &DL = MI.getDebugLoc(); |
5371 | const Register DstReg = MI.getOperand(i: 0).getReg(); |
5372 | Register MaskReg = MI.getOperand(i: 1).getReg(); |
5373 | |
5374 | const bool IsVALU = TRI->isVectorRegister(MRI, Reg: MaskReg); |
5375 | |
5376 | if (IsVALU) { |
5377 | MaskReg = TII->readlaneVGPRToSGPR(SrcReg: MaskReg, UseMI&: MI, MRI); |
5378 | } |
5379 | |
5380 | BuildMI(*BB, &MI, DL, TII->get(AMDGPU::COPY), DstReg).addReg(MaskReg); |
5381 | MI.eraseFromParent(); |
5382 | return BB; |
5383 | } |
5384 | case AMDGPU::ENDPGM_TRAP: { |
5385 | const DebugLoc &DL = MI.getDebugLoc(); |
5386 | if (BB->succ_empty() && std::next(x: MI.getIterator()) == BB->end()) { |
5387 | MI.setDesc(TII->get(AMDGPU::S_ENDPGM)); |
5388 | MI.addOperand(Op: MachineOperand::CreateImm(Val: 0)); |
5389 | return BB; |
5390 | } |
5391 | |
5392 | // We need a block split to make the real endpgm a terminator. We also don't |
5393 | // want to break phis in successor blocks, so we can't just delete to the |
5394 | // end of the block. |
5395 | |
5396 | MachineBasicBlock *SplitBB = BB->splitAt(SplitInst&: MI, UpdateLiveIns: false /*UpdateLiveIns*/); |
5397 | MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); |
5398 | MF->push_back(MBB: TrapBB); |
5399 | BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM)) |
5400 | .addImm(0); |
5401 | BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) |
5402 | .addMBB(TrapBB); |
5403 | |
5404 | BB->addSuccessor(Succ: TrapBB); |
5405 | MI.eraseFromParent(); |
5406 | return SplitBB; |
5407 | } |
5408 | case AMDGPU::SIMULATED_TRAP: { |
5409 | assert(Subtarget->hasPrivEnabledTrap2NopBug()); |
5410 | MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); |
5411 | MachineBasicBlock *SplitBB = |
5412 | TII->insertSimulatedTrap(MRI, MBB&: *BB, MI, DL: MI.getDebugLoc()); |
5413 | MI.eraseFromParent(); |
5414 | return SplitBB; |
5415 | } |
5416 | default: |
5417 | if (TII->isImage(MI) || TII->isMUBUF(MI)) { |
5418 | if (!MI.mayStore()) |
5419 | AddMemOpInit(MI); |
5420 | return BB; |
5421 | } |
5422 | return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, MBB: BB); |
5423 | } |
5424 | } |
5425 | |
5426 | bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const { |
5427 | // This currently forces unfolding various combinations of fsub into fma with |
5428 | // free fneg'd operands. As long as we have fast FMA (controlled by |
5429 | // isFMAFasterThanFMulAndFAdd), we should perform these. |
5430 | |
5431 | // When fma is quarter rate, for f64 where add / sub are at best half rate, |
5432 | // most of these combines appear to be cycle neutral but save on instruction |
5433 | // count / code size. |
5434 | return true; |
5435 | } |
5436 | |
5437 | bool SITargetLowering::enableAggressiveFMAFusion(LLT Ty) const { return true; } |
5438 | |
5439 | EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx, |
5440 | EVT VT) const { |
5441 | if (!VT.isVector()) { |
5442 | return MVT::i1; |
5443 | } |
5444 | return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements()); |
5445 | } |
5446 | |
5447 | MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const { |
5448 | // TODO: Should i16 be used always if legal? For now it would force VALU |
5449 | // shifts. |
5450 | return (VT == MVT::i16) ? MVT::i16 : MVT::i32; |
5451 | } |
5452 | |
5453 | LLT SITargetLowering::getPreferredShiftAmountTy(LLT Ty) const { |
5454 | return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts()) |
5455 | ? Ty.changeElementSize(NewEltSize: 16) |
5456 | : Ty.changeElementSize(NewEltSize: 32); |
5457 | } |
5458 | |
5459 | // Answering this is somewhat tricky and depends on the specific device which |
5460 | // have different rates for fma or all f64 operations. |
5461 | // |
5462 | // v_fma_f64 and v_mul_f64 always take the same number of cycles as each other |
5463 | // regardless of which device (although the number of cycles differs between |
5464 | // devices), so it is always profitable for f64. |
5465 | // |
5466 | // v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable |
5467 | // only on full rate devices. Normally, we should prefer selecting v_mad_f32 |
5468 | // which we can always do even without fused FP ops since it returns the same |
5469 | // result as the separate operations and since it is always full |
5470 | // rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32 |
5471 | // however does not support denormals, so we do report fma as faster if we have |
5472 | // a fast fma device and require denormals. |
5473 | // |
5474 | bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, |
5475 | EVT VT) const { |
5476 | VT = VT.getScalarType(); |
5477 | |
5478 | switch (VT.getSimpleVT().SimpleTy) { |
5479 | case MVT::f32: { |
5480 | // If mad is not available this depends only on if f32 fma is full rate. |
5481 | if (!Subtarget->hasMadMacF32Insts()) |
5482 | return Subtarget->hasFastFMAF32(); |
5483 | |
5484 | // Otherwise f32 mad is always full rate and returns the same result as |
5485 | // the separate operations so should be preferred over fma. |
5486 | // However does not support denormals. |
5487 | if (!denormalModeIsFlushAllF32(MF)) |
5488 | return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts(); |
5489 | |
5490 | // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32. |
5491 | return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts(); |
5492 | } |
5493 | case MVT::f64: |
5494 | return true; |
5495 | case MVT::f16: |
5496 | return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF); |
5497 | default: |
5498 | break; |
5499 | } |
5500 | |
5501 | return false; |
5502 | } |
5503 | |
5504 | bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, |
5505 | LLT Ty) const { |
5506 | switch (Ty.getScalarSizeInBits()) { |
5507 | case 16: |
5508 | return isFMAFasterThanFMulAndFAdd(MF, MVT::f16); |
5509 | case 32: |
5510 | return isFMAFasterThanFMulAndFAdd(MF, MVT::f32); |
5511 | case 64: |
5512 | return isFMAFasterThanFMulAndFAdd(MF, MVT::f64); |
5513 | default: |
5514 | break; |
5515 | } |
5516 | |
5517 | return false; |
5518 | } |
5519 | |
5520 | bool SITargetLowering::isFMADLegal(const MachineInstr &MI, LLT Ty) const { |
5521 | if (!Ty.isScalar()) |
5522 | return false; |
5523 | |
5524 | if (Ty.getScalarSizeInBits() == 16) |
5525 | return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(MF: *MI.getMF()); |
5526 | if (Ty.getScalarSizeInBits() == 32) |
5527 | return Subtarget->hasMadMacF32Insts() && |
5528 | denormalModeIsFlushAllF32(MF: *MI.getMF()); |
5529 | |
5530 | return false; |
5531 | } |
5532 | |
5533 | bool SITargetLowering::isFMADLegal(const SelectionDAG &DAG, |
5534 | const SDNode *N) const { |
5535 | // TODO: Check future ftz flag |
5536 | // v_mad_f32/v_mac_f32 do not support denormals. |
5537 | EVT VT = N->getValueType(ResNo: 0); |
5538 | if (VT == MVT::f32) |
5539 | return Subtarget->hasMadMacF32Insts() && |
5540 | denormalModeIsFlushAllF32(MF: DAG.getMachineFunction()); |
5541 | if (VT == MVT::f16) { |
5542 | return Subtarget->hasMadF16() && |
5543 | denormalModeIsFlushAllF64F16(MF: DAG.getMachineFunction()); |
5544 | } |
5545 | |
5546 | return false; |
5547 | } |
5548 | |
5549 | //===----------------------------------------------------------------------===// |
5550 | // Custom DAG Lowering Operations |
5551 | //===----------------------------------------------------------------------===// |
5552 | |
5553 | // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the |
5554 | // wider vector type is legal. |
5555 | SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op, |
5556 | SelectionDAG &DAG) const { |
5557 | unsigned Opc = Op.getOpcode(); |
5558 | EVT VT = Op.getValueType(); |
5559 | assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 || |
5560 | VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 || |
5561 | VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 || |
5562 | VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16); |
5563 | |
5564 | SDValue Lo, Hi; |
5565 | std::tie(args&: Lo, args&: Hi) = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 0); |
5566 | |
5567 | SDLoc SL(Op); |
5568 | SDValue OpLo = DAG.getNode(Opcode: Opc, DL: SL, VT: Lo.getValueType(), Operand: Lo, |
5569 | Flags: Op->getFlags()); |
5570 | SDValue OpHi = DAG.getNode(Opcode: Opc, DL: SL, VT: Hi.getValueType(), Operand: Hi, |
5571 | Flags: Op->getFlags()); |
5572 | |
5573 | return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc(Op), VT, N1: OpLo, N2: OpHi); |
5574 | } |
5575 | |
5576 | // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the |
5577 | // wider vector type is legal. |
5578 | SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op, |
5579 | SelectionDAG &DAG) const { |
5580 | unsigned Opc = Op.getOpcode(); |
5581 | EVT VT = Op.getValueType(); |
5582 | assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 || |
5583 | VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 || |
5584 | VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 || |
5585 | VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16); |
5586 | |
5587 | SDValue Lo0, Hi0; |
5588 | std::tie(args&: Lo0, args&: Hi0) = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 0); |
5589 | SDValue Lo1, Hi1; |
5590 | std::tie(args&: Lo1, args&: Hi1) = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 1); |
5591 | |
5592 | SDLoc SL(Op); |
5593 | |
5594 | SDValue OpLo = DAG.getNode(Opcode: Opc, DL: SL, VT: Lo0.getValueType(), N1: Lo0, N2: Lo1, |
5595 | Flags: Op->getFlags()); |
5596 | SDValue OpHi = DAG.getNode(Opcode: Opc, DL: SL, VT: Hi0.getValueType(), N1: Hi0, N2: Hi1, |
5597 | Flags: Op->getFlags()); |
5598 | |
5599 | return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc(Op), VT, N1: OpLo, N2: OpHi); |
5600 | } |
5601 | |
5602 | SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op, |
5603 | SelectionDAG &DAG) const { |
5604 | unsigned Opc = Op.getOpcode(); |
5605 | EVT VT = Op.getValueType(); |
5606 | assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 || |
5607 | VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 || |
5608 | VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 || |
5609 | VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 || |
5610 | VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 || |
5611 | VT == MVT::v32bf16); |
5612 | |
5613 | SDValue Lo0, Hi0; |
5614 | SDValue Op0 = Op.getOperand(i: 0); |
5615 | std::tie(args&: Lo0, args&: Hi0) = Op0.getValueType().isVector() |
5616 | ? DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 0) |
5617 | : std::pair(Op0, Op0); |
5618 | SDValue Lo1, Hi1; |
5619 | std::tie(args&: Lo1, args&: Hi1) = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 1); |
5620 | SDValue Lo2, Hi2; |
5621 | std::tie(args&: Lo2, args&: Hi2) = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 2); |
5622 | |
5623 | SDLoc SL(Op); |
5624 | auto ResVT = DAG.GetSplitDestVTs(VT); |
5625 | |
5626 | SDValue OpLo = DAG.getNode(Opcode: Opc, DL: SL, VT: ResVT.first, N1: Lo0, N2: Lo1, N3: Lo2, |
5627 | Flags: Op->getFlags()); |
5628 | SDValue OpHi = DAG.getNode(Opcode: Opc, DL: SL, VT: ResVT.second, N1: Hi0, N2: Hi1, N3: Hi2, |
5629 | Flags: Op->getFlags()); |
5630 | |
5631 | return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc(Op), VT, N1: OpLo, N2: OpHi); |
5632 | } |
5633 | |
5634 | |
5635 | SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { |
5636 | switch (Op.getOpcode()) { |
5637 | default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); |
5638 | case ISD::BRCOND: return LowerBRCOND(Op, DAG); |
5639 | case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); |
5640 | case ISD::LOAD: { |
5641 | SDValue Result = LowerLOAD(Op, DAG); |
5642 | assert((!Result.getNode() || |
5643 | Result.getNode()->getNumValues() == 2) && |
5644 | "Load should return a value and a chain" ); |
5645 | return Result; |
5646 | } |
5647 | case ISD::FSQRT: { |
5648 | EVT VT = Op.getValueType(); |
5649 | if (VT == MVT::f32) |
5650 | return lowerFSQRTF32(Op, DAG); |
5651 | if (VT == MVT::f64) |
5652 | return lowerFSQRTF64(Op, DAG); |
5653 | return SDValue(); |
5654 | } |
5655 | case ISD::FSIN: |
5656 | case ISD::FCOS: |
5657 | return LowerTrig(Op, DAG); |
5658 | case ISD::SELECT: return LowerSELECT(Op, DAG); |
5659 | case ISD::FDIV: return LowerFDIV(Op, DAG); |
5660 | case ISD::FFREXP: return LowerFFREXP(Op, DAG); |
5661 | case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG); |
5662 | case ISD::STORE: return LowerSTORE(Op, DAG); |
5663 | case ISD::GlobalAddress: { |
5664 | MachineFunction &MF = DAG.getMachineFunction(); |
5665 | SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
5666 | return LowerGlobalAddress(MFI, Op, DAG); |
5667 | } |
5668 | case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); |
5669 | case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG); |
5670 | case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG); |
5671 | case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG); |
5672 | case ISD::INSERT_SUBVECTOR: |
5673 | return lowerINSERT_SUBVECTOR(Op, DAG); |
5674 | case ISD::INSERT_VECTOR_ELT: |
5675 | return lowerINSERT_VECTOR_ELT(Op, DAG); |
5676 | case ISD::EXTRACT_VECTOR_ELT: |
5677 | return lowerEXTRACT_VECTOR_ELT(Op, DAG); |
5678 | case ISD::VECTOR_SHUFFLE: |
5679 | return lowerVECTOR_SHUFFLE(Op, DAG); |
5680 | case ISD::SCALAR_TO_VECTOR: |
5681 | return lowerSCALAR_TO_VECTOR(Op, DAG); |
5682 | case ISD::BUILD_VECTOR: |
5683 | return lowerBUILD_VECTOR(Op, DAG); |
5684 | case ISD::FP_ROUND: |
5685 | case ISD::STRICT_FP_ROUND: |
5686 | return lowerFP_ROUND(Op, DAG); |
5687 | case ISD::FPTRUNC_ROUND: { |
5688 | unsigned Opc; |
5689 | SDLoc DL(Op); |
5690 | |
5691 | if (Op.getOperand(0)->getValueType(0) != MVT::f32) |
5692 | return SDValue(); |
5693 | |
5694 | // Get the rounding mode from the last operand |
5695 | int RoundMode = Op.getConstantOperandVal(i: 1); |
5696 | if (RoundMode == (int)RoundingMode::TowardPositive) |
5697 | Opc = AMDGPUISD::FPTRUNC_ROUND_UPWARD; |
5698 | else if (RoundMode == (int)RoundingMode::TowardNegative) |
5699 | Opc = AMDGPUISD::FPTRUNC_ROUND_DOWNWARD; |
5700 | else |
5701 | return SDValue(); |
5702 | |
5703 | return DAG.getNode(Opcode: Opc, DL, VTList: Op.getNode()->getVTList(), N: Op->getOperand(Num: 0)); |
5704 | } |
5705 | case ISD::TRAP: |
5706 | return lowerTRAP(Op, DAG); |
5707 | case ISD::DEBUGTRAP: |
5708 | return lowerDEBUGTRAP(Op, DAG); |
5709 | case ISD::FABS: |
5710 | case ISD::FNEG: |
5711 | case ISD::FCANONICALIZE: |
5712 | case ISD::BSWAP: |
5713 | return splitUnaryVectorOp(Op, DAG); |
5714 | case ISD::FMINNUM: |
5715 | case ISD::FMAXNUM: |
5716 | return lowerFMINNUM_FMAXNUM(Op, DAG); |
5717 | case ISD::FLDEXP: |
5718 | case ISD::STRICT_FLDEXP: |
5719 | return lowerFLDEXP(Op, DAG); |
5720 | case ISD::FMA: |
5721 | return splitTernaryVectorOp(Op, DAG); |
5722 | case ISD::FP_TO_SINT: |
5723 | case ISD::FP_TO_UINT: |
5724 | return LowerFP_TO_INT(Op, DAG); |
5725 | case ISD::SHL: |
5726 | case ISD::SRA: |
5727 | case ISD::SRL: |
5728 | case ISD::ADD: |
5729 | case ISD::SUB: |
5730 | case ISD::SMIN: |
5731 | case ISD::SMAX: |
5732 | case ISD::UMIN: |
5733 | case ISD::UMAX: |
5734 | case ISD::FADD: |
5735 | case ISD::FMUL: |
5736 | case ISD::FMINNUM_IEEE: |
5737 | case ISD::FMAXNUM_IEEE: |
5738 | case ISD::UADDSAT: |
5739 | case ISD::USUBSAT: |
5740 | case ISD::SADDSAT: |
5741 | case ISD::SSUBSAT: |
5742 | return splitBinaryVectorOp(Op, DAG); |
5743 | case ISD::MUL: |
5744 | return lowerMUL(Op, DAG); |
5745 | case ISD::SMULO: |
5746 | case ISD::UMULO: |
5747 | return lowerXMULO(Op, DAG); |
5748 | case ISD::SMUL_LOHI: |
5749 | case ISD::UMUL_LOHI: |
5750 | return lowerXMUL_LOHI(Op, DAG); |
5751 | case ISD::DYNAMIC_STACKALLOC: |
5752 | return LowerDYNAMIC_STACKALLOC(Op, DAG); |
5753 | case ISD::STACKSAVE: |
5754 | return LowerSTACKSAVE(Op, DAG); |
5755 | case ISD::GET_ROUNDING: |
5756 | return lowerGET_ROUNDING(Op, DAG); |
5757 | case ISD::PREFETCH: |
5758 | return lowerPREFETCH(Op, DAG); |
5759 | case ISD::FP_EXTEND: |
5760 | case ISD::STRICT_FP_EXTEND: |
5761 | return lowerFP_EXTEND(Op, DAG); |
5762 | case ISD::GET_FPENV: |
5763 | return lowerGET_FPENV(Op, DAG); |
5764 | case ISD::SET_FPENV: |
5765 | return lowerSET_FPENV(Op, DAG); |
5766 | } |
5767 | return SDValue(); |
5768 | } |
5769 | |
5770 | // Used for D16: Casts the result of an instruction into the right vector, |
5771 | // packs values if loads return unpacked values. |
5772 | static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, |
5773 | const SDLoc &DL, |
5774 | SelectionDAG &DAG, bool Unpacked) { |
5775 | if (!LoadVT.isVector()) |
5776 | return Result; |
5777 | |
5778 | // Cast back to the original packed type or to a larger type that is a |
5779 | // multiple of 32 bit for D16. Widening the return type is a required for |
5780 | // legalization. |
5781 | EVT FittingLoadVT = LoadVT; |
5782 | if ((LoadVT.getVectorNumElements() % 2) == 1) { |
5783 | FittingLoadVT = |
5784 | EVT::getVectorVT(Context&: *DAG.getContext(), VT: LoadVT.getVectorElementType(), |
5785 | NumElements: LoadVT.getVectorNumElements() + 1); |
5786 | } |
5787 | |
5788 | if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16. |
5789 | // Truncate to v2i16/v4i16. |
5790 | EVT IntLoadVT = FittingLoadVT.changeTypeToInteger(); |
5791 | |
5792 | // Workaround legalizer not scalarizing truncate after vector op |
5793 | // legalization but not creating intermediate vector trunc. |
5794 | SmallVector<SDValue, 4> Elts; |
5795 | DAG.ExtractVectorElements(Op: Result, Args&: Elts); |
5796 | for (SDValue &Elt : Elts) |
5797 | Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt); |
5798 | |
5799 | // Pad illegal v1i16/v3fi6 to v4i16 |
5800 | if ((LoadVT.getVectorNumElements() % 2) == 1) |
5801 | Elts.push_back(DAG.getUNDEF(MVT::i16)); |
5802 | |
5803 | Result = DAG.getBuildVector(VT: IntLoadVT, DL, Ops: Elts); |
5804 | |
5805 | // Bitcast to original type (v2f16/v4f16). |
5806 | return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: FittingLoadVT, Operand: Result); |
5807 | } |
5808 | |
5809 | // Cast back to the original packed type. |
5810 | return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: FittingLoadVT, Operand: Result); |
5811 | } |
5812 | |
5813 | SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, |
5814 | MemSDNode *M, |
5815 | SelectionDAG &DAG, |
5816 | ArrayRef<SDValue> Ops, |
5817 | bool IsIntrinsic) const { |
5818 | SDLoc DL(M); |
5819 | |
5820 | bool Unpacked = Subtarget->hasUnpackedD16VMem(); |
5821 | EVT LoadVT = M->getValueType(ResNo: 0); |
5822 | |
5823 | EVT EquivLoadVT = LoadVT; |
5824 | if (LoadVT.isVector()) { |
5825 | if (Unpacked) { |
5826 | EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, |
5827 | LoadVT.getVectorNumElements()); |
5828 | } else if ((LoadVT.getVectorNumElements() % 2) == 1) { |
5829 | // Widen v3f16 to legal type |
5830 | EquivLoadVT = |
5831 | EVT::getVectorVT(Context&: *DAG.getContext(), VT: LoadVT.getVectorElementType(), |
5832 | NumElements: LoadVT.getVectorNumElements() + 1); |
5833 | } |
5834 | } |
5835 | |
5836 | // Change from v4f16/v2f16 to EquivLoadVT. |
5837 | SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other); |
5838 | |
5839 | SDValue Load |
5840 | = DAG.getMemIntrinsicNode( |
5841 | Opcode: IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, dl: DL, |
5842 | VTList, Ops, MemVT: M->getMemoryVT(), |
5843 | MMO: M->getMemOperand()); |
5844 | |
5845 | SDValue Adjusted = adjustLoadValueTypeImpl(Result: Load, LoadVT, DL, DAG, Unpacked); |
5846 | |
5847 | return DAG.getMergeValues(Ops: { Adjusted, Load.getValue(R: 1) }, dl: DL); |
5848 | } |
5849 | |
5850 | SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat, |
5851 | SelectionDAG &DAG, |
5852 | ArrayRef<SDValue> Ops) const { |
5853 | SDLoc DL(M); |
5854 | EVT LoadVT = M->getValueType(ResNo: 0); |
5855 | EVT EltType = LoadVT.getScalarType(); |
5856 | EVT IntVT = LoadVT.changeTypeToInteger(); |
5857 | |
5858 | bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16); |
5859 | |
5860 | assert(M->getNumValues() == 2 || M->getNumValues() == 3); |
5861 | bool IsTFE = M->getNumValues() == 3; |
5862 | |
5863 | unsigned Opc; |
5864 | if (IsFormat) { |
5865 | Opc = IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE |
5866 | : AMDGPUISD::BUFFER_LOAD_FORMAT; |
5867 | } else { |
5868 | // TODO: Support non-format TFE loads. |
5869 | if (IsTFE) |
5870 | return SDValue(); |
5871 | Opc = AMDGPUISD::BUFFER_LOAD; |
5872 | } |
5873 | |
5874 | if (IsD16) { |
5875 | return adjustLoadValueType(Opcode: AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops); |
5876 | } |
5877 | |
5878 | // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics |
5879 | if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32) |
5880 | return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, MMO: M->getMemOperand()); |
5881 | |
5882 | if (isTypeLegal(VT: LoadVT)) { |
5883 | return getMemIntrinsicNode(Opcode: Opc, DL, VTList: M->getVTList(), Ops, MemVT: IntVT, |
5884 | MMO: M->getMemOperand(), DAG); |
5885 | } |
5886 | |
5887 | EVT CastVT = getEquivalentMemType(Context&: *DAG.getContext(), VT: LoadVT); |
5888 | SDVTList VTList = DAG.getVTList(CastVT, MVT::Other); |
5889 | SDValue MemNode = getMemIntrinsicNode(Opcode: Opc, DL, VTList, Ops, MemVT: CastVT, |
5890 | MMO: M->getMemOperand(), DAG); |
5891 | return DAG.getMergeValues( |
5892 | Ops: {DAG.getNode(Opcode: ISD::BITCAST, DL, VT: LoadVT, Operand: MemNode), MemNode.getValue(R: 1)}, |
5893 | dl: DL); |
5894 | } |
5895 | |
5896 | static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, |
5897 | SDNode *N, SelectionDAG &DAG) { |
5898 | EVT VT = N->getValueType(ResNo: 0); |
5899 | unsigned CondCode = N->getConstantOperandVal(Num: 3); |
5900 | if (!ICmpInst::isIntPredicate(P: static_cast<ICmpInst::Predicate>(CondCode))) |
5901 | return DAG.getUNDEF(VT); |
5902 | |
5903 | ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode); |
5904 | |
5905 | SDValue LHS = N->getOperand(Num: 1); |
5906 | SDValue RHS = N->getOperand(Num: 2); |
5907 | |
5908 | SDLoc DL(N); |
5909 | |
5910 | EVT CmpVT = LHS.getValueType(); |
5911 | if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) { |
5912 | unsigned PromoteOp = ICmpInst::isSigned(predicate: IcInput) ? |
5913 | ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; |
5914 | LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS); |
5915 | RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS); |
5916 | } |
5917 | |
5918 | ISD::CondCode CCOpcode = getICmpCondCode(Pred: IcInput); |
5919 | |
5920 | unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize(); |
5921 | EVT CCVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: WavefrontSize); |
5922 | |
5923 | SDValue SetCC = DAG.getNode(Opcode: AMDGPUISD::SETCC, DL, VT: CCVT, N1: LHS, N2: RHS, |
5924 | N3: DAG.getCondCode(Cond: CCOpcode)); |
5925 | if (VT.bitsEq(VT: CCVT)) |
5926 | return SetCC; |
5927 | return DAG.getZExtOrTrunc(Op: SetCC, DL, VT); |
5928 | } |
5929 | |
5930 | static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, |
5931 | SDNode *N, SelectionDAG &DAG) { |
5932 | EVT VT = N->getValueType(ResNo: 0); |
5933 | |
5934 | unsigned CondCode = N->getConstantOperandVal(Num: 3); |
5935 | if (!FCmpInst::isFPPredicate(P: static_cast<FCmpInst::Predicate>(CondCode))) |
5936 | return DAG.getUNDEF(VT); |
5937 | |
5938 | SDValue Src0 = N->getOperand(Num: 1); |
5939 | SDValue Src1 = N->getOperand(Num: 2); |
5940 | EVT CmpVT = Src0.getValueType(); |
5941 | SDLoc SL(N); |
5942 | |
5943 | if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) { |
5944 | Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0); |
5945 | Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1); |
5946 | } |
5947 | |
5948 | FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode); |
5949 | ISD::CondCode CCOpcode = getFCmpCondCode(Pred: IcInput); |
5950 | unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize(); |
5951 | EVT CCVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: WavefrontSize); |
5952 | SDValue SetCC = DAG.getNode(Opcode: AMDGPUISD::SETCC, DL: SL, VT: CCVT, N1: Src0, |
5953 | N2: Src1, N3: DAG.getCondCode(Cond: CCOpcode)); |
5954 | if (VT.bitsEq(VT: CCVT)) |
5955 | return SetCC; |
5956 | return DAG.getZExtOrTrunc(Op: SetCC, DL: SL, VT); |
5957 | } |
5958 | |
5959 | static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, |
5960 | SelectionDAG &DAG) { |
5961 | EVT VT = N->getValueType(ResNo: 0); |
5962 | SDValue Src = N->getOperand(Num: 1); |
5963 | SDLoc SL(N); |
5964 | |
5965 | if (Src.getOpcode() == ISD::SETCC) { |
5966 | // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...) |
5967 | return DAG.getNode(Opcode: AMDGPUISD::SETCC, DL: SL, VT, N1: Src.getOperand(i: 0), |
5968 | N2: Src.getOperand(i: 1), N3: Src.getOperand(i: 2)); |
5969 | } |
5970 | if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Val&: Src)) { |
5971 | // (ballot 0) -> 0 |
5972 | if (Arg->isZero()) |
5973 | return DAG.getConstant(Val: 0, DL: SL, VT); |
5974 | |
5975 | // (ballot 1) -> EXEC/EXEC_LO |
5976 | if (Arg->isOne()) { |
5977 | Register Exec; |
5978 | if (VT.getScalarSizeInBits() == 32) |
5979 | Exec = AMDGPU::EXEC_LO; |
5980 | else if (VT.getScalarSizeInBits() == 64) |
5981 | Exec = AMDGPU::EXEC; |
5982 | else |
5983 | return SDValue(); |
5984 | |
5985 | return DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: SL, Reg: Exec, VT); |
5986 | } |
5987 | } |
5988 | |
5989 | // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0) |
5990 | // ISD::SETNE) |
5991 | return DAG.getNode( |
5992 | AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32), |
5993 | DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE)); |
5994 | } |
5995 | |
5996 | void SITargetLowering::ReplaceNodeResults(SDNode *N, |
5997 | SmallVectorImpl<SDValue> &Results, |
5998 | SelectionDAG &DAG) const { |
5999 | switch (N->getOpcode()) { |
6000 | case ISD::INSERT_VECTOR_ELT: { |
6001 | if (SDValue Res = lowerINSERT_VECTOR_ELT(Op: SDValue(N, 0), DAG)) |
6002 | Results.push_back(Elt: Res); |
6003 | return; |
6004 | } |
6005 | case ISD::EXTRACT_VECTOR_ELT: { |
6006 | if (SDValue Res = lowerEXTRACT_VECTOR_ELT(Op: SDValue(N, 0), DAG)) |
6007 | Results.push_back(Elt: Res); |
6008 | return; |
6009 | } |
6010 | case ISD::INTRINSIC_WO_CHAIN: { |
6011 | unsigned IID = N->getConstantOperandVal(Num: 0); |
6012 | switch (IID) { |
6013 | case Intrinsic::amdgcn_make_buffer_rsrc: |
6014 | Results.push_back(Elt: lowerPointerAsRsrcIntrin(Op: N, DAG)); |
6015 | return; |
6016 | case Intrinsic::amdgcn_cvt_pkrtz: { |
6017 | SDValue Src0 = N->getOperand(Num: 1); |
6018 | SDValue Src1 = N->getOperand(Num: 2); |
6019 | SDLoc SL(N); |
6020 | SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, |
6021 | Src0, Src1); |
6022 | Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt)); |
6023 | return; |
6024 | } |
6025 | case Intrinsic::amdgcn_cvt_pknorm_i16: |
6026 | case Intrinsic::amdgcn_cvt_pknorm_u16: |
6027 | case Intrinsic::amdgcn_cvt_pk_i16: |
6028 | case Intrinsic::amdgcn_cvt_pk_u16: { |
6029 | SDValue Src0 = N->getOperand(Num: 1); |
6030 | SDValue Src1 = N->getOperand(Num: 2); |
6031 | SDLoc SL(N); |
6032 | unsigned Opcode; |
6033 | |
6034 | if (IID == Intrinsic::amdgcn_cvt_pknorm_i16) |
6035 | Opcode = AMDGPUISD::CVT_PKNORM_I16_F32; |
6036 | else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16) |
6037 | Opcode = AMDGPUISD::CVT_PKNORM_U16_F32; |
6038 | else if (IID == Intrinsic::amdgcn_cvt_pk_i16) |
6039 | Opcode = AMDGPUISD::CVT_PK_I16_I32; |
6040 | else |
6041 | Opcode = AMDGPUISD::CVT_PK_U16_U32; |
6042 | |
6043 | EVT VT = N->getValueType(ResNo: 0); |
6044 | if (isTypeLegal(VT)) |
6045 | Results.push_back(Elt: DAG.getNode(Opcode, DL: SL, VT, N1: Src0, N2: Src1)); |
6046 | else { |
6047 | SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1); |
6048 | Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt)); |
6049 | } |
6050 | return; |
6051 | } |
6052 | case Intrinsic::amdgcn_s_buffer_load: { |
6053 | // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate |
6054 | // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG |
6055 | // combiner tries to merge the s_buffer_load_u8 with a sext instruction |
6056 | // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with |
6057 | // s_buffer_load_i8. |
6058 | if (!Subtarget->hasScalarSubwordLoads()) |
6059 | return; |
6060 | SDValue Op = SDValue(N, 0); |
6061 | SDValue Rsrc = Op.getOperand(i: 1); |
6062 | SDValue Offset = Op.getOperand(i: 2); |
6063 | SDValue CachePolicy = Op.getOperand(i: 3); |
6064 | EVT VT = Op.getValueType(); |
6065 | assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n" ); |
6066 | SDLoc DL(Op); |
6067 | MachineFunction &MF = DAG.getMachineFunction(); |
6068 | const DataLayout &DataLayout = DAG.getDataLayout(); |
6069 | Align Alignment = |
6070 | DataLayout.getABITypeAlign(Ty: VT.getTypeForEVT(Context&: *DAG.getContext())); |
6071 | MachineMemOperand *MMO = MF.getMachineMemOperand( |
6072 | PtrInfo: MachinePointerInfo(), |
6073 | F: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | |
6074 | MachineMemOperand::MOInvariant, |
6075 | Size: VT.getStoreSize(), BaseAlignment: Alignment); |
6076 | SDValue LoadVal; |
6077 | if (!Offset->isDivergent()) { |
6078 | SDValue Ops[] = {Rsrc, // source register |
6079 | Offset, CachePolicy}; |
6080 | SDValue BufferLoad = |
6081 | DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_UBYTE, DL, |
6082 | DAG.getVTList(MVT::i32), Ops, VT, MMO); |
6083 | LoadVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: BufferLoad); |
6084 | } else { |
6085 | SDValue Ops[] = { |
6086 | DAG.getEntryNode(), // Chain |
6087 | Rsrc, // rsrc |
6088 | DAG.getConstant(0, DL, MVT::i32), // vindex |
6089 | {}, // voffset |
6090 | {}, // soffset |
6091 | {}, // offset |
6092 | CachePolicy, // cachepolicy |
6093 | DAG.getTargetConstant(0, DL, MVT::i1), // idxen |
6094 | }; |
6095 | setBufferOffsets(CombinedOffset: Offset, DAG, Offsets: &Ops[3], Alignment: Align(4)); |
6096 | LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO); |
6097 | } |
6098 | Results.push_back(Elt: LoadVal); |
6099 | return; |
6100 | } |
6101 | } |
6102 | break; |
6103 | } |
6104 | case ISD::INTRINSIC_W_CHAIN: { |
6105 | if (SDValue Res = LowerINTRINSIC_W_CHAIN(Op: SDValue(N, 0), DAG)) { |
6106 | if (Res.getOpcode() == ISD::MERGE_VALUES) { |
6107 | // FIXME: Hacky |
6108 | for (unsigned I = 0; I < Res.getNumOperands(); I++) { |
6109 | Results.push_back(Elt: Res.getOperand(i: I)); |
6110 | } |
6111 | } else { |
6112 | Results.push_back(Elt: Res); |
6113 | Results.push_back(Elt: Res.getValue(R: 1)); |
6114 | } |
6115 | return; |
6116 | } |
6117 | |
6118 | break; |
6119 | } |
6120 | case ISD::SELECT: { |
6121 | SDLoc SL(N); |
6122 | EVT VT = N->getValueType(ResNo: 0); |
6123 | EVT NewVT = getEquivalentMemType(Context&: *DAG.getContext(), VT); |
6124 | SDValue LHS = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVT, Operand: N->getOperand(Num: 1)); |
6125 | SDValue RHS = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVT, Operand: N->getOperand(Num: 2)); |
6126 | |
6127 | EVT SelectVT = NewVT; |
6128 | if (NewVT.bitsLT(MVT::i32)) { |
6129 | LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS); |
6130 | RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS); |
6131 | SelectVT = MVT::i32; |
6132 | } |
6133 | |
6134 | SDValue NewSelect = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: SelectVT, |
6135 | N1: N->getOperand(Num: 0), N2: LHS, N3: RHS); |
6136 | |
6137 | if (NewVT != SelectVT) |
6138 | NewSelect = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: NewVT, Operand: NewSelect); |
6139 | Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: NewSelect)); |
6140 | return; |
6141 | } |
6142 | case ISD::FNEG: { |
6143 | if (N->getValueType(0) != MVT::v2f16) |
6144 | break; |
6145 | |
6146 | SDLoc SL(N); |
6147 | SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0)); |
6148 | |
6149 | SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, |
6150 | BC, |
6151 | DAG.getConstant(0x80008000, SL, MVT::i32)); |
6152 | Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op)); |
6153 | return; |
6154 | } |
6155 | case ISD::FABS: { |
6156 | if (N->getValueType(0) != MVT::v2f16) |
6157 | break; |
6158 | |
6159 | SDLoc SL(N); |
6160 | SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0)); |
6161 | |
6162 | SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, |
6163 | BC, |
6164 | DAG.getConstant(0x7fff7fff, SL, MVT::i32)); |
6165 | Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op)); |
6166 | return; |
6167 | } |
6168 | case ISD::FSQRT: { |
6169 | if (N->getValueType(0) != MVT::f16) |
6170 | break; |
6171 | Results.push_back(Elt: lowerFSQRTF16(Op: SDValue(N, 0), DAG)); |
6172 | break; |
6173 | } |
6174 | default: |
6175 | AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG); |
6176 | break; |
6177 | } |
6178 | } |
6179 | |
6180 | /// Helper function for LowerBRCOND |
6181 | static SDNode *findUser(SDValue Value, unsigned Opcode) { |
6182 | |
6183 | SDNode *Parent = Value.getNode(); |
6184 | for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end(); |
6185 | I != E; ++I) { |
6186 | |
6187 | if (I.getUse().get() != Value) |
6188 | continue; |
6189 | |
6190 | if (I->getOpcode() == Opcode) |
6191 | return *I; |
6192 | } |
6193 | return nullptr; |
6194 | } |
6195 | |
6196 | unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const { |
6197 | if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) { |
6198 | switch (Intr->getConstantOperandVal(Num: 1)) { |
6199 | case Intrinsic::amdgcn_if: |
6200 | return AMDGPUISD::IF; |
6201 | case Intrinsic::amdgcn_else: |
6202 | return AMDGPUISD::ELSE; |
6203 | case Intrinsic::amdgcn_loop: |
6204 | return AMDGPUISD::LOOP; |
6205 | case Intrinsic::amdgcn_end_cf: |
6206 | llvm_unreachable("should not occur" ); |
6207 | default: |
6208 | return 0; |
6209 | } |
6210 | } |
6211 | |
6212 | // break, if_break, else_break are all only used as inputs to loop, not |
6213 | // directly as branch conditions. |
6214 | return 0; |
6215 | } |
6216 | |
6217 | bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const { |
6218 | const Triple &TT = getTargetMachine().getTargetTriple(); |
6219 | return (GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || |
6220 | GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) && |
6221 | AMDGPU::shouldEmitConstantsToTextSection(TT); |
6222 | } |
6223 | |
6224 | bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const { |
6225 | if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) |
6226 | return false; |
6227 | |
6228 | // FIXME: Either avoid relying on address space here or change the default |
6229 | // address space for functions to avoid the explicit check. |
6230 | return (GV->getValueType()->isFunctionTy() || |
6231 | !isNonGlobalAddrSpace(AS: GV->getAddressSpace())) && |
6232 | !shouldEmitFixup(GV) && !getTargetMachine().shouldAssumeDSOLocal(GV); |
6233 | } |
6234 | |
6235 | bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const { |
6236 | return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV); |
6237 | } |
6238 | |
6239 | bool SITargetLowering::shouldUseLDSConstAddress(const GlobalValue *GV) const { |
6240 | if (!GV->hasExternalLinkage()) |
6241 | return true; |
6242 | |
6243 | const auto OS = getTargetMachine().getTargetTriple().getOS(); |
6244 | return OS == Triple::AMDHSA || OS == Triple::AMDPAL; |
6245 | } |
6246 | |
6247 | /// This transforms the control flow intrinsics to get the branch destination as |
6248 | /// last parameter, also switches branch target with BR if the need arise |
6249 | SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, |
6250 | SelectionDAG &DAG) const { |
6251 | SDLoc DL(BRCOND); |
6252 | |
6253 | SDNode *Intr = BRCOND.getOperand(i: 1).getNode(); |
6254 | SDValue Target = BRCOND.getOperand(i: 2); |
6255 | SDNode *BR = nullptr; |
6256 | SDNode *SetCC = nullptr; |
6257 | |
6258 | if (Intr->getOpcode() == ISD::SETCC) { |
6259 | // As long as we negate the condition everything is fine |
6260 | SetCC = Intr; |
6261 | Intr = SetCC->getOperand(Num: 0).getNode(); |
6262 | |
6263 | } else { |
6264 | // Get the target from BR if we don't negate the condition |
6265 | BR = findUser(Value: BRCOND, Opcode: ISD::BR); |
6266 | assert(BR && "brcond missing unconditional branch user" ); |
6267 | Target = BR->getOperand(Num: 1); |
6268 | } |
6269 | |
6270 | unsigned CFNode = isCFIntrinsic(Intr); |
6271 | if (CFNode == 0) { |
6272 | // This is a uniform branch so we don't need to legalize. |
6273 | return BRCOND; |
6274 | } |
6275 | |
6276 | bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID || |
6277 | Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN; |
6278 | |
6279 | assert(!SetCC || |
6280 | (SetCC->getConstantOperandVal(1) == 1 && |
6281 | cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == |
6282 | ISD::SETNE)); |
6283 | |
6284 | // operands of the new intrinsic call |
6285 | SmallVector<SDValue, 4> Ops; |
6286 | if (HaveChain) |
6287 | Ops.push_back(Elt: BRCOND.getOperand(i: 0)); |
6288 | |
6289 | Ops.append(in_start: Intr->op_begin() + (HaveChain ? 2 : 1), in_end: Intr->op_end()); |
6290 | Ops.push_back(Elt: Target); |
6291 | |
6292 | ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end()); |
6293 | |
6294 | // build the new intrinsic call |
6295 | SDNode *Result = DAG.getNode(Opcode: CFNode, DL, VTList: DAG.getVTList(VTs: Res), Ops).getNode(); |
6296 | |
6297 | if (!HaveChain) { |
6298 | SDValue Ops[] = { |
6299 | SDValue(Result, 0), |
6300 | BRCOND.getOperand(i: 0) |
6301 | }; |
6302 | |
6303 | Result = DAG.getMergeValues(Ops, dl: DL).getNode(); |
6304 | } |
6305 | |
6306 | if (BR) { |
6307 | // Give the branch instruction our target |
6308 | SDValue Ops[] = { |
6309 | BR->getOperand(Num: 0), |
6310 | BRCOND.getOperand(i: 2) |
6311 | }; |
6312 | SDValue NewBR = DAG.getNode(Opcode: ISD::BR, DL, VTList: BR->getVTList(), Ops); |
6313 | DAG.ReplaceAllUsesWith(From: BR, To: NewBR.getNode()); |
6314 | } |
6315 | |
6316 | SDValue Chain = SDValue(Result, Result->getNumValues() - 1); |
6317 | |
6318 | // Copy the intrinsic results to registers |
6319 | for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) { |
6320 | SDNode *CopyToReg = findUser(Value: SDValue(Intr, i), Opcode: ISD::CopyToReg); |
6321 | if (!CopyToReg) |
6322 | continue; |
6323 | |
6324 | Chain = DAG.getCopyToReg( |
6325 | Chain, dl: DL, |
6326 | Reg: CopyToReg->getOperand(Num: 1), |
6327 | N: SDValue(Result, i - 1), |
6328 | Glue: SDValue()); |
6329 | |
6330 | DAG.ReplaceAllUsesWith(From: SDValue(CopyToReg, 0), To: CopyToReg->getOperand(Num: 0)); |
6331 | } |
6332 | |
6333 | // Remove the old intrinsic from the chain |
6334 | DAG.ReplaceAllUsesOfValueWith( |
6335 | From: SDValue(Intr, Intr->getNumValues() - 1), |
6336 | To: Intr->getOperand(Num: 0)); |
6337 | |
6338 | return Chain; |
6339 | } |
6340 | |
6341 | SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, |
6342 | SelectionDAG &DAG) const { |
6343 | MVT VT = Op.getSimpleValueType(); |
6344 | SDLoc DL(Op); |
6345 | // Checking the depth |
6346 | if (Op.getConstantOperandVal(i: 0) != 0) |
6347 | return DAG.getConstant(Val: 0, DL, VT); |
6348 | |
6349 | MachineFunction &MF = DAG.getMachineFunction(); |
6350 | const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
6351 | // Check for kernel and shader functions |
6352 | if (Info->isEntryFunction()) |
6353 | return DAG.getConstant(Val: 0, DL, VT); |
6354 | |
6355 | MachineFrameInfo &MFI = MF.getFrameInfo(); |
6356 | // There is a call to @llvm.returnaddress in this function |
6357 | MFI.setReturnAddressIsTaken(true); |
6358 | |
6359 | const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); |
6360 | // Get the return address reg and mark it as an implicit live-in |
6361 | Register Reg = MF.addLiveIn(PReg: TRI->getReturnAddressReg(MF), RC: getRegClassFor(VT, isDivergent: Op.getNode()->isDivergent())); |
6362 | |
6363 | return DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: DL, Reg, VT); |
6364 | } |
6365 | |
6366 | SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, |
6367 | SDValue Op, |
6368 | const SDLoc &DL, |
6369 | EVT VT) const { |
6370 | return Op.getValueType().bitsLE(VT) ? |
6371 | DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) : |
6372 | DAG.getNode(ISD::FP_ROUND, DL, VT, Op, |
6373 | DAG.getTargetConstant(0, DL, MVT::i32)); |
6374 | } |
6375 | |
6376 | SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { |
6377 | assert(Op.getValueType() == MVT::f16 && |
6378 | "Do not know how to custom lower FP_ROUND for non-f16 type" ); |
6379 | |
6380 | SDValue Src = Op.getOperand(i: 0); |
6381 | EVT SrcVT = Src.getValueType(); |
6382 | if (SrcVT != MVT::f64) |
6383 | return Op; |
6384 | |
6385 | // TODO: Handle strictfp |
6386 | if (Op.getOpcode() != ISD::FP_ROUND) |
6387 | return Op; |
6388 | |
6389 | SDLoc DL(Op); |
6390 | |
6391 | SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src); |
6392 | SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16); |
6393 | return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc); |
6394 | } |
6395 | |
6396 | SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op, |
6397 | SelectionDAG &DAG) const { |
6398 | EVT VT = Op.getValueType(); |
6399 | const MachineFunction &MF = DAG.getMachineFunction(); |
6400 | const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
6401 | bool IsIEEEMode = Info->getMode().IEEE; |
6402 | |
6403 | // FIXME: Assert during selection that this is only selected for |
6404 | // ieee_mode. Currently a combine can produce the ieee version for non-ieee |
6405 | // mode functions, but this happens to be OK since it's only done in cases |
6406 | // where there is known no sNaN. |
6407 | if (IsIEEEMode) |
6408 | return expandFMINNUM_FMAXNUM(N: Op.getNode(), DAG); |
6409 | |
6410 | if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 || |
6411 | VT == MVT::v16bf16) |
6412 | return splitBinaryVectorOp(Op, DAG); |
6413 | return Op; |
6414 | } |
6415 | |
6416 | SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const { |
6417 | bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP; |
6418 | EVT VT = Op.getValueType(); |
6419 | assert(VT == MVT::f16); |
6420 | |
6421 | SDValue Exp = Op.getOperand(i: IsStrict ? 2 : 1); |
6422 | EVT ExpVT = Exp.getValueType(); |
6423 | if (ExpVT == MVT::i16) |
6424 | return Op; |
6425 | |
6426 | SDLoc DL(Op); |
6427 | |
6428 | // Correct the exponent type for f16 to i16. |
6429 | // Clamp the range of the exponent to the instruction's range. |
6430 | |
6431 | // TODO: This should be a generic narrowing legalization, and can easily be |
6432 | // for GlobalISel. |
6433 | |
6434 | SDValue MinExp = DAG.getConstant(Val: minIntN(N: 16), DL, VT: ExpVT); |
6435 | SDValue ClampMin = DAG.getNode(Opcode: ISD::SMAX, DL, VT: ExpVT, N1: Exp, N2: MinExp); |
6436 | |
6437 | SDValue MaxExp = DAG.getConstant(Val: maxIntN(N: 16), DL, VT: ExpVT); |
6438 | SDValue Clamp = DAG.getNode(Opcode: ISD::SMIN, DL, VT: ExpVT, N1: ClampMin, N2: MaxExp); |
6439 | |
6440 | SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp); |
6441 | |
6442 | if (IsStrict) { |
6443 | return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other}, |
6444 | {Op.getOperand(0), Op.getOperand(1), TruncExp}); |
6445 | } |
6446 | |
6447 | return DAG.getNode(Opcode: ISD::FLDEXP, DL, VT, N1: Op.getOperand(i: 0), N2: TruncExp); |
6448 | } |
6449 | |
6450 | // Custom lowering for vector multiplications and s_mul_u64. |
6451 | SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const { |
6452 | EVT VT = Op.getValueType(); |
6453 | |
6454 | // Split vector operands. |
6455 | if (VT.isVector()) |
6456 | return splitBinaryVectorOp(Op, DAG); |
6457 | |
6458 | assert(VT == MVT::i64 && "The following code is a special for s_mul_u64" ); |
6459 | |
6460 | // There are four ways to lower s_mul_u64: |
6461 | // |
6462 | // 1. If all the operands are uniform, then we lower it as it is. |
6463 | // |
6464 | // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit |
6465 | // multiplications because there is not a vector equivalent of s_mul_u64. |
6466 | // |
6467 | // 3. If the cost model decides that it is more efficient to use vector |
6468 | // registers, then we have to split s_mul_u64 in 32-bit multiplications. |
6469 | // This happens in splitScalarSMULU64() in SIInstrInfo.cpp . |
6470 | // |
6471 | // 4. If the cost model decides to use vector registers and both of the |
6472 | // operands are zero-extended/sign-extended from 32-bits, then we split the |
6473 | // s_mul_u64 in two 32-bit multiplications. The problem is that it is not |
6474 | // possible to check if the operands are zero-extended or sign-extended in |
6475 | // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with |
6476 | // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace |
6477 | // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended. |
6478 | // If the cost model decides that we have to use vector registers, then |
6479 | // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/ |
6480 | // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model |
6481 | // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/ |
6482 | // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in |
6483 | // SIInstrInfo.cpp . |
6484 | |
6485 | if (Op->isDivergent()) |
6486 | return SDValue(); |
6487 | |
6488 | SDValue Op0 = Op.getOperand(i: 0); |
6489 | SDValue Op1 = Op.getOperand(i: 1); |
6490 | // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64 |
6491 | // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to |
6492 | // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo. |
6493 | KnownBits Op0KnownBits = DAG.computeKnownBits(Op: Op0); |
6494 | unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros(); |
6495 | KnownBits Op1KnownBits = DAG.computeKnownBits(Op: Op1); |
6496 | unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros(); |
6497 | SDLoc SL(Op); |
6498 | if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32) |
6499 | return SDValue( |
6500 | DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0); |
6501 | unsigned Op0SignBits = DAG.ComputeNumSignBits(Op: Op0); |
6502 | unsigned Op1SignBits = DAG.ComputeNumSignBits(Op: Op1); |
6503 | if (Op0SignBits >= 33 && Op1SignBits >= 33) |
6504 | return SDValue( |
6505 | DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0); |
6506 | // If all the operands are uniform, then we lower s_mul_u64 as it is. |
6507 | return Op; |
6508 | } |
6509 | |
6510 | SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const { |
6511 | EVT VT = Op.getValueType(); |
6512 | SDLoc SL(Op); |
6513 | SDValue LHS = Op.getOperand(i: 0); |
6514 | SDValue RHS = Op.getOperand(i: 1); |
6515 | bool isSigned = Op.getOpcode() == ISD::SMULO; |
6516 | |
6517 | if (ConstantSDNode *RHSC = isConstOrConstSplat(N: RHS)) { |
6518 | const APInt &C = RHSC->getAPIntValue(); |
6519 | // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X } |
6520 | if (C.isPowerOf2()) { |
6521 | // smulo(x, signed_min) is same as umulo(x, signed_min). |
6522 | bool UseArithShift = isSigned && !C.isMinSignedValue(); |
6523 | SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32); |
6524 | SDValue Result = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT, N1: LHS, N2: ShiftAmt); |
6525 | SDValue Overflow = DAG.getSetCC(SL, MVT::i1, |
6526 | DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, |
6527 | SL, VT, Result, ShiftAmt), |
6528 | LHS, ISD::SETNE); |
6529 | return DAG.getMergeValues(Ops: { Result, Overflow }, dl: SL); |
6530 | } |
6531 | } |
6532 | |
6533 | SDValue Result = DAG.getNode(Opcode: ISD::MUL, DL: SL, VT, N1: LHS, N2: RHS); |
6534 | SDValue Top = DAG.getNode(Opcode: isSigned ? ISD::MULHS : ISD::MULHU, |
6535 | DL: SL, VT, N1: LHS, N2: RHS); |
6536 | |
6537 | SDValue Sign = isSigned |
6538 | ? DAG.getNode(ISD::SRA, SL, VT, Result, |
6539 | DAG.getConstant(VT.getScalarSizeInBits() - 1, SL, MVT::i32)) |
6540 | : DAG.getConstant(0, SL, VT); |
6541 | SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE); |
6542 | |
6543 | return DAG.getMergeValues(Ops: { Result, Overflow }, dl: SL); |
6544 | } |
6545 | |
6546 | SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const { |
6547 | if (Op->isDivergent()) { |
6548 | // Select to V_MAD_[IU]64_[IU]32. |
6549 | return Op; |
6550 | } |
6551 | if (Subtarget->hasSMulHi()) { |
6552 | // Expand to S_MUL_I32 + S_MUL_HI_[IU]32. |
6553 | return SDValue(); |
6554 | } |
6555 | // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to |
6556 | // calculate the high part, so we might as well do the whole thing with |
6557 | // V_MAD_[IU]64_[IU]32. |
6558 | return Op; |
6559 | } |
6560 | |
6561 | SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const { |
6562 | if (!Subtarget->isTrapHandlerEnabled() || |
6563 | Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) |
6564 | return lowerTrapEndpgm(Op, DAG); |
6565 | |
6566 | return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG) : |
6567 | lowerTrapHsaQueuePtr(Op, DAG); |
6568 | } |
6569 | |
6570 | SDValue SITargetLowering::lowerTrapEndpgm( |
6571 | SDValue Op, SelectionDAG &DAG) const { |
6572 | SDLoc SL(Op); |
6573 | SDValue Chain = Op.getOperand(i: 0); |
6574 | return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain); |
6575 | } |
6576 | |
6577 | SDValue SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT, |
6578 | const SDLoc &DL, Align Alignment, ImplicitParameter Param) const { |
6579 | MachineFunction &MF = DAG.getMachineFunction(); |
6580 | uint64_t Offset = getImplicitParameterOffset(MF, Param); |
6581 | SDValue Ptr = lowerKernArgParameterPtr(DAG, SL: DL, Chain: DAG.getEntryNode(), Offset); |
6582 | MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); |
6583 | return DAG.getLoad(VT, dl: DL, Chain: DAG.getEntryNode(), Ptr, PtrInfo, Alignment, |
6584 | MMOFlags: MachineMemOperand::MODereferenceable | |
6585 | MachineMemOperand::MOInvariant); |
6586 | } |
6587 | |
6588 | SDValue SITargetLowering::lowerTrapHsaQueuePtr( |
6589 | SDValue Op, SelectionDAG &DAG) const { |
6590 | SDLoc SL(Op); |
6591 | SDValue Chain = Op.getOperand(i: 0); |
6592 | |
6593 | SDValue QueuePtr; |
6594 | // For code object version 5, QueuePtr is passed through implicit kernarg. |
6595 | const Module *M = DAG.getMachineFunction().getFunction().getParent(); |
6596 | if (AMDGPU::getAMDHSACodeObjectVersion(M: *M) >= AMDGPU::AMDHSA_COV5) { |
6597 | QueuePtr = |
6598 | loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR); |
6599 | } else { |
6600 | MachineFunction &MF = DAG.getMachineFunction(); |
6601 | SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
6602 | Register UserSGPR = Info->getQueuePtrUserSGPR(); |
6603 | |
6604 | if (UserSGPR == AMDGPU::NoRegister) { |
6605 | // We probably are in a function incorrectly marked with |
6606 | // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the |
6607 | // trap, so just use a null pointer. |
6608 | QueuePtr = DAG.getConstant(0, SL, MVT::i64); |
6609 | } else { |
6610 | QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, |
6611 | MVT::i64); |
6612 | } |
6613 | } |
6614 | |
6615 | SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64); |
6616 | SDValue ToReg = DAG.getCopyToReg(Chain, dl: SL, Reg: SGPR01, |
6617 | N: QueuePtr, Glue: SDValue()); |
6618 | |
6619 | uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap); |
6620 | SDValue Ops[] = { |
6621 | ToReg, |
6622 | DAG.getTargetConstant(TrapID, SL, MVT::i16), |
6623 | SGPR01, |
6624 | ToReg.getValue(1) |
6625 | }; |
6626 | return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops); |
6627 | } |
6628 | |
6629 | SDValue SITargetLowering::lowerTrapHsa( |
6630 | SDValue Op, SelectionDAG &DAG) const { |
6631 | SDLoc SL(Op); |
6632 | SDValue Chain = Op.getOperand(i: 0); |
6633 | |
6634 | // We need to simulate the 's_trap 2' instruction on targets that run in |
6635 | // PRIV=1 (where it is treated as a nop). |
6636 | if (Subtarget->hasPrivEnabledTrap2NopBug()) |
6637 | return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain); |
6638 | |
6639 | uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap); |
6640 | SDValue Ops[] = { |
6641 | Chain, |
6642 | DAG.getTargetConstant(TrapID, SL, MVT::i16) |
6643 | }; |
6644 | return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops); |
6645 | } |
6646 | |
6647 | SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const { |
6648 | SDLoc SL(Op); |
6649 | SDValue Chain = Op.getOperand(i: 0); |
6650 | MachineFunction &MF = DAG.getMachineFunction(); |
6651 | |
6652 | if (!Subtarget->isTrapHandlerEnabled() || |
6653 | Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { |
6654 | DiagnosticInfoUnsupported NoTrap(MF.getFunction(), |
6655 | "debugtrap handler not supported" , |
6656 | Op.getDebugLoc(), |
6657 | DS_Warning); |
6658 | LLVMContext &Ctx = MF.getFunction().getContext(); |
6659 | Ctx.diagnose(DI: NoTrap); |
6660 | return Chain; |
6661 | } |
6662 | |
6663 | uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap); |
6664 | SDValue Ops[] = { |
6665 | Chain, |
6666 | DAG.getTargetConstant(TrapID, SL, MVT::i16) |
6667 | }; |
6668 | return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops); |
6669 | } |
6670 | |
6671 | SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL, |
6672 | SelectionDAG &DAG) const { |
6673 | if (Subtarget->hasApertureRegs()) { |
6674 | const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS) |
6675 | ? AMDGPU::SRC_SHARED_BASE |
6676 | : AMDGPU::SRC_PRIVATE_BASE; |
6677 | // Note: this feature (register) is broken. When used as a 32-bit operand, |
6678 | // it returns a wrong value (all zeroes?). The real value is in the upper 32 |
6679 | // bits. |
6680 | // |
6681 | // To work around the issue, directly emit a 64 bit mov from this register |
6682 | // then extract the high bits. Note that this shouldn't even result in a |
6683 | // shift being emitted and simply become a pair of registers (e.g.): |
6684 | // s_mov_b64 s[6:7], src_shared_base |
6685 | // v_mov_b32_e32 v1, s7 |
6686 | // |
6687 | // FIXME: It would be more natural to emit a CopyFromReg here, but then copy |
6688 | // coalescing would kick in and it would think it's okay to use the "HI" |
6689 | // subregister directly (instead of extracting the HI 32 bits) which is an |
6690 | // artificial (unusable) register. |
6691 | // Register TableGen definitions would need an overhaul to get rid of the |
6692 | // artificial "HI" aperture registers and prevent this kind of issue from |
6693 | // happening. |
6694 | SDNode *Mov = DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, |
6695 | DAG.getRegister(ApertureRegNo, MVT::i64)); |
6696 | return DAG.getNode( |
6697 | ISD::TRUNCATE, DL, MVT::i32, |
6698 | DAG.getNode(ISD::SRL, DL, MVT::i64, |
6699 | {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)})); |
6700 | } |
6701 | |
6702 | // For code object version 5, private_base and shared_base are passed through |
6703 | // implicit kernargs. |
6704 | const Module *M = DAG.getMachineFunction().getFunction().getParent(); |
6705 | if (AMDGPU::getAMDHSACodeObjectVersion(M: *M) >= AMDGPU::AMDHSA_COV5) { |
6706 | ImplicitParameter Param = |
6707 | (AS == AMDGPUAS::LOCAL_ADDRESS) ? SHARED_BASE : PRIVATE_BASE; |
6708 | return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param); |
6709 | } |
6710 | |
6711 | MachineFunction &MF = DAG.getMachineFunction(); |
6712 | SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
6713 | Register UserSGPR = Info->getQueuePtrUserSGPR(); |
6714 | if (UserSGPR == AMDGPU::NoRegister) { |
6715 | // We probably are in a function incorrectly marked with |
6716 | // amdgpu-no-queue-ptr. This is undefined. |
6717 | return DAG.getUNDEF(MVT::i32); |
6718 | } |
6719 | |
6720 | SDValue QueuePtr = CreateLiveInRegister( |
6721 | DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64); |
6722 | |
6723 | // Offset into amd_queue_t for group_segment_aperture_base_hi / |
6724 | // private_segment_aperture_base_hi. |
6725 | uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; |
6726 | |
6727 | SDValue Ptr = |
6728 | DAG.getObjectPtrOffset(SL: DL, Ptr: QueuePtr, Offset: TypeSize::getFixed(ExactSize: StructOffset)); |
6729 | |
6730 | // TODO: Use custom target PseudoSourceValue. |
6731 | // TODO: We should use the value from the IR intrinsic call, but it might not |
6732 | // be available and how do we get it? |
6733 | MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); |
6734 | return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo, |
6735 | commonAlignment(Align(64), StructOffset), |
6736 | MachineMemOperand::MODereferenceable | |
6737 | MachineMemOperand::MOInvariant); |
6738 | } |
6739 | |
6740 | /// Return true if the value is a known valid address, such that a null check is |
6741 | /// not necessary. |
6742 | static bool isKnownNonNull(SDValue Val, SelectionDAG &DAG, |
6743 | const AMDGPUTargetMachine &TM, unsigned AddrSpace) { |
6744 | if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) || |
6745 | isa<BasicBlockSDNode>(Val)) |
6746 | return true; |
6747 | |
6748 | if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val)) |
6749 | return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace); |
6750 | |
6751 | // TODO: Search through arithmetic, handle arguments and loads |
6752 | // marked nonnull. |
6753 | return false; |
6754 | } |
6755 | |
6756 | SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op, |
6757 | SelectionDAG &DAG) const { |
6758 | SDLoc SL(Op); |
6759 | |
6760 | const AMDGPUTargetMachine &TM = |
6761 | static_cast<const AMDGPUTargetMachine &>(getTargetMachine()); |
6762 | |
6763 | unsigned DestAS, SrcAS; |
6764 | SDValue Src; |
6765 | bool IsNonNull = false; |
6766 | if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Val&: Op)) { |
6767 | SrcAS = ASC->getSrcAddressSpace(); |
6768 | Src = ASC->getOperand(Num: 0); |
6769 | DestAS = ASC->getDestAddressSpace(); |
6770 | } else { |
6771 | assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN && |
6772 | Op.getConstantOperandVal(0) == |
6773 | Intrinsic::amdgcn_addrspacecast_nonnull); |
6774 | Src = Op->getOperand(Num: 1); |
6775 | SrcAS = Op->getConstantOperandVal(Num: 2); |
6776 | DestAS = Op->getConstantOperandVal(Num: 3); |
6777 | IsNonNull = true; |
6778 | } |
6779 | |
6780 | SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64); |
6781 | |
6782 | // flat -> local/private |
6783 | if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { |
6784 | if (DestAS == AMDGPUAS::LOCAL_ADDRESS || |
6785 | DestAS == AMDGPUAS::PRIVATE_ADDRESS) { |
6786 | SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src); |
6787 | |
6788 | if (IsNonNull || isKnownNonNull(Val: Op, DAG, TM, AddrSpace: SrcAS)) |
6789 | return Ptr; |
6790 | |
6791 | unsigned NullVal = TM.getNullPointerValue(AddrSpace: DestAS); |
6792 | SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32); |
6793 | SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE); |
6794 | |
6795 | return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr, |
6796 | SegmentNullPtr); |
6797 | } |
6798 | } |
6799 | |
6800 | // local/private -> flat |
6801 | if (DestAS == AMDGPUAS::FLAT_ADDRESS) { |
6802 | if (SrcAS == AMDGPUAS::LOCAL_ADDRESS || |
6803 | SrcAS == AMDGPUAS::PRIVATE_ADDRESS) { |
6804 | |
6805 | SDValue Aperture = getSegmentAperture(AS: SrcAS, DL: SL, DAG); |
6806 | SDValue CvtPtr = |
6807 | DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture); |
6808 | CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr); |
6809 | |
6810 | if (IsNonNull || isKnownNonNull(Val: Op, DAG, TM, AddrSpace: SrcAS)) |
6811 | return CvtPtr; |
6812 | |
6813 | unsigned NullVal = TM.getNullPointerValue(AddrSpace: SrcAS); |
6814 | SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32); |
6815 | |
6816 | SDValue NonNull |
6817 | = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE); |
6818 | |
6819 | return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr, |
6820 | FlatNullPtr); |
6821 | } |
6822 | } |
6823 | |
6824 | if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && |
6825 | Op.getValueType() == MVT::i64) { |
6826 | const SIMachineFunctionInfo *Info = |
6827 | DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>(); |
6828 | SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32); |
6829 | SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi); |
6830 | return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); |
6831 | } |
6832 | |
6833 | if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && |
6834 | Src.getValueType() == MVT::i64) |
6835 | return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src); |
6836 | |
6837 | // global <-> flat are no-ops and never emitted. |
6838 | |
6839 | const MachineFunction &MF = DAG.getMachineFunction(); |
6840 | DiagnosticInfoUnsupported InvalidAddrSpaceCast( |
6841 | MF.getFunction(), "invalid addrspacecast" , SL.getDebugLoc()); |
6842 | DAG.getContext()->diagnose(DI: InvalidAddrSpaceCast); |
6843 | |
6844 | return DAG.getUNDEF(VT: Op->getValueType(ResNo: 0)); |
6845 | } |
6846 | |
6847 | // This lowers an INSERT_SUBVECTOR by extracting the individual elements from |
6848 | // the small vector and inserting them into the big vector. That is better than |
6849 | // the default expansion of doing it via a stack slot. Even though the use of |
6850 | // the stack slot would be optimized away afterwards, the stack slot itself |
6851 | // remains. |
6852 | SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op, |
6853 | SelectionDAG &DAG) const { |
6854 | SDValue Vec = Op.getOperand(i: 0); |
6855 | SDValue Ins = Op.getOperand(i: 1); |
6856 | SDValue Idx = Op.getOperand(i: 2); |
6857 | EVT VecVT = Vec.getValueType(); |
6858 | EVT InsVT = Ins.getValueType(); |
6859 | EVT EltVT = VecVT.getVectorElementType(); |
6860 | unsigned InsNumElts = InsVT.getVectorNumElements(); |
6861 | unsigned IdxVal = Idx->getAsZExtVal(); |
6862 | SDLoc SL(Op); |
6863 | |
6864 | if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) { |
6865 | // Insert 32-bit registers at a time. |
6866 | assert(InsNumElts % 2 == 0 && "expect legal vector types" ); |
6867 | |
6868 | unsigned VecNumElts = VecVT.getVectorNumElements(); |
6869 | EVT NewVecVT = |
6870 | EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2); |
6871 | EVT NewInsVT = InsNumElts == 2 ? MVT::i32 |
6872 | : EVT::getVectorVT(*DAG.getContext(), |
6873 | MVT::i32, InsNumElts / 2); |
6874 | |
6875 | Vec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVecVT, Operand: Vec); |
6876 | Ins = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewInsVT, Operand: Ins); |
6877 | |
6878 | for (unsigned I = 0; I != InsNumElts / 2; ++I) { |
6879 | SDValue Elt; |
6880 | if (InsNumElts == 2) { |
6881 | Elt = Ins; |
6882 | } else { |
6883 | Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins, |
6884 | DAG.getConstant(I, SL, MVT::i32)); |
6885 | } |
6886 | Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt, |
6887 | DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32)); |
6888 | } |
6889 | |
6890 | return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: VecVT, Operand: Vec); |
6891 | } |
6892 | |
6893 | for (unsigned I = 0; I != InsNumElts; ++I) { |
6894 | SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins, |
6895 | DAG.getConstant(I, SL, MVT::i32)); |
6896 | Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt, |
6897 | DAG.getConstant(IdxVal + I, SL, MVT::i32)); |
6898 | } |
6899 | return Vec; |
6900 | } |
6901 | |
6902 | SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op, |
6903 | SelectionDAG &DAG) const { |
6904 | SDValue Vec = Op.getOperand(i: 0); |
6905 | SDValue InsVal = Op.getOperand(i: 1); |
6906 | SDValue Idx = Op.getOperand(i: 2); |
6907 | EVT VecVT = Vec.getValueType(); |
6908 | EVT EltVT = VecVT.getVectorElementType(); |
6909 | unsigned VecSize = VecVT.getSizeInBits(); |
6910 | unsigned EltSize = EltVT.getSizeInBits(); |
6911 | SDLoc SL(Op); |
6912 | |
6913 | // Specially handle the case of v4i16 with static indexing. |
6914 | unsigned NumElts = VecVT.getVectorNumElements(); |
6915 | auto KIdx = dyn_cast<ConstantSDNode>(Val&: Idx); |
6916 | if (NumElts == 4 && EltSize == 16 && KIdx) { |
6917 | SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec); |
6918 | |
6919 | SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec, |
6920 | DAG.getConstant(0, SL, MVT::i32)); |
6921 | SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec, |
6922 | DAG.getConstant(1, SL, MVT::i32)); |
6923 | |
6924 | SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf); |
6925 | SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf); |
6926 | |
6927 | unsigned Idx = KIdx->getZExtValue(); |
6928 | bool InsertLo = Idx < 2; |
6929 | SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, |
6930 | InsertLo ? LoVec : HiVec, |
6931 | DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal), |
6932 | DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32)); |
6933 | |
6934 | InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf); |
6935 | |
6936 | SDValue Concat = InsertLo ? |
6937 | DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) : |
6938 | DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf }); |
6939 | |
6940 | return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: VecVT, Operand: Concat); |
6941 | } |
6942 | |
6943 | // Static indexing does not lower to stack access, and hence there is no need |
6944 | // for special custom lowering to avoid stack access. |
6945 | if (isa<ConstantSDNode>(Val: Idx)) |
6946 | return SDValue(); |
6947 | |
6948 | // Avoid stack access for dynamic indexing by custom lowering to |
6949 | // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec |
6950 | |
6951 | assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits" ); |
6952 | |
6953 | MVT IntVT = MVT::getIntegerVT(BitWidth: VecSize); |
6954 | |
6955 | // Convert vector index to bit-index and get the required bit mask. |
6956 | assert(isPowerOf2_32(EltSize)); |
6957 | const auto EltMask = maskTrailingOnes<uint64_t>(N: EltSize); |
6958 | SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32); |
6959 | SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor); |
6960 | SDValue BFM = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: IntVT, |
6961 | N1: DAG.getConstant(Val: EltMask, DL: SL, VT: IntVT), N2: ScaledIdx); |
6962 | |
6963 | // 1. Create a congruent vector with the target value in each element. |
6964 | SDValue ExtVal = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: IntVT, |
6965 | Operand: DAG.getSplatBuildVector(VT: VecVT, DL: SL, Op: InsVal)); |
6966 | |
6967 | // 2. Mask off all other indicies except the required index within (1). |
6968 | SDValue LHS = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: IntVT, N1: BFM, N2: ExtVal); |
6969 | |
6970 | // 3. Mask off the required index within the target vector. |
6971 | SDValue BCVec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: IntVT, Operand: Vec); |
6972 | SDValue RHS = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: IntVT, |
6973 | N1: DAG.getNOT(DL: SL, Val: BFM, VT: IntVT), N2: BCVec); |
6974 | |
6975 | // 4. Get (2) and (3) ORed into the target vector. |
6976 | SDValue BFI = DAG.getNode(Opcode: ISD::OR, DL: SL, VT: IntVT, N1: LHS, N2: RHS); |
6977 | |
6978 | return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: VecVT, Operand: BFI); |
6979 | } |
6980 | |
6981 | SDValue SITargetLowering::(SDValue Op, |
6982 | SelectionDAG &DAG) const { |
6983 | SDLoc SL(Op); |
6984 | |
6985 | EVT ResultVT = Op.getValueType(); |
6986 | SDValue Vec = Op.getOperand(i: 0); |
6987 | SDValue Idx = Op.getOperand(i: 1); |
6988 | EVT VecVT = Vec.getValueType(); |
6989 | unsigned VecSize = VecVT.getSizeInBits(); |
6990 | EVT EltVT = VecVT.getVectorElementType(); |
6991 | |
6992 | DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr); |
6993 | |
6994 | // Make sure we do any optimizations that will make it easier to fold |
6995 | // source modifiers before obscuring it with bit operations. |
6996 | |
6997 | // XXX - Why doesn't this get called when vector_shuffle is expanded? |
6998 | if (SDValue Combined = performExtractVectorEltCombine(N: Op.getNode(), DCI)) |
6999 | return Combined; |
7000 | |
7001 | if (VecSize == 128 || VecSize == 256 || VecSize == 512) { |
7002 | SDValue Lo, Hi; |
7003 | EVT LoVT, HiVT; |
7004 | std::tie(args&: LoVT, args&: HiVT) = DAG.GetSplitDestVTs(VT: VecVT); |
7005 | |
7006 | if (VecSize == 128) { |
7007 | SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec); |
7008 | Lo = DAG.getBitcast(LoVT, |
7009 | DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2, |
7010 | DAG.getConstant(0, SL, MVT::i32))); |
7011 | Hi = DAG.getBitcast(HiVT, |
7012 | DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2, |
7013 | DAG.getConstant(1, SL, MVT::i32))); |
7014 | } else if (VecSize == 256) { |
7015 | SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec); |
7016 | SDValue Parts[4]; |
7017 | for (unsigned P = 0; P < 4; ++P) { |
7018 | Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2, |
7019 | DAG.getConstant(P, SL, MVT::i32)); |
7020 | } |
7021 | |
7022 | Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64, |
7023 | Parts[0], Parts[1])); |
7024 | Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64, |
7025 | Parts[2], Parts[3])); |
7026 | } else { |
7027 | assert(VecSize == 512); |
7028 | |
7029 | SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec); |
7030 | SDValue Parts[8]; |
7031 | for (unsigned P = 0; P < 8; ++P) { |
7032 | Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2, |
7033 | DAG.getConstant(P, SL, MVT::i32)); |
7034 | } |
7035 | |
7036 | Lo = DAG.getBitcast(LoVT, |
7037 | DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64, |
7038 | Parts[0], Parts[1], Parts[2], Parts[3])); |
7039 | Hi = DAG.getBitcast(HiVT, |
7040 | DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64, |
7041 | Parts[4], Parts[5],Parts[6], Parts[7])); |
7042 | } |
7043 | |
7044 | EVT IdxVT = Idx.getValueType(); |
7045 | unsigned NElem = VecVT.getVectorNumElements(); |
7046 | assert(isPowerOf2_32(NElem)); |
7047 | SDValue IdxMask = DAG.getConstant(Val: NElem / 2 - 1, DL: SL, VT: IdxVT); |
7048 | SDValue NewIdx = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: IdxVT, N1: Idx, N2: IdxMask); |
7049 | SDValue Half = DAG.getSelectCC(DL: SL, LHS: Idx, RHS: IdxMask, True: Hi, False: Lo, Cond: ISD::SETUGT); |
7050 | return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT, N1: Half, N2: NewIdx); |
7051 | } |
7052 | |
7053 | assert(VecSize <= 64); |
7054 | |
7055 | MVT IntVT = MVT::getIntegerVT(BitWidth: VecSize); |
7056 | |
7057 | // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly. |
7058 | SDValue VecBC = peekThroughBitcasts(V: Vec); |
7059 | if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) { |
7060 | SDValue Src = VecBC.getOperand(i: 0); |
7061 | Src = DAG.getBitcast(VT: Src.getValueType().changeTypeToInteger(), V: Src); |
7062 | Vec = DAG.getAnyExtOrTrunc(Op: Src, DL: SL, VT: IntVT); |
7063 | } |
7064 | |
7065 | unsigned EltSize = EltVT.getSizeInBits(); |
7066 | assert(isPowerOf2_32(EltSize)); |
7067 | |
7068 | SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32); |
7069 | |
7070 | // Convert vector index to bit-index (* EltSize) |
7071 | SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor); |
7072 | |
7073 | SDValue BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: IntVT, Operand: Vec); |
7074 | SDValue Elt = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: IntVT, N1: BC, N2: ScaledIdx); |
7075 | |
7076 | if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) { |
7077 | SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt); |
7078 | return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: ResultVT, Operand: Result); |
7079 | } |
7080 | |
7081 | return DAG.getAnyExtOrTrunc(Op: Elt, DL: SL, VT: ResultVT); |
7082 | } |
7083 | |
7084 | static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) { |
7085 | assert(Elt % 2 == 0); |
7086 | return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0); |
7087 | } |
7088 | |
7089 | SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op, |
7090 | SelectionDAG &DAG) const { |
7091 | SDLoc SL(Op); |
7092 | EVT ResultVT = Op.getValueType(); |
7093 | ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Val&: Op); |
7094 | |
7095 | EVT PackVT = ResultVT.isInteger() ? MVT::v2i16 : MVT::v2f16; |
7096 | EVT EltVT = PackVT.getVectorElementType(); |
7097 | int SrcNumElts = Op.getOperand(i: 0).getValueType().getVectorNumElements(); |
7098 | |
7099 | // vector_shuffle <0,1,6,7> lhs, rhs |
7100 | // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2) |
7101 | // |
7102 | // vector_shuffle <6,7,2,3> lhs, rhs |
7103 | // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2) |
7104 | // |
7105 | // vector_shuffle <6,7,0,1> lhs, rhs |
7106 | // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0) |
7107 | |
7108 | // Avoid scalarizing when both halves are reading from consecutive elements. |
7109 | SmallVector<SDValue, 4> Pieces; |
7110 | for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) { |
7111 | if (elementPairIsContiguous(Mask: SVN->getMask(), Elt: I)) { |
7112 | const int Idx = SVN->getMaskElt(Idx: I); |
7113 | int VecIdx = Idx < SrcNumElts ? 0 : 1; |
7114 | int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts; |
7115 | SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, |
7116 | PackVT, SVN->getOperand(VecIdx), |
7117 | DAG.getConstant(EltIdx, SL, MVT::i32)); |
7118 | Pieces.push_back(Elt: SubVec); |
7119 | } else { |
7120 | const int Idx0 = SVN->getMaskElt(Idx: I); |
7121 | const int Idx1 = SVN->getMaskElt(Idx: I + 1); |
7122 | int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1; |
7123 | int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1; |
7124 | int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts; |
7125 | int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts; |
7126 | |
7127 | SDValue Vec0 = SVN->getOperand(Num: VecIdx0); |
7128 | SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, |
7129 | Vec0, DAG.getConstant(EltIdx0, SL, MVT::i32)); |
7130 | |
7131 | SDValue Vec1 = SVN->getOperand(Num: VecIdx1); |
7132 | SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, |
7133 | Vec1, DAG.getConstant(EltIdx1, SL, MVT::i32)); |
7134 | Pieces.push_back(Elt: DAG.getBuildVector(VT: PackVT, DL: SL, Ops: { Elt0, Elt1 })); |
7135 | } |
7136 | } |
7137 | |
7138 | return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SL, VT: ResultVT, Ops: Pieces); |
7139 | } |
7140 | |
7141 | SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op, |
7142 | SelectionDAG &DAG) const { |
7143 | SDValue SVal = Op.getOperand(i: 0); |
7144 | EVT ResultVT = Op.getValueType(); |
7145 | EVT SValVT = SVal.getValueType(); |
7146 | SDValue UndefVal = DAG.getUNDEF(VT: SValVT); |
7147 | SDLoc SL(Op); |
7148 | |
7149 | SmallVector<SDValue, 8> VElts; |
7150 | VElts.push_back(Elt: SVal); |
7151 | for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I) |
7152 | VElts.push_back(Elt: UndefVal); |
7153 | |
7154 | return DAG.getBuildVector(VT: ResultVT, DL: SL, Ops: VElts); |
7155 | } |
7156 | |
7157 | SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op, |
7158 | SelectionDAG &DAG) const { |
7159 | SDLoc SL(Op); |
7160 | EVT VT = Op.getValueType(); |
7161 | |
7162 | if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 || |
7163 | VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) { |
7164 | EVT HalfVT = MVT::getVectorVT(VT: VT.getVectorElementType().getSimpleVT(), |
7165 | NumElements: VT.getVectorNumElements() / 2); |
7166 | MVT HalfIntVT = MVT::getIntegerVT(BitWidth: HalfVT.getSizeInBits()); |
7167 | |
7168 | // Turn into pair of packed build_vectors. |
7169 | // TODO: Special case for constants that can be materialized with s_mov_b64. |
7170 | SmallVector<SDValue, 4> LoOps, HiOps; |
7171 | for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I != E; ++I) { |
7172 | LoOps.push_back(Elt: Op.getOperand(i: I)); |
7173 | HiOps.push_back(Elt: Op.getOperand(i: I + E)); |
7174 | } |
7175 | SDValue Lo = DAG.getBuildVector(VT: HalfVT, DL: SL, Ops: LoOps); |
7176 | SDValue Hi = DAG.getBuildVector(VT: HalfVT, DL: SL, Ops: HiOps); |
7177 | |
7178 | SDValue CastLo = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: HalfIntVT, Operand: Lo); |
7179 | SDValue CastHi = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: HalfIntVT, Operand: Hi); |
7180 | |
7181 | SDValue Blend = DAG.getBuildVector(VT: MVT::getVectorVT(VT: HalfIntVT, NumElements: 2), DL: SL, |
7182 | Ops: { CastLo, CastHi }); |
7183 | return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Blend); |
7184 | } |
7185 | |
7186 | if (VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v16bf16) { |
7187 | EVT QuarterVT = MVT::getVectorVT(VT: VT.getVectorElementType().getSimpleVT(), |
7188 | NumElements: VT.getVectorNumElements() / 4); |
7189 | MVT QuarterIntVT = MVT::getIntegerVT(BitWidth: QuarterVT.getSizeInBits()); |
7190 | |
7191 | SmallVector<SDValue, 4> Parts[4]; |
7192 | for (unsigned I = 0, E = VT.getVectorNumElements() / 4; I != E; ++I) { |
7193 | for (unsigned P = 0; P < 4; ++P) |
7194 | Parts[P].push_back(Elt: Op.getOperand(i: I + P * E)); |
7195 | } |
7196 | SDValue Casts[4]; |
7197 | for (unsigned P = 0; P < 4; ++P) { |
7198 | SDValue Vec = DAG.getBuildVector(VT: QuarterVT, DL: SL, Ops: Parts[P]); |
7199 | Casts[P] = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: QuarterIntVT, Operand: Vec); |
7200 | } |
7201 | |
7202 | SDValue Blend = |
7203 | DAG.getBuildVector(VT: MVT::getVectorVT(VT: QuarterIntVT, NumElements: 4), DL: SL, Ops: Casts); |
7204 | return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Blend); |
7205 | } |
7206 | |
7207 | if (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v32bf16) { |
7208 | EVT QuarterVT = MVT::getVectorVT(VT: VT.getVectorElementType().getSimpleVT(), |
7209 | NumElements: VT.getVectorNumElements() / 8); |
7210 | MVT QuarterIntVT = MVT::getIntegerVT(BitWidth: QuarterVT.getSizeInBits()); |
7211 | |
7212 | SmallVector<SDValue, 8> Parts[8]; |
7213 | for (unsigned I = 0, E = VT.getVectorNumElements() / 8; I != E; ++I) { |
7214 | for (unsigned P = 0; P < 8; ++P) |
7215 | Parts[P].push_back(Elt: Op.getOperand(i: I + P * E)); |
7216 | } |
7217 | SDValue Casts[8]; |
7218 | for (unsigned P = 0; P < 8; ++P) { |
7219 | SDValue Vec = DAG.getBuildVector(VT: QuarterVT, DL: SL, Ops: Parts[P]); |
7220 | Casts[P] = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: QuarterIntVT, Operand: Vec); |
7221 | } |
7222 | |
7223 | SDValue Blend = |
7224 | DAG.getBuildVector(VT: MVT::getVectorVT(VT: QuarterIntVT, NumElements: 8), DL: SL, Ops: Casts); |
7225 | return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Blend); |
7226 | } |
7227 | |
7228 | assert(VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16); |
7229 | assert(!Subtarget->hasVOP3PInsts() && "this should be legal" ); |
7230 | |
7231 | SDValue Lo = Op.getOperand(i: 0); |
7232 | SDValue Hi = Op.getOperand(i: 1); |
7233 | |
7234 | // Avoid adding defined bits with the zero_extend. |
7235 | if (Hi.isUndef()) { |
7236 | Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo); |
7237 | SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo); |
7238 | return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: ExtLo); |
7239 | } |
7240 | |
7241 | Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi); |
7242 | Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi); |
7243 | |
7244 | SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi, |
7245 | DAG.getConstant(16, SL, MVT::i32)); |
7246 | if (Lo.isUndef()) |
7247 | return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: ShlHi); |
7248 | |
7249 | Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo); |
7250 | Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo); |
7251 | |
7252 | SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi); |
7253 | return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Or); |
7254 | } |
7255 | |
7256 | bool |
7257 | SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { |
7258 | // OSes that use ELF REL relocations (instead of RELA) can only store a |
7259 | // 32-bit addend in the instruction, so it is not safe to allow offset folding |
7260 | // which can create arbitrary 64-bit addends. (This is only a problem for |
7261 | // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by |
7262 | // the high 32 bits of the addend.) |
7263 | // |
7264 | // This should be kept in sync with how HasRelocationAddend is initialized in |
7265 | // the constructor of ELFAMDGPUAsmBackend. |
7266 | if (!Subtarget->isAmdHsaOS()) |
7267 | return false; |
7268 | |
7269 | // We can fold offsets for anything that doesn't require a GOT relocation. |
7270 | return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || |
7271 | GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || |
7272 | GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) && |
7273 | !shouldEmitGOTReloc(GV: GA->getGlobal()); |
7274 | } |
7275 | |
7276 | static SDValue |
7277 | buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, |
7278 | const SDLoc &DL, int64_t Offset, EVT PtrVT, |
7279 | unsigned GAFlags = SIInstrInfo::MO_NONE) { |
7280 | assert(isInt<32>(Offset + 4) && "32-bit offset is expected!" ); |
7281 | // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is |
7282 | // lowered to the following code sequence: |
7283 | // |
7284 | // For constant address space: |
7285 | // s_getpc_b64 s[0:1] |
7286 | // s_add_u32 s0, s0, $symbol |
7287 | // s_addc_u32 s1, s1, 0 |
7288 | // |
7289 | // s_getpc_b64 returns the address of the s_add_u32 instruction and then |
7290 | // a fixup or relocation is emitted to replace $symbol with a literal |
7291 | // constant, which is a pc-relative offset from the encoding of the $symbol |
7292 | // operand to the global variable. |
7293 | // |
7294 | // For global address space: |
7295 | // s_getpc_b64 s[0:1] |
7296 | // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo |
7297 | // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi |
7298 | // |
7299 | // s_getpc_b64 returns the address of the s_add_u32 instruction and then |
7300 | // fixups or relocations are emitted to replace $symbol@*@lo and |
7301 | // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, |
7302 | // which is a 64-bit pc-relative offset from the encoding of the $symbol |
7303 | // operand to the global variable. |
7304 | SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags); |
7305 | SDValue PtrHi; |
7306 | if (GAFlags == SIInstrInfo::MO_NONE) |
7307 | PtrHi = DAG.getTargetConstant(0, DL, MVT::i32); |
7308 | else |
7309 | PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1); |
7310 | return DAG.getNode(Opcode: AMDGPUISD::PC_ADD_REL_OFFSET, DL, VT: PtrVT, N1: PtrLo, N2: PtrHi); |
7311 | } |
7312 | |
7313 | SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, |
7314 | SDValue Op, |
7315 | SelectionDAG &DAG) const { |
7316 | GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Val&: Op); |
7317 | SDLoc DL(GSD); |
7318 | EVT PtrVT = Op.getValueType(); |
7319 | |
7320 | const GlobalValue *GV = GSD->getGlobal(); |
7321 | if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && |
7322 | shouldUseLDSConstAddress(GV)) || |
7323 | GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS || |
7324 | GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { |
7325 | if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && |
7326 | GV->hasExternalLinkage()) { |
7327 | Type *Ty = GV->getValueType(); |
7328 | // HIP uses an unsized array `extern __shared__ T s[]` or similar |
7329 | // zero-sized type in other languages to declare the dynamic shared |
7330 | // memory which size is not known at the compile time. They will be |
7331 | // allocated by the runtime and placed directly after the static |
7332 | // allocated ones. They all share the same offset. |
7333 | if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) { |
7334 | assert(PtrVT == MVT::i32 && "32-bit pointer is expected." ); |
7335 | // Adjust alignment for that dynamic shared memory array. |
7336 | Function &F = DAG.getMachineFunction().getFunction(); |
7337 | MFI->setDynLDSAlign(F, GV: *cast<GlobalVariable>(Val: GV)); |
7338 | MFI->setUsesDynamicLDS(true); |
7339 | return SDValue( |
7340 | DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0); |
7341 | } |
7342 | } |
7343 | return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); |
7344 | } |
7345 | |
7346 | if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { |
7347 | SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(), |
7348 | SIInstrInfo::MO_ABS32_LO); |
7349 | return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA); |
7350 | } |
7351 | |
7352 | if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) { |
7353 | SDValue AddrLo = DAG.getTargetGlobalAddress( |
7354 | GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO); |
7355 | AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0}; |
7356 | |
7357 | SDValue AddrHi = DAG.getTargetGlobalAddress( |
7358 | GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI); |
7359 | AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0}; |
7360 | |
7361 | return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi); |
7362 | } |
7363 | |
7364 | if (shouldEmitFixup(GV)) |
7365 | return buildPCRelGlobalAddress(DAG, GV, DL, Offset: GSD->getOffset(), PtrVT); |
7366 | |
7367 | if (shouldEmitPCReloc(GV)) |
7368 | return buildPCRelGlobalAddress(DAG, GV, DL, Offset: GSD->getOffset(), PtrVT, |
7369 | GAFlags: SIInstrInfo::MO_REL32); |
7370 | |
7371 | SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, Offset: 0, PtrVT, |
7372 | GAFlags: SIInstrInfo::MO_GOTPCREL32); |
7373 | |
7374 | Type *Ty = PtrVT.getTypeForEVT(Context&: *DAG.getContext()); |
7375 | PointerType *PtrTy = PointerType::get(ElementType: Ty, AddressSpace: AMDGPUAS::CONSTANT_ADDRESS); |
7376 | const DataLayout &DataLayout = DAG.getDataLayout(); |
7377 | Align Alignment = DataLayout.getABITypeAlign(Ty: PtrTy); |
7378 | MachinePointerInfo PtrInfo |
7379 | = MachinePointerInfo::getGOT(MF&: DAG.getMachineFunction()); |
7380 | |
7381 | return DAG.getLoad(VT: PtrVT, dl: DL, Chain: DAG.getEntryNode(), Ptr: GOTAddr, PtrInfo, Alignment, |
7382 | MMOFlags: MachineMemOperand::MODereferenceable | |
7383 | MachineMemOperand::MOInvariant); |
7384 | } |
7385 | |
7386 | SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, |
7387 | const SDLoc &DL, SDValue V) const { |
7388 | // We can't use S_MOV_B32 directly, because there is no way to specify m0 as |
7389 | // the destination register. |
7390 | // |
7391 | // We can't use CopyToReg, because MachineCSE won't combine COPY instructions, |
7392 | // so we will end up with redundant moves to m0. |
7393 | // |
7394 | // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result. |
7395 | |
7396 | // A Null SDValue creates a glue result. |
7397 | SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue, |
7398 | V, Chain); |
7399 | return SDValue(M0, 0); |
7400 | } |
7401 | |
7402 | SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, |
7403 | SDValue Op, |
7404 | MVT VT, |
7405 | unsigned Offset) const { |
7406 | SDLoc SL(Op); |
7407 | SDValue Param = lowerKernargMemParameter( |
7408 | DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false); |
7409 | // The local size values will have the hi 16-bits as zero. |
7410 | return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param, |
7411 | DAG.getValueType(VT)); |
7412 | } |
7413 | |
7414 | static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, |
7415 | EVT VT) { |
7416 | DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(), |
7417 | "non-hsa intrinsic with hsa target" , |
7418 | DL.getDebugLoc()); |
7419 | DAG.getContext()->diagnose(DI: BadIntrin); |
7420 | return DAG.getUNDEF(VT); |
7421 | } |
7422 | |
7423 | static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, |
7424 | EVT VT) { |
7425 | DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(), |
7426 | "intrinsic not supported on subtarget" , |
7427 | DL.getDebugLoc()); |
7428 | DAG.getContext()->diagnose(DI: BadIntrin); |
7429 | return DAG.getUNDEF(VT); |
7430 | } |
7431 | |
7432 | static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, |
7433 | ArrayRef<SDValue> Elts) { |
7434 | assert(!Elts.empty()); |
7435 | MVT Type; |
7436 | unsigned NumElts = Elts.size(); |
7437 | |
7438 | if (NumElts <= 12) { |
7439 | Type = MVT::getVectorVT(MVT::f32, NumElts); |
7440 | } else { |
7441 | assert(Elts.size() <= 16); |
7442 | Type = MVT::v16f32; |
7443 | NumElts = 16; |
7444 | } |
7445 | |
7446 | SmallVector<SDValue, 16> VecElts(NumElts); |
7447 | for (unsigned i = 0; i < Elts.size(); ++i) { |
7448 | SDValue Elt = Elts[i]; |
7449 | if (Elt.getValueType() != MVT::f32) |
7450 | Elt = DAG.getBitcast(MVT::f32, Elt); |
7451 | VecElts[i] = Elt; |
7452 | } |
7453 | for (unsigned i = Elts.size(); i < NumElts; ++i) |
7454 | VecElts[i] = DAG.getUNDEF(MVT::f32); |
7455 | |
7456 | if (NumElts == 1) |
7457 | return VecElts[0]; |
7458 | return DAG.getBuildVector(VT: Type, DL, Ops: VecElts); |
7459 | } |
7460 | |
7461 | static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, |
7462 | SDValue Src, int ) { |
7463 | EVT SrcVT = Src.getValueType(); |
7464 | |
7465 | SmallVector<SDValue, 8> Elts; |
7466 | |
7467 | if (SrcVT.isVector()) |
7468 | DAG.ExtractVectorElements(Op: Src, Args&: Elts); |
7469 | else |
7470 | Elts.push_back(Elt: Src); |
7471 | |
7472 | SDValue Undef = DAG.getUNDEF(VT: SrcVT.getScalarType()); |
7473 | while (ExtraElts--) |
7474 | Elts.push_back(Elt: Undef); |
7475 | |
7476 | return DAG.getBuildVector(VT: CastVT, DL, Ops: Elts); |
7477 | } |
7478 | |
7479 | // Re-construct the required return value for a image load intrinsic. |
7480 | // This is more complicated due to the optional use TexFailCtrl which means the required |
7481 | // return type is an aggregate |
7482 | static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, |
7483 | ArrayRef<EVT> ResultTypes, bool IsTexFail, |
7484 | bool Unpacked, bool IsD16, int DMaskPop, |
7485 | int NumVDataDwords, bool IsAtomicPacked16Bit, |
7486 | const SDLoc &DL) { |
7487 | // Determine the required return type. This is the same regardless of IsTexFail flag |
7488 | EVT ReqRetVT = ResultTypes[0]; |
7489 | int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1; |
7490 | int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit) |
7491 | ? (ReqRetNumElts + 1) / 2 |
7492 | : ReqRetNumElts; |
7493 | |
7494 | int MaskPopDwords = (!IsD16 || (IsD16 && Unpacked)) ? |
7495 | DMaskPop : (DMaskPop + 1) / 2; |
7496 | |
7497 | MVT DataDwordVT = NumDataDwords == 1 ? |
7498 | MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords); |
7499 | |
7500 | MVT MaskPopVT = MaskPopDwords == 1 ? |
7501 | MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords); |
7502 | |
7503 | SDValue Data(Result, 0); |
7504 | SDValue TexFail; |
7505 | |
7506 | if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) { |
7507 | SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32); |
7508 | if (MaskPopVT.isVector()) { |
7509 | Data = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MaskPopVT, |
7510 | N1: SDValue(Result, 0), N2: ZeroIdx); |
7511 | } else { |
7512 | Data = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MaskPopVT, |
7513 | N1: SDValue(Result, 0), N2: ZeroIdx); |
7514 | } |
7515 | } |
7516 | |
7517 | if (DataDwordVT.isVector() && !IsAtomicPacked16Bit) |
7518 | Data = padEltsToUndef(DAG, DL, CastVT: DataDwordVT, Src: Data, |
7519 | ExtraElts: NumDataDwords - MaskPopDwords); |
7520 | |
7521 | if (IsD16) |
7522 | Data = adjustLoadValueTypeImpl(Result: Data, LoadVT: ReqRetVT, DL, DAG, Unpacked); |
7523 | |
7524 | EVT LegalReqRetVT = ReqRetVT; |
7525 | if (!ReqRetVT.isVector()) { |
7526 | if (!Data.getValueType().isInteger()) |
7527 | Data = DAG.getNode(Opcode: ISD::BITCAST, DL, |
7528 | VT: Data.getValueType().changeTypeToInteger(), Operand: Data); |
7529 | Data = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ReqRetVT.changeTypeToInteger(), Operand: Data); |
7530 | } else { |
7531 | // We need to widen the return vector to a legal type |
7532 | if ((ReqRetVT.getVectorNumElements() % 2) == 1 && |
7533 | ReqRetVT.getVectorElementType().getSizeInBits() == 16) { |
7534 | LegalReqRetVT = |
7535 | EVT::getVectorVT(Context&: *DAG.getContext(), VT: ReqRetVT.getVectorElementType(), |
7536 | NumElements: ReqRetVT.getVectorNumElements() + 1); |
7537 | } |
7538 | } |
7539 | Data = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: LegalReqRetVT, Operand: Data); |
7540 | |
7541 | if (IsTexFail) { |
7542 | TexFail = |
7543 | DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0), |
7544 | DAG.getConstant(MaskPopDwords, DL, MVT::i32)); |
7545 | |
7546 | return DAG.getMergeValues(Ops: {Data, TexFail, SDValue(Result, 1)}, dl: DL); |
7547 | } |
7548 | |
7549 | if (Result->getNumValues() == 1) |
7550 | return Data; |
7551 | |
7552 | return DAG.getMergeValues(Ops: {Data, SDValue(Result, 1)}, dl: DL); |
7553 | } |
7554 | |
7555 | static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE, |
7556 | SDValue *LWE, bool &IsTexFail) { |
7557 | auto TexFailCtrlConst = cast<ConstantSDNode>(Val: TexFailCtrl.getNode()); |
7558 | |
7559 | uint64_t Value = TexFailCtrlConst->getZExtValue(); |
7560 | if (Value) { |
7561 | IsTexFail = true; |
7562 | } |
7563 | |
7564 | SDLoc DL(TexFailCtrlConst); |
7565 | *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32); |
7566 | Value &= ~(uint64_t)0x1; |
7567 | *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32); |
7568 | Value &= ~(uint64_t)0x2; |
7569 | |
7570 | return Value == 0; |
7571 | } |
7572 | |
7573 | static void packImage16bitOpsToDwords(SelectionDAG &DAG, SDValue Op, |
7574 | MVT PackVectorVT, |
7575 | SmallVectorImpl<SDValue> &PackedAddrs, |
7576 | unsigned DimIdx, unsigned EndIdx, |
7577 | unsigned NumGradients) { |
7578 | SDLoc DL(Op); |
7579 | for (unsigned I = DimIdx; I < EndIdx; I++) { |
7580 | SDValue Addr = Op.getOperand(i: I); |
7581 | |
7582 | // Gradients are packed with undef for each coordinate. |
7583 | // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this: |
7584 | // 1D: undef,dx/dh; undef,dx/dv |
7585 | // 2D: dy/dh,dx/dh; dy/dv,dx/dv |
7586 | // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv |
7587 | if (((I + 1) >= EndIdx) || |
7588 | ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 || |
7589 | I == DimIdx + NumGradients - 1))) { |
7590 | if (Addr.getValueType() != MVT::i16) |
7591 | Addr = DAG.getBitcast(MVT::i16, Addr); |
7592 | Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr); |
7593 | } else { |
7594 | Addr = DAG.getBuildVector(VT: PackVectorVT, DL, Ops: {Addr, Op.getOperand(i: I + 1)}); |
7595 | I++; |
7596 | } |
7597 | Addr = DAG.getBitcast(MVT::f32, Addr); |
7598 | PackedAddrs.push_back(Elt: Addr); |
7599 | } |
7600 | } |
7601 | |
7602 | SDValue SITargetLowering::lowerImage(SDValue Op, |
7603 | const AMDGPU::ImageDimIntrinsicInfo *Intr, |
7604 | SelectionDAG &DAG, bool WithChain) const { |
7605 | SDLoc DL(Op); |
7606 | MachineFunction &MF = DAG.getMachineFunction(); |
7607 | const GCNSubtarget* ST = &MF.getSubtarget<GCNSubtarget>(); |
7608 | const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = |
7609 | AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Intr->BaseOpcode); |
7610 | const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(DimEnum: Intr->Dim); |
7611 | unsigned IntrOpcode = Intr->BaseOpcode; |
7612 | bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget); |
7613 | bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget); |
7614 | bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget); |
7615 | |
7616 | SmallVector<EVT, 3> ResultTypes(Op->values()); |
7617 | SmallVector<EVT, 3> OrigResultTypes(Op->values()); |
7618 | bool IsD16 = false; |
7619 | bool IsG16 = false; |
7620 | bool IsA16 = false; |
7621 | SDValue VData; |
7622 | int NumVDataDwords; |
7623 | bool AdjustRetType = false; |
7624 | bool IsAtomicPacked16Bit = false; |
7625 | |
7626 | // Offset of intrinsic arguments |
7627 | const unsigned ArgOffset = WithChain ? 2 : 1; |
7628 | |
7629 | unsigned DMask; |
7630 | unsigned DMaskLanes = 0; |
7631 | |
7632 | if (BaseOpcode->Atomic) { |
7633 | VData = Op.getOperand(i: 2); |
7634 | |
7635 | IsAtomicPacked16Bit = |
7636 | (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 || |
7637 | Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16); |
7638 | |
7639 | bool Is64Bit = VData.getValueSizeInBits() == 64; |
7640 | if (BaseOpcode->AtomicX2) { |
7641 | SDValue VData2 = Op.getOperand(i: 3); |
7642 | VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL, |
7643 | {VData, VData2}); |
7644 | if (Is64Bit) |
7645 | VData = DAG.getBitcast(MVT::v4i32, VData); |
7646 | |
7647 | ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32; |
7648 | DMask = Is64Bit ? 0xf : 0x3; |
7649 | NumVDataDwords = Is64Bit ? 4 : 2; |
7650 | } else { |
7651 | DMask = Is64Bit ? 0x3 : 0x1; |
7652 | NumVDataDwords = Is64Bit ? 2 : 1; |
7653 | } |
7654 | } else { |
7655 | DMask = Op->getConstantOperandVal(Num: ArgOffset + Intr->DMaskIndex); |
7656 | DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(Value: DMask); |
7657 | |
7658 | if (BaseOpcode->Store) { |
7659 | VData = Op.getOperand(i: 2); |
7660 | |
7661 | MVT StoreVT = VData.getSimpleValueType(); |
7662 | if (StoreVT.getScalarType() == MVT::f16) { |
7663 | if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16) |
7664 | return Op; // D16 is unsupported for this instruction |
7665 | |
7666 | IsD16 = true; |
7667 | VData = handleD16VData(VData, DAG, ImageStore: true); |
7668 | } |
7669 | |
7670 | NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32; |
7671 | } else { |
7672 | // Work out the num dwords based on the dmask popcount and underlying type |
7673 | // and whether packing is supported. |
7674 | MVT LoadVT = ResultTypes[0].getSimpleVT(); |
7675 | if (LoadVT.getScalarType() == MVT::f16) { |
7676 | if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16) |
7677 | return Op; // D16 is unsupported for this instruction |
7678 | |
7679 | IsD16 = true; |
7680 | } |
7681 | |
7682 | // Confirm that the return type is large enough for the dmask specified |
7683 | if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) || |
7684 | (!LoadVT.isVector() && DMaskLanes > 1)) |
7685 | return Op; |
7686 | |
7687 | // The sq block of gfx8 and gfx9 do not estimate register use correctly |
7688 | // for d16 image_gather4, image_gather4_l, and image_gather4_lz |
7689 | // instructions. |
7690 | if (IsD16 && !Subtarget->hasUnpackedD16VMem() && |
7691 | !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug())) |
7692 | NumVDataDwords = (DMaskLanes + 1) / 2; |
7693 | else |
7694 | NumVDataDwords = DMaskLanes; |
7695 | |
7696 | AdjustRetType = true; |
7697 | } |
7698 | } |
7699 | |
7700 | unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd; |
7701 | SmallVector<SDValue, 4> VAddrs; |
7702 | |
7703 | // Check for 16 bit addresses or derivatives and pack if true. |
7704 | MVT VAddrVT = |
7705 | Op.getOperand(i: ArgOffset + Intr->GradientStart).getSimpleValueType(); |
7706 | MVT VAddrScalarVT = VAddrVT.getScalarType(); |
7707 | MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16; |
7708 | IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16; |
7709 | |
7710 | VAddrVT = Op.getOperand(i: ArgOffset + Intr->CoordStart).getSimpleValueType(); |
7711 | VAddrScalarVT = VAddrVT.getScalarType(); |
7712 | MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16; |
7713 | IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16; |
7714 | |
7715 | // Push back extra arguments. |
7716 | for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) { |
7717 | if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) { |
7718 | assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument" ); |
7719 | // Special handling of bias when A16 is on. Bias is of type half but |
7720 | // occupies full 32-bit. |
7721 | SDValue Bias = DAG.getBuildVector( |
7722 | MVT::v2f16, DL, |
7723 | {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)}); |
7724 | VAddrs.push_back(Elt: Bias); |
7725 | } else { |
7726 | assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && |
7727 | "Bias needs to be converted to 16 bit in A16 mode" ); |
7728 | VAddrs.push_back(Elt: Op.getOperand(i: ArgOffset + I)); |
7729 | } |
7730 | } |
7731 | |
7732 | if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) { |
7733 | // 16 bit gradients are supported, but are tied to the A16 control |
7734 | // so both gradients and addresses must be 16 bit |
7735 | LLVM_DEBUG( |
7736 | dbgs() << "Failed to lower image intrinsic: 16 bit addresses " |
7737 | "require 16 bit args for both gradients and addresses" ); |
7738 | return Op; |
7739 | } |
7740 | |
7741 | if (IsA16) { |
7742 | if (!ST->hasA16()) { |
7743 | LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not " |
7744 | "support 16 bit addresses\n" ); |
7745 | return Op; |
7746 | } |
7747 | } |
7748 | |
7749 | // We've dealt with incorrect input so we know that if IsA16, IsG16 |
7750 | // are set then we have to compress/pack operands (either address, |
7751 | // gradient or both) |
7752 | // In the case where a16 and gradients are tied (no G16 support) then we |
7753 | // have already verified that both IsA16 and IsG16 are true |
7754 | if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) { |
7755 | // Activate g16 |
7756 | const AMDGPU::MIMGG16MappingInfo *G16MappingInfo = |
7757 | AMDGPU::getMIMGG16MappingInfo(G: Intr->BaseOpcode); |
7758 | IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16 |
7759 | } |
7760 | |
7761 | // Add gradients (packed or unpacked) |
7762 | if (IsG16) { |
7763 | // Pack the gradients |
7764 | // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart); |
7765 | packImage16bitOpsToDwords(DAG, Op, PackVectorVT: GradPackVectorVT, PackedAddrs&: VAddrs, |
7766 | DimIdx: ArgOffset + Intr->GradientStart, |
7767 | EndIdx: ArgOffset + Intr->CoordStart, NumGradients: Intr->NumGradients); |
7768 | } else { |
7769 | for (unsigned I = ArgOffset + Intr->GradientStart; |
7770 | I < ArgOffset + Intr->CoordStart; I++) |
7771 | VAddrs.push_back(Elt: Op.getOperand(i: I)); |
7772 | } |
7773 | |
7774 | // Add addresses (packed or unpacked) |
7775 | if (IsA16) { |
7776 | packImage16bitOpsToDwords(DAG, Op, PackVectorVT: AddrPackVectorVT, PackedAddrs&: VAddrs, |
7777 | DimIdx: ArgOffset + Intr->CoordStart, EndIdx: VAddrEnd, |
7778 | NumGradients: 0 /* No gradients */); |
7779 | } else { |
7780 | // Add uncompressed address |
7781 | for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++) |
7782 | VAddrs.push_back(Elt: Op.getOperand(i: I)); |
7783 | } |
7784 | |
7785 | // If the register allocator cannot place the address registers contiguously |
7786 | // without introducing moves, then using the non-sequential address encoding |
7787 | // is always preferable, since it saves VALU instructions and is usually a |
7788 | // wash in terms of code size or even better. |
7789 | // |
7790 | // However, we currently have no way of hinting to the register allocator that |
7791 | // MIMG addresses should be placed contiguously when it is possible to do so, |
7792 | // so force non-NSA for the common 2-address case as a heuristic. |
7793 | // |
7794 | // SIShrinkInstructions will convert NSA encodings to non-NSA after register |
7795 | // allocation when possible. |
7796 | // |
7797 | // Partial NSA is allowed on GFX11+ where the final register is a contiguous |
7798 | // set of the remaining addresses. |
7799 | const unsigned NSAMaxSize = ST->getNSAMaxSize(HasSampler: BaseOpcode->Sampler); |
7800 | const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding(); |
7801 | const bool UseNSA = ST->hasNSAEncoding() && |
7802 | VAddrs.size() >= ST->getNSAThreshold(MF) && |
7803 | (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding); |
7804 | const bool UsePartialNSA = |
7805 | UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize; |
7806 | |
7807 | SDValue VAddr; |
7808 | if (UsePartialNSA) { |
7809 | VAddr = getBuildDwordsVector(DAG, DL, |
7810 | Elts: ArrayRef(VAddrs).drop_front(N: NSAMaxSize - 1)); |
7811 | } |
7812 | else if (!UseNSA) { |
7813 | VAddr = getBuildDwordsVector(DAG, DL, Elts: VAddrs); |
7814 | } |
7815 | |
7816 | SDValue True = DAG.getTargetConstant(1, DL, MVT::i1); |
7817 | SDValue False = DAG.getTargetConstant(0, DL, MVT::i1); |
7818 | SDValue Unorm; |
7819 | if (!BaseOpcode->Sampler) { |
7820 | Unorm = True; |
7821 | } else { |
7822 | uint64_t UnormConst = |
7823 | Op.getConstantOperandVal(i: ArgOffset + Intr->UnormIndex); |
7824 | |
7825 | Unorm = UnormConst ? True : False; |
7826 | } |
7827 | |
7828 | SDValue TFE; |
7829 | SDValue LWE; |
7830 | SDValue TexFail = Op.getOperand(i: ArgOffset + Intr->TexFailCtrlIndex); |
7831 | bool IsTexFail = false; |
7832 | if (!parseTexFail(TexFailCtrl: TexFail, DAG, TFE: &TFE, LWE: &LWE, IsTexFail)) |
7833 | return Op; |
7834 | |
7835 | if (IsTexFail) { |
7836 | if (!DMaskLanes) { |
7837 | // Expecting to get an error flag since TFC is on - and dmask is 0 |
7838 | // Force dmask to be at least 1 otherwise the instruction will fail |
7839 | DMask = 0x1; |
7840 | DMaskLanes = 1; |
7841 | NumVDataDwords = 1; |
7842 | } |
7843 | NumVDataDwords += 1; |
7844 | AdjustRetType = true; |
7845 | } |
7846 | |
7847 | // Has something earlier tagged that the return type needs adjusting |
7848 | // This happens if the instruction is a load or has set TexFailCtrl flags |
7849 | if (AdjustRetType) { |
7850 | // NumVDataDwords reflects the true number of dwords required in the return type |
7851 | if (DMaskLanes == 0 && !BaseOpcode->Store) { |
7852 | // This is a no-op load. This can be eliminated |
7853 | SDValue Undef = DAG.getUNDEF(VT: Op.getValueType()); |
7854 | if (isa<MemSDNode>(Val: Op)) |
7855 | return DAG.getMergeValues(Ops: {Undef, Op.getOperand(i: 0)}, dl: DL); |
7856 | return Undef; |
7857 | } |
7858 | |
7859 | EVT NewVT = NumVDataDwords > 1 ? |
7860 | EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumVDataDwords) |
7861 | : MVT::i32; |
7862 | |
7863 | ResultTypes[0] = NewVT; |
7864 | if (ResultTypes.size() == 3) { |
7865 | // Original result was aggregate type used for TexFailCtrl results |
7866 | // The actual instruction returns as a vector type which has now been |
7867 | // created. Remove the aggregate result. |
7868 | ResultTypes.erase(CI: &ResultTypes[1]); |
7869 | } |
7870 | } |
7871 | |
7872 | unsigned CPol = Op.getConstantOperandVal(i: ArgOffset + Intr->CachePolicyIndex); |
7873 | if (BaseOpcode->Atomic) |
7874 | CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization |
7875 | if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) | |
7876 | AMDGPU::CPol::VOLATILE)) |
7877 | return Op; |
7878 | |
7879 | SmallVector<SDValue, 26> Ops; |
7880 | if (BaseOpcode->Store || BaseOpcode->Atomic) |
7881 | Ops.push_back(Elt: VData); // vdata |
7882 | if (UsePartialNSA) { |
7883 | append_range(C&: Ops, R: ArrayRef(VAddrs).take_front(N: NSAMaxSize - 1)); |
7884 | Ops.push_back(Elt: VAddr); |
7885 | } |
7886 | else if (UseNSA) |
7887 | append_range(C&: Ops, R&: VAddrs); |
7888 | else |
7889 | Ops.push_back(Elt: VAddr); |
7890 | Ops.push_back(Elt: Op.getOperand(i: ArgOffset + Intr->RsrcIndex)); |
7891 | if (BaseOpcode->Sampler) |
7892 | Ops.push_back(Elt: Op.getOperand(i: ArgOffset + Intr->SampIndex)); |
7893 | Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32)); |
7894 | if (IsGFX10Plus) |
7895 | Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32)); |
7896 | if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA) |
7897 | Ops.push_back(Elt: Unorm); |
7898 | Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32)); |
7899 | Ops.push_back(IsA16 && // r128, a16 for gfx9 |
7900 | ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False); |
7901 | if (IsGFX10Plus) |
7902 | Ops.push_back(Elt: IsA16 ? True : False); |
7903 | if (!Subtarget->hasGFX90AInsts()) { |
7904 | Ops.push_back(Elt: TFE); //tfe |
7905 | } else if (TFE->getAsZExtVal()) { |
7906 | report_fatal_error(reason: "TFE is not supported on this GPU" ); |
7907 | } |
7908 | if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA) |
7909 | Ops.push_back(Elt: LWE); // lwe |
7910 | if (!IsGFX10Plus) |
7911 | Ops.push_back(Elt: DimInfo->DA ? True : False); |
7912 | if (BaseOpcode->HasD16) |
7913 | Ops.push_back(Elt: IsD16 ? True : False); |
7914 | if (isa<MemSDNode>(Val: Op)) |
7915 | Ops.push_back(Elt: Op.getOperand(i: 0)); // chain |
7916 | |
7917 | int NumVAddrDwords = |
7918 | UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32; |
7919 | int Opcode = -1; |
7920 | |
7921 | if (IsGFX12Plus) { |
7922 | Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12, |
7923 | NumVDataDwords, NumVAddrDwords); |
7924 | } else if (IsGFX11Plus) { |
7925 | Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, |
7926 | UseNSA ? AMDGPU::MIMGEncGfx11NSA |
7927 | : AMDGPU::MIMGEncGfx11Default, |
7928 | NumVDataDwords, NumVAddrDwords); |
7929 | } else if (IsGFX10Plus) { |
7930 | Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, |
7931 | UseNSA ? AMDGPU::MIMGEncGfx10NSA |
7932 | : AMDGPU::MIMGEncGfx10Default, |
7933 | NumVDataDwords, NumVAddrDwords); |
7934 | } else { |
7935 | if (Subtarget->hasGFX90AInsts()) { |
7936 | Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a, |
7937 | NumVDataDwords, NumVAddrDwords); |
7938 | if (Opcode == -1) |
7939 | report_fatal_error( |
7940 | reason: "requested image instruction is not supported on this GPU" ); |
7941 | } |
7942 | if (Opcode == -1 && |
7943 | Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) |
7944 | Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8, |
7945 | NumVDataDwords, NumVAddrDwords); |
7946 | if (Opcode == -1) |
7947 | Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6, |
7948 | NumVDataDwords, NumVAddrDwords); |
7949 | } |
7950 | if (Opcode == -1) |
7951 | return Op; |
7952 | |
7953 | MachineSDNode *NewNode = DAG.getMachineNode(Opcode, dl: DL, ResultTys: ResultTypes, Ops); |
7954 | if (auto MemOp = dyn_cast<MemSDNode>(Val&: Op)) { |
7955 | MachineMemOperand *MemRef = MemOp->getMemOperand(); |
7956 | DAG.setNodeMemRefs(N: NewNode, NewMemRefs: {MemRef}); |
7957 | } |
7958 | |
7959 | if (BaseOpcode->AtomicX2) { |
7960 | SmallVector<SDValue, 1> Elt; |
7961 | DAG.ExtractVectorElements(Op: SDValue(NewNode, 0), Args&: Elt, Start: 0, Count: 1); |
7962 | return DAG.getMergeValues(Ops: {Elt[0], SDValue(NewNode, 1)}, dl: DL); |
7963 | } |
7964 | if (BaseOpcode->Store) |
7965 | return SDValue(NewNode, 0); |
7966 | return constructRetValue(DAG, Result: NewNode, ResultTypes: OrigResultTypes, IsTexFail, |
7967 | Unpacked: Subtarget->hasUnpackedD16VMem(), IsD16, DMaskPop: DMaskLanes, |
7968 | NumVDataDwords, IsAtomicPacked16Bit, DL); |
7969 | } |
7970 | |
7971 | SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, |
7972 | SDValue Offset, SDValue CachePolicy, |
7973 | SelectionDAG &DAG) const { |
7974 | MachineFunction &MF = DAG.getMachineFunction(); |
7975 | |
7976 | const DataLayout &DataLayout = DAG.getDataLayout(); |
7977 | Align Alignment = |
7978 | DataLayout.getABITypeAlign(Ty: VT.getTypeForEVT(Context&: *DAG.getContext())); |
7979 | |
7980 | MachineMemOperand *MMO = MF.getMachineMemOperand( |
7981 | PtrInfo: MachinePointerInfo(), |
7982 | F: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | |
7983 | MachineMemOperand::MOInvariant, |
7984 | Size: VT.getStoreSize(), BaseAlignment: Alignment); |
7985 | |
7986 | if (!Offset->isDivergent()) { |
7987 | SDValue Ops[] = {Rsrc, Offset, CachePolicy}; |
7988 | |
7989 | // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the |
7990 | // s_buffer_load_u16 instruction is emitted for both signed and unsigned |
7991 | // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext |
7992 | // and generates s_buffer_load_i16 (performSignExtendInRegCombine). |
7993 | if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) { |
7994 | SDValue BufferLoad = |
7995 | DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_USHORT, DL, |
7996 | DAG.getVTList(MVT::i32), Ops, VT, MMO); |
7997 | return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: BufferLoad); |
7998 | } |
7999 | |
8000 | // Widen vec3 load to vec4. |
8001 | if (VT.isVector() && VT.getVectorNumElements() == 3 && |
8002 | !Subtarget->hasScalarDwordx3Loads()) { |
8003 | EVT WidenedVT = |
8004 | EVT::getVectorVT(Context&: *DAG.getContext(), VT: VT.getVectorElementType(), NumElements: 4); |
8005 | auto WidenedOp = DAG.getMemIntrinsicNode( |
8006 | Opcode: AMDGPUISD::SBUFFER_LOAD, dl: DL, VTList: DAG.getVTList(VT: WidenedVT), Ops, MemVT: WidenedVT, |
8007 | MMO: MF.getMachineMemOperand(MMO, Offset: 0, Size: WidenedVT.getStoreSize())); |
8008 | auto Subvector = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: WidenedOp, |
8009 | N2: DAG.getVectorIdxConstant(Val: 0, DL)); |
8010 | return Subvector; |
8011 | } |
8012 | |
8013 | return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::SBUFFER_LOAD, dl: DL, |
8014 | VTList: DAG.getVTList(VT), Ops, MemVT: VT, MMO); |
8015 | } |
8016 | |
8017 | // We have a divergent offset. Emit a MUBUF buffer load instead. We can |
8018 | // assume that the buffer is unswizzled. |
8019 | SDValue Ops[] = { |
8020 | DAG.getEntryNode(), // Chain |
8021 | Rsrc, // rsrc |
8022 | DAG.getConstant(0, DL, MVT::i32), // vindex |
8023 | {}, // voffset |
8024 | {}, // soffset |
8025 | {}, // offset |
8026 | CachePolicy, // cachepolicy |
8027 | DAG.getTargetConstant(0, DL, MVT::i1), // idxen |
8028 | }; |
8029 | if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) { |
8030 | setBufferOffsets(CombinedOffset: Offset, DAG, Offsets: &Ops[3], Alignment: Align(4)); |
8031 | return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO); |
8032 | } |
8033 | |
8034 | SmallVector<SDValue, 4> Loads; |
8035 | unsigned NumLoads = 1; |
8036 | MVT LoadVT = VT.getSimpleVT(); |
8037 | unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1; |
8038 | assert((LoadVT.getScalarType() == MVT::i32 || |
8039 | LoadVT.getScalarType() == MVT::f32)); |
8040 | |
8041 | if (NumElts == 8 || NumElts == 16) { |
8042 | NumLoads = NumElts / 4; |
8043 | LoadVT = MVT::getVectorVT(VT: LoadVT.getScalarType(), NumElements: 4); |
8044 | } |
8045 | |
8046 | SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue}); |
8047 | |
8048 | // Use the alignment to ensure that the required offsets will fit into the |
8049 | // immediate offsets. |
8050 | setBufferOffsets(CombinedOffset: Offset, DAG, Offsets: &Ops[3], |
8051 | Alignment: NumLoads > 1 ? Align(16 * NumLoads) : Align(4)); |
8052 | |
8053 | uint64_t InstOffset = Ops[5]->getAsZExtVal(); |
8054 | for (unsigned i = 0; i < NumLoads; ++i) { |
8055 | Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32); |
8056 | Loads.push_back(Elt: getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops, |
8057 | LoadVT, MMO, DAG)); |
8058 | } |
8059 | |
8060 | if (NumElts == 8 || NumElts == 16) |
8061 | return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT, Ops: Loads); |
8062 | |
8063 | return Loads[0]; |
8064 | } |
8065 | |
8066 | SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const { |
8067 | // With architected SGPRs, waveIDinGroup is in TTMP8[29:25]. |
8068 | if (!Subtarget->hasArchitectedSGPRs()) |
8069 | return {}; |
8070 | SDLoc SL(Op); |
8071 | MVT VT = MVT::i32; |
8072 | SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT); |
8073 | return DAG.getNode(Opcode: AMDGPUISD::BFE_U32, DL: SL, VT, N1: TTMP8, |
8074 | N2: DAG.getConstant(Val: 25, DL: SL, VT), N3: DAG.getConstant(Val: 5, DL: SL, VT)); |
8075 | } |
8076 | |
8077 | SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op, |
8078 | unsigned Dim, |
8079 | const ArgDescriptor &Arg) const { |
8080 | SDLoc SL(Op); |
8081 | MachineFunction &MF = DAG.getMachineFunction(); |
8082 | unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim); |
8083 | if (MaxID == 0) |
8084 | return DAG.getConstant(0, SL, MVT::i32); |
8085 | |
8086 | SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, |
8087 | SDLoc(DAG.getEntryNode()), Arg); |
8088 | |
8089 | // Don't bother inserting AssertZext for packed IDs since we're emitting the |
8090 | // masking operations anyway. |
8091 | // |
8092 | // TODO: We could assert the top bit is 0 for the source copy. |
8093 | if (Arg.isMasked()) |
8094 | return Val; |
8095 | |
8096 | // Preserve the known bits after expansion to a copy. |
8097 | EVT SmallVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: llvm::bit_width(Value: MaxID)); |
8098 | return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val, |
8099 | DAG.getValueType(SmallVT)); |
8100 | } |
8101 | |
8102 | SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, |
8103 | SelectionDAG &DAG) const { |
8104 | MachineFunction &MF = DAG.getMachineFunction(); |
8105 | auto MFI = MF.getInfo<SIMachineFunctionInfo>(); |
8106 | |
8107 | EVT VT = Op.getValueType(); |
8108 | SDLoc DL(Op); |
8109 | unsigned IntrinsicID = Op.getConstantOperandVal(i: 0); |
8110 | |
8111 | // TODO: Should this propagate fast-math-flags? |
8112 | |
8113 | switch (IntrinsicID) { |
8114 | case Intrinsic::amdgcn_implicit_buffer_ptr: { |
8115 | if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction())) |
8116 | return emitNonHSAIntrinsicError(DAG, DL, VT); |
8117 | return getPreloadedValue(DAG, MFI: *MFI, VT, |
8118 | PVID: AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); |
8119 | } |
8120 | case Intrinsic::amdgcn_dispatch_ptr: |
8121 | case Intrinsic::amdgcn_queue_ptr: { |
8122 | if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) { |
8123 | DiagnosticInfoUnsupported BadIntrin( |
8124 | MF.getFunction(), "unsupported hsa intrinsic without hsa target" , |
8125 | DL.getDebugLoc()); |
8126 | DAG.getContext()->diagnose(DI: BadIntrin); |
8127 | return DAG.getUNDEF(VT); |
8128 | } |
8129 | |
8130 | auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ? |
8131 | AMDGPUFunctionArgInfo::DISPATCH_PTR : AMDGPUFunctionArgInfo::QUEUE_PTR; |
8132 | return getPreloadedValue(DAG, MFI: *MFI, VT, PVID: RegID); |
8133 | } |
8134 | case Intrinsic::amdgcn_implicitarg_ptr: { |
8135 | if (MFI->isEntryFunction()) |
8136 | return getImplicitArgPtr(DAG, SL: DL); |
8137 | return getPreloadedValue(DAG, MFI: *MFI, VT, |
8138 | PVID: AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); |
8139 | } |
8140 | case Intrinsic::amdgcn_kernarg_segment_ptr: { |
8141 | if (!AMDGPU::isKernel(CC: MF.getFunction().getCallingConv())) { |
8142 | // This only makes sense to call in a kernel, so just lower to null. |
8143 | return DAG.getConstant(Val: 0, DL, VT); |
8144 | } |
8145 | |
8146 | return getPreloadedValue(DAG, MFI: *MFI, VT, |
8147 | PVID: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); |
8148 | } |
8149 | case Intrinsic::amdgcn_dispatch_id: { |
8150 | return getPreloadedValue(DAG, MFI: *MFI, VT, PVID: AMDGPUFunctionArgInfo::DISPATCH_ID); |
8151 | } |
8152 | case Intrinsic::amdgcn_rcp: |
8153 | return DAG.getNode(Opcode: AMDGPUISD::RCP, DL, VT, Operand: Op.getOperand(i: 1)); |
8154 | case Intrinsic::amdgcn_rsq: |
8155 | return DAG.getNode(Opcode: AMDGPUISD::RSQ, DL, VT, Operand: Op.getOperand(i: 1)); |
8156 | case Intrinsic::amdgcn_rsq_legacy: |
8157 | if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) |
8158 | return emitRemovedIntrinsicError(DAG, DL, VT); |
8159 | return SDValue(); |
8160 | case Intrinsic::amdgcn_rcp_legacy: |
8161 | if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) |
8162 | return emitRemovedIntrinsicError(DAG, DL, VT); |
8163 | return DAG.getNode(Opcode: AMDGPUISD::RCP_LEGACY, DL, VT, Operand: Op.getOperand(i: 1)); |
8164 | case Intrinsic::amdgcn_rsq_clamp: { |
8165 | if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) |
8166 | return DAG.getNode(Opcode: AMDGPUISD::RSQ_CLAMP, DL, VT, Operand: Op.getOperand(i: 1)); |
8167 | |
8168 | Type *Type = VT.getTypeForEVT(Context&: *DAG.getContext()); |
8169 | APFloat Max = APFloat::getLargest(Sem: Type->getFltSemantics()); |
8170 | APFloat Min = APFloat::getLargest(Sem: Type->getFltSemantics(), Negative: true); |
8171 | |
8172 | SDValue Rsq = DAG.getNode(Opcode: AMDGPUISD::RSQ, DL, VT, Operand: Op.getOperand(i: 1)); |
8173 | SDValue Tmp = DAG.getNode(Opcode: ISD::FMINNUM, DL, VT, N1: Rsq, |
8174 | N2: DAG.getConstantFP(Val: Max, DL, VT)); |
8175 | return DAG.getNode(Opcode: ISD::FMAXNUM, DL, VT, N1: Tmp, |
8176 | N2: DAG.getConstantFP(Val: Min, DL, VT)); |
8177 | } |
8178 | case Intrinsic::r600_read_ngroups_x: |
8179 | if (Subtarget->isAmdHsaOS()) |
8180 | return emitNonHSAIntrinsicError(DAG, DL, VT); |
8181 | |
8182 | return lowerKernargMemParameter(DAG, VT, MemVT: VT, SL: DL, Chain: DAG.getEntryNode(), |
8183 | Offset: SI::KernelInputOffsets::NGROUPS_X, Alignment: Align(4), |
8184 | Signed: false); |
8185 | case Intrinsic::r600_read_ngroups_y: |
8186 | if (Subtarget->isAmdHsaOS()) |
8187 | return emitNonHSAIntrinsicError(DAG, DL, VT); |
8188 | |
8189 | return lowerKernargMemParameter(DAG, VT, MemVT: VT, SL: DL, Chain: DAG.getEntryNode(), |
8190 | Offset: SI::KernelInputOffsets::NGROUPS_Y, Alignment: Align(4), |
8191 | Signed: false); |
8192 | case Intrinsic::r600_read_ngroups_z: |
8193 | if (Subtarget->isAmdHsaOS()) |
8194 | return emitNonHSAIntrinsicError(DAG, DL, VT); |
8195 | |
8196 | return lowerKernargMemParameter(DAG, VT, MemVT: VT, SL: DL, Chain: DAG.getEntryNode(), |
8197 | Offset: SI::KernelInputOffsets::NGROUPS_Z, Alignment: Align(4), |
8198 | Signed: false); |
8199 | case Intrinsic::r600_read_global_size_x: |
8200 | if (Subtarget->isAmdHsaOS()) |
8201 | return emitNonHSAIntrinsicError(DAG, DL, VT); |
8202 | |
8203 | return lowerKernargMemParameter(DAG, VT, MemVT: VT, SL: DL, Chain: DAG.getEntryNode(), |
8204 | Offset: SI::KernelInputOffsets::GLOBAL_SIZE_X, |
8205 | Alignment: Align(4), Signed: false); |
8206 | case Intrinsic::r600_read_global_size_y: |
8207 | if (Subtarget->isAmdHsaOS()) |
8208 | return emitNonHSAIntrinsicError(DAG, DL, VT); |
8209 | |
8210 | return lowerKernargMemParameter(DAG, VT, MemVT: VT, SL: DL, Chain: DAG.getEntryNode(), |
8211 | Offset: SI::KernelInputOffsets::GLOBAL_SIZE_Y, |
8212 | Alignment: Align(4), Signed: false); |
8213 | case Intrinsic::r600_read_global_size_z: |
8214 | if (Subtarget->isAmdHsaOS()) |
8215 | return emitNonHSAIntrinsicError(DAG, DL, VT); |
8216 | |
8217 | return lowerKernargMemParameter(DAG, VT, MemVT: VT, SL: DL, Chain: DAG.getEntryNode(), |
8218 | Offset: SI::KernelInputOffsets::GLOBAL_SIZE_Z, |
8219 | Alignment: Align(4), Signed: false); |
8220 | case Intrinsic::r600_read_local_size_x: |
8221 | if (Subtarget->isAmdHsaOS()) |
8222 | return emitNonHSAIntrinsicError(DAG, DL, VT); |
8223 | |
8224 | return lowerImplicitZextParam(DAG, Op, MVT::i16, |
8225 | SI::KernelInputOffsets::LOCAL_SIZE_X); |
8226 | case Intrinsic::r600_read_local_size_y: |
8227 | if (Subtarget->isAmdHsaOS()) |
8228 | return emitNonHSAIntrinsicError(DAG, DL, VT); |
8229 | |
8230 | return lowerImplicitZextParam(DAG, Op, MVT::i16, |
8231 | SI::KernelInputOffsets::LOCAL_SIZE_Y); |
8232 | case Intrinsic::r600_read_local_size_z: |
8233 | if (Subtarget->isAmdHsaOS()) |
8234 | return emitNonHSAIntrinsicError(DAG, DL, VT); |
8235 | |
8236 | return lowerImplicitZextParam(DAG, Op, MVT::i16, |
8237 | SI::KernelInputOffsets::LOCAL_SIZE_Z); |
8238 | case Intrinsic::amdgcn_workgroup_id_x: |
8239 | return getPreloadedValue(DAG, MFI: *MFI, VT, |
8240 | PVID: AMDGPUFunctionArgInfo::WORKGROUP_ID_X); |
8241 | case Intrinsic::amdgcn_workgroup_id_y: |
8242 | return getPreloadedValue(DAG, MFI: *MFI, VT, |
8243 | PVID: AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); |
8244 | case Intrinsic::amdgcn_workgroup_id_z: |
8245 | return getPreloadedValue(DAG, MFI: *MFI, VT, |
8246 | PVID: AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); |
8247 | case Intrinsic::amdgcn_wave_id: |
8248 | return lowerWaveID(DAG, Op); |
8249 | case Intrinsic::amdgcn_lds_kernel_id: { |
8250 | if (MFI->isEntryFunction()) |
8251 | return getLDSKernelId(DAG, SL: DL); |
8252 | return getPreloadedValue(DAG, MFI: *MFI, VT, |
8253 | PVID: AMDGPUFunctionArgInfo::LDS_KERNEL_ID); |
8254 | } |
8255 | case Intrinsic::amdgcn_workitem_id_x: |
8256 | return lowerWorkitemID(DAG, Op, Dim: 0, Arg: MFI->getArgInfo().WorkItemIDX); |
8257 | case Intrinsic::amdgcn_workitem_id_y: |
8258 | return lowerWorkitemID(DAG, Op, Dim: 1, Arg: MFI->getArgInfo().WorkItemIDY); |
8259 | case Intrinsic::amdgcn_workitem_id_z: |
8260 | return lowerWorkitemID(DAG, Op, Dim: 2, Arg: MFI->getArgInfo().WorkItemIDZ); |
8261 | case Intrinsic::amdgcn_wavefrontsize: |
8262 | return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(), |
8263 | SDLoc(Op), MVT::i32); |
8264 | case Intrinsic::amdgcn_s_buffer_load: { |
8265 | unsigned CPol = Op.getConstantOperandVal(i: 3); |
8266 | // s_buffer_load, because of how it's optimized, can't be volatile |
8267 | // so reject ones with the volatile bit set. |
8268 | if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12) |
8269 | ? AMDGPU::CPol::ALL |
8270 | : AMDGPU::CPol::ALL_pregfx12)) |
8271 | return Op; |
8272 | return lowerSBuffer(VT, DL, Rsrc: Op.getOperand(i: 1), Offset: Op.getOperand(i: 2), CachePolicy: Op.getOperand(i: 3), |
8273 | DAG); |
8274 | } |
8275 | case Intrinsic::amdgcn_fdiv_fast: |
8276 | return lowerFDIV_FAST(Op, DAG); |
8277 | case Intrinsic::amdgcn_sin: |
8278 | return DAG.getNode(Opcode: AMDGPUISD::SIN_HW, DL, VT, Operand: Op.getOperand(i: 1)); |
8279 | |
8280 | case Intrinsic::amdgcn_cos: |
8281 | return DAG.getNode(Opcode: AMDGPUISD::COS_HW, DL, VT, Operand: Op.getOperand(i: 1)); |
8282 | |
8283 | case Intrinsic::amdgcn_mul_u24: |
8284 | return DAG.getNode(Opcode: AMDGPUISD::MUL_U24, DL, VT, N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2)); |
8285 | case Intrinsic::amdgcn_mul_i24: |
8286 | return DAG.getNode(Opcode: AMDGPUISD::MUL_I24, DL, VT, N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2)); |
8287 | |
8288 | case Intrinsic::amdgcn_log_clamp: { |
8289 | if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) |
8290 | return SDValue(); |
8291 | |
8292 | return emitRemovedIntrinsicError(DAG, DL, VT); |
8293 | } |
8294 | case Intrinsic::amdgcn_fract: |
8295 | return DAG.getNode(Opcode: AMDGPUISD::FRACT, DL, VT, Operand: Op.getOperand(i: 1)); |
8296 | |
8297 | case Intrinsic::amdgcn_class: |
8298 | return DAG.getNode(Opcode: AMDGPUISD::FP_CLASS, DL, VT, |
8299 | N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2)); |
8300 | case Intrinsic::amdgcn_div_fmas: |
8301 | return DAG.getNode(Opcode: AMDGPUISD::DIV_FMAS, DL, VT, |
8302 | N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3), |
8303 | N4: Op.getOperand(i: 4)); |
8304 | |
8305 | case Intrinsic::amdgcn_div_fixup: |
8306 | return DAG.getNode(Opcode: AMDGPUISD::DIV_FIXUP, DL, VT, |
8307 | N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3)); |
8308 | |
8309 | case Intrinsic::amdgcn_div_scale: { |
8310 | const ConstantSDNode *Param = cast<ConstantSDNode>(Val: Op.getOperand(i: 3)); |
8311 | |
8312 | // Translate to the operands expected by the machine instruction. The |
8313 | // first parameter must be the same as the first instruction. |
8314 | SDValue Numerator = Op.getOperand(i: 1); |
8315 | SDValue Denominator = Op.getOperand(i: 2); |
8316 | |
8317 | // Note this order is opposite of the machine instruction's operations, |
8318 | // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The |
8319 | // intrinsic has the numerator as the first operand to match a normal |
8320 | // division operation. |
8321 | |
8322 | SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator; |
8323 | |
8324 | return DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL, VTList: Op->getVTList(), N1: Src0, |
8325 | N2: Denominator, N3: Numerator); |
8326 | } |
8327 | case Intrinsic::amdgcn_icmp: { |
8328 | // There is a Pat that handles this variant, so return it as-is. |
8329 | if (Op.getOperand(1).getValueType() == MVT::i1 && |
8330 | Op.getConstantOperandVal(2) == 0 && |
8331 | Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE) |
8332 | return Op; |
8333 | return lowerICMPIntrinsic(TLI: *this, N: Op.getNode(), DAG); |
8334 | } |
8335 | case Intrinsic::amdgcn_fcmp: { |
8336 | return lowerFCMPIntrinsic(TLI: *this, N: Op.getNode(), DAG); |
8337 | } |
8338 | case Intrinsic::amdgcn_ballot: |
8339 | return lowerBALLOTIntrinsic(TLI: *this, N: Op.getNode(), DAG); |
8340 | case Intrinsic::amdgcn_fmed3: |
8341 | return DAG.getNode(Opcode: AMDGPUISD::FMED3, DL, VT, |
8342 | N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3)); |
8343 | case Intrinsic::amdgcn_fdot2: |
8344 | return DAG.getNode(Opcode: AMDGPUISD::FDOT2, DL, VT, |
8345 | N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3), |
8346 | N4: Op.getOperand(i: 4)); |
8347 | case Intrinsic::amdgcn_fmul_legacy: |
8348 | return DAG.getNode(Opcode: AMDGPUISD::FMUL_LEGACY, DL, VT, |
8349 | N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2)); |
8350 | case Intrinsic::amdgcn_sffbh: |
8351 | return DAG.getNode(Opcode: AMDGPUISD::FFBH_I32, DL, VT, Operand: Op.getOperand(i: 1)); |
8352 | case Intrinsic::amdgcn_sbfe: |
8353 | return DAG.getNode(Opcode: AMDGPUISD::BFE_I32, DL, VT, |
8354 | N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3)); |
8355 | case Intrinsic::amdgcn_ubfe: |
8356 | return DAG.getNode(Opcode: AMDGPUISD::BFE_U32, DL, VT, |
8357 | N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3)); |
8358 | case Intrinsic::amdgcn_cvt_pkrtz: |
8359 | case Intrinsic::amdgcn_cvt_pknorm_i16: |
8360 | case Intrinsic::amdgcn_cvt_pknorm_u16: |
8361 | case Intrinsic::amdgcn_cvt_pk_i16: |
8362 | case Intrinsic::amdgcn_cvt_pk_u16: { |
8363 | // FIXME: Stop adding cast if v2f16/v2i16 are legal. |
8364 | EVT VT = Op.getValueType(); |
8365 | unsigned Opcode; |
8366 | |
8367 | if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz) |
8368 | Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32; |
8369 | else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16) |
8370 | Opcode = AMDGPUISD::CVT_PKNORM_I16_F32; |
8371 | else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16) |
8372 | Opcode = AMDGPUISD::CVT_PKNORM_U16_F32; |
8373 | else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16) |
8374 | Opcode = AMDGPUISD::CVT_PK_I16_I32; |
8375 | else |
8376 | Opcode = AMDGPUISD::CVT_PK_U16_U32; |
8377 | |
8378 | if (isTypeLegal(VT)) |
8379 | return DAG.getNode(Opcode, DL, VT, N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2)); |
8380 | |
8381 | SDValue Node = DAG.getNode(Opcode, DL, MVT::i32, |
8382 | Op.getOperand(1), Op.getOperand(2)); |
8383 | return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Node); |
8384 | } |
8385 | case Intrinsic::amdgcn_fmad_ftz: |
8386 | return DAG.getNode(Opcode: AMDGPUISD::FMAD_FTZ, DL, VT, N1: Op.getOperand(i: 1), |
8387 | N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3)); |
8388 | |
8389 | case Intrinsic::amdgcn_if_break: |
8390 | return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT, |
8391 | Op->getOperand(1), Op->getOperand(2)), 0); |
8392 | |
8393 | case Intrinsic::amdgcn_groupstaticsize: { |
8394 | Triple::OSType OS = getTargetMachine().getTargetTriple().getOS(); |
8395 | if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) |
8396 | return Op; |
8397 | |
8398 | const Module *M = MF.getFunction().getParent(); |
8399 | const GlobalValue *GV = |
8400 | M->getNamedValue(Intrinsic::getName(Intrinsic::amdgcn_groupstaticsize)); |
8401 | SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0, |
8402 | SIInstrInfo::MO_ABS32_LO); |
8403 | return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0}; |
8404 | } |
8405 | case Intrinsic::amdgcn_is_shared: |
8406 | case Intrinsic::amdgcn_is_private: { |
8407 | SDLoc SL(Op); |
8408 | unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared) ? |
8409 | AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS; |
8410 | SDValue Aperture = getSegmentAperture(AS, DL: SL, DAG); |
8411 | SDValue SrcVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, |
8412 | Op.getOperand(1)); |
8413 | |
8414 | SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec, |
8415 | DAG.getConstant(1, SL, MVT::i32)); |
8416 | return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ); |
8417 | } |
8418 | case Intrinsic::amdgcn_perm: |
8419 | return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1), |
8420 | Op.getOperand(2), Op.getOperand(3)); |
8421 | case Intrinsic::amdgcn_reloc_constant: { |
8422 | Module *M = const_cast<Module *>(MF.getFunction().getParent()); |
8423 | const MDNode *Metadata = cast<MDNodeSDNode>(Val: Op.getOperand(i: 1))->getMD(); |
8424 | auto SymbolName = cast<MDString>(Val: Metadata->getOperand(I: 0))->getString(); |
8425 | auto RelocSymbol = cast<GlobalVariable>( |
8426 | Val: M->getOrInsertGlobal(Name: SymbolName, Ty: Type::getInt32Ty(C&: M->getContext()))); |
8427 | SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0, |
8428 | SIInstrInfo::MO_ABS32_LO); |
8429 | return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0}; |
8430 | } |
8431 | case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16: |
8432 | case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16: |
8433 | case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16: |
8434 | case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16: |
8435 | case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8: |
8436 | case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8: |
8437 | case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8: |
8438 | case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: { |
8439 | if (Op.getOperand(4).getValueType() == MVT::i32) |
8440 | return SDValue(); |
8441 | |
8442 | SDLoc SL(Op); |
8443 | auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32); |
8444 | return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(), |
8445 | Op.getOperand(i: 0), Op.getOperand(i: 1), Op.getOperand(i: 2), |
8446 | Op.getOperand(i: 3), IndexKeyi32); |
8447 | } |
8448 | case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4: |
8449 | case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8: |
8450 | case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: { |
8451 | if (Op.getOperand(6).getValueType() == MVT::i32) |
8452 | return SDValue(); |
8453 | |
8454 | SDLoc SL(Op); |
8455 | auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32); |
8456 | return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(), |
8457 | {Op.getOperand(i: 0), Op.getOperand(i: 1), Op.getOperand(i: 2), |
8458 | Op.getOperand(i: 3), Op.getOperand(i: 4), Op.getOperand(i: 5), |
8459 | IndexKeyi32, Op.getOperand(i: 7)}); |
8460 | } |
8461 | case Intrinsic::amdgcn_addrspacecast_nonnull: |
8462 | return lowerADDRSPACECAST(Op, DAG); |
8463 | default: |
8464 | if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = |
8465 | AMDGPU::getImageDimIntrinsicInfo(Intr: IntrinsicID)) |
8466 | return lowerImage(Op, Intr: ImageDimIntr, DAG, WithChain: false); |
8467 | |
8468 | return Op; |
8469 | } |
8470 | } |
8471 | |
8472 | // On targets not supporting constant in soffset field, turn zero to |
8473 | // SGPR_NULL to avoid generating an extra s_mov with zero. |
8474 | static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, |
8475 | const GCNSubtarget *Subtarget) { |
8476 | if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset)) |
8477 | return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32); |
8478 | return SOffset; |
8479 | } |
8480 | |
8481 | SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op, |
8482 | SelectionDAG &DAG, |
8483 | unsigned NewOpcode) const { |
8484 | SDLoc DL(Op); |
8485 | |
8486 | SDValue VData = Op.getOperand(i: 2); |
8487 | SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG); |
8488 | auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 4), DAG); |
8489 | auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 5), DAG, Subtarget); |
8490 | SDValue Ops[] = { |
8491 | Op.getOperand(0), // Chain |
8492 | VData, // vdata |
8493 | Rsrc, // rsrc |
8494 | DAG.getConstant(0, DL, MVT::i32), // vindex |
8495 | Offsets.first, // voffset |
8496 | SOffset, // soffset |
8497 | Offsets.second, // offset |
8498 | Op.getOperand(6), // cachepolicy |
8499 | DAG.getTargetConstant(0, DL, MVT::i1), // idxen |
8500 | }; |
8501 | |
8502 | auto *M = cast<MemSDNode>(Val&: Op); |
8503 | |
8504 | EVT MemVT = VData.getValueType(); |
8505 | return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT, |
8506 | M->getMemOperand()); |
8507 | } |
8508 | |
8509 | // Return a value to use for the idxen operand by examining the vindex operand. |
8510 | static unsigned getIdxEn(SDValue VIndex) { |
8511 | // No need to set idxen if vindex is known to be zero. |
8512 | return isNullConstant(V: VIndex) ? 0 : 1; |
8513 | } |
8514 | |
8515 | SDValue |
8516 | SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG, |
8517 | unsigned NewOpcode) const { |
8518 | SDLoc DL(Op); |
8519 | |
8520 | SDValue VData = Op.getOperand(i: 2); |
8521 | SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG); |
8522 | auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 5), DAG); |
8523 | auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 6), DAG, Subtarget); |
8524 | SDValue Ops[] = { |
8525 | Op.getOperand(0), // Chain |
8526 | VData, // vdata |
8527 | Rsrc, // rsrc |
8528 | Op.getOperand(4), // vindex |
8529 | Offsets.first, // voffset |
8530 | SOffset, // soffset |
8531 | Offsets.second, // offset |
8532 | Op.getOperand(7), // cachepolicy |
8533 | DAG.getTargetConstant(1, DL, MVT::i1), // idxen |
8534 | }; |
8535 | |
8536 | auto *M = cast<MemSDNode>(Val&: Op); |
8537 | |
8538 | EVT MemVT = VData.getValueType(); |
8539 | return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT, |
8540 | M->getMemOperand()); |
8541 | } |
8542 | |
8543 | SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, |
8544 | SelectionDAG &DAG) const { |
8545 | unsigned IntrID = Op.getConstantOperandVal(i: 1); |
8546 | SDLoc DL(Op); |
8547 | |
8548 | switch (IntrID) { |
8549 | case Intrinsic::amdgcn_ds_ordered_add: |
8550 | case Intrinsic::amdgcn_ds_ordered_swap: { |
8551 | MemSDNode *M = cast<MemSDNode>(Val&: Op); |
8552 | SDValue Chain = M->getOperand(Num: 0); |
8553 | SDValue M0 = M->getOperand(Num: 2); |
8554 | SDValue Value = M->getOperand(Num: 3); |
8555 | unsigned IndexOperand = M->getConstantOperandVal(Num: 7); |
8556 | unsigned WaveRelease = M->getConstantOperandVal(Num: 8); |
8557 | unsigned WaveDone = M->getConstantOperandVal(Num: 9); |
8558 | |
8559 | unsigned OrderedCountIndex = IndexOperand & 0x3f; |
8560 | IndexOperand &= ~0x3f; |
8561 | unsigned CountDw = 0; |
8562 | |
8563 | if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) { |
8564 | CountDw = (IndexOperand >> 24) & 0xf; |
8565 | IndexOperand &= ~(0xf << 24); |
8566 | |
8567 | if (CountDw < 1 || CountDw > 4) { |
8568 | report_fatal_error( |
8569 | reason: "ds_ordered_count: dword count must be between 1 and 4" ); |
8570 | } |
8571 | } |
8572 | |
8573 | if (IndexOperand) |
8574 | report_fatal_error(reason: "ds_ordered_count: bad index operand" ); |
8575 | |
8576 | if (WaveDone && !WaveRelease) |
8577 | report_fatal_error(reason: "ds_ordered_count: wave_done requires wave_release" ); |
8578 | |
8579 | unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1; |
8580 | unsigned ShaderType = |
8581 | SIInstrInfo::getDSShaderTypeValue(MF: DAG.getMachineFunction()); |
8582 | unsigned Offset0 = OrderedCountIndex << 2; |
8583 | unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4); |
8584 | |
8585 | if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) |
8586 | Offset1 |= (CountDw - 1) << 6; |
8587 | |
8588 | if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11) |
8589 | Offset1 |= ShaderType << 2; |
8590 | |
8591 | unsigned Offset = Offset0 | (Offset1 << 8); |
8592 | |
8593 | SDValue Ops[] = { |
8594 | Chain, |
8595 | Value, |
8596 | DAG.getTargetConstant(Offset, DL, MVT::i16), |
8597 | copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue |
8598 | }; |
8599 | return DAG.getMemIntrinsicNode(AMDGPUISD::DS_ORDERED_COUNT, DL, |
8600 | M->getVTList(), Ops, M->getMemoryVT(), |
8601 | M->getMemOperand()); |
8602 | } |
8603 | case Intrinsic::amdgcn_ds_fadd: { |
8604 | MemSDNode *M = cast<MemSDNode>(Val&: Op); |
8605 | unsigned Opc; |
8606 | switch (IntrID) { |
8607 | case Intrinsic::amdgcn_ds_fadd: |
8608 | Opc = ISD::ATOMIC_LOAD_FADD; |
8609 | break; |
8610 | } |
8611 | |
8612 | return DAG.getAtomic(Opcode: Opc, dl: SDLoc(Op), MemVT: M->getMemoryVT(), |
8613 | Chain: M->getOperand(Num: 0), Ptr: M->getOperand(Num: 2), Val: M->getOperand(Num: 3), |
8614 | MMO: M->getMemOperand()); |
8615 | } |
8616 | case Intrinsic::amdgcn_ds_fmin: |
8617 | case Intrinsic::amdgcn_ds_fmax: { |
8618 | MemSDNode *M = cast<MemSDNode>(Val&: Op); |
8619 | unsigned Opc; |
8620 | switch (IntrID) { |
8621 | case Intrinsic::amdgcn_ds_fmin: |
8622 | Opc = AMDGPUISD::ATOMIC_LOAD_FMIN; |
8623 | break; |
8624 | case Intrinsic::amdgcn_ds_fmax: |
8625 | Opc = AMDGPUISD::ATOMIC_LOAD_FMAX; |
8626 | break; |
8627 | default: |
8628 | llvm_unreachable("Unknown intrinsic!" ); |
8629 | } |
8630 | SDValue Ops[] = { |
8631 | M->getOperand(Num: 0), // Chain |
8632 | M->getOperand(Num: 2), // Ptr |
8633 | M->getOperand(Num: 3) // Value |
8634 | }; |
8635 | |
8636 | return DAG.getMemIntrinsicNode(Opcode: Opc, dl: SDLoc(Op), VTList: M->getVTList(), Ops, |
8637 | MemVT: M->getMemoryVT(), MMO: M->getMemOperand()); |
8638 | } |
8639 | case Intrinsic::amdgcn_buffer_load: |
8640 | case Intrinsic::amdgcn_buffer_load_format: { |
8641 | unsigned Glc = Op.getConstantOperandVal(i: 5); |
8642 | unsigned Slc = Op.getConstantOperandVal(i: 6); |
8643 | unsigned IdxEn = getIdxEn(VIndex: Op.getOperand(i: 3)); |
8644 | SDValue Ops[] = { |
8645 | Op.getOperand(0), // Chain |
8646 | Op.getOperand(2), // rsrc |
8647 | Op.getOperand(3), // vindex |
8648 | SDValue(), // voffset -- will be set by setBufferOffsets |
8649 | SDValue(), // soffset -- will be set by setBufferOffsets |
8650 | SDValue(), // offset -- will be set by setBufferOffsets |
8651 | DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy |
8652 | DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen |
8653 | }; |
8654 | setBufferOffsets(CombinedOffset: Op.getOperand(i: 4), DAG, Offsets: &Ops[3]); |
8655 | |
8656 | unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ? |
8657 | AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT; |
8658 | |
8659 | EVT VT = Op.getValueType(); |
8660 | EVT IntVT = VT.changeTypeToInteger(); |
8661 | auto *M = cast<MemSDNode>(Val&: Op); |
8662 | EVT LoadVT = Op.getValueType(); |
8663 | |
8664 | if (LoadVT.getScalarType() == MVT::f16) |
8665 | return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, |
8666 | M, DAG, Ops); |
8667 | |
8668 | // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics |
8669 | if (LoadVT.getScalarType() == MVT::i8 || LoadVT.getScalarType() == MVT::i16) |
8670 | return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, |
8671 | M->getMemOperand()); |
8672 | |
8673 | return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, |
8674 | M->getMemOperand(), DAG); |
8675 | } |
8676 | case Intrinsic::amdgcn_raw_buffer_load: |
8677 | case Intrinsic::amdgcn_raw_ptr_buffer_load: |
8678 | case Intrinsic::amdgcn_raw_buffer_load_format: |
8679 | case Intrinsic::amdgcn_raw_ptr_buffer_load_format: { |
8680 | const bool IsFormat = |
8681 | IntrID == Intrinsic::amdgcn_raw_buffer_load_format || |
8682 | IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format; |
8683 | |
8684 | SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG); |
8685 | auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 3), DAG); |
8686 | auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 4), DAG, Subtarget); |
8687 | SDValue Ops[] = { |
8688 | Op.getOperand(0), // Chain |
8689 | Rsrc, // rsrc |
8690 | DAG.getConstant(0, DL, MVT::i32), // vindex |
8691 | Offsets.first, // voffset |
8692 | SOffset, // soffset |
8693 | Offsets.second, // offset |
8694 | Op.getOperand(5), // cachepolicy, swizzled buffer |
8695 | DAG.getTargetConstant(0, DL, MVT::i1), // idxen |
8696 | }; |
8697 | |
8698 | auto *M = cast<MemSDNode>(Val&: Op); |
8699 | return lowerIntrinsicLoad(M, IsFormat, DAG, Ops); |
8700 | } |
8701 | case Intrinsic::amdgcn_struct_buffer_load: |
8702 | case Intrinsic::amdgcn_struct_ptr_buffer_load: |
8703 | case Intrinsic::amdgcn_struct_buffer_load_format: |
8704 | case Intrinsic::amdgcn_struct_ptr_buffer_load_format: { |
8705 | const bool IsFormat = |
8706 | IntrID == Intrinsic::amdgcn_struct_buffer_load_format || |
8707 | IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format; |
8708 | |
8709 | SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG); |
8710 | auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 4), DAG); |
8711 | auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 5), DAG, Subtarget); |
8712 | SDValue Ops[] = { |
8713 | Op.getOperand(0), // Chain |
8714 | Rsrc, // rsrc |
8715 | Op.getOperand(3), // vindex |
8716 | Offsets.first, // voffset |
8717 | SOffset, // soffset |
8718 | Offsets.second, // offset |
8719 | Op.getOperand(6), // cachepolicy, swizzled buffer |
8720 | DAG.getTargetConstant(1, DL, MVT::i1), // idxen |
8721 | }; |
8722 | |
8723 | return lowerIntrinsicLoad(cast<MemSDNode>(Val&: Op), IsFormat, DAG, Ops); |
8724 | } |
8725 | case Intrinsic::amdgcn_tbuffer_load: { |
8726 | MemSDNode *M = cast<MemSDNode>(Val&: Op); |
8727 | EVT LoadVT = Op.getValueType(); |
8728 | |
8729 | auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 5), DAG, Subtarget); |
8730 | unsigned Dfmt = Op.getConstantOperandVal(i: 7); |
8731 | unsigned Nfmt = Op.getConstantOperandVal(i: 8); |
8732 | unsigned Glc = Op.getConstantOperandVal(i: 9); |
8733 | unsigned Slc = Op.getConstantOperandVal(i: 10); |
8734 | unsigned IdxEn = getIdxEn(VIndex: Op.getOperand(i: 3)); |
8735 | SDValue Ops[] = { |
8736 | Op.getOperand(0), // Chain |
8737 | Op.getOperand(2), // rsrc |
8738 | Op.getOperand(3), // vindex |
8739 | Op.getOperand(4), // voffset |
8740 | SOffset, // soffset |
8741 | Op.getOperand(6), // offset |
8742 | DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format |
8743 | DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy |
8744 | DAG.getTargetConstant(IdxEn, DL, MVT::i1) // idxen |
8745 | }; |
8746 | |
8747 | if (LoadVT.getScalarType() == MVT::f16) |
8748 | return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, |
8749 | M, DAG, Ops); |
8750 | return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, |
8751 | Op->getVTList(), Ops, LoadVT, M->getMemOperand(), |
8752 | DAG); |
8753 | } |
8754 | case Intrinsic::amdgcn_raw_tbuffer_load: |
8755 | case Intrinsic::amdgcn_raw_ptr_tbuffer_load: { |
8756 | MemSDNode *M = cast<MemSDNode>(Val&: Op); |
8757 | EVT LoadVT = Op.getValueType(); |
8758 | SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG); |
8759 | auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 3), DAG); |
8760 | auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 4), DAG, Subtarget); |
8761 | |
8762 | SDValue Ops[] = { |
8763 | Op.getOperand(0), // Chain |
8764 | Rsrc, // rsrc |
8765 | DAG.getConstant(0, DL, MVT::i32), // vindex |
8766 | Offsets.first, // voffset |
8767 | SOffset, // soffset |
8768 | Offsets.second, // offset |
8769 | Op.getOperand(5), // format |
8770 | Op.getOperand(6), // cachepolicy, swizzled buffer |
8771 | DAG.getTargetConstant(0, DL, MVT::i1), // idxen |
8772 | }; |
8773 | |
8774 | if (LoadVT.getScalarType() == MVT::f16) |
8775 | return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, |
8776 | M, DAG, Ops); |
8777 | return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, |
8778 | Op->getVTList(), Ops, LoadVT, M->getMemOperand(), |
8779 | DAG); |
8780 | } |
8781 | case Intrinsic::amdgcn_struct_tbuffer_load: |
8782 | case Intrinsic::amdgcn_struct_ptr_tbuffer_load: { |
8783 | MemSDNode *M = cast<MemSDNode>(Val&: Op); |
8784 | EVT LoadVT = Op.getValueType(); |
8785 | SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG); |
8786 | auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 4), DAG); |
8787 | auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 5), DAG, Subtarget); |
8788 | |
8789 | SDValue Ops[] = { |
8790 | Op.getOperand(0), // Chain |
8791 | Rsrc, // rsrc |
8792 | Op.getOperand(3), // vindex |
8793 | Offsets.first, // voffset |
8794 | SOffset, // soffset |
8795 | Offsets.second, // offset |
8796 | Op.getOperand(6), // format |
8797 | Op.getOperand(7), // cachepolicy, swizzled buffer |
8798 | DAG.getTargetConstant(1, DL, MVT::i1), // idxen |
8799 | }; |
8800 | |
8801 | if (LoadVT.getScalarType() == MVT::f16) |
8802 | return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, |
8803 | M, DAG, Ops); |
8804 | return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, |
8805 | Op->getVTList(), Ops, LoadVT, M->getMemOperand(), |
8806 | DAG); |
8807 | } |
8808 | case Intrinsic::amdgcn_buffer_atomic_swap: |
8809 | case Intrinsic::amdgcn_buffer_atomic_add: |
8810 | case Intrinsic::amdgcn_buffer_atomic_sub: |
8811 | case Intrinsic::amdgcn_buffer_atomic_csub: |
8812 | case Intrinsic::amdgcn_buffer_atomic_smin: |
8813 | case Intrinsic::amdgcn_buffer_atomic_umin: |
8814 | case Intrinsic::amdgcn_buffer_atomic_smax: |
8815 | case Intrinsic::amdgcn_buffer_atomic_umax: |
8816 | case Intrinsic::amdgcn_buffer_atomic_and: |
8817 | case Intrinsic::amdgcn_buffer_atomic_or: |
8818 | case Intrinsic::amdgcn_buffer_atomic_xor: |
8819 | case Intrinsic::amdgcn_buffer_atomic_fadd: { |
8820 | unsigned Slc = Op.getConstantOperandVal(i: 6); |
8821 | unsigned IdxEn = getIdxEn(VIndex: Op.getOperand(i: 4)); |
8822 | SDValue Ops[] = { |
8823 | Op.getOperand(0), // Chain |
8824 | Op.getOperand(2), // vdata |
8825 | Op.getOperand(3), // rsrc |
8826 | Op.getOperand(4), // vindex |
8827 | SDValue(), // voffset -- will be set by setBufferOffsets |
8828 | SDValue(), // soffset -- will be set by setBufferOffsets |
8829 | SDValue(), // offset -- will be set by setBufferOffsets |
8830 | DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy |
8831 | DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen |
8832 | }; |
8833 | setBufferOffsets(CombinedOffset: Op.getOperand(i: 5), DAG, Offsets: &Ops[4]); |
8834 | |
8835 | EVT VT = Op.getValueType(); |
8836 | |
8837 | auto *M = cast<MemSDNode>(Val&: Op); |
8838 | unsigned Opcode = 0; |
8839 | |
8840 | switch (IntrID) { |
8841 | case Intrinsic::amdgcn_buffer_atomic_swap: |
8842 | Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP; |
8843 | break; |
8844 | case Intrinsic::amdgcn_buffer_atomic_add: |
8845 | Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD; |
8846 | break; |
8847 | case Intrinsic::amdgcn_buffer_atomic_sub: |
8848 | Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB; |
8849 | break; |
8850 | case Intrinsic::amdgcn_buffer_atomic_csub: |
8851 | Opcode = AMDGPUISD::BUFFER_ATOMIC_CSUB; |
8852 | break; |
8853 | case Intrinsic::amdgcn_buffer_atomic_smin: |
8854 | Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN; |
8855 | break; |
8856 | case Intrinsic::amdgcn_buffer_atomic_umin: |
8857 | Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN; |
8858 | break; |
8859 | case Intrinsic::amdgcn_buffer_atomic_smax: |
8860 | Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX; |
8861 | break; |
8862 | case Intrinsic::amdgcn_buffer_atomic_umax: |
8863 | Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX; |
8864 | break; |
8865 | case Intrinsic::amdgcn_buffer_atomic_and: |
8866 | Opcode = AMDGPUISD::BUFFER_ATOMIC_AND; |
8867 | break; |
8868 | case Intrinsic::amdgcn_buffer_atomic_or: |
8869 | Opcode = AMDGPUISD::BUFFER_ATOMIC_OR; |
8870 | break; |
8871 | case Intrinsic::amdgcn_buffer_atomic_xor: |
8872 | Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR; |
8873 | break; |
8874 | case Intrinsic::amdgcn_buffer_atomic_fadd: |
8875 | Opcode = AMDGPUISD::BUFFER_ATOMIC_FADD; |
8876 | break; |
8877 | default: |
8878 | llvm_unreachable("unhandled atomic opcode" ); |
8879 | } |
8880 | |
8881 | return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, |
8882 | M->getMemOperand()); |
8883 | } |
8884 | case Intrinsic::amdgcn_raw_buffer_atomic_fadd: |
8885 | case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd: |
8886 | return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FADD); |
8887 | case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16: |
8888 | return lowerRawBufferAtomicIntrin(Op, DAG, |
8889 | NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FADD_BF16); |
8890 | case Intrinsic::amdgcn_struct_buffer_atomic_fadd: |
8891 | case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd: |
8892 | return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FADD); |
8893 | case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16: |
8894 | return lowerStructBufferAtomicIntrin(Op, DAG, |
8895 | NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FADD_BF16); |
8896 | case Intrinsic::amdgcn_raw_buffer_atomic_fmin: |
8897 | case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin: |
8898 | return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FMIN); |
8899 | case Intrinsic::amdgcn_struct_buffer_atomic_fmin: |
8900 | case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin: |
8901 | return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FMIN); |
8902 | case Intrinsic::amdgcn_raw_buffer_atomic_fmax: |
8903 | case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax: |
8904 | return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FMAX); |
8905 | case Intrinsic::amdgcn_struct_buffer_atomic_fmax: |
8906 | case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax: |
8907 | return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FMAX); |
8908 | case Intrinsic::amdgcn_raw_buffer_atomic_swap: |
8909 | case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap: |
8910 | return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SWAP); |
8911 | case Intrinsic::amdgcn_raw_buffer_atomic_add: |
8912 | case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add: |
8913 | return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_ADD); |
8914 | case Intrinsic::amdgcn_raw_buffer_atomic_sub: |
8915 | case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub: |
8916 | return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SUB); |
8917 | case Intrinsic::amdgcn_raw_buffer_atomic_smin: |
8918 | case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin: |
8919 | return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SMIN); |
8920 | case Intrinsic::amdgcn_raw_buffer_atomic_umin: |
8921 | case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin: |
8922 | return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_UMIN); |
8923 | case Intrinsic::amdgcn_raw_buffer_atomic_smax: |
8924 | case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax: |
8925 | return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SMAX); |
8926 | case Intrinsic::amdgcn_raw_buffer_atomic_umax: |
8927 | case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax: |
8928 | return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_UMAX); |
8929 | case Intrinsic::amdgcn_raw_buffer_atomic_and: |
8930 | case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and: |
8931 | return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_AND); |
8932 | case Intrinsic::amdgcn_raw_buffer_atomic_or: |
8933 | case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or: |
8934 | return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_OR); |
8935 | case Intrinsic::amdgcn_raw_buffer_atomic_xor: |
8936 | case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor: |
8937 | return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_XOR); |
8938 | case Intrinsic::amdgcn_raw_buffer_atomic_inc: |
8939 | case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc: |
8940 | return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_INC); |
8941 | case Intrinsic::amdgcn_raw_buffer_atomic_dec: |
8942 | case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec: |
8943 | return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_DEC); |
8944 | case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32: |
8945 | return lowerRawBufferAtomicIntrin(Op, DAG, |
8946 | NewOpcode: AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32); |
8947 | case Intrinsic::amdgcn_struct_buffer_atomic_swap: |
8948 | case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap: |
8949 | return lowerStructBufferAtomicIntrin(Op, DAG, |
8950 | NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SWAP); |
8951 | case Intrinsic::amdgcn_struct_buffer_atomic_add: |
8952 | case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add: |
8953 | return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_ADD); |
8954 | case Intrinsic::amdgcn_struct_buffer_atomic_sub: |
8955 | case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub: |
8956 | return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SUB); |
8957 | case Intrinsic::amdgcn_struct_buffer_atomic_smin: |
8958 | case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin: |
8959 | return lowerStructBufferAtomicIntrin(Op, DAG, |
8960 | NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SMIN); |
8961 | case Intrinsic::amdgcn_struct_buffer_atomic_umin: |
8962 | case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin: |
8963 | return lowerStructBufferAtomicIntrin(Op, DAG, |
8964 | NewOpcode: AMDGPUISD::BUFFER_ATOMIC_UMIN); |
8965 | case Intrinsic::amdgcn_struct_buffer_atomic_smax: |
8966 | case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax: |
8967 | return lowerStructBufferAtomicIntrin(Op, DAG, |
8968 | NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SMAX); |
8969 | case Intrinsic::amdgcn_struct_buffer_atomic_umax: |
8970 | case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax: |
8971 | return lowerStructBufferAtomicIntrin(Op, DAG, |
8972 | NewOpcode: AMDGPUISD::BUFFER_ATOMIC_UMAX); |
8973 | case Intrinsic::amdgcn_struct_buffer_atomic_and: |
8974 | case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and: |
8975 | return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_AND); |
8976 | case Intrinsic::amdgcn_struct_buffer_atomic_or: |
8977 | case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or: |
8978 | return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_OR); |
8979 | case Intrinsic::amdgcn_struct_buffer_atomic_xor: |
8980 | case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor: |
8981 | return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_XOR); |
8982 | case Intrinsic::amdgcn_struct_buffer_atomic_inc: |
8983 | case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc: |
8984 | return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_INC); |
8985 | case Intrinsic::amdgcn_struct_buffer_atomic_dec: |
8986 | case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec: |
8987 | return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_DEC); |
8988 | case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32: |
8989 | return lowerStructBufferAtomicIntrin(Op, DAG, |
8990 | NewOpcode: AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32); |
8991 | |
8992 | case Intrinsic::amdgcn_buffer_atomic_cmpswap: { |
8993 | unsigned Slc = Op.getConstantOperandVal(i: 7); |
8994 | unsigned IdxEn = getIdxEn(VIndex: Op.getOperand(i: 5)); |
8995 | SDValue Ops[] = { |
8996 | Op.getOperand(0), // Chain |
8997 | Op.getOperand(2), // src |
8998 | Op.getOperand(3), // cmp |
8999 | Op.getOperand(4), // rsrc |
9000 | Op.getOperand(5), // vindex |
9001 | SDValue(), // voffset -- will be set by setBufferOffsets |
9002 | SDValue(), // soffset -- will be set by setBufferOffsets |
9003 | SDValue(), // offset -- will be set by setBufferOffsets |
9004 | DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy |
9005 | DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen |
9006 | }; |
9007 | setBufferOffsets(CombinedOffset: Op.getOperand(i: 6), DAG, Offsets: &Ops[5]); |
9008 | |
9009 | EVT VT = Op.getValueType(); |
9010 | auto *M = cast<MemSDNode>(Val&: Op); |
9011 | |
9012 | return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, |
9013 | Op->getVTList(), Ops, VT, M->getMemOperand()); |
9014 | } |
9015 | case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: |
9016 | case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: { |
9017 | SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 4), DAG); |
9018 | auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 5), DAG); |
9019 | auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 6), DAG, Subtarget); |
9020 | SDValue Ops[] = { |
9021 | Op.getOperand(0), // Chain |
9022 | Op.getOperand(2), // src |
9023 | Op.getOperand(3), // cmp |
9024 | Rsrc, // rsrc |
9025 | DAG.getConstant(0, DL, MVT::i32), // vindex |
9026 | Offsets.first, // voffset |
9027 | SOffset, // soffset |
9028 | Offsets.second, // offset |
9029 | Op.getOperand(7), // cachepolicy |
9030 | DAG.getTargetConstant(0, DL, MVT::i1), // idxen |
9031 | }; |
9032 | EVT VT = Op.getValueType(); |
9033 | auto *M = cast<MemSDNode>(Val&: Op); |
9034 | |
9035 | return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, |
9036 | Op->getVTList(), Ops, VT, M->getMemOperand()); |
9037 | } |
9038 | case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: |
9039 | case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: { |
9040 | SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op->getOperand(Num: 4), DAG); |
9041 | auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 6), DAG); |
9042 | auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 7), DAG, Subtarget); |
9043 | SDValue Ops[] = { |
9044 | Op.getOperand(0), // Chain |
9045 | Op.getOperand(2), // src |
9046 | Op.getOperand(3), // cmp |
9047 | Rsrc, // rsrc |
9048 | Op.getOperand(5), // vindex |
9049 | Offsets.first, // voffset |
9050 | SOffset, // soffset |
9051 | Offsets.second, // offset |
9052 | Op.getOperand(8), // cachepolicy |
9053 | DAG.getTargetConstant(1, DL, MVT::i1), // idxen |
9054 | }; |
9055 | EVT VT = Op.getValueType(); |
9056 | auto *M = cast<MemSDNode>(Val&: Op); |
9057 | |
9058 | return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, |
9059 | Op->getVTList(), Ops, VT, M->getMemOperand()); |
9060 | } |
9061 | case Intrinsic::amdgcn_image_bvh_intersect_ray: { |
9062 | MemSDNode *M = cast<MemSDNode>(Val&: Op); |
9063 | SDValue NodePtr = M->getOperand(Num: 2); |
9064 | SDValue RayExtent = M->getOperand(Num: 3); |
9065 | SDValue RayOrigin = M->getOperand(Num: 4); |
9066 | SDValue RayDir = M->getOperand(Num: 5); |
9067 | SDValue RayInvDir = M->getOperand(Num: 6); |
9068 | SDValue TDescr = M->getOperand(Num: 7); |
9069 | |
9070 | assert(NodePtr.getValueType() == MVT::i32 || |
9071 | NodePtr.getValueType() == MVT::i64); |
9072 | assert(RayDir.getValueType() == MVT::v3f16 || |
9073 | RayDir.getValueType() == MVT::v3f32); |
9074 | |
9075 | if (!Subtarget->hasGFX10_AEncoding()) { |
9076 | emitRemovedIntrinsicError(DAG, DL, VT: Op.getValueType()); |
9077 | return SDValue(); |
9078 | } |
9079 | |
9080 | const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget); |
9081 | const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget); |
9082 | const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget); |
9083 | const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16; |
9084 | const bool Is64 = NodePtr.getValueType() == MVT::i64; |
9085 | const unsigned NumVDataDwords = 4; |
9086 | const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); |
9087 | const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords; |
9088 | const bool UseNSA = (Subtarget->hasNSAEncoding() && |
9089 | NumVAddrs <= Subtarget->getNSAMaxSize()) || |
9090 | IsGFX12Plus; |
9091 | const unsigned BaseOpcodes[2][2] = { |
9092 | {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, |
9093 | {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, |
9094 | AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; |
9095 | int Opcode; |
9096 | if (UseNSA) { |
9097 | Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], |
9098 | IsGFX12Plus ? AMDGPU::MIMGEncGfx12 |
9099 | : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA |
9100 | : AMDGPU::MIMGEncGfx10NSA, |
9101 | NumVDataDwords, NumVAddrDwords); |
9102 | } else { |
9103 | assert(!IsGFX12Plus); |
9104 | Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], |
9105 | IsGFX11 ? AMDGPU::MIMGEncGfx11Default |
9106 | : AMDGPU::MIMGEncGfx10Default, |
9107 | NumVDataDwords, NumVAddrDwords); |
9108 | } |
9109 | assert(Opcode != -1); |
9110 | |
9111 | SmallVector<SDValue, 16> Ops; |
9112 | |
9113 | auto packLanes = [&DAG, &Ops, &DL] (SDValue Op, bool IsAligned) { |
9114 | SmallVector<SDValue, 3> Lanes; |
9115 | DAG.ExtractVectorElements(Op, Args&: Lanes, Start: 0, Count: 3); |
9116 | if (Lanes[0].getValueSizeInBits() == 32) { |
9117 | for (unsigned I = 0; I < 3; ++I) |
9118 | Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I])); |
9119 | } else { |
9120 | if (IsAligned) { |
9121 | Ops.push_back( |
9122 | DAG.getBitcast(MVT::i32, |
9123 | DAG.getBuildVector(MVT::v2f16, DL, |
9124 | { Lanes[0], Lanes[1] }))); |
9125 | Ops.push_back(Elt: Lanes[2]); |
9126 | } else { |
9127 | SDValue Elt0 = Ops.pop_back_val(); |
9128 | Ops.push_back( |
9129 | DAG.getBitcast(MVT::i32, |
9130 | DAG.getBuildVector(MVT::v2f16, DL, |
9131 | { Elt0, Lanes[0] }))); |
9132 | Ops.push_back( |
9133 | DAG.getBitcast(MVT::i32, |
9134 | DAG.getBuildVector(MVT::v2f16, DL, |
9135 | { Lanes[1], Lanes[2] }))); |
9136 | } |
9137 | } |
9138 | }; |
9139 | |
9140 | if (UseNSA && IsGFX11Plus) { |
9141 | Ops.push_back(Elt: NodePtr); |
9142 | Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent)); |
9143 | Ops.push_back(Elt: RayOrigin); |
9144 | if (IsA16) { |
9145 | SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes; |
9146 | DAG.ExtractVectorElements(Op: RayDir, Args&: DirLanes, Start: 0, Count: 3); |
9147 | DAG.ExtractVectorElements(Op: RayInvDir, Args&: InvDirLanes, Start: 0, Count: 3); |
9148 | for (unsigned I = 0; I < 3; ++I) { |
9149 | MergedLanes.push_back(DAG.getBitcast( |
9150 | MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, |
9151 | {DirLanes[I], InvDirLanes[I]}))); |
9152 | } |
9153 | Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes)); |
9154 | } else { |
9155 | Ops.push_back(Elt: RayDir); |
9156 | Ops.push_back(Elt: RayInvDir); |
9157 | } |
9158 | } else { |
9159 | if (Is64) |
9160 | DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0, |
9161 | 2); |
9162 | else |
9163 | Ops.push_back(Elt: NodePtr); |
9164 | |
9165 | Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent)); |
9166 | packLanes(RayOrigin, true); |
9167 | packLanes(RayDir, true); |
9168 | packLanes(RayInvDir, false); |
9169 | } |
9170 | |
9171 | if (!UseNSA) { |
9172 | // Build a single vector containing all the operands so far prepared. |
9173 | if (NumVAddrDwords > 12) { |
9174 | SDValue Undef = DAG.getUNDEF(MVT::i32); |
9175 | Ops.append(NumInputs: 16 - Ops.size(), Elt: Undef); |
9176 | } |
9177 | assert(Ops.size() >= 8 && Ops.size() <= 12); |
9178 | SDValue MergedOps = DAG.getBuildVector( |
9179 | MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops); |
9180 | Ops.clear(); |
9181 | Ops.push_back(Elt: MergedOps); |
9182 | } |
9183 | |
9184 | Ops.push_back(Elt: TDescr); |
9185 | Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1)); |
9186 | Ops.push_back(Elt: M->getChain()); |
9187 | |
9188 | auto *NewNode = DAG.getMachineNode(Opcode, dl: DL, VTs: M->getVTList(), Ops); |
9189 | MachineMemOperand *MemRef = M->getMemOperand(); |
9190 | DAG.setNodeMemRefs(N: NewNode, NewMemRefs: {MemRef}); |
9191 | return SDValue(NewNode, 0); |
9192 | } |
9193 | case Intrinsic::amdgcn_global_atomic_fmin: |
9194 | case Intrinsic::amdgcn_global_atomic_fmax: |
9195 | case Intrinsic::amdgcn_global_atomic_fmin_num: |
9196 | case Intrinsic::amdgcn_global_atomic_fmax_num: |
9197 | case Intrinsic::amdgcn_flat_atomic_fmin: |
9198 | case Intrinsic::amdgcn_flat_atomic_fmax: |
9199 | case Intrinsic::amdgcn_flat_atomic_fmin_num: |
9200 | case Intrinsic::amdgcn_flat_atomic_fmax_num: { |
9201 | MemSDNode *M = cast<MemSDNode>(Val&: Op); |
9202 | SDValue Ops[] = { |
9203 | M->getOperand(Num: 0), // Chain |
9204 | M->getOperand(Num: 2), // Ptr |
9205 | M->getOperand(Num: 3) // Value |
9206 | }; |
9207 | unsigned Opcode = 0; |
9208 | switch (IntrID) { |
9209 | case Intrinsic::amdgcn_global_atomic_fmin: |
9210 | case Intrinsic::amdgcn_global_atomic_fmin_num: |
9211 | case Intrinsic::amdgcn_flat_atomic_fmin: |
9212 | case Intrinsic::amdgcn_flat_atomic_fmin_num: { |
9213 | Opcode = AMDGPUISD::ATOMIC_LOAD_FMIN; |
9214 | break; |
9215 | } |
9216 | case Intrinsic::amdgcn_global_atomic_fmax: |
9217 | case Intrinsic::amdgcn_global_atomic_fmax_num: |
9218 | case Intrinsic::amdgcn_flat_atomic_fmax: |
9219 | case Intrinsic::amdgcn_flat_atomic_fmax_num: { |
9220 | Opcode = AMDGPUISD::ATOMIC_LOAD_FMAX; |
9221 | break; |
9222 | } |
9223 | default: |
9224 | llvm_unreachable("unhandled atomic opcode" ); |
9225 | } |
9226 | return DAG.getMemIntrinsicNode(Opcode, dl: SDLoc(Op), |
9227 | VTList: M->getVTList(), Ops, MemVT: M->getMemoryVT(), |
9228 | MMO: M->getMemOperand()); |
9229 | } |
9230 | case Intrinsic::amdgcn_s_get_barrier_state: { |
9231 | SDValue Chain = Op->getOperand(Num: 0); |
9232 | SmallVector<SDValue, 2> Ops; |
9233 | unsigned Opc; |
9234 | bool IsInlinableBarID = false; |
9235 | int64_t BarID; |
9236 | |
9237 | if (isa<ConstantSDNode>(Val: Op->getOperand(Num: 2))) { |
9238 | BarID = cast<ConstantSDNode>(Val: Op->getOperand(Num: 2))->getSExtValue(); |
9239 | IsInlinableBarID = AMDGPU::isInlinableIntLiteral(Literal: BarID); |
9240 | } |
9241 | |
9242 | if (IsInlinableBarID) { |
9243 | Opc = AMDGPU::S_GET_BARRIER_STATE_IMM; |
9244 | SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32); |
9245 | Ops.push_back(Elt: K); |
9246 | } else { |
9247 | Opc = AMDGPU::S_GET_BARRIER_STATE_M0; |
9248 | SDValue M0Val = copyToM0(DAG, Chain, DL, V: Op.getOperand(i: 2)); |
9249 | Ops.push_back(Elt: M0Val.getValue(R: 0)); |
9250 | } |
9251 | |
9252 | auto NewMI = DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: Op->getVTList(), Ops); |
9253 | return SDValue(NewMI, 0); |
9254 | } |
9255 | default: |
9256 | |
9257 | if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = |
9258 | AMDGPU::getImageDimIntrinsicInfo(Intr: IntrID)) |
9259 | return lowerImage(Op, Intr: ImageDimIntr, DAG, WithChain: true); |
9260 | |
9261 | return SDValue(); |
9262 | } |
9263 | } |
9264 | |
9265 | // Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to |
9266 | // dwordx4 if on SI and handle TFE loads. |
9267 | SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL, |
9268 | SDVTList VTList, |
9269 | ArrayRef<SDValue> Ops, EVT MemVT, |
9270 | MachineMemOperand *MMO, |
9271 | SelectionDAG &DAG) const { |
9272 | LLVMContext &C = *DAG.getContext(); |
9273 | MachineFunction &MF = DAG.getMachineFunction(); |
9274 | EVT VT = VTList.VTs[0]; |
9275 | |
9276 | assert(VTList.NumVTs == 2 || VTList.NumVTs == 3); |
9277 | bool IsTFE = VTList.NumVTs == 3; |
9278 | if (IsTFE) { |
9279 | unsigned NumValueDWords = divideCeil(Numerator: VT.getSizeInBits(), Denominator: 32); |
9280 | unsigned NumOpDWords = NumValueDWords + 1; |
9281 | EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords); |
9282 | SDVTList OpDWordsVTList = DAG.getVTList(VT1: OpDWordsVT, VT2: VTList.VTs[2]); |
9283 | MachineMemOperand *OpDWordsMMO = |
9284 | MF.getMachineMemOperand(MMO, Offset: 0, Size: NumOpDWords * 4); |
9285 | SDValue Op = getMemIntrinsicNode(Opcode, DL, VTList: OpDWordsVTList, Ops, |
9286 | MemVT: OpDWordsVT, MMO: OpDWordsMMO, DAG); |
9287 | SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, |
9288 | DAG.getVectorIdxConstant(NumValueDWords, DL)); |
9289 | SDValue ZeroIdx = DAG.getVectorIdxConstant(Val: 0, DL); |
9290 | SDValue ValueDWords = |
9291 | NumValueDWords == 1 |
9292 | ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx) |
9293 | : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, |
9294 | EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op, |
9295 | ZeroIdx); |
9296 | SDValue Value = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: ValueDWords); |
9297 | return DAG.getMergeValues(Ops: {Value, Status, SDValue(Op.getNode(), 1)}, dl: DL); |
9298 | } |
9299 | |
9300 | if (!Subtarget->hasDwordx3LoadStores() && |
9301 | (VT == MVT::v3i32 || VT == MVT::v3f32)) { |
9302 | EVT WidenedVT = EVT::getVectorVT(Context&: C, VT: VT.getVectorElementType(), NumElements: 4); |
9303 | EVT WidenedMemVT = EVT::getVectorVT(Context&: C, VT: MemVT.getVectorElementType(), NumElements: 4); |
9304 | MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, Offset: 0, Size: 16); |
9305 | SDVTList WidenedVTList = DAG.getVTList(VT1: WidenedVT, VT2: VTList.VTs[1]); |
9306 | SDValue Op = DAG.getMemIntrinsicNode(Opcode, dl: DL, VTList: WidenedVTList, Ops, |
9307 | MemVT: WidenedMemVT, MMO: WidenedMMO); |
9308 | SDValue Value = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: Op, |
9309 | N2: DAG.getVectorIdxConstant(Val: 0, DL)); |
9310 | return DAG.getMergeValues(Ops: {Value, SDValue(Op.getNode(), 1)}, dl: DL); |
9311 | } |
9312 | |
9313 | return DAG.getMemIntrinsicNode(Opcode, dl: DL, VTList, Ops, MemVT, MMO); |
9314 | } |
9315 | |
9316 | SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG, |
9317 | bool ImageStore) const { |
9318 | EVT StoreVT = VData.getValueType(); |
9319 | |
9320 | // No change for f16 and legal vector D16 types. |
9321 | if (!StoreVT.isVector()) |
9322 | return VData; |
9323 | |
9324 | SDLoc DL(VData); |
9325 | unsigned NumElements = StoreVT.getVectorNumElements(); |
9326 | |
9327 | if (Subtarget->hasUnpackedD16VMem()) { |
9328 | // We need to unpack the packed data to store. |
9329 | EVT IntStoreVT = StoreVT.changeTypeToInteger(); |
9330 | SDValue IntVData = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IntStoreVT, Operand: VData); |
9331 | |
9332 | EVT EquivStoreVT = |
9333 | EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements); |
9334 | SDValue ZExt = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: EquivStoreVT, Operand: IntVData); |
9335 | return DAG.UnrollVectorOp(N: ZExt.getNode()); |
9336 | } |
9337 | |
9338 | // The sq block of gfx8.1 does not estimate register use correctly for d16 |
9339 | // image store instructions. The data operand is computed as if it were not a |
9340 | // d16 image instruction. |
9341 | if (ImageStore && Subtarget->hasImageStoreD16Bug()) { |
9342 | // Bitcast to i16 |
9343 | EVT IntStoreVT = StoreVT.changeTypeToInteger(); |
9344 | SDValue IntVData = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IntStoreVT, Operand: VData); |
9345 | |
9346 | // Decompose into scalars |
9347 | SmallVector<SDValue, 4> Elts; |
9348 | DAG.ExtractVectorElements(Op: IntVData, Args&: Elts); |
9349 | |
9350 | // Group pairs of i16 into v2i16 and bitcast to i32 |
9351 | SmallVector<SDValue, 4> PackedElts; |
9352 | for (unsigned I = 0; I < Elts.size() / 2; I += 1) { |
9353 | SDValue Pair = |
9354 | DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]}); |
9355 | SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair); |
9356 | PackedElts.push_back(Elt: IntPair); |
9357 | } |
9358 | if ((NumElements % 2) == 1) { |
9359 | // Handle v3i16 |
9360 | unsigned I = Elts.size() / 2; |
9361 | SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL, |
9362 | {Elts[I * 2], DAG.getUNDEF(MVT::i16)}); |
9363 | SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair); |
9364 | PackedElts.push_back(Elt: IntPair); |
9365 | } |
9366 | |
9367 | // Pad using UNDEF |
9368 | PackedElts.resize(Elts.size(), DAG.getUNDEF(MVT::i32)); |
9369 | |
9370 | // Build final vector |
9371 | EVT VecVT = |
9372 | EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size()); |
9373 | return DAG.getBuildVector(VT: VecVT, DL, Ops: PackedElts); |
9374 | } |
9375 | |
9376 | if (NumElements == 3) { |
9377 | EVT IntStoreVT = |
9378 | EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: StoreVT.getStoreSizeInBits()); |
9379 | SDValue IntVData = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IntStoreVT, Operand: VData); |
9380 | |
9381 | EVT WidenedStoreVT = EVT::getVectorVT( |
9382 | Context&: *DAG.getContext(), VT: StoreVT.getVectorElementType(), NumElements: NumElements + 1); |
9383 | EVT WidenedIntVT = EVT::getIntegerVT(Context&: *DAG.getContext(), |
9384 | BitWidth: WidenedStoreVT.getStoreSizeInBits()); |
9385 | SDValue ZExt = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: WidenedIntVT, Operand: IntVData); |
9386 | return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: WidenedStoreVT, Operand: ZExt); |
9387 | } |
9388 | |
9389 | assert(isTypeLegal(StoreVT)); |
9390 | return VData; |
9391 | } |
9392 | |
9393 | SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, |
9394 | SelectionDAG &DAG) const { |
9395 | SDLoc DL(Op); |
9396 | SDValue Chain = Op.getOperand(i: 0); |
9397 | unsigned IntrinsicID = Op.getConstantOperandVal(i: 1); |
9398 | MachineFunction &MF = DAG.getMachineFunction(); |
9399 | |
9400 | switch (IntrinsicID) { |
9401 | case Intrinsic::amdgcn_exp_compr: { |
9402 | if (!Subtarget->hasCompressedExport()) { |
9403 | DiagnosticInfoUnsupported BadIntrin( |
9404 | DAG.getMachineFunction().getFunction(), |
9405 | "intrinsic not supported on subtarget" , DL.getDebugLoc()); |
9406 | DAG.getContext()->diagnose(DI: BadIntrin); |
9407 | } |
9408 | SDValue Src0 = Op.getOperand(i: 4); |
9409 | SDValue Src1 = Op.getOperand(i: 5); |
9410 | // Hack around illegal type on SI by directly selecting it. |
9411 | if (isTypeLegal(VT: Src0.getValueType())) |
9412 | return SDValue(); |
9413 | |
9414 | const ConstantSDNode *Done = cast<ConstantSDNode>(Val: Op.getOperand(i: 6)); |
9415 | SDValue Undef = DAG.getUNDEF(MVT::f32); |
9416 | const SDValue Ops[] = { |
9417 | Op.getOperand(2), // tgt |
9418 | DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0 |
9419 | DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1 |
9420 | Undef, // src2 |
9421 | Undef, // src3 |
9422 | Op.getOperand(7), // vm |
9423 | DAG.getTargetConstant(1, DL, MVT::i1), // compr |
9424 | Op.getOperand(3), // en |
9425 | Op.getOperand(0) // Chain |
9426 | }; |
9427 | |
9428 | unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE; |
9429 | return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0); |
9430 | } |
9431 | case Intrinsic::amdgcn_s_barrier: { |
9432 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
9433 | if (getTargetMachine().getOptLevel() > CodeGenOptLevel::None) { |
9434 | unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second; |
9435 | if (WGSize <= ST.getWavefrontSize()) |
9436 | return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other, |
9437 | Op.getOperand(0)), 0); |
9438 | } |
9439 | |
9440 | // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait |
9441 | if (ST.hasSplitBarriers()) { |
9442 | SDValue K = |
9443 | DAG.getTargetConstant(AMDGPU::Barrier::WORKGROUP, DL, MVT::i32); |
9444 | SDValue BarSignal = |
9445 | SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL, |
9446 | MVT::Other, K, Op.getOperand(0)), |
9447 | 0); |
9448 | SDValue BarWait = |
9449 | SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_WAIT, DL, MVT::Other, K, |
9450 | BarSignal.getValue(0)), |
9451 | 0); |
9452 | return BarWait; |
9453 | } |
9454 | |
9455 | return SDValue(); |
9456 | }; |
9457 | case Intrinsic::amdgcn_tbuffer_store: { |
9458 | SDValue VData = Op.getOperand(i: 2); |
9459 | bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); |
9460 | if (IsD16) |
9461 | VData = handleD16VData(VData, DAG); |
9462 | unsigned Dfmt = Op.getConstantOperandVal(i: 8); |
9463 | unsigned Nfmt = Op.getConstantOperandVal(i: 9); |
9464 | unsigned Glc = Op.getConstantOperandVal(i: 10); |
9465 | unsigned Slc = Op.getConstantOperandVal(i: 11); |
9466 | unsigned IdxEn = getIdxEn(VIndex: Op.getOperand(i: 4)); |
9467 | SDValue Ops[] = { |
9468 | Chain, |
9469 | VData, // vdata |
9470 | Op.getOperand(3), // rsrc |
9471 | Op.getOperand(4), // vindex |
9472 | Op.getOperand(5), // voffset |
9473 | Op.getOperand(6), // soffset |
9474 | Op.getOperand(7), // offset |
9475 | DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format |
9476 | DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy |
9477 | DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen |
9478 | }; |
9479 | unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 : |
9480 | AMDGPUISD::TBUFFER_STORE_FORMAT; |
9481 | MemSDNode *M = cast<MemSDNode>(Val&: Op); |
9482 | return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, |
9483 | M->getMemoryVT(), M->getMemOperand()); |
9484 | } |
9485 | |
9486 | case Intrinsic::amdgcn_struct_tbuffer_store: |
9487 | case Intrinsic::amdgcn_struct_ptr_tbuffer_store: { |
9488 | SDValue VData = Op.getOperand(i: 2); |
9489 | bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); |
9490 | if (IsD16) |
9491 | VData = handleD16VData(VData, DAG); |
9492 | SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG); |
9493 | auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 5), DAG); |
9494 | auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 6), DAG, Subtarget); |
9495 | SDValue Ops[] = { |
9496 | Chain, |
9497 | VData, // vdata |
9498 | Rsrc, // rsrc |
9499 | Op.getOperand(4), // vindex |
9500 | Offsets.first, // voffset |
9501 | SOffset, // soffset |
9502 | Offsets.second, // offset |
9503 | Op.getOperand(7), // format |
9504 | Op.getOperand(8), // cachepolicy, swizzled buffer |
9505 | DAG.getTargetConstant(1, DL, MVT::i1), // idxen |
9506 | }; |
9507 | unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 : |
9508 | AMDGPUISD::TBUFFER_STORE_FORMAT; |
9509 | MemSDNode *M = cast<MemSDNode>(Val&: Op); |
9510 | return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, |
9511 | M->getMemoryVT(), M->getMemOperand()); |
9512 | } |
9513 | |
9514 | case Intrinsic::amdgcn_raw_tbuffer_store: |
9515 | case Intrinsic::amdgcn_raw_ptr_tbuffer_store: { |
9516 | SDValue VData = Op.getOperand(i: 2); |
9517 | bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); |
9518 | if (IsD16) |
9519 | VData = handleD16VData(VData, DAG); |
9520 | SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG); |
9521 | auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 4), DAG); |
9522 | auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 5), DAG, Subtarget); |
9523 | SDValue Ops[] = { |
9524 | Chain, |
9525 | VData, // vdata |
9526 | Rsrc, // rsrc |
9527 | DAG.getConstant(0, DL, MVT::i32), // vindex |
9528 | Offsets.first, // voffset |
9529 | SOffset, // soffset |
9530 | Offsets.second, // offset |
9531 | Op.getOperand(6), // format |
9532 | Op.getOperand(7), // cachepolicy, swizzled buffer |
9533 | DAG.getTargetConstant(0, DL, MVT::i1), // idxen |
9534 | }; |
9535 | unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 : |
9536 | AMDGPUISD::TBUFFER_STORE_FORMAT; |
9537 | MemSDNode *M = cast<MemSDNode>(Val&: Op); |
9538 | return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, |
9539 | M->getMemoryVT(), M->getMemOperand()); |
9540 | } |
9541 | |
9542 | case Intrinsic::amdgcn_buffer_store: |
9543 | case Intrinsic::amdgcn_buffer_store_format: { |
9544 | SDValue VData = Op.getOperand(i: 2); |
9545 | bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); |
9546 | if (IsD16) |
9547 | VData = handleD16VData(VData, DAG); |
9548 | unsigned Glc = Op.getConstantOperandVal(i: 6); |
9549 | unsigned Slc = Op.getConstantOperandVal(i: 7); |
9550 | unsigned IdxEn = getIdxEn(VIndex: Op.getOperand(i: 4)); |
9551 | SDValue Ops[] = { |
9552 | Chain, |
9553 | VData, |
9554 | Op.getOperand(3), // rsrc |
9555 | Op.getOperand(4), // vindex |
9556 | SDValue(), // voffset -- will be set by setBufferOffsets |
9557 | SDValue(), // soffset -- will be set by setBufferOffsets |
9558 | SDValue(), // offset -- will be set by setBufferOffsets |
9559 | DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy |
9560 | DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen |
9561 | }; |
9562 | setBufferOffsets(CombinedOffset: Op.getOperand(i: 5), DAG, Offsets: &Ops[4]); |
9563 | |
9564 | unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ? |
9565 | AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; |
9566 | Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; |
9567 | MemSDNode *M = cast<MemSDNode>(Val&: Op); |
9568 | |
9569 | // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics |
9570 | EVT VDataType = VData.getValueType().getScalarType(); |
9571 | if (VDataType == MVT::i8 || VDataType == MVT::i16) |
9572 | return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M); |
9573 | |
9574 | return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, |
9575 | M->getMemoryVT(), M->getMemOperand()); |
9576 | } |
9577 | |
9578 | case Intrinsic::amdgcn_raw_buffer_store: |
9579 | case Intrinsic::amdgcn_raw_ptr_buffer_store: |
9580 | case Intrinsic::amdgcn_raw_buffer_store_format: |
9581 | case Intrinsic::amdgcn_raw_ptr_buffer_store_format: { |
9582 | const bool IsFormat = |
9583 | IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format || |
9584 | IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format; |
9585 | |
9586 | SDValue VData = Op.getOperand(i: 2); |
9587 | EVT VDataVT = VData.getValueType(); |
9588 | EVT EltType = VDataVT.getScalarType(); |
9589 | bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16); |
9590 | if (IsD16) { |
9591 | VData = handleD16VData(VData, DAG); |
9592 | VDataVT = VData.getValueType(); |
9593 | } |
9594 | |
9595 | if (!isTypeLegal(VT: VDataVT)) { |
9596 | VData = |
9597 | DAG.getNode(Opcode: ISD::BITCAST, DL, |
9598 | VT: getEquivalentMemType(Context&: *DAG.getContext(), VT: VDataVT), Operand: VData); |
9599 | } |
9600 | |
9601 | SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG); |
9602 | auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 4), DAG); |
9603 | auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 5), DAG, Subtarget); |
9604 | SDValue Ops[] = { |
9605 | Chain, |
9606 | VData, |
9607 | Rsrc, |
9608 | DAG.getConstant(0, DL, MVT::i32), // vindex |
9609 | Offsets.first, // voffset |
9610 | SOffset, // soffset |
9611 | Offsets.second, // offset |
9612 | Op.getOperand(6), // cachepolicy, swizzled buffer |
9613 | DAG.getTargetConstant(0, DL, MVT::i1), // idxen |
9614 | }; |
9615 | unsigned Opc = |
9616 | IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE; |
9617 | Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; |
9618 | MemSDNode *M = cast<MemSDNode>(Val&: Op); |
9619 | |
9620 | // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics |
9621 | if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32) |
9622 | return handleByteShortBufferStores(DAG, VDataType: VDataVT, DL, Ops, M); |
9623 | |
9624 | return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, |
9625 | M->getMemoryVT(), M->getMemOperand()); |
9626 | } |
9627 | |
9628 | case Intrinsic::amdgcn_struct_buffer_store: |
9629 | case Intrinsic::amdgcn_struct_ptr_buffer_store: |
9630 | case Intrinsic::amdgcn_struct_buffer_store_format: |
9631 | case Intrinsic::amdgcn_struct_ptr_buffer_store_format: { |
9632 | const bool IsFormat = |
9633 | IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format || |
9634 | IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format; |
9635 | |
9636 | SDValue VData = Op.getOperand(i: 2); |
9637 | EVT VDataVT = VData.getValueType(); |
9638 | EVT EltType = VDataVT.getScalarType(); |
9639 | bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16); |
9640 | |
9641 | if (IsD16) { |
9642 | VData = handleD16VData(VData, DAG); |
9643 | VDataVT = VData.getValueType(); |
9644 | } |
9645 | |
9646 | if (!isTypeLegal(VT: VDataVT)) { |
9647 | VData = |
9648 | DAG.getNode(Opcode: ISD::BITCAST, DL, |
9649 | VT: getEquivalentMemType(Context&: *DAG.getContext(), VT: VDataVT), Operand: VData); |
9650 | } |
9651 | |
9652 | auto Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG); |
9653 | auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 5), DAG); |
9654 | auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 6), DAG, Subtarget); |
9655 | SDValue Ops[] = { |
9656 | Chain, |
9657 | VData, |
9658 | Rsrc, |
9659 | Op.getOperand(4), // vindex |
9660 | Offsets.first, // voffset |
9661 | SOffset, // soffset |
9662 | Offsets.second, // offset |
9663 | Op.getOperand(7), // cachepolicy, swizzled buffer |
9664 | DAG.getTargetConstant(1, DL, MVT::i1), // idxen |
9665 | }; |
9666 | unsigned Opc = |
9667 | !IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; |
9668 | Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; |
9669 | MemSDNode *M = cast<MemSDNode>(Val&: Op); |
9670 | |
9671 | // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics |
9672 | EVT VDataType = VData.getValueType().getScalarType(); |
9673 | if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32) |
9674 | return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M); |
9675 | |
9676 | return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, |
9677 | M->getMemoryVT(), M->getMemOperand()); |
9678 | } |
9679 | case Intrinsic::amdgcn_raw_buffer_load_lds: |
9680 | case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: |
9681 | case Intrinsic::amdgcn_struct_buffer_load_lds: |
9682 | case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: { |
9683 | assert(!AMDGPU::isGFX12Plus(*Subtarget)); |
9684 | unsigned Opc; |
9685 | bool HasVIndex = |
9686 | IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds || |
9687 | IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds; |
9688 | unsigned OpOffset = HasVIndex ? 1 : 0; |
9689 | SDValue VOffset = Op.getOperand(i: 5 + OpOffset); |
9690 | bool HasVOffset = !isNullConstant(V: VOffset); |
9691 | unsigned Size = Op->getConstantOperandVal(Num: 4); |
9692 | |
9693 | switch (Size) { |
9694 | default: |
9695 | return SDValue(); |
9696 | case 1: |
9697 | Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN |
9698 | : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN |
9699 | : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN |
9700 | : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET; |
9701 | break; |
9702 | case 2: |
9703 | Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN |
9704 | : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN |
9705 | : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN |
9706 | : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET; |
9707 | break; |
9708 | case 4: |
9709 | Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN |
9710 | : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN |
9711 | : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN |
9712 | : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET; |
9713 | break; |
9714 | } |
9715 | |
9716 | SDValue M0Val = copyToM0(DAG, Chain, DL, V: Op.getOperand(i: 3)); |
9717 | |
9718 | SmallVector<SDValue, 8> Ops; |
9719 | |
9720 | if (HasVIndex && HasVOffset) |
9721 | Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL, |
9722 | { Op.getOperand(5), // VIndex |
9723 | VOffset })); |
9724 | else if (HasVIndex) |
9725 | Ops.push_back(Elt: Op.getOperand(i: 5)); |
9726 | else if (HasVOffset) |
9727 | Ops.push_back(Elt: VOffset); |
9728 | |
9729 | SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG); |
9730 | Ops.push_back(Elt: Rsrc); |
9731 | Ops.push_back(Elt: Op.getOperand(i: 6 + OpOffset)); // soffset |
9732 | Ops.push_back(Elt: Op.getOperand(i: 7 + OpOffset)); // imm offset |
9733 | unsigned Aux = Op.getConstantOperandVal(i: 8 + OpOffset); |
9734 | Ops.push_back( |
9735 | DAG.getTargetConstant(Aux & AMDGPU::CPol::ALL, DL, MVT::i8)); // cpol |
9736 | Ops.push_back(DAG.getTargetConstant( |
9737 | Aux & AMDGPU::CPol::SWZ_pregfx12 ? 1 : 0, DL, MVT::i8)); // swz |
9738 | Ops.push_back(Elt: M0Val.getValue(R: 0)); // Chain |
9739 | Ops.push_back(Elt: M0Val.getValue(R: 1)); // Glue |
9740 | |
9741 | auto *M = cast<MemSDNode>(Val&: Op); |
9742 | MachineMemOperand *LoadMMO = M->getMemOperand(); |
9743 | // Don't set the offset value here because the pointer points to the base of |
9744 | // the buffer. |
9745 | MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); |
9746 | |
9747 | MachinePointerInfo StorePtrI = LoadPtrI; |
9748 | LoadPtrI.V = PoisonValue::get( |
9749 | T: PointerType::get(C&: *DAG.getContext(), AddressSpace: AMDGPUAS::GLOBAL_ADDRESS)); |
9750 | LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS; |
9751 | StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; |
9752 | |
9753 | auto F = LoadMMO->getFlags() & |
9754 | ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad); |
9755 | LoadMMO = |
9756 | MF.getMachineMemOperand(PtrInfo: LoadPtrI, F: F | MachineMemOperand::MOLoad, Size, |
9757 | BaseAlignment: LoadMMO->getBaseAlign(), AAInfo: LoadMMO->getAAInfo()); |
9758 | |
9759 | MachineMemOperand *StoreMMO = MF.getMachineMemOperand( |
9760 | PtrInfo: StorePtrI, F: F | MachineMemOperand::MOStore, Size: sizeof(int32_t), |
9761 | BaseAlignment: LoadMMO->getBaseAlign(), AAInfo: LoadMMO->getAAInfo()); |
9762 | |
9763 | auto Load = DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: M->getVTList(), Ops); |
9764 | DAG.setNodeMemRefs(N: Load, NewMemRefs: {LoadMMO, StoreMMO}); |
9765 | |
9766 | return SDValue(Load, 0); |
9767 | } |
9768 | case Intrinsic::amdgcn_global_load_lds: { |
9769 | unsigned Opc; |
9770 | unsigned Size = Op->getConstantOperandVal(Num: 4); |
9771 | switch (Size) { |
9772 | default: |
9773 | return SDValue(); |
9774 | case 1: |
9775 | Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE; |
9776 | break; |
9777 | case 2: |
9778 | Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT; |
9779 | break; |
9780 | case 4: |
9781 | Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD; |
9782 | break; |
9783 | } |
9784 | |
9785 | auto *M = cast<MemSDNode>(Val&: Op); |
9786 | SDValue M0Val = copyToM0(DAG, Chain, DL, V: Op.getOperand(i: 3)); |
9787 | |
9788 | SmallVector<SDValue, 6> Ops; |
9789 | |
9790 | SDValue Addr = Op.getOperand(i: 2); // Global ptr |
9791 | SDValue VOffset; |
9792 | // Try to split SAddr and VOffset. Global and LDS pointers share the same |
9793 | // immediate offset, so we cannot use a regular SelectGlobalSAddr(). |
9794 | if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) { |
9795 | SDValue LHS = Addr.getOperand(i: 0); |
9796 | SDValue RHS = Addr.getOperand(i: 1); |
9797 | |
9798 | if (LHS->isDivergent()) |
9799 | std::swap(a&: LHS, b&: RHS); |
9800 | |
9801 | if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND && |
9802 | RHS.getOperand(0).getValueType() == MVT::i32) { |
9803 | // add (i64 sgpr), (zero_extend (i32 vgpr)) |
9804 | Addr = LHS; |
9805 | VOffset = RHS.getOperand(i: 0); |
9806 | } |
9807 | } |
9808 | |
9809 | Ops.push_back(Elt: Addr); |
9810 | if (!Addr->isDivergent()) { |
9811 | Opc = AMDGPU::getGlobalSaddrOp(Opcode: Opc); |
9812 | if (!VOffset) |
9813 | VOffset = SDValue( |
9814 | DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, |
9815 | DAG.getTargetConstant(0, DL, MVT::i32)), 0); |
9816 | Ops.push_back(Elt: VOffset); |
9817 | } |
9818 | |
9819 | Ops.push_back(Elt: Op.getOperand(i: 5)); // Offset |
9820 | Ops.push_back(Elt: Op.getOperand(i: 6)); // CPol |
9821 | Ops.push_back(Elt: M0Val.getValue(R: 0)); // Chain |
9822 | Ops.push_back(Elt: M0Val.getValue(R: 1)); // Glue |
9823 | |
9824 | MachineMemOperand *LoadMMO = M->getMemOperand(); |
9825 | MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); |
9826 | LoadPtrI.Offset = Op->getConstantOperandVal(Num: 5); |
9827 | MachinePointerInfo StorePtrI = LoadPtrI; |
9828 | LoadPtrI.V = PoisonValue::get( |
9829 | T: PointerType::get(C&: *DAG.getContext(), AddressSpace: AMDGPUAS::GLOBAL_ADDRESS)); |
9830 | LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS; |
9831 | StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; |
9832 | auto F = LoadMMO->getFlags() & |
9833 | ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad); |
9834 | LoadMMO = |
9835 | MF.getMachineMemOperand(PtrInfo: LoadPtrI, F: F | MachineMemOperand::MOLoad, Size, |
9836 | BaseAlignment: LoadMMO->getBaseAlign(), AAInfo: LoadMMO->getAAInfo()); |
9837 | MachineMemOperand *StoreMMO = MF.getMachineMemOperand( |
9838 | PtrInfo: StorePtrI, F: F | MachineMemOperand::MOStore, Size: sizeof(int32_t), BaseAlignment: Align(4), |
9839 | AAInfo: LoadMMO->getAAInfo()); |
9840 | |
9841 | auto Load = DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: Op->getVTList(), Ops); |
9842 | DAG.setNodeMemRefs(N: Load, NewMemRefs: {LoadMMO, StoreMMO}); |
9843 | |
9844 | return SDValue(Load, 0); |
9845 | } |
9846 | case Intrinsic::amdgcn_end_cf: |
9847 | return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other, |
9848 | Op->getOperand(2), Chain), 0); |
9849 | case Intrinsic::amdgcn_s_barrier_init: |
9850 | case Intrinsic::amdgcn_s_barrier_join: |
9851 | case Intrinsic::amdgcn_s_wakeup_barrier: { |
9852 | SDValue Chain = Op->getOperand(Num: 0); |
9853 | SmallVector<SDValue, 2> Ops; |
9854 | SDValue BarOp = Op->getOperand(Num: 2); |
9855 | unsigned Opc; |
9856 | bool IsInlinableBarID = false; |
9857 | int64_t BarVal; |
9858 | |
9859 | if (isa<ConstantSDNode>(Val: BarOp)) { |
9860 | BarVal = cast<ConstantSDNode>(Val&: BarOp)->getSExtValue(); |
9861 | IsInlinableBarID = AMDGPU::isInlinableIntLiteral(Literal: BarVal); |
9862 | } |
9863 | |
9864 | if (IsInlinableBarID) { |
9865 | switch (IntrinsicID) { |
9866 | default: |
9867 | return SDValue(); |
9868 | case Intrinsic::amdgcn_s_barrier_init: |
9869 | Opc = AMDGPU::S_BARRIER_INIT_IMM; |
9870 | break; |
9871 | case Intrinsic::amdgcn_s_barrier_join: |
9872 | Opc = AMDGPU::S_BARRIER_JOIN_IMM; |
9873 | break; |
9874 | case Intrinsic::amdgcn_s_wakeup_barrier: |
9875 | Opc = AMDGPU::S_WAKEUP_BARRIER_IMM; |
9876 | break; |
9877 | } |
9878 | |
9879 | SDValue K = DAG.getTargetConstant(BarVal, DL, MVT::i32); |
9880 | Ops.push_back(Elt: K); |
9881 | } else { |
9882 | switch (IntrinsicID) { |
9883 | default: |
9884 | return SDValue(); |
9885 | case Intrinsic::amdgcn_s_barrier_init: |
9886 | Opc = AMDGPU::S_BARRIER_INIT_M0; |
9887 | break; |
9888 | case Intrinsic::amdgcn_s_barrier_join: |
9889 | Opc = AMDGPU::S_BARRIER_JOIN_M0; |
9890 | break; |
9891 | case Intrinsic::amdgcn_s_wakeup_barrier: |
9892 | Opc = AMDGPU::S_WAKEUP_BARRIER_M0; |
9893 | break; |
9894 | } |
9895 | } |
9896 | |
9897 | if (IntrinsicID == Intrinsic::amdgcn_s_barrier_init) { |
9898 | SDValue M0Val; |
9899 | // Member count will be read from M0[16:22] |
9900 | M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, Op.getOperand(3), |
9901 | DAG.getShiftAmountConstant(16, MVT::i32, DL)); |
9902 | |
9903 | if (!IsInlinableBarID) { |
9904 | // If reference to barrier id is not an inline constant then it must be |
9905 | // referenced with M0[4:0]. Perform an OR with the member count to |
9906 | // include it in M0. |
9907 | M0Val = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, |
9908 | Op.getOperand(2), M0Val), |
9909 | 0); |
9910 | } |
9911 | Ops.push_back(Elt: copyToM0(DAG, Chain, DL, V: M0Val).getValue(R: 0)); |
9912 | } else if (!IsInlinableBarID) { |
9913 | Ops.push_back(Elt: copyToM0(DAG, Chain, DL, V: BarOp).getValue(R: 0)); |
9914 | } |
9915 | |
9916 | auto NewMI = DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: Op->getVTList(), Ops); |
9917 | return SDValue(NewMI, 0); |
9918 | } |
9919 | default: { |
9920 | if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = |
9921 | AMDGPU::getImageDimIntrinsicInfo(Intr: IntrinsicID)) |
9922 | return lowerImage(Op, Intr: ImageDimIntr, DAG, WithChain: true); |
9923 | |
9924 | return Op; |
9925 | } |
9926 | } |
9927 | } |
9928 | |
9929 | // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: |
9930 | // offset (the offset that is included in bounds checking and swizzling, to be |
9931 | // split between the instruction's voffset and immoffset fields) and soffset |
9932 | // (the offset that is excluded from bounds checking and swizzling, to go in |
9933 | // the instruction's soffset field). This function takes the first kind of |
9934 | // offset and figures out how to split it between voffset and immoffset. |
9935 | std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets( |
9936 | SDValue Offset, SelectionDAG &DAG) const { |
9937 | SDLoc DL(Offset); |
9938 | const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST: *Subtarget); |
9939 | SDValue N0 = Offset; |
9940 | ConstantSDNode *C1 = nullptr; |
9941 | |
9942 | if ((C1 = dyn_cast<ConstantSDNode>(Val&: N0))) |
9943 | N0 = SDValue(); |
9944 | else if (DAG.isBaseWithConstantOffset(Op: N0)) { |
9945 | C1 = cast<ConstantSDNode>(Val: N0.getOperand(i: 1)); |
9946 | N0 = N0.getOperand(i: 0); |
9947 | } |
9948 | |
9949 | if (C1) { |
9950 | unsigned ImmOffset = C1->getZExtValue(); |
9951 | // If the immediate value is too big for the immoffset field, put only bits |
9952 | // that would normally fit in the immoffset field. The remaining value that |
9953 | // is copied/added for the voffset field is a large power of 2, and it |
9954 | // stands more chance of being CSEd with the copy/add for another similar |
9955 | // load/store. |
9956 | // However, do not do that rounding down if that is a negative |
9957 | // number, as it appears to be illegal to have a negative offset in the |
9958 | // vgpr, even if adding the immediate offset makes it positive. |
9959 | unsigned Overflow = ImmOffset & ~MaxImm; |
9960 | ImmOffset -= Overflow; |
9961 | if ((int32_t)Overflow < 0) { |
9962 | Overflow += ImmOffset; |
9963 | ImmOffset = 0; |
9964 | } |
9965 | C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32)); |
9966 | if (Overflow) { |
9967 | auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32); |
9968 | if (!N0) |
9969 | N0 = OverflowVal; |
9970 | else { |
9971 | SDValue Ops[] = { N0, OverflowVal }; |
9972 | N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops); |
9973 | } |
9974 | } |
9975 | } |
9976 | if (!N0) |
9977 | N0 = DAG.getConstant(0, DL, MVT::i32); |
9978 | if (!C1) |
9979 | C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32)); |
9980 | return {N0, SDValue(C1, 0)}; |
9981 | } |
9982 | |
9983 | // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the |
9984 | // three offsets (voffset, soffset and instoffset) into the SDValue[3] array |
9985 | // pointed to by Offsets. |
9986 | void SITargetLowering::setBufferOffsets(SDValue CombinedOffset, |
9987 | SelectionDAG &DAG, SDValue *Offsets, |
9988 | Align Alignment) const { |
9989 | const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); |
9990 | SDLoc DL(CombinedOffset); |
9991 | if (auto *C = dyn_cast<ConstantSDNode>(Val&: CombinedOffset)) { |
9992 | uint32_t Imm = C->getZExtValue(); |
9993 | uint32_t SOffset, ImmOffset; |
9994 | if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) { |
9995 | Offsets[0] = DAG.getConstant(0, DL, MVT::i32); |
9996 | Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32); |
9997 | Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32); |
9998 | return; |
9999 | } |
10000 | } |
10001 | if (DAG.isBaseWithConstantOffset(Op: CombinedOffset)) { |
10002 | SDValue N0 = CombinedOffset.getOperand(i: 0); |
10003 | SDValue N1 = CombinedOffset.getOperand(i: 1); |
10004 | uint32_t SOffset, ImmOffset; |
10005 | int Offset = cast<ConstantSDNode>(Val&: N1)->getSExtValue(); |
10006 | if (Offset >= 0 && |
10007 | TII->splitMUBUFOffset(Imm: Offset, SOffset, ImmOffset, Alignment)) { |
10008 | Offsets[0] = N0; |
10009 | Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32); |
10010 | Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32); |
10011 | return; |
10012 | } |
10013 | } |
10014 | |
10015 | SDValue SOffsetZero = Subtarget->hasRestrictedSOffset() |
10016 | ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32) |
10017 | : DAG.getConstant(0, DL, MVT::i32); |
10018 | |
10019 | Offsets[0] = CombinedOffset; |
10020 | Offsets[1] = SOffsetZero; |
10021 | Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32); |
10022 | } |
10023 | |
10024 | SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer, |
10025 | SelectionDAG &DAG) const { |
10026 | if (!MaybePointer.getValueType().isScalarInteger()) |
10027 | return MaybePointer; |
10028 | |
10029 | SDLoc DL(MaybePointer); |
10030 | |
10031 | SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer); |
10032 | return Rsrc; |
10033 | } |
10034 | |
10035 | // Wrap a global or flat pointer into a buffer intrinsic using the flags |
10036 | // specified in the intrinsic. |
10037 | SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op, |
10038 | SelectionDAG &DAG) const { |
10039 | SDLoc Loc(Op); |
10040 | |
10041 | SDValue Pointer = Op->getOperand(Num: 1); |
10042 | SDValue Stride = Op->getOperand(Num: 2); |
10043 | SDValue NumRecords = Op->getOperand(Num: 3); |
10044 | SDValue Flags = Op->getOperand(Num: 4); |
10045 | |
10046 | auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32); |
10047 | SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32); |
10048 | SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask); |
10049 | std::optional<uint32_t> ConstStride = std::nullopt; |
10050 | if (auto *ConstNode = dyn_cast<ConstantSDNode>(Val&: Stride)) |
10051 | ConstStride = ConstNode->getZExtValue(); |
10052 | |
10053 | SDValue NewHighHalf = Masked; |
10054 | if (!ConstStride || *ConstStride != 0) { |
10055 | SDValue ShiftedStride; |
10056 | if (ConstStride) { |
10057 | ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32); |
10058 | } else { |
10059 | SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32); |
10060 | ShiftedStride = |
10061 | DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride, |
10062 | DAG.getShiftAmountConstant(16, MVT::i32, Loc)); |
10063 | } |
10064 | NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride); |
10065 | } |
10066 | |
10067 | SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf, |
10068 | NewHighHalf, NumRecords, Flags); |
10069 | SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc); |
10070 | return RsrcPtr; |
10071 | } |
10072 | |
10073 | // Handle 8 bit and 16 bit buffer loads |
10074 | SDValue |
10075 | SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT, |
10076 | SDLoc DL, ArrayRef<SDValue> Ops, |
10077 | MachineMemOperand *MMO) const { |
10078 | EVT IntVT = LoadVT.changeTypeToInteger(); |
10079 | unsigned Opc = (LoadVT.getScalarType() == MVT::i8) ? |
10080 | AMDGPUISD::BUFFER_LOAD_UBYTE : AMDGPUISD::BUFFER_LOAD_USHORT; |
10081 | |
10082 | SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other); |
10083 | SDValue BufferLoad = |
10084 | DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: ResList, Ops, MemVT: IntVT, MMO); |
10085 | SDValue LoadVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: IntVT, Operand: BufferLoad); |
10086 | LoadVal = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: LoadVT, Operand: LoadVal); |
10087 | |
10088 | return DAG.getMergeValues(Ops: {LoadVal, BufferLoad.getValue(R: 1)}, dl: DL); |
10089 | } |
10090 | |
10091 | // Handle 8 bit and 16 bit buffer stores |
10092 | SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG, |
10093 | EVT VDataType, SDLoc DL, |
10094 | SDValue Ops[], |
10095 | MemSDNode *M) const { |
10096 | if (VDataType == MVT::f16) |
10097 | Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]); |
10098 | |
10099 | SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]); |
10100 | Ops[1] = BufferStoreExt; |
10101 | unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE : |
10102 | AMDGPUISD::BUFFER_STORE_SHORT; |
10103 | ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9); |
10104 | return DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: M->getVTList(), Ops: OpsRef, MemVT: VDataType, |
10105 | MMO: M->getMemOperand()); |
10106 | } |
10107 | |
10108 | static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, |
10109 | ISD::LoadExtType ExtType, SDValue Op, |
10110 | const SDLoc &SL, EVT VT) { |
10111 | if (VT.bitsLT(VT: Op.getValueType())) |
10112 | return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: Op); |
10113 | |
10114 | switch (ExtType) { |
10115 | case ISD::SEXTLOAD: |
10116 | return DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL: SL, VT, Operand: Op); |
10117 | case ISD::ZEXTLOAD: |
10118 | return DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT, Operand: Op); |
10119 | case ISD::EXTLOAD: |
10120 | return DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT, Operand: Op); |
10121 | case ISD::NON_EXTLOAD: |
10122 | return Op; |
10123 | } |
10124 | |
10125 | llvm_unreachable("invalid ext type" ); |
10126 | } |
10127 | |
10128 | // Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads. |
10129 | // TODO: Skip this on GFX12 which does have scalar sub-dword loads. |
10130 | SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const { |
10131 | SelectionDAG &DAG = DCI.DAG; |
10132 | if (Ld->getAlign() < Align(4) || Ld->isDivergent()) |
10133 | return SDValue(); |
10134 | |
10135 | // FIXME: Constant loads should all be marked invariant. |
10136 | unsigned AS = Ld->getAddressSpace(); |
10137 | if (AS != AMDGPUAS::CONSTANT_ADDRESS && |
10138 | AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT && |
10139 | (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant())) |
10140 | return SDValue(); |
10141 | |
10142 | // Don't do this early, since it may interfere with adjacent load merging for |
10143 | // illegal types. We can avoid losing alignment information for exotic types |
10144 | // pre-legalize. |
10145 | EVT MemVT = Ld->getMemoryVT(); |
10146 | if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) || |
10147 | MemVT.getSizeInBits() >= 32) |
10148 | return SDValue(); |
10149 | |
10150 | SDLoc SL(Ld); |
10151 | |
10152 | assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) && |
10153 | "unexpected vector extload" ); |
10154 | |
10155 | // TODO: Drop only high part of range. |
10156 | SDValue Ptr = Ld->getBasePtr(); |
10157 | SDValue NewLoad = DAG.getLoad( |
10158 | ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr, |
10159 | Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(), |
10160 | Ld->getMemOperand()->getFlags(), Ld->getAAInfo(), |
10161 | nullptr); // Drop ranges |
10162 | |
10163 | EVT TruncVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: MemVT.getSizeInBits()); |
10164 | if (MemVT.isFloatingPoint()) { |
10165 | assert(Ld->getExtensionType() == ISD::NON_EXTLOAD && |
10166 | "unexpected fp extload" ); |
10167 | TruncVT = MemVT.changeTypeToInteger(); |
10168 | } |
10169 | |
10170 | SDValue Cvt = NewLoad; |
10171 | if (Ld->getExtensionType() == ISD::SEXTLOAD) { |
10172 | Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad, |
10173 | DAG.getValueType(TruncVT)); |
10174 | } else if (Ld->getExtensionType() == ISD::ZEXTLOAD || |
10175 | Ld->getExtensionType() == ISD::NON_EXTLOAD) { |
10176 | Cvt = DAG.getZeroExtendInReg(Op: NewLoad, DL: SL, VT: TruncVT); |
10177 | } else { |
10178 | assert(Ld->getExtensionType() == ISD::EXTLOAD); |
10179 | } |
10180 | |
10181 | EVT VT = Ld->getValueType(ResNo: 0); |
10182 | EVT IntVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: VT.getSizeInBits()); |
10183 | |
10184 | DCI.AddToWorklist(N: Cvt.getNode()); |
10185 | |
10186 | // We may need to handle exotic cases, such as i16->i64 extloads, so insert |
10187 | // the appropriate extension from the 32-bit load. |
10188 | Cvt = getLoadExtOrTrunc(DAG, ExtType: Ld->getExtensionType(), Op: Cvt, SL, VT: IntVT); |
10189 | DCI.AddToWorklist(N: Cvt.getNode()); |
10190 | |
10191 | // Handle conversion back to floating point if necessary. |
10192 | Cvt = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Cvt); |
10193 | |
10194 | return DAG.getMergeValues(Ops: { Cvt, NewLoad.getValue(R: 1) }, dl: SL); |
10195 | } |
10196 | |
10197 | static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, |
10198 | const SIMachineFunctionInfo &Info) { |
10199 | // TODO: Should check if the address can definitely not access stack. |
10200 | if (Info.isEntryFunction()) |
10201 | return Info.getUserSGPRInfo().hasFlatScratchInit(); |
10202 | return true; |
10203 | } |
10204 | |
10205 | SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { |
10206 | SDLoc DL(Op); |
10207 | LoadSDNode *Load = cast<LoadSDNode>(Val&: Op); |
10208 | ISD::LoadExtType ExtType = Load->getExtensionType(); |
10209 | EVT MemVT = Load->getMemoryVT(); |
10210 | |
10211 | if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) { |
10212 | if (MemVT == MVT::i16 && isTypeLegal(MVT::i16)) |
10213 | return SDValue(); |
10214 | |
10215 | // FIXME: Copied from PPC |
10216 | // First, load into 32 bits, then truncate to 1 bit. |
10217 | |
10218 | SDValue Chain = Load->getChain(); |
10219 | SDValue BasePtr = Load->getBasePtr(); |
10220 | MachineMemOperand *MMO = Load->getMemOperand(); |
10221 | |
10222 | EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16; |
10223 | |
10224 | SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, |
10225 | BasePtr, RealMemVT, MMO); |
10226 | |
10227 | if (!MemVT.isVector()) { |
10228 | SDValue Ops[] = { |
10229 | DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MemVT, Operand: NewLD), |
10230 | NewLD.getValue(R: 1) |
10231 | }; |
10232 | |
10233 | return DAG.getMergeValues(Ops, dl: DL); |
10234 | } |
10235 | |
10236 | SmallVector<SDValue, 3> Elts; |
10237 | for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) { |
10238 | SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD, |
10239 | DAG.getConstant(I, DL, MVT::i32)); |
10240 | |
10241 | Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt)); |
10242 | } |
10243 | |
10244 | SDValue Ops[] = { |
10245 | DAG.getBuildVector(VT: MemVT, DL, Ops: Elts), |
10246 | NewLD.getValue(R: 1) |
10247 | }; |
10248 | |
10249 | return DAG.getMergeValues(Ops, dl: DL); |
10250 | } |
10251 | |
10252 | if (!MemVT.isVector()) |
10253 | return SDValue(); |
10254 | |
10255 | assert(Op.getValueType().getVectorElementType() == MVT::i32 && |
10256 | "Custom lowering for non-i32 vectors hasn't been implemented." ); |
10257 | |
10258 | Align Alignment = Load->getAlign(); |
10259 | unsigned AS = Load->getAddressSpace(); |
10260 | if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS && |
10261 | Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) { |
10262 | return SplitVectorLoad(Op, DAG); |
10263 | } |
10264 | |
10265 | MachineFunction &MF = DAG.getMachineFunction(); |
10266 | SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
10267 | // If there is a possibility that flat instruction access scratch memory |
10268 | // then we need to use the same legalization rules we use for private. |
10269 | if (AS == AMDGPUAS::FLAT_ADDRESS && |
10270 | !Subtarget->hasMultiDwordFlatScratchAddressing()) |
10271 | AS = addressMayBeAccessedAsPrivate(MMO: Load->getMemOperand(), Info: *MFI) ? |
10272 | AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS; |
10273 | |
10274 | unsigned NumElements = MemVT.getVectorNumElements(); |
10275 | |
10276 | if (AS == AMDGPUAS::CONSTANT_ADDRESS || |
10277 | AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { |
10278 | if (!Op->isDivergent() && Alignment >= Align(4) && NumElements < 32) { |
10279 | if (MemVT.isPow2VectorType() || |
10280 | (Subtarget->hasScalarDwordx3Loads() && NumElements == 3)) |
10281 | return SDValue(); |
10282 | return WidenOrSplitVectorLoad(Op, DAG); |
10283 | } |
10284 | // Non-uniform loads will be selected to MUBUF instructions, so they |
10285 | // have the same legalization requirements as global and private |
10286 | // loads. |
10287 | // |
10288 | } |
10289 | |
10290 | if (AS == AMDGPUAS::CONSTANT_ADDRESS || |
10291 | AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT || |
10292 | AS == AMDGPUAS::GLOBAL_ADDRESS) { |
10293 | if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() && |
10294 | Load->isSimple() && isMemOpHasNoClobberedMemOperand(N: Load) && |
10295 | Alignment >= Align(4) && NumElements < 32) { |
10296 | if (MemVT.isPow2VectorType() || |
10297 | (Subtarget->hasScalarDwordx3Loads() && NumElements == 3)) |
10298 | return SDValue(); |
10299 | return WidenOrSplitVectorLoad(Op, DAG); |
10300 | } |
10301 | // Non-uniform loads will be selected to MUBUF instructions, so they |
10302 | // have the same legalization requirements as global and private |
10303 | // loads. |
10304 | // |
10305 | } |
10306 | if (AS == AMDGPUAS::CONSTANT_ADDRESS || |
10307 | AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT || |
10308 | AS == AMDGPUAS::GLOBAL_ADDRESS || |
10309 | AS == AMDGPUAS::FLAT_ADDRESS) { |
10310 | if (NumElements > 4) |
10311 | return SplitVectorLoad(Op, DAG); |
10312 | // v3 loads not supported on SI. |
10313 | if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores()) |
10314 | return WidenOrSplitVectorLoad(Op, DAG); |
10315 | |
10316 | // v3 and v4 loads are supported for private and global memory. |
10317 | return SDValue(); |
10318 | } |
10319 | if (AS == AMDGPUAS::PRIVATE_ADDRESS) { |
10320 | // Depending on the setting of the private_element_size field in the |
10321 | // resource descriptor, we can only make private accesses up to a certain |
10322 | // size. |
10323 | switch (Subtarget->getMaxPrivateElementSize()) { |
10324 | case 4: { |
10325 | SDValue Ops[2]; |
10326 | std::tie(args&: Ops[0], args&: Ops[1]) = scalarizeVectorLoad(LD: Load, DAG); |
10327 | return DAG.getMergeValues(Ops, dl: DL); |
10328 | } |
10329 | case 8: |
10330 | if (NumElements > 2) |
10331 | return SplitVectorLoad(Op, DAG); |
10332 | return SDValue(); |
10333 | case 16: |
10334 | // Same as global/flat |
10335 | if (NumElements > 4) |
10336 | return SplitVectorLoad(Op, DAG); |
10337 | // v3 loads not supported on SI. |
10338 | if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores()) |
10339 | return WidenOrSplitVectorLoad(Op, DAG); |
10340 | |
10341 | return SDValue(); |
10342 | default: |
10343 | llvm_unreachable("unsupported private_element_size" ); |
10344 | } |
10345 | } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { |
10346 | unsigned Fast = 0; |
10347 | auto Flags = Load->getMemOperand()->getFlags(); |
10348 | if (allowsMisalignedMemoryAccessesImpl(Size: MemVT.getSizeInBits(), AddrSpace: AS, |
10349 | Alignment: Load->getAlign(), Flags, IsFast: &Fast) && |
10350 | Fast > 1) |
10351 | return SDValue(); |
10352 | |
10353 | if (MemVT.isVector()) |
10354 | return SplitVectorLoad(Op, DAG); |
10355 | } |
10356 | |
10357 | if (!allowsMemoryAccessForAlignment(Context&: *DAG.getContext(), DL: DAG.getDataLayout(), |
10358 | VT: MemVT, MMO: *Load->getMemOperand())) { |
10359 | SDValue Ops[2]; |
10360 | std::tie(args&: Ops[0], args&: Ops[1]) = expandUnalignedLoad(LD: Load, DAG); |
10361 | return DAG.getMergeValues(Ops, dl: DL); |
10362 | } |
10363 | |
10364 | return SDValue(); |
10365 | } |
10366 | |
10367 | SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { |
10368 | EVT VT = Op.getValueType(); |
10369 | if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 || |
10370 | VT.getSizeInBits() == 512) |
10371 | return splitTernaryVectorOp(Op, DAG); |
10372 | |
10373 | assert(VT.getSizeInBits() == 64); |
10374 | |
10375 | SDLoc DL(Op); |
10376 | SDValue Cond = Op.getOperand(i: 0); |
10377 | |
10378 | SDValue Zero = DAG.getConstant(0, DL, MVT::i32); |
10379 | SDValue One = DAG.getConstant(1, DL, MVT::i32); |
10380 | |
10381 | SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1)); |
10382 | SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2)); |
10383 | |
10384 | SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero); |
10385 | SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero); |
10386 | |
10387 | SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1); |
10388 | |
10389 | SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One); |
10390 | SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One); |
10391 | |
10392 | SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1); |
10393 | |
10394 | SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi}); |
10395 | return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Res); |
10396 | } |
10397 | |
10398 | // Catch division cases where we can use shortcuts with rcp and rsq |
10399 | // instructions. |
10400 | SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, |
10401 | SelectionDAG &DAG) const { |
10402 | SDLoc SL(Op); |
10403 | SDValue LHS = Op.getOperand(i: 0); |
10404 | SDValue RHS = Op.getOperand(i: 1); |
10405 | EVT VT = Op.getValueType(); |
10406 | const SDNodeFlags Flags = Op->getFlags(); |
10407 | |
10408 | bool AllowInaccurateRcp = Flags.hasApproximateFuncs() || |
10409 | DAG.getTarget().Options.UnsafeFPMath; |
10410 | |
10411 | if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(Val&: LHS)) { |
10412 | // Without !fpmath accuracy information, we can't do more because we don't |
10413 | // know exactly whether rcp is accurate enough to meet !fpmath requirement. |
10414 | // f16 is always accurate enough |
10415 | if (!AllowInaccurateRcp && VT != MVT::f16) |
10416 | return SDValue(); |
10417 | |
10418 | if (CLHS->isExactlyValue(V: 1.0)) { |
10419 | // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to |
10420 | // the CI documentation has a worst case error of 1 ulp. |
10421 | // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to |
10422 | // use it as long as we aren't trying to use denormals. |
10423 | // |
10424 | // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp. |
10425 | |
10426 | // 1.0 / sqrt(x) -> rsq(x) |
10427 | |
10428 | // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP |
10429 | // error seems really high at 2^29 ULP. |
10430 | // 1.0 / x -> rcp(x) |
10431 | return DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: RHS); |
10432 | } |
10433 | |
10434 | // Same as for 1.0, but expand the sign out of the constant. |
10435 | if (CLHS->isExactlyValue(V: -1.0)) { |
10436 | // -1.0 / x -> rcp (fneg x) |
10437 | SDValue FNegRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS); |
10438 | return DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: FNegRHS); |
10439 | } |
10440 | } |
10441 | |
10442 | // For f16 require afn or arcp. |
10443 | // For f32 require afn. |
10444 | if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal())) |
10445 | return SDValue(); |
10446 | |
10447 | // Turn into multiply by the reciprocal. |
10448 | // x / y -> x * (1.0 / y) |
10449 | SDValue Recip = DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: RHS); |
10450 | return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: LHS, N2: Recip, Flags); |
10451 | } |
10452 | |
10453 | SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op, |
10454 | SelectionDAG &DAG) const { |
10455 | SDLoc SL(Op); |
10456 | SDValue X = Op.getOperand(i: 0); |
10457 | SDValue Y = Op.getOperand(i: 1); |
10458 | EVT VT = Op.getValueType(); |
10459 | const SDNodeFlags Flags = Op->getFlags(); |
10460 | |
10461 | bool AllowInaccurateDiv = Flags.hasApproximateFuncs() || |
10462 | DAG.getTarget().Options.UnsafeFPMath; |
10463 | if (!AllowInaccurateDiv) |
10464 | return SDValue(); |
10465 | |
10466 | SDValue NegY = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Y); |
10467 | SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT); |
10468 | |
10469 | SDValue R = DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: Y); |
10470 | SDValue Tmp0 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: NegY, N2: R, N3: One); |
10471 | |
10472 | R = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: Tmp0, N2: R, N3: R); |
10473 | SDValue Tmp1 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: NegY, N2: R, N3: One); |
10474 | R = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: Tmp1, N2: R, N3: R); |
10475 | SDValue Ret = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: R); |
10476 | SDValue Tmp2 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: NegY, N2: Ret, N3: X); |
10477 | return DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: Tmp2, N2: R, N3: Ret); |
10478 | } |
10479 | |
10480 | static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, |
10481 | EVT VT, SDValue A, SDValue B, SDValue GlueChain, |
10482 | SDNodeFlags Flags) { |
10483 | if (GlueChain->getNumValues() <= 1) { |
10484 | return DAG.getNode(Opcode, DL: SL, VT, N1: A, N2: B, Flags); |
10485 | } |
10486 | |
10487 | assert(GlueChain->getNumValues() == 3); |
10488 | |
10489 | SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue); |
10490 | switch (Opcode) { |
10491 | default: llvm_unreachable("no chain equivalent for opcode" ); |
10492 | case ISD::FMUL: |
10493 | Opcode = AMDGPUISD::FMUL_W_CHAIN; |
10494 | break; |
10495 | } |
10496 | |
10497 | return DAG.getNode(Opcode, DL: SL, VTList, |
10498 | Ops: {GlueChain.getValue(R: 1), A, B, GlueChain.getValue(R: 2)}, |
10499 | Flags); |
10500 | } |
10501 | |
10502 | static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, |
10503 | EVT VT, SDValue A, SDValue B, SDValue C, |
10504 | SDValue GlueChain, SDNodeFlags Flags) { |
10505 | if (GlueChain->getNumValues() <= 1) { |
10506 | return DAG.getNode(Opcode, DL: SL, VT, Ops: {A, B, C}, Flags); |
10507 | } |
10508 | |
10509 | assert(GlueChain->getNumValues() == 3); |
10510 | |
10511 | SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue); |
10512 | switch (Opcode) { |
10513 | default: llvm_unreachable("no chain equivalent for opcode" ); |
10514 | case ISD::FMA: |
10515 | Opcode = AMDGPUISD::FMA_W_CHAIN; |
10516 | break; |
10517 | } |
10518 | |
10519 | return DAG.getNode(Opcode, DL: SL, VTList, |
10520 | Ops: {GlueChain.getValue(R: 1), A, B, C, GlueChain.getValue(R: 2)}, |
10521 | Flags); |
10522 | } |
10523 | |
10524 | SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const { |
10525 | if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG)) |
10526 | return FastLowered; |
10527 | |
10528 | SDLoc SL(Op); |
10529 | SDValue Src0 = Op.getOperand(i: 0); |
10530 | SDValue Src1 = Op.getOperand(i: 1); |
10531 | |
10532 | SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0); |
10533 | SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1); |
10534 | |
10535 | SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1); |
10536 | SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1); |
10537 | |
10538 | SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32); |
10539 | SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag); |
10540 | |
10541 | return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0); |
10542 | } |
10543 | |
10544 | // Faster 2.5 ULP division that does not support denormals. |
10545 | SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const { |
10546 | SDNodeFlags Flags = Op->getFlags(); |
10547 | SDLoc SL(Op); |
10548 | SDValue LHS = Op.getOperand(i: 1); |
10549 | SDValue RHS = Op.getOperand(i: 2); |
10550 | |
10551 | SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags); |
10552 | |
10553 | const APFloat K0Val(0x1p+96f); |
10554 | const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32); |
10555 | |
10556 | const APFloat K1Val(0x1p-32f); |
10557 | const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32); |
10558 | |
10559 | const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); |
10560 | |
10561 | EVT SetCCVT = |
10562 | getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32); |
10563 | |
10564 | SDValue r2 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: r1, RHS: K0, Cond: ISD::SETOGT); |
10565 | |
10566 | SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags); |
10567 | |
10568 | r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags); |
10569 | |
10570 | // rcp does not support denormals. |
10571 | SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags); |
10572 | |
10573 | SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags); |
10574 | |
10575 | return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags); |
10576 | } |
10577 | |
10578 | // Returns immediate value for setting the F32 denorm mode when using the |
10579 | // S_DENORM_MODE instruction. |
10580 | static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, |
10581 | const SIMachineFunctionInfo *Info, |
10582 | const GCNSubtarget *ST) { |
10583 | assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE" ); |
10584 | uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue(); |
10585 | uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2); |
10586 | return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32); |
10587 | } |
10588 | |
10589 | SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { |
10590 | if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG)) |
10591 | return FastLowered; |
10592 | |
10593 | // The selection matcher assumes anything with a chain selecting to a |
10594 | // mayRaiseFPException machine instruction. Since we're introducing a chain |
10595 | // here, we need to explicitly report nofpexcept for the regular fdiv |
10596 | // lowering. |
10597 | SDNodeFlags Flags = Op->getFlags(); |
10598 | Flags.setNoFPExcept(true); |
10599 | |
10600 | SDLoc SL(Op); |
10601 | SDValue LHS = Op.getOperand(i: 0); |
10602 | SDValue RHS = Op.getOperand(i: 1); |
10603 | |
10604 | const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); |
10605 | |
10606 | SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1); |
10607 | |
10608 | SDValue DenominatorScaled = DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL: SL, VTList: ScaleVT, |
10609 | Ops: {RHS, RHS, LHS}, Flags); |
10610 | SDValue NumeratorScaled = DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL: SL, VTList: ScaleVT, |
10611 | Ops: {LHS, RHS, LHS}, Flags); |
10612 | |
10613 | // Denominator is scaled to not be denormal, so using rcp is ok. |
10614 | SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, |
10615 | DenominatorScaled, Flags); |
10616 | SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32, |
10617 | DenominatorScaled, Flags); |
10618 | |
10619 | using namespace AMDGPU::Hwreg; |
10620 | const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2); |
10621 | const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32); |
10622 | |
10623 | const MachineFunction &MF = DAG.getMachineFunction(); |
10624 | const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
10625 | const DenormalMode DenormMode = Info->getMode().FP32Denormals; |
10626 | |
10627 | const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE(); |
10628 | const bool HasDynamicDenormals = |
10629 | (DenormMode.Input == DenormalMode::Dynamic) || |
10630 | (DenormMode.Output == DenormalMode::Dynamic); |
10631 | |
10632 | SDValue SavedDenormMode; |
10633 | |
10634 | if (!PreservesDenormals) { |
10635 | // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV |
10636 | // lowering. The chain dependence is insufficient, and we need glue. We do |
10637 | // not need the glue variants in a strictfp function. |
10638 | |
10639 | SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); |
10640 | |
10641 | SDValue Glue = DAG.getEntryNode(); |
10642 | if (HasDynamicDenormals) { |
10643 | SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL, |
10644 | DAG.getVTList(MVT::i32, MVT::Glue), |
10645 | {BitField, Glue}); |
10646 | SavedDenormMode = SDValue(GetReg, 0); |
10647 | |
10648 | Glue = DAG.getMergeValues( |
10649 | Ops: {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, dl: SL); |
10650 | } |
10651 | |
10652 | SDNode *EnableDenorm; |
10653 | if (Subtarget->hasDenormModeInst()) { |
10654 | const SDValue EnableDenormValue = |
10655 | getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, ST: Subtarget); |
10656 | |
10657 | EnableDenorm = DAG.getNode(Opcode: AMDGPUISD::DENORM_MODE, DL: SL, VTList: BindParamVTs, N1: Glue, |
10658 | N2: EnableDenormValue) |
10659 | .getNode(); |
10660 | } else { |
10661 | const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE, |
10662 | SL, MVT::i32); |
10663 | EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs, |
10664 | {EnableDenormValue, BitField, Glue}); |
10665 | } |
10666 | |
10667 | SDValue Ops[3] = { |
10668 | NegDivScale0, |
10669 | SDValue(EnableDenorm, 0), |
10670 | SDValue(EnableDenorm, 1) |
10671 | }; |
10672 | |
10673 | NegDivScale0 = DAG.getMergeValues(Ops, dl: SL); |
10674 | } |
10675 | |
10676 | SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, |
10677 | ApproxRcp, One, NegDivScale0, Flags); |
10678 | |
10679 | SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp, |
10680 | ApproxRcp, Fma0, Flags); |
10681 | |
10682 | SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, |
10683 | Fma1, Fma1, Flags); |
10684 | |
10685 | SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul, |
10686 | NumeratorScaled, Mul, Flags); |
10687 | |
10688 | SDValue Fma3 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, |
10689 | Fma2, Fma1, Mul, Fma2, Flags); |
10690 | |
10691 | SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, |
10692 | NumeratorScaled, Fma3, Flags); |
10693 | |
10694 | if (!PreservesDenormals) { |
10695 | SDNode *DisableDenorm; |
10696 | if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) { |
10697 | const SDValue DisableDenormValue = getSPDenormModeValue( |
10698 | FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, ST: Subtarget); |
10699 | |
10700 | DisableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, MVT::Other, |
10701 | Fma4.getValue(1), DisableDenormValue, |
10702 | Fma4.getValue(2)).getNode(); |
10703 | } else { |
10704 | assert(HasDynamicDenormals == (bool)SavedDenormMode); |
10705 | const SDValue DisableDenormValue = |
10706 | HasDynamicDenormals |
10707 | ? SavedDenormMode |
10708 | : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32); |
10709 | |
10710 | DisableDenorm = DAG.getMachineNode( |
10711 | AMDGPU::S_SETREG_B32, SL, MVT::Other, |
10712 | {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)}); |
10713 | } |
10714 | |
10715 | SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other, |
10716 | SDValue(DisableDenorm, 0), DAG.getRoot()); |
10717 | DAG.setRoot(OutputChain); |
10718 | } |
10719 | |
10720 | SDValue Scale = NumeratorScaled.getValue(R: 1); |
10721 | SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32, |
10722 | {Fma4, Fma1, Fma3, Scale}, Flags); |
10723 | |
10724 | return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags); |
10725 | } |
10726 | |
10727 | SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { |
10728 | if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG)) |
10729 | return FastLowered; |
10730 | |
10731 | SDLoc SL(Op); |
10732 | SDValue X = Op.getOperand(i: 0); |
10733 | SDValue Y = Op.getOperand(i: 1); |
10734 | |
10735 | const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64); |
10736 | |
10737 | SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1); |
10738 | |
10739 | SDValue DivScale0 = DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL: SL, VTList: ScaleVT, N1: Y, N2: Y, N3: X); |
10740 | |
10741 | SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0); |
10742 | |
10743 | SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0); |
10744 | |
10745 | SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One); |
10746 | |
10747 | SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp); |
10748 | |
10749 | SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One); |
10750 | |
10751 | SDValue DivScale1 = DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL: SL, VTList: ScaleVT, N1: X, N2: Y, N3: X); |
10752 | |
10753 | SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1); |
10754 | SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3); |
10755 | |
10756 | SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64, |
10757 | NegDivScale0, Mul, DivScale1); |
10758 | |
10759 | SDValue Scale; |
10760 | |
10761 | if (!Subtarget->hasUsableDivScaleConditionOutput()) { |
10762 | // Workaround a hardware bug on SI where the condition output from div_scale |
10763 | // is not usable. |
10764 | |
10765 | const SDValue Hi = DAG.getConstant(1, SL, MVT::i32); |
10766 | |
10767 | // Figure out if the scale to use for div_fmas. |
10768 | SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X); |
10769 | SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y); |
10770 | SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0); |
10771 | SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1); |
10772 | |
10773 | SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi); |
10774 | SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi); |
10775 | |
10776 | SDValue Scale0Hi |
10777 | = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi); |
10778 | SDValue Scale1Hi |
10779 | = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi); |
10780 | |
10781 | SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ); |
10782 | SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ); |
10783 | Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen); |
10784 | } else { |
10785 | Scale = DivScale1.getValue(R: 1); |
10786 | } |
10787 | |
10788 | SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, |
10789 | Fma4, Fma3, Mul, Scale); |
10790 | |
10791 | return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X); |
10792 | } |
10793 | |
10794 | SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const { |
10795 | EVT VT = Op.getValueType(); |
10796 | |
10797 | if (VT == MVT::f32) |
10798 | return LowerFDIV32(Op, DAG); |
10799 | |
10800 | if (VT == MVT::f64) |
10801 | return LowerFDIV64(Op, DAG); |
10802 | |
10803 | if (VT == MVT::f16) |
10804 | return LowerFDIV16(Op, DAG); |
10805 | |
10806 | llvm_unreachable("Unexpected type for fdiv" ); |
10807 | } |
10808 | |
10809 | SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const { |
10810 | SDLoc dl(Op); |
10811 | SDValue Val = Op.getOperand(i: 0); |
10812 | EVT VT = Val.getValueType(); |
10813 | EVT ResultExpVT = Op->getValueType(ResNo: 1); |
10814 | EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32; |
10815 | |
10816 | SDValue Mant = DAG.getNode( |
10817 | ISD::INTRINSIC_WO_CHAIN, dl, VT, |
10818 | DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val); |
10819 | |
10820 | SDValue Exp = DAG.getNode( |
10821 | ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT, |
10822 | DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val); |
10823 | |
10824 | if (Subtarget->hasFractBug()) { |
10825 | SDValue Fabs = DAG.getNode(Opcode: ISD::FABS, DL: dl, VT, Operand: Val); |
10826 | SDValue Inf = DAG.getConstantFP( |
10827 | Val: APFloat::getInf(Sem: SelectionDAG::EVTToAPFloatSemantics(VT)), DL: dl, VT); |
10828 | |
10829 | SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT); |
10830 | SDValue Zero = DAG.getConstant(Val: 0, DL: dl, VT: InstrExpVT); |
10831 | Exp = DAG.getNode(Opcode: ISD::SELECT, DL: dl, VT: InstrExpVT, N1: IsFinite, N2: Exp, N3: Zero); |
10832 | Mant = DAG.getNode(Opcode: ISD::SELECT, DL: dl, VT, N1: IsFinite, N2: Mant, N3: Val); |
10833 | } |
10834 | |
10835 | SDValue CastExp = DAG.getSExtOrTrunc(Op: Exp, DL: dl, VT: ResultExpVT); |
10836 | return DAG.getMergeValues(Ops: {Mant, CastExp}, dl); |
10837 | } |
10838 | |
10839 | SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { |
10840 | SDLoc DL(Op); |
10841 | StoreSDNode *Store = cast<StoreSDNode>(Val&: Op); |
10842 | EVT VT = Store->getMemoryVT(); |
10843 | |
10844 | if (VT == MVT::i1) { |
10845 | return DAG.getTruncStore(Store->getChain(), DL, |
10846 | DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32), |
10847 | Store->getBasePtr(), MVT::i1, Store->getMemOperand()); |
10848 | } |
10849 | |
10850 | assert(VT.isVector() && |
10851 | Store->getValue().getValueType().getScalarType() == MVT::i32); |
10852 | |
10853 | unsigned AS = Store->getAddressSpace(); |
10854 | if (Subtarget->hasLDSMisalignedBug() && |
10855 | AS == AMDGPUAS::FLAT_ADDRESS && |
10856 | Store->getAlign().value() < VT.getStoreSize() && VT.getSizeInBits() > 32) { |
10857 | return SplitVectorStore(Op, DAG); |
10858 | } |
10859 | |
10860 | MachineFunction &MF = DAG.getMachineFunction(); |
10861 | SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
10862 | // If there is a possibility that flat instruction access scratch memory |
10863 | // then we need to use the same legalization rules we use for private. |
10864 | if (AS == AMDGPUAS::FLAT_ADDRESS && |
10865 | !Subtarget->hasMultiDwordFlatScratchAddressing()) |
10866 | AS = addressMayBeAccessedAsPrivate(MMO: Store->getMemOperand(), Info: *MFI) ? |
10867 | AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS; |
10868 | |
10869 | unsigned NumElements = VT.getVectorNumElements(); |
10870 | if (AS == AMDGPUAS::GLOBAL_ADDRESS || |
10871 | AS == AMDGPUAS::FLAT_ADDRESS) { |
10872 | if (NumElements > 4) |
10873 | return SplitVectorStore(Op, DAG); |
10874 | // v3 stores not supported on SI. |
10875 | if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores()) |
10876 | return SplitVectorStore(Op, DAG); |
10877 | |
10878 | if (!allowsMemoryAccessForAlignment(Context&: *DAG.getContext(), DL: DAG.getDataLayout(), |
10879 | VT, MMO: *Store->getMemOperand())) |
10880 | return expandUnalignedStore(ST: Store, DAG); |
10881 | |
10882 | return SDValue(); |
10883 | } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) { |
10884 | switch (Subtarget->getMaxPrivateElementSize()) { |
10885 | case 4: |
10886 | return scalarizeVectorStore(ST: Store, DAG); |
10887 | case 8: |
10888 | if (NumElements > 2) |
10889 | return SplitVectorStore(Op, DAG); |
10890 | return SDValue(); |
10891 | case 16: |
10892 | if (NumElements > 4 || |
10893 | (NumElements == 3 && !Subtarget->enableFlatScratch())) |
10894 | return SplitVectorStore(Op, DAG); |
10895 | return SDValue(); |
10896 | default: |
10897 | llvm_unreachable("unsupported private_element_size" ); |
10898 | } |
10899 | } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { |
10900 | unsigned Fast = 0; |
10901 | auto Flags = Store->getMemOperand()->getFlags(); |
10902 | if (allowsMisalignedMemoryAccessesImpl(Size: VT.getSizeInBits(), AddrSpace: AS, |
10903 | Alignment: Store->getAlign(), Flags, IsFast: &Fast) && |
10904 | Fast > 1) |
10905 | return SDValue(); |
10906 | |
10907 | if (VT.isVector()) |
10908 | return SplitVectorStore(Op, DAG); |
10909 | |
10910 | return expandUnalignedStore(ST: Store, DAG); |
10911 | } |
10912 | |
10913 | // Probably an invalid store. If so we'll end up emitting a selection error. |
10914 | return SDValue(); |
10915 | } |
10916 | |
10917 | // Avoid the full correct expansion for f32 sqrt when promoting from f16. |
10918 | SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const { |
10919 | SDLoc SL(Op); |
10920 | assert(!Subtarget->has16BitInsts()); |
10921 | SDNodeFlags Flags = Op->getFlags(); |
10922 | SDValue Ext = |
10923 | DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags); |
10924 | |
10925 | SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32); |
10926 | SDValue Sqrt = |
10927 | DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags); |
10928 | |
10929 | return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt, |
10930 | DAG.getTargetConstant(0, SL, MVT::i32), Flags); |
10931 | } |
10932 | |
10933 | SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const { |
10934 | SDLoc DL(Op); |
10935 | SDNodeFlags Flags = Op->getFlags(); |
10936 | MVT VT = Op.getValueType().getSimpleVT(); |
10937 | const SDValue X = Op.getOperand(i: 0); |
10938 | |
10939 | if (allowApproxFunc(DAG, Flags)) { |
10940 | // Instruction is 1ulp but ignores denormals. |
10941 | return DAG.getNode( |
10942 | ISD::INTRINSIC_WO_CHAIN, DL, VT, |
10943 | DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags); |
10944 | } |
10945 | |
10946 | SDValue ScaleThreshold = DAG.getConstantFP(Val: 0x1.0p-96f, DL, VT); |
10947 | SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT); |
10948 | |
10949 | SDValue ScaleUpFactor = DAG.getConstantFP(Val: 0x1.0p+32f, DL, VT); |
10950 | |
10951 | SDValue ScaledX = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: X, N2: ScaleUpFactor, Flags); |
10952 | |
10953 | SDValue SqrtX = |
10954 | DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: NeedScale, N2: ScaledX, N3: X, Flags); |
10955 | |
10956 | SDValue SqrtS; |
10957 | if (needsDenormHandlingF32(DAG, Src: X, Flags)) { |
10958 | SDValue SqrtID = |
10959 | DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32); |
10960 | SqrtS = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT, N1: SqrtID, N2: SqrtX, Flags); |
10961 | |
10962 | SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS); |
10963 | SDValue SqrtSNextDownInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt, |
10964 | DAG.getConstant(-1, DL, MVT::i32)); |
10965 | SDValue SqrtSNextDown = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: SqrtSNextDownInt); |
10966 | |
10967 | SDValue NegSqrtSNextDown = |
10968 | DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: SqrtSNextDown, Flags); |
10969 | |
10970 | SDValue SqrtVP = |
10971 | DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: NegSqrtSNextDown, N2: SqrtS, N3: SqrtX, Flags); |
10972 | |
10973 | SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt, |
10974 | DAG.getConstant(1, DL, MVT::i32)); |
10975 | SDValue SqrtSNextUp = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: SqrtSNextUpInt); |
10976 | |
10977 | SDValue NegSqrtSNextUp = DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: SqrtSNextUp, Flags); |
10978 | SDValue SqrtVS = |
10979 | DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: NegSqrtSNextUp, N2: SqrtS, N3: SqrtX, Flags); |
10980 | |
10981 | SDValue Zero = DAG.getConstantFP(Val: 0.0f, DL, VT); |
10982 | SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE); |
10983 | |
10984 | SqrtS = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: SqrtVPLE0, N2: SqrtSNextDown, N3: SqrtS, |
10985 | Flags); |
10986 | |
10987 | SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT); |
10988 | SqrtS = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: SqrtVPVSGT0, N2: SqrtSNextUp, N3: SqrtS, |
10989 | Flags); |
10990 | } else { |
10991 | SDValue SqrtR = DAG.getNode(Opcode: AMDGPUISD::RSQ, DL, VT, Operand: SqrtX, Flags); |
10992 | |
10993 | SqrtS = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: SqrtX, N2: SqrtR, Flags); |
10994 | |
10995 | SDValue Half = DAG.getConstantFP(Val: 0.5f, DL, VT); |
10996 | SDValue SqrtH = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: SqrtR, N2: Half, Flags); |
10997 | SDValue NegSqrtH = DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: SqrtH, Flags); |
10998 | |
10999 | SDValue SqrtE = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: NegSqrtH, N2: SqrtS, N3: Half, Flags); |
11000 | SqrtH = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: SqrtH, N2: SqrtE, N3: SqrtH, Flags); |
11001 | SqrtS = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: SqrtS, N2: SqrtE, N3: SqrtS, Flags); |
11002 | |
11003 | SDValue NegSqrtS = DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: SqrtS, Flags); |
11004 | SDValue SqrtD = |
11005 | DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: NegSqrtS, N2: SqrtS, N3: SqrtX, Flags); |
11006 | SqrtS = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: SqrtD, N2: SqrtH, N3: SqrtS, Flags); |
11007 | } |
11008 | |
11009 | SDValue ScaleDownFactor = DAG.getConstantFP(Val: 0x1.0p-16f, DL, VT); |
11010 | |
11011 | SDValue ScaledDown = |
11012 | DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: SqrtS, N2: ScaleDownFactor, Flags); |
11013 | |
11014 | SqrtS = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: NeedScale, N2: ScaledDown, N3: SqrtS, Flags); |
11015 | SDValue IsZeroOrInf = |
11016 | DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX, |
11017 | DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32)); |
11018 | |
11019 | return DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: IsZeroOrInf, N2: SqrtX, N3: SqrtS, Flags); |
11020 | } |
11021 | |
11022 | SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const { |
11023 | // For double type, the SQRT and RSQ instructions don't have required |
11024 | // precision, we apply Goldschmidt's algorithm to improve the result: |
11025 | // |
11026 | // y0 = rsq(x) |
11027 | // g0 = x * y0 |
11028 | // h0 = 0.5 * y0 |
11029 | // |
11030 | // r0 = 0.5 - h0 * g0 |
11031 | // g1 = g0 * r0 + g0 |
11032 | // h1 = h0 * r0 + h0 |
11033 | // |
11034 | // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1 |
11035 | // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1 |
11036 | // h2 = h1 * r1 + h1 |
11037 | // |
11038 | // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2 |
11039 | // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2 |
11040 | // |
11041 | // sqrt(x) = g3 |
11042 | |
11043 | SDNodeFlags Flags = Op->getFlags(); |
11044 | |
11045 | SDLoc DL(Op); |
11046 | |
11047 | SDValue X = Op.getOperand(i: 0); |
11048 | SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64); |
11049 | |
11050 | SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT); |
11051 | |
11052 | SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32); |
11053 | |
11054 | // Scale up input if it is too small. |
11055 | SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32); |
11056 | SDValue ScaleUp = |
11057 | DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt); |
11058 | SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags); |
11059 | |
11060 | SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX); |
11061 | |
11062 | SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY); |
11063 | |
11064 | SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64); |
11065 | SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half); |
11066 | |
11067 | SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0); |
11068 | SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half); |
11069 | |
11070 | SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0); |
11071 | |
11072 | SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0); |
11073 | |
11074 | SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1); |
11075 | SDValue SqrtD0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX); |
11076 | |
11077 | SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1); |
11078 | |
11079 | SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2); |
11080 | SDValue SqrtD1 = |
11081 | DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX); |
11082 | |
11083 | SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2); |
11084 | |
11085 | SDValue ScaleDownFactor = DAG.getConstant(-128, DL, MVT::i32); |
11086 | SDValue ScaleDown = |
11087 | DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt); |
11088 | SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags); |
11089 | |
11090 | // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check |
11091 | // with finite only or nsz because rsq(+/-0) = +/-inf |
11092 | |
11093 | // TODO: Check for DAZ and expand to subnormals |
11094 | SDValue IsZeroOrInf = |
11095 | DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX, |
11096 | DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32)); |
11097 | |
11098 | // If x is +INF, +0, or -0, use its original value |
11099 | return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet, |
11100 | Flags); |
11101 | } |
11102 | |
11103 | SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { |
11104 | SDLoc DL(Op); |
11105 | EVT VT = Op.getValueType(); |
11106 | SDValue Arg = Op.getOperand(i: 0); |
11107 | SDValue TrigVal; |
11108 | |
11109 | // Propagate fast-math flags so that the multiply we introduce can be folded |
11110 | // if Arg is already the result of a multiply by constant. |
11111 | auto Flags = Op->getFlags(); |
11112 | |
11113 | SDValue OneOver2Pi = DAG.getConstantFP(Val: 0.5 * numbers::inv_pi, DL, VT); |
11114 | |
11115 | if (Subtarget->hasTrigReducedRange()) { |
11116 | SDValue MulVal = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Arg, N2: OneOver2Pi, Flags); |
11117 | TrigVal = DAG.getNode(Opcode: AMDGPUISD::FRACT, DL, VT, Operand: MulVal, Flags); |
11118 | } else { |
11119 | TrigVal = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Arg, N2: OneOver2Pi, Flags); |
11120 | } |
11121 | |
11122 | switch (Op.getOpcode()) { |
11123 | case ISD::FCOS: |
11124 | return DAG.getNode(Opcode: AMDGPUISD::COS_HW, DL: SDLoc(Op), VT, Operand: TrigVal, Flags); |
11125 | case ISD::FSIN: |
11126 | return DAG.getNode(Opcode: AMDGPUISD::SIN_HW, DL: SDLoc(Op), VT, Operand: TrigVal, Flags); |
11127 | default: |
11128 | llvm_unreachable("Wrong trig opcode" ); |
11129 | } |
11130 | } |
11131 | |
11132 | SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const { |
11133 | AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Val&: Op); |
11134 | assert(AtomicNode->isCompareAndSwap()); |
11135 | unsigned AS = AtomicNode->getAddressSpace(); |
11136 | |
11137 | // No custom lowering required for local address space |
11138 | if (!AMDGPU::isFlatGlobalAddrSpace(AS)) |
11139 | return Op; |
11140 | |
11141 | // Non-local address space requires custom lowering for atomic compare |
11142 | // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2 |
11143 | SDLoc DL(Op); |
11144 | SDValue ChainIn = Op.getOperand(i: 0); |
11145 | SDValue Addr = Op.getOperand(i: 1); |
11146 | SDValue Old = Op.getOperand(i: 2); |
11147 | SDValue New = Op.getOperand(i: 3); |
11148 | EVT VT = Op.getValueType(); |
11149 | MVT SimpleVT = VT.getSimpleVT(); |
11150 | MVT VecType = MVT::getVectorVT(VT: SimpleVT, NumElements: 2); |
11151 | |
11152 | SDValue NewOld = DAG.getBuildVector(VT: VecType, DL, Ops: {New, Old}); |
11153 | SDValue Ops[] = { ChainIn, Addr, NewOld }; |
11154 | |
11155 | return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::ATOMIC_CMP_SWAP, dl: DL, VTList: Op->getVTList(), |
11156 | Ops, MemVT: VT, MMO: AtomicNode->getMemOperand()); |
11157 | } |
11158 | |
11159 | //===----------------------------------------------------------------------===// |
11160 | // Custom DAG optimizations |
11161 | //===----------------------------------------------------------------------===// |
11162 | |
11163 | SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, |
11164 | DAGCombinerInfo &DCI) const { |
11165 | EVT VT = N->getValueType(ResNo: 0); |
11166 | EVT ScalarVT = VT.getScalarType(); |
11167 | if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16) |
11168 | return SDValue(); |
11169 | |
11170 | SelectionDAG &DAG = DCI.DAG; |
11171 | SDLoc DL(N); |
11172 | |
11173 | SDValue Src = N->getOperand(Num: 0); |
11174 | EVT SrcVT = Src.getValueType(); |
11175 | |
11176 | // TODO: We could try to match extracting the higher bytes, which would be |
11177 | // easier if i8 vectors weren't promoted to i32 vectors, particularly after |
11178 | // types are legalized. v4i8 -> v4f32 is probably the only case to worry |
11179 | // about in practice. |
11180 | if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) { |
11181 | if (DAG.MaskedValueIsZero(Op: Src, Mask: APInt::getHighBitsSet(numBits: 32, hiBitsSet: 24))) { |
11182 | SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src); |
11183 | DCI.AddToWorklist(N: Cvt.getNode()); |
11184 | |
11185 | // For the f16 case, fold to a cast to f32 and then cast back to f16. |
11186 | if (ScalarVT != MVT::f32) { |
11187 | Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt, |
11188 | DAG.getTargetConstant(0, DL, MVT::i32)); |
11189 | } |
11190 | return Cvt; |
11191 | } |
11192 | } |
11193 | |
11194 | return SDValue(); |
11195 | } |
11196 | |
11197 | SDValue SITargetLowering::performFCopySignCombine(SDNode *N, |
11198 | DAGCombinerInfo &DCI) const { |
11199 | SDValue MagnitudeOp = N->getOperand(Num: 0); |
11200 | SDValue SignOp = N->getOperand(Num: 1); |
11201 | SelectionDAG &DAG = DCI.DAG; |
11202 | SDLoc DL(N); |
11203 | |
11204 | // f64 fcopysign is really an f32 copysign on the high bits, so replace the |
11205 | // lower half with a copy. |
11206 | // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y) |
11207 | if (MagnitudeOp.getValueType() == MVT::f64) { |
11208 | SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, MagnitudeOp); |
11209 | SDValue MagLo = |
11210 | DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector, |
11211 | DAG.getConstant(0, DL, MVT::i32)); |
11212 | SDValue MagHi = |
11213 | DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector, |
11214 | DAG.getConstant(1, DL, MVT::i32)); |
11215 | |
11216 | SDValue HiOp = |
11217 | DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOp); |
11218 | |
11219 | SDValue Vector = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp); |
11220 | |
11221 | return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector); |
11222 | } |
11223 | |
11224 | if (SignOp.getValueType() != MVT::f64) |
11225 | return SDValue(); |
11226 | |
11227 | // Reduce width of sign operand, we only need the highest bit. |
11228 | // |
11229 | // fcopysign f64:x, f64:y -> |
11230 | // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1) |
11231 | // TODO: In some cases it might make sense to go all the way to f16. |
11232 | SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, SignOp); |
11233 | SDValue SignAsF32 = |
11234 | DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector, |
11235 | DAG.getConstant(1, DL, MVT::i32)); |
11236 | |
11237 | return DAG.getNode(Opcode: ISD::FCOPYSIGN, DL, VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 0), |
11238 | N2: SignAsF32); |
11239 | } |
11240 | |
11241 | // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2) |
11242 | // (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no |
11243 | // bits |
11244 | |
11245 | // This is a variant of |
11246 | // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2), |
11247 | // |
11248 | // The normal DAG combiner will do this, but only if the add has one use since |
11249 | // that would increase the number of instructions. |
11250 | // |
11251 | // This prevents us from seeing a constant offset that can be folded into a |
11252 | // memory instruction's addressing mode. If we know the resulting add offset of |
11253 | // a pointer can be folded into an addressing offset, we can replace the pointer |
11254 | // operand with the add of new constant offset. This eliminates one of the uses, |
11255 | // and may allow the remaining use to also be simplified. |
11256 | // |
11257 | SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, |
11258 | unsigned AddrSpace, |
11259 | EVT MemVT, |
11260 | DAGCombinerInfo &DCI) const { |
11261 | SDValue N0 = N->getOperand(Num: 0); |
11262 | SDValue N1 = N->getOperand(Num: 1); |
11263 | |
11264 | // We only do this to handle cases where it's profitable when there are |
11265 | // multiple uses of the add, so defer to the standard combine. |
11266 | if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) || |
11267 | N0->hasOneUse()) |
11268 | return SDValue(); |
11269 | |
11270 | const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(Val&: N1); |
11271 | if (!CN1) |
11272 | return SDValue(); |
11273 | |
11274 | const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(Val: N0.getOperand(i: 1)); |
11275 | if (!CAdd) |
11276 | return SDValue(); |
11277 | |
11278 | SelectionDAG &DAG = DCI.DAG; |
11279 | |
11280 | if (N0->getOpcode() == ISD::OR && |
11281 | !DAG.haveNoCommonBitsSet(A: N0.getOperand(i: 0), B: N0.getOperand(i: 1))) |
11282 | return SDValue(); |
11283 | |
11284 | // If the resulting offset is too large, we can't fold it into the |
11285 | // addressing mode offset. |
11286 | APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue(); |
11287 | Type *Ty = MemVT.getTypeForEVT(Context&: *DCI.DAG.getContext()); |
11288 | |
11289 | AddrMode AM; |
11290 | AM.HasBaseReg = true; |
11291 | AM.BaseOffs = Offset.getSExtValue(); |
11292 | if (!isLegalAddressingMode(DL: DCI.DAG.getDataLayout(), AM, Ty, AS: AddrSpace)) |
11293 | return SDValue(); |
11294 | |
11295 | SDLoc SL(N); |
11296 | EVT VT = N->getValueType(ResNo: 0); |
11297 | |
11298 | SDValue ShlX = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT, N1: N0.getOperand(i: 0), N2: N1); |
11299 | SDValue COffset = DAG.getConstant(Val: Offset, DL: SL, VT); |
11300 | |
11301 | SDNodeFlags Flags; |
11302 | Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() && |
11303 | (N0.getOpcode() == ISD::OR || |
11304 | N0->getFlags().hasNoUnsignedWrap())); |
11305 | |
11306 | return DAG.getNode(Opcode: ISD::ADD, DL: SL, VT, N1: ShlX, N2: COffset, Flags); |
11307 | } |
11308 | |
11309 | /// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset |
11310 | /// by the chain and intrinsic ID. Theoretically we would also need to check the |
11311 | /// specific intrinsic, but they all place the pointer operand first. |
11312 | static unsigned getBasePtrIndex(const MemSDNode *N) { |
11313 | switch (N->getOpcode()) { |
11314 | case ISD::STORE: |
11315 | case ISD::INTRINSIC_W_CHAIN: |
11316 | case ISD::INTRINSIC_VOID: |
11317 | return 2; |
11318 | default: |
11319 | return 1; |
11320 | } |
11321 | } |
11322 | |
11323 | SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N, |
11324 | DAGCombinerInfo &DCI) const { |
11325 | SelectionDAG &DAG = DCI.DAG; |
11326 | SDLoc SL(N); |
11327 | |
11328 | unsigned PtrIdx = getBasePtrIndex(N); |
11329 | SDValue Ptr = N->getOperand(Num: PtrIdx); |
11330 | |
11331 | // TODO: We could also do this for multiplies. |
11332 | if (Ptr.getOpcode() == ISD::SHL) { |
11333 | SDValue NewPtr = performSHLPtrCombine(N: Ptr.getNode(), AddrSpace: N->getAddressSpace(), |
11334 | MemVT: N->getMemoryVT(), DCI); |
11335 | if (NewPtr) { |
11336 | SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end()); |
11337 | |
11338 | NewOps[PtrIdx] = NewPtr; |
11339 | return SDValue(DAG.UpdateNodeOperands(N, Ops: NewOps), 0); |
11340 | } |
11341 | } |
11342 | |
11343 | return SDValue(); |
11344 | } |
11345 | |
11346 | static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) { |
11347 | return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) || |
11348 | (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) || |
11349 | (Opc == ISD::XOR && Val == 0); |
11350 | } |
11351 | |
11352 | // Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This |
11353 | // will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit |
11354 | // integer combine opportunities since most 64-bit operations are decomposed |
11355 | // this way. TODO: We won't want this for SALU especially if it is an inline |
11356 | // immediate. |
11357 | SDValue SITargetLowering::splitBinaryBitConstantOp( |
11358 | DAGCombinerInfo &DCI, |
11359 | const SDLoc &SL, |
11360 | unsigned Opc, SDValue LHS, |
11361 | const ConstantSDNode *CRHS) const { |
11362 | uint64_t Val = CRHS->getZExtValue(); |
11363 | uint32_t ValLo = Lo_32(Value: Val); |
11364 | uint32_t ValHi = Hi_32(Value: Val); |
11365 | const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); |
11366 | |
11367 | if ((bitOpWithConstantIsReducible(Opc, Val: ValLo) || |
11368 | bitOpWithConstantIsReducible(Opc, Val: ValHi)) || |
11369 | (CRHS->hasOneUse() && !TII->isInlineConstant(Imm: CRHS->getAPIntValue()))) { |
11370 | // If we need to materialize a 64-bit immediate, it will be split up later |
11371 | // anyway. Avoid creating the harder to understand 64-bit immediate |
11372 | // materialization. |
11373 | return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi); |
11374 | } |
11375 | |
11376 | return SDValue(); |
11377 | } |
11378 | |
11379 | bool llvm::isBoolSGPR(SDValue V) { |
11380 | if (V.getValueType() != MVT::i1) |
11381 | return false; |
11382 | switch (V.getOpcode()) { |
11383 | default: |
11384 | break; |
11385 | case ISD::SETCC: |
11386 | case AMDGPUISD::FP_CLASS: |
11387 | return true; |
11388 | case ISD::AND: |
11389 | case ISD::OR: |
11390 | case ISD::XOR: |
11391 | return isBoolSGPR(V: V.getOperand(i: 0)) && isBoolSGPR(V: V.getOperand(i: 1)); |
11392 | } |
11393 | return false; |
11394 | } |
11395 | |
11396 | // If a constant has all zeroes or all ones within each byte return it. |
11397 | // Otherwise return 0. |
11398 | static uint32_t getConstantPermuteMask(uint32_t C) { |
11399 | // 0xff for any zero byte in the mask |
11400 | uint32_t ZeroByteMask = 0; |
11401 | if (!(C & 0x000000ff)) ZeroByteMask |= 0x000000ff; |
11402 | if (!(C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00; |
11403 | if (!(C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000; |
11404 | if (!(C & 0xff000000)) ZeroByteMask |= 0xff000000; |
11405 | uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte |
11406 | if ((NonZeroByteMask & C) != NonZeroByteMask) |
11407 | return 0; // Partial bytes selected. |
11408 | return C; |
11409 | } |
11410 | |
11411 | // Check if a node selects whole bytes from its operand 0 starting at a byte |
11412 | // boundary while masking the rest. Returns select mask as in the v_perm_b32 |
11413 | // or -1 if not succeeded. |
11414 | // Note byte select encoding: |
11415 | // value 0-3 selects corresponding source byte; |
11416 | // value 0xc selects zero; |
11417 | // value 0xff selects 0xff. |
11418 | static uint32_t getPermuteMask(SDValue V) { |
11419 | assert(V.getValueSizeInBits() == 32); |
11420 | |
11421 | if (V.getNumOperands() != 2) |
11422 | return ~0; |
11423 | |
11424 | ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(Val: V.getOperand(i: 1)); |
11425 | if (!N1) |
11426 | return ~0; |
11427 | |
11428 | uint32_t C = N1->getZExtValue(); |
11429 | |
11430 | switch (V.getOpcode()) { |
11431 | default: |
11432 | break; |
11433 | case ISD::AND: |
11434 | if (uint32_t ConstMask = getConstantPermuteMask(C)) |
11435 | return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask); |
11436 | break; |
11437 | |
11438 | case ISD::OR: |
11439 | if (uint32_t ConstMask = getConstantPermuteMask(C)) |
11440 | return (0x03020100 & ~ConstMask) | ConstMask; |
11441 | break; |
11442 | |
11443 | case ISD::SHL: |
11444 | if (C % 8) |
11445 | return ~0; |
11446 | |
11447 | return uint32_t((0x030201000c0c0c0cull << C) >> 32); |
11448 | |
11449 | case ISD::SRL: |
11450 | if (C % 8) |
11451 | return ~0; |
11452 | |
11453 | return uint32_t(0x0c0c0c0c03020100ull >> C); |
11454 | } |
11455 | |
11456 | return ~0; |
11457 | } |
11458 | |
11459 | SDValue SITargetLowering::performAndCombine(SDNode *N, |
11460 | DAGCombinerInfo &DCI) const { |
11461 | if (DCI.isBeforeLegalize()) |
11462 | return SDValue(); |
11463 | |
11464 | SelectionDAG &DAG = DCI.DAG; |
11465 | EVT VT = N->getValueType(ResNo: 0); |
11466 | SDValue LHS = N->getOperand(Num: 0); |
11467 | SDValue RHS = N->getOperand(Num: 1); |
11468 | |
11469 | |
11470 | const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS); |
11471 | if (VT == MVT::i64 && CRHS) { |
11472 | if (SDValue Split |
11473 | = splitBinaryBitConstantOp(DCI, SL: SDLoc(N), Opc: ISD::AND, LHS, CRHS)) |
11474 | return Split; |
11475 | } |
11476 | |
11477 | if (CRHS && VT == MVT::i32) { |
11478 | // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb |
11479 | // nb = number of trailing zeroes in mask |
11480 | // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass, |
11481 | // given that we are selecting 8 or 16 bit fields starting at byte boundary. |
11482 | uint64_t Mask = CRHS->getZExtValue(); |
11483 | unsigned Bits = llvm::popcount(Value: Mask); |
11484 | if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL && |
11485 | (Bits == 8 || Bits == 16) && isShiftedMask_64(Value: Mask) && !(Mask & 1)) { |
11486 | if (auto *CShift = dyn_cast<ConstantSDNode>(Val: LHS->getOperand(Num: 1))) { |
11487 | unsigned Shift = CShift->getZExtValue(); |
11488 | unsigned NB = CRHS->getAPIntValue().countr_zero(); |
11489 | unsigned Offset = NB + Shift; |
11490 | if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary. |
11491 | SDLoc SL(N); |
11492 | SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, |
11493 | LHS->getOperand(0), |
11494 | DAG.getConstant(Offset, SL, MVT::i32), |
11495 | DAG.getConstant(Bits, SL, MVT::i32)); |
11496 | EVT NarrowVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: Bits); |
11497 | SDValue Ext = DAG.getNode(Opcode: ISD::AssertZext, DL: SL, VT, N1: BFE, |
11498 | N2: DAG.getValueType(NarrowVT)); |
11499 | SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext, |
11500 | DAG.getConstant(NB, SDLoc(CRHS), MVT::i32)); |
11501 | return Shl; |
11502 | } |
11503 | } |
11504 | } |
11505 | |
11506 | // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2) |
11507 | if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM && |
11508 | isa<ConstantSDNode>(Val: LHS.getOperand(i: 2))) { |
11509 | uint32_t Sel = getConstantPermuteMask(C: Mask); |
11510 | if (!Sel) |
11511 | return SDValue(); |
11512 | |
11513 | // Select 0xc for all zero bytes |
11514 | Sel = (LHS.getConstantOperandVal(i: 2) & Sel) | (~Sel & 0x0c0c0c0c); |
11515 | SDLoc DL(N); |
11516 | return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0), |
11517 | LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32)); |
11518 | } |
11519 | } |
11520 | |
11521 | // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) -> |
11522 | // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity) |
11523 | if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) { |
11524 | ISD::CondCode LCC = cast<CondCodeSDNode>(Val: LHS.getOperand(i: 2))->get(); |
11525 | ISD::CondCode RCC = cast<CondCodeSDNode>(Val: RHS.getOperand(i: 2))->get(); |
11526 | |
11527 | SDValue X = LHS.getOperand(i: 0); |
11528 | SDValue Y = RHS.getOperand(i: 0); |
11529 | if (Y.getOpcode() != ISD::FABS || Y.getOperand(i: 0) != X || |
11530 | !isTypeLegal(VT: X.getValueType())) |
11531 | return SDValue(); |
11532 | |
11533 | if (LCC == ISD::SETO) { |
11534 | if (X != LHS.getOperand(i: 1)) |
11535 | return SDValue(); |
11536 | |
11537 | if (RCC == ISD::SETUNE) { |
11538 | const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(Val: RHS.getOperand(i: 1)); |
11539 | if (!C1 || !C1->isInfinity() || C1->isNegative()) |
11540 | return SDValue(); |
11541 | |
11542 | const uint32_t Mask = SIInstrFlags::N_NORMAL | |
11543 | SIInstrFlags::N_SUBNORMAL | |
11544 | SIInstrFlags::N_ZERO | |
11545 | SIInstrFlags::P_ZERO | |
11546 | SIInstrFlags::P_SUBNORMAL | |
11547 | SIInstrFlags::P_NORMAL; |
11548 | |
11549 | static_assert(((~(SIInstrFlags::S_NAN | |
11550 | SIInstrFlags::Q_NAN | |
11551 | SIInstrFlags::N_INFINITY | |
11552 | SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask, |
11553 | "mask not equal" ); |
11554 | |
11555 | SDLoc DL(N); |
11556 | return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, |
11557 | X, DAG.getConstant(Mask, DL, MVT::i32)); |
11558 | } |
11559 | } |
11560 | } |
11561 | |
11562 | if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS) |
11563 | std::swap(a&: LHS, b&: RHS); |
11564 | |
11565 | if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS && |
11566 | RHS.hasOneUse()) { |
11567 | ISD::CondCode LCC = cast<CondCodeSDNode>(Val: LHS.getOperand(i: 2))->get(); |
11568 | // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | n_nan) |
11569 | // and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan | n_nan) |
11570 | const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Val: RHS.getOperand(i: 1)); |
11571 | if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask && |
11572 | (RHS.getOperand(i: 0) == LHS.getOperand(i: 0) && |
11573 | LHS.getOperand(i: 0) == LHS.getOperand(i: 1))) { |
11574 | const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN; |
11575 | unsigned NewMask = LCC == ISD::SETO ? |
11576 | Mask->getZExtValue() & ~OrdMask : |
11577 | Mask->getZExtValue() & OrdMask; |
11578 | |
11579 | SDLoc DL(N); |
11580 | return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0), |
11581 | DAG.getConstant(NewMask, DL, MVT::i32)); |
11582 | } |
11583 | } |
11584 | |
11585 | if (VT == MVT::i32 && |
11586 | (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) { |
11587 | // and x, (sext cc from i1) => select cc, x, 0 |
11588 | if (RHS.getOpcode() != ISD::SIGN_EXTEND) |
11589 | std::swap(a&: LHS, b&: RHS); |
11590 | if (isBoolSGPR(RHS.getOperand(0))) |
11591 | return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), |
11592 | LHS, DAG.getConstant(0, SDLoc(N), MVT::i32)); |
11593 | } |
11594 | |
11595 | // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2) |
11596 | const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); |
11597 | if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() && |
11598 | N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) { |
11599 | uint32_t LHSMask = getPermuteMask(V: LHS); |
11600 | uint32_t RHSMask = getPermuteMask(V: RHS); |
11601 | if (LHSMask != ~0u && RHSMask != ~0u) { |
11602 | // Canonicalize the expression in an attempt to have fewer unique masks |
11603 | // and therefore fewer registers used to hold the masks. |
11604 | if (LHSMask > RHSMask) { |
11605 | std::swap(a&: LHSMask, b&: RHSMask); |
11606 | std::swap(a&: LHS, b&: RHS); |
11607 | } |
11608 | |
11609 | // Select 0xc for each lane used from source operand. Zero has 0xc mask |
11610 | // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range. |
11611 | uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c; |
11612 | uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c; |
11613 | |
11614 | // Check of we need to combine values from two sources within a byte. |
11615 | if (!(LHSUsedLanes & RHSUsedLanes) && |
11616 | // If we select high and lower word keep it for SDWA. |
11617 | // TODO: teach SDWA to work with v_perm_b32 and remove the check. |
11618 | !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) { |
11619 | // Each byte in each mask is either selector mask 0-3, or has higher |
11620 | // bits set in either of masks, which can be 0xff for 0xff or 0x0c for |
11621 | // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise |
11622 | // mask which is not 0xff wins. By anding both masks we have a correct |
11623 | // result except that 0x0c shall be corrected to give 0x0c only. |
11624 | uint32_t Mask = LHSMask & RHSMask; |
11625 | for (unsigned I = 0; I < 32; I += 8) { |
11626 | uint32_t ByteSel = 0xff << I; |
11627 | if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c) |
11628 | Mask &= (0x0c << I) & 0xffffffff; |
11629 | } |
11630 | |
11631 | // Add 4 to each active LHS lane. It will not affect any existing 0xff |
11632 | // or 0x0c. |
11633 | uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404); |
11634 | SDLoc DL(N); |
11635 | |
11636 | return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, |
11637 | LHS.getOperand(0), RHS.getOperand(0), |
11638 | DAG.getConstant(Sel, DL, MVT::i32)); |
11639 | } |
11640 | } |
11641 | } |
11642 | |
11643 | return SDValue(); |
11644 | } |
11645 | |
11646 | // A key component of v_perm is a mapping between byte position of the src |
11647 | // operands, and the byte position of the dest. To provide such, we need: 1. the |
11648 | // node that provides x byte of the dest of the OR, and 2. the byte of the node |
11649 | // used to provide that x byte. calculateByteProvider finds which node provides |
11650 | // a certain byte of the dest of the OR, and calculateSrcByte takes that node, |
11651 | // and finds an ultimate src and byte position For example: The supported |
11652 | // LoadCombine pattern for vector loads is as follows |
11653 | // t1 |
11654 | // or |
11655 | // / \ |
11656 | // t2 t3 |
11657 | // zext shl |
11658 | // | | \ |
11659 | // t4 t5 16 |
11660 | // or anyext |
11661 | // / \ | |
11662 | // t6 t7 t8 |
11663 | // srl shl or |
11664 | // / | / \ / \ |
11665 | // t9 t10 t11 t12 t13 t14 |
11666 | // trunc* 8 trunc* 8 and and |
11667 | // | | / | | \ |
11668 | // t15 t16 t17 t18 t19 t20 |
11669 | // trunc* 255 srl -256 |
11670 | // | / \ |
11671 | // t15 t15 16 |
11672 | // |
11673 | // *In this example, the truncs are from i32->i16 |
11674 | // |
11675 | // calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3 |
11676 | // respectively. calculateSrcByte would find (given node) -> ultimate src & |
11677 | // byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3. |
11678 | // After finding the mapping, we can combine the tree into vperm t15, t16, |
11679 | // 0x05000407 |
11680 | |
11681 | // Find the source and byte position from a node. |
11682 | // \p DestByte is the byte position of the dest of the or that the src |
11683 | // ultimately provides. \p SrcIndex is the byte of the src that maps to this |
11684 | // dest of the or byte. \p Depth tracks how many recursive iterations we have |
11685 | // performed. |
11686 | static const std::optional<ByteProvider<SDValue>> |
11687 | calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0, |
11688 | unsigned Depth = 0) { |
11689 | // We may need to recursively traverse a series of SRLs |
11690 | if (Depth >= 6) |
11691 | return std::nullopt; |
11692 | |
11693 | if (Op.getValueSizeInBits() < 8) |
11694 | return std::nullopt; |
11695 | |
11696 | if (Op.getValueType().isVector()) |
11697 | return ByteProvider<SDValue>::getSrc(Val: Op, ByteOffset: DestByte, VectorOffset: SrcIndex); |
11698 | |
11699 | switch (Op->getOpcode()) { |
11700 | case ISD::TRUNCATE: { |
11701 | return calculateSrcByte(Op: Op->getOperand(Num: 0), DestByte, SrcIndex, Depth: Depth + 1); |
11702 | } |
11703 | |
11704 | case ISD::SIGN_EXTEND: |
11705 | case ISD::ZERO_EXTEND: |
11706 | case ISD::SIGN_EXTEND_INREG: { |
11707 | SDValue NarrowOp = Op->getOperand(Num: 0); |
11708 | auto NarrowVT = NarrowOp.getValueType(); |
11709 | if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) { |
11710 | auto *VTSign = cast<VTSDNode>(Val: Op->getOperand(Num: 1)); |
11711 | NarrowVT = VTSign->getVT(); |
11712 | } |
11713 | if (!NarrowVT.isByteSized()) |
11714 | return std::nullopt; |
11715 | uint64_t NarrowByteWidth = NarrowVT.getStoreSize(); |
11716 | |
11717 | if (SrcIndex >= NarrowByteWidth) |
11718 | return std::nullopt; |
11719 | return calculateSrcByte(Op: Op->getOperand(Num: 0), DestByte, SrcIndex, Depth: Depth + 1); |
11720 | } |
11721 | |
11722 | case ISD::SRA: |
11723 | case ISD::SRL: { |
11724 | auto ShiftOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1)); |
11725 | if (!ShiftOp) |
11726 | return std::nullopt; |
11727 | |
11728 | uint64_t BitShift = ShiftOp->getZExtValue(); |
11729 | |
11730 | if (BitShift % 8 != 0) |
11731 | return std::nullopt; |
11732 | |
11733 | SrcIndex += BitShift / 8; |
11734 | |
11735 | return calculateSrcByte(Op: Op->getOperand(Num: 0), DestByte, SrcIndex, Depth: Depth + 1); |
11736 | } |
11737 | |
11738 | default: { |
11739 | return ByteProvider<SDValue>::getSrc(Val: Op, ByteOffset: DestByte, VectorOffset: SrcIndex); |
11740 | } |
11741 | } |
11742 | llvm_unreachable("fully handled switch" ); |
11743 | } |
11744 | |
11745 | // For a byte position in the result of an Or, traverse the tree and find the |
11746 | // node (and the byte of the node) which ultimately provides this {Or, |
11747 | // BytePosition}. \p Op is the operand we are currently examining. \p Index is |
11748 | // the byte position of the Op that corresponds with the originally requested |
11749 | // byte of the Or \p Depth tracks how many recursive iterations we have |
11750 | // performed. \p StartingIndex is the originally requested byte of the Or |
11751 | static const std::optional<ByteProvider<SDValue>> |
11752 | calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth, |
11753 | unsigned StartingIndex = 0) { |
11754 | // Finding Src tree of RHS of or typically requires at least 1 additional |
11755 | // depth |
11756 | if (Depth > 6) |
11757 | return std::nullopt; |
11758 | |
11759 | unsigned BitWidth = Op.getScalarValueSizeInBits(); |
11760 | if (BitWidth % 8 != 0) |
11761 | return std::nullopt; |
11762 | if (Index > BitWidth / 8 - 1) |
11763 | return std::nullopt; |
11764 | |
11765 | bool IsVec = Op.getValueType().isVector(); |
11766 | switch (Op.getOpcode()) { |
11767 | case ISD::OR: { |
11768 | if (IsVec) |
11769 | return std::nullopt; |
11770 | |
11771 | auto RHS = calculateByteProvider(Op: Op.getOperand(i: 1), Index, Depth: Depth + 1, |
11772 | StartingIndex); |
11773 | if (!RHS) |
11774 | return std::nullopt; |
11775 | auto LHS = calculateByteProvider(Op: Op.getOperand(i: 0), Index, Depth: Depth + 1, |
11776 | StartingIndex); |
11777 | if (!LHS) |
11778 | return std::nullopt; |
11779 | // A well formed Or will have two ByteProviders for each byte, one of which |
11780 | // is constant zero |
11781 | if (!LHS->isConstantZero() && !RHS->isConstantZero()) |
11782 | return std::nullopt; |
11783 | if (!LHS || LHS->isConstantZero()) |
11784 | return RHS; |
11785 | if (!RHS || RHS->isConstantZero()) |
11786 | return LHS; |
11787 | return std::nullopt; |
11788 | } |
11789 | |
11790 | case ISD::AND: { |
11791 | if (IsVec) |
11792 | return std::nullopt; |
11793 | |
11794 | auto BitMaskOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1)); |
11795 | if (!BitMaskOp) |
11796 | return std::nullopt; |
11797 | |
11798 | uint32_t BitMask = BitMaskOp->getZExtValue(); |
11799 | // Bits we expect for our StartingIndex |
11800 | uint32_t IndexMask = 0xFF << (Index * 8); |
11801 | |
11802 | if ((IndexMask & BitMask) != IndexMask) { |
11803 | // If the result of the and partially provides the byte, then it |
11804 | // is not well formatted |
11805 | if (IndexMask & BitMask) |
11806 | return std::nullopt; |
11807 | return ByteProvider<SDValue>::getConstantZero(); |
11808 | } |
11809 | |
11810 | return calculateSrcByte(Op: Op->getOperand(Num: 0), DestByte: StartingIndex, SrcIndex: Index); |
11811 | } |
11812 | |
11813 | case ISD::FSHR: { |
11814 | if (IsVec) |
11815 | return std::nullopt; |
11816 | |
11817 | // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW)) |
11818 | auto ShiftOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 2)); |
11819 | if (!ShiftOp || Op.getValueType().isVector()) |
11820 | return std::nullopt; |
11821 | |
11822 | uint64_t BitsProvided = Op.getValueSizeInBits(); |
11823 | if (BitsProvided % 8 != 0) |
11824 | return std::nullopt; |
11825 | |
11826 | uint64_t BitShift = ShiftOp->getAPIntValue().urem(RHS: BitsProvided); |
11827 | if (BitShift % 8) |
11828 | return std::nullopt; |
11829 | |
11830 | uint64_t ConcatSizeInBytes = BitsProvided / 4; |
11831 | uint64_t ByteShift = BitShift / 8; |
11832 | |
11833 | uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes; |
11834 | uint64_t BytesProvided = BitsProvided / 8; |
11835 | SDValue NextOp = Op.getOperand(i: NewIndex >= BytesProvided ? 0 : 1); |
11836 | NewIndex %= BytesProvided; |
11837 | return calculateByteProvider(Op: NextOp, Index: NewIndex, Depth: Depth + 1, StartingIndex); |
11838 | } |
11839 | |
11840 | case ISD::SRA: |
11841 | case ISD::SRL: { |
11842 | if (IsVec) |
11843 | return std::nullopt; |
11844 | |
11845 | auto ShiftOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1)); |
11846 | if (!ShiftOp) |
11847 | return std::nullopt; |
11848 | |
11849 | uint64_t BitShift = ShiftOp->getZExtValue(); |
11850 | if (BitShift % 8) |
11851 | return std::nullopt; |
11852 | |
11853 | auto BitsProvided = Op.getScalarValueSizeInBits(); |
11854 | if (BitsProvided % 8 != 0) |
11855 | return std::nullopt; |
11856 | |
11857 | uint64_t BytesProvided = BitsProvided / 8; |
11858 | uint64_t ByteShift = BitShift / 8; |
11859 | // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes. |
11860 | // If the byte we are trying to provide (as tracked by index) falls in this |
11861 | // range, then the SRL provides the byte. The byte of interest of the src of |
11862 | // the SRL is Index + ByteShift |
11863 | return BytesProvided - ByteShift > Index |
11864 | ? calculateSrcByte(Op: Op->getOperand(Num: 0), DestByte: StartingIndex, |
11865 | SrcIndex: Index + ByteShift) |
11866 | : ByteProvider<SDValue>::getConstantZero(); |
11867 | } |
11868 | |
11869 | case ISD::SHL: { |
11870 | if (IsVec) |
11871 | return std::nullopt; |
11872 | |
11873 | auto ShiftOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1)); |
11874 | if (!ShiftOp) |
11875 | return std::nullopt; |
11876 | |
11877 | uint64_t BitShift = ShiftOp->getZExtValue(); |
11878 | if (BitShift % 8 != 0) |
11879 | return std::nullopt; |
11880 | uint64_t ByteShift = BitShift / 8; |
11881 | |
11882 | // If we are shifting by an amount greater than (or equal to) |
11883 | // the index we are trying to provide, then it provides 0s. If not, |
11884 | // then this bytes are not definitively 0s, and the corresponding byte |
11885 | // of interest is Index - ByteShift of the src |
11886 | return Index < ByteShift |
11887 | ? ByteProvider<SDValue>::getConstantZero() |
11888 | : calculateByteProvider(Op: Op.getOperand(i: 0), Index: Index - ByteShift, |
11889 | Depth: Depth + 1, StartingIndex); |
11890 | } |
11891 | case ISD::ANY_EXTEND: |
11892 | case ISD::SIGN_EXTEND: |
11893 | case ISD::ZERO_EXTEND: |
11894 | case ISD::SIGN_EXTEND_INREG: |
11895 | case ISD::AssertZext: |
11896 | case ISD::AssertSext: { |
11897 | if (IsVec) |
11898 | return std::nullopt; |
11899 | |
11900 | SDValue NarrowOp = Op->getOperand(Num: 0); |
11901 | unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits(); |
11902 | if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG || |
11903 | Op->getOpcode() == ISD::AssertZext || |
11904 | Op->getOpcode() == ISD::AssertSext) { |
11905 | auto *VTSign = cast<VTSDNode>(Val: Op->getOperand(Num: 1)); |
11906 | NarrowBitWidth = VTSign->getVT().getSizeInBits(); |
11907 | } |
11908 | if (NarrowBitWidth % 8 != 0) |
11909 | return std::nullopt; |
11910 | uint64_t NarrowByteWidth = NarrowBitWidth / 8; |
11911 | |
11912 | if (Index >= NarrowByteWidth) |
11913 | return Op.getOpcode() == ISD::ZERO_EXTEND |
11914 | ? std::optional<ByteProvider<SDValue>>( |
11915 | ByteProvider<SDValue>::getConstantZero()) |
11916 | : std::nullopt; |
11917 | return calculateByteProvider(Op: NarrowOp, Index, Depth: Depth + 1, StartingIndex); |
11918 | } |
11919 | |
11920 | case ISD::TRUNCATE: { |
11921 | if (IsVec) |
11922 | return std::nullopt; |
11923 | |
11924 | uint64_t NarrowByteWidth = BitWidth / 8; |
11925 | |
11926 | if (NarrowByteWidth >= Index) { |
11927 | return calculateByteProvider(Op: Op.getOperand(i: 0), Index, Depth: Depth + 1, |
11928 | StartingIndex); |
11929 | } |
11930 | |
11931 | return std::nullopt; |
11932 | } |
11933 | |
11934 | case ISD::CopyFromReg: { |
11935 | if (BitWidth / 8 > Index) |
11936 | return calculateSrcByte(Op, DestByte: StartingIndex, SrcIndex: Index); |
11937 | |
11938 | return std::nullopt; |
11939 | } |
11940 | |
11941 | case ISD::LOAD: { |
11942 | auto L = cast<LoadSDNode>(Val: Op.getNode()); |
11943 | |
11944 | unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits(); |
11945 | if (NarrowBitWidth % 8 != 0) |
11946 | return std::nullopt; |
11947 | uint64_t NarrowByteWidth = NarrowBitWidth / 8; |
11948 | |
11949 | // If the width of the load does not reach byte we are trying to provide for |
11950 | // and it is not a ZEXTLOAD, then the load does not provide for the byte in |
11951 | // question |
11952 | if (Index >= NarrowByteWidth) { |
11953 | return L->getExtensionType() == ISD::ZEXTLOAD |
11954 | ? std::optional<ByteProvider<SDValue>>( |
11955 | ByteProvider<SDValue>::getConstantZero()) |
11956 | : std::nullopt; |
11957 | } |
11958 | |
11959 | if (NarrowByteWidth > Index) { |
11960 | return calculateSrcByte(Op, DestByte: StartingIndex, SrcIndex: Index); |
11961 | } |
11962 | |
11963 | return std::nullopt; |
11964 | } |
11965 | |
11966 | case ISD::BSWAP: { |
11967 | if (IsVec) |
11968 | return std::nullopt; |
11969 | |
11970 | return calculateByteProvider(Op: Op->getOperand(Num: 0), Index: BitWidth / 8 - Index - 1, |
11971 | Depth: Depth + 1, StartingIndex); |
11972 | } |
11973 | |
11974 | case ISD::EXTRACT_VECTOR_ELT: { |
11975 | auto IdxOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1)); |
11976 | if (!IdxOp) |
11977 | return std::nullopt; |
11978 | auto VecIdx = IdxOp->getZExtValue(); |
11979 | auto ScalarSize = Op.getScalarValueSizeInBits(); |
11980 | if (ScalarSize != 32) { |
11981 | Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index; |
11982 | } |
11983 | |
11984 | return calculateSrcByte(Op: ScalarSize == 32 ? Op : Op.getOperand(i: 0), |
11985 | DestByte: StartingIndex, SrcIndex: Index); |
11986 | } |
11987 | |
11988 | case AMDGPUISD::PERM: { |
11989 | if (IsVec) |
11990 | return std::nullopt; |
11991 | |
11992 | auto PermMask = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 2)); |
11993 | if (!PermMask) |
11994 | return std::nullopt; |
11995 | |
11996 | auto IdxMask = |
11997 | (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8); |
11998 | if (IdxMask > 0x07 && IdxMask != 0x0c) |
11999 | return std::nullopt; |
12000 | |
12001 | auto NextOp = Op.getOperand(i: IdxMask > 0x03 ? 0 : 1); |
12002 | auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask; |
12003 | |
12004 | return IdxMask != 0x0c ? calculateSrcByte(Op: NextOp, DestByte: StartingIndex, SrcIndex: NextIndex) |
12005 | : ByteProvider<SDValue>( |
12006 | ByteProvider<SDValue>::getConstantZero()); |
12007 | } |
12008 | |
12009 | default: { |
12010 | return std::nullopt; |
12011 | } |
12012 | } |
12013 | |
12014 | llvm_unreachable("fully handled switch" ); |
12015 | } |
12016 | |
12017 | // Returns true if the Operand is a scalar and is 16 bits |
12018 | static bool isExtendedFrom16Bits(SDValue &Operand) { |
12019 | |
12020 | switch (Operand.getOpcode()) { |
12021 | case ISD::ANY_EXTEND: |
12022 | case ISD::SIGN_EXTEND: |
12023 | case ISD::ZERO_EXTEND: { |
12024 | auto OpVT = Operand.getOperand(i: 0).getValueType(); |
12025 | return !OpVT.isVector() && OpVT.getSizeInBits() == 16; |
12026 | } |
12027 | case ISD::LOAD: { |
12028 | LoadSDNode *L = cast<LoadSDNode>(Val: Operand.getNode()); |
12029 | auto ExtType = cast<LoadSDNode>(Val: L)->getExtensionType(); |
12030 | if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD || |
12031 | ExtType == ISD::EXTLOAD) { |
12032 | auto MemVT = L->getMemoryVT(); |
12033 | return !MemVT.isVector() && MemVT.getSizeInBits() == 16; |
12034 | } |
12035 | return L->getMemoryVT().getSizeInBits() == 16; |
12036 | } |
12037 | default: |
12038 | return false; |
12039 | } |
12040 | } |
12041 | |
12042 | // Returns true if the mask matches consecutive bytes, and the first byte |
12043 | // begins at a power of 2 byte offset from 0th byte |
12044 | static bool addresses16Bits(int Mask) { |
12045 | int Low8 = Mask & 0xff; |
12046 | int Hi8 = (Mask & 0xff00) >> 8; |
12047 | |
12048 | assert(Low8 < 8 && Hi8 < 8); |
12049 | // Are the bytes contiguous in the order of increasing addresses. |
12050 | bool IsConsecutive = (Hi8 - Low8 == 1); |
12051 | // Is the first byte at location that is aligned for 16 bit instructions. |
12052 | // A counter example is taking 2 consecutive bytes starting at the 8th bit. |
12053 | // In this case, we still need code to extract the 16 bit operand, so it |
12054 | // is better to use i8 v_perm |
12055 | bool Is16Aligned = !(Low8 % 2); |
12056 | |
12057 | return IsConsecutive && Is16Aligned; |
12058 | } |
12059 | |
12060 | // Do not lower into v_perm if the operands are actually 16 bit |
12061 | // and the selected bits (based on PermMask) correspond with two |
12062 | // easily addressable 16 bit operands. |
12063 | static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, |
12064 | SDValue &OtherOp) { |
12065 | int Low16 = PermMask & 0xffff; |
12066 | int Hi16 = (PermMask & 0xffff0000) >> 16; |
12067 | |
12068 | auto TempOp = peekThroughBitcasts(V: Op); |
12069 | auto TempOtherOp = peekThroughBitcasts(V: OtherOp); |
12070 | |
12071 | auto OpIs16Bit = |
12072 | TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(Operand&: TempOp); |
12073 | if (!OpIs16Bit) |
12074 | return true; |
12075 | |
12076 | auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 || |
12077 | isExtendedFrom16Bits(Operand&: TempOtherOp); |
12078 | if (!OtherOpIs16Bit) |
12079 | return true; |
12080 | |
12081 | // Do we cleanly address both |
12082 | return !addresses16Bits(Mask: Low16) || !addresses16Bits(Mask: Hi16); |
12083 | } |
12084 | |
12085 | static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, |
12086 | unsigned DWordOffset) { |
12087 | SDValue Ret; |
12088 | |
12089 | auto TypeSize = Src.getValueSizeInBits().getFixedValue(); |
12090 | // ByteProvider must be at least 8 bits |
12091 | assert(Src.getValueSizeInBits().isKnownMultipleOf(8)); |
12092 | |
12093 | if (TypeSize <= 32) |
12094 | return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32); |
12095 | |
12096 | if (Src.getValueType().isVector()) { |
12097 | auto ScalarTySize = Src.getScalarValueSizeInBits(); |
12098 | auto ScalarTy = Src.getValueType().getScalarType(); |
12099 | if (ScalarTySize == 32) { |
12100 | return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src, |
12101 | DAG.getConstant(DWordOffset, SL, MVT::i32)); |
12102 | } |
12103 | if (ScalarTySize > 32) { |
12104 | Ret = DAG.getNode( |
12105 | ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src, |
12106 | DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32)); |
12107 | auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32)); |
12108 | if (ShiftVal) |
12109 | Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret, |
12110 | DAG.getConstant(ShiftVal, SL, MVT::i32)); |
12111 | return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32); |
12112 | } |
12113 | |
12114 | assert(ScalarTySize < 32); |
12115 | auto NumElements = TypeSize / ScalarTySize; |
12116 | auto Trunc32Elements = (ScalarTySize * NumElements) / 32; |
12117 | auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize; |
12118 | auto NumElementsIn32 = 32 / ScalarTySize; |
12119 | auto NumAvailElements = DWordOffset < Trunc32Elements |
12120 | ? NumElementsIn32 |
12121 | : NumElements - NormalizedTrunc; |
12122 | |
12123 | SmallVector<SDValue, 4> VecSrcs; |
12124 | DAG.ExtractVectorElements(Op: Src, Args&: VecSrcs, Start: DWordOffset * NumElementsIn32, |
12125 | Count: NumAvailElements); |
12126 | |
12127 | Ret = DAG.getBuildVector( |
12128 | VT: MVT::getVectorVT(VT: MVT::getIntegerVT(BitWidth: ScalarTySize), NumElements: NumAvailElements), DL: SL, |
12129 | Ops: VecSrcs); |
12130 | return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32); |
12131 | } |
12132 | |
12133 | /// Scalar Type |
12134 | auto ShiftVal = 32 * DWordOffset; |
12135 | Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src, |
12136 | DAG.getConstant(ShiftVal, SL, MVT::i32)); |
12137 | return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32); |
12138 | } |
12139 | |
12140 | static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { |
12141 | SelectionDAG &DAG = DCI.DAG; |
12142 | [[maybe_unused]] EVT VT = N->getValueType(ResNo: 0); |
12143 | SmallVector<ByteProvider<SDValue>, 8> PermNodes; |
12144 | |
12145 | // VT is known to be MVT::i32, so we need to provide 4 bytes. |
12146 | assert(VT == MVT::i32); |
12147 | for (int i = 0; i < 4; i++) { |
12148 | // Find the ByteProvider that provides the ith byte of the result of OR |
12149 | std::optional<ByteProvider<SDValue>> P = |
12150 | calculateByteProvider(Op: SDValue(N, 0), Index: i, Depth: 0, /*StartingIndex = */ i); |
12151 | // TODO support constantZero |
12152 | if (!P || P->isConstantZero()) |
12153 | return SDValue(); |
12154 | |
12155 | PermNodes.push_back(Elt: *P); |
12156 | } |
12157 | if (PermNodes.size() != 4) |
12158 | return SDValue(); |
12159 | |
12160 | std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4); |
12161 | std::optional<std::pair<unsigned, unsigned>> SecondSrc; |
12162 | uint64_t PermMask = 0x00000000; |
12163 | for (size_t i = 0; i < PermNodes.size(); i++) { |
12164 | auto PermOp = PermNodes[i]; |
12165 | // Since the mask is applied to Src1:Src2, Src1 bytes must be offset |
12166 | // by sizeof(Src2) = 4 |
12167 | int SrcByteAdjust = 4; |
12168 | |
12169 | // If the Src uses a byte from a different DWORD, then it corresponds |
12170 | // with a difference source |
12171 | if (!PermOp.hasSameSrc(Other: PermNodes[FirstSrc.first]) || |
12172 | ((PermOp.SrcOffset / 4) != FirstSrc.second)) { |
12173 | if (SecondSrc) |
12174 | if (!PermOp.hasSameSrc(Other: PermNodes[SecondSrc->first]) || |
12175 | ((PermOp.SrcOffset / 4) != SecondSrc->second)) |
12176 | return SDValue(); |
12177 | |
12178 | // Set the index of the second distinct Src node |
12179 | SecondSrc = {i, PermNodes[i].SrcOffset / 4}; |
12180 | assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8)); |
12181 | SrcByteAdjust = 0; |
12182 | } |
12183 | assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8); |
12184 | assert(!DAG.getDataLayout().isBigEndian()); |
12185 | PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8); |
12186 | } |
12187 | SDLoc DL(N); |
12188 | SDValue Op = *PermNodes[FirstSrc.first].Src; |
12189 | Op = getDWordFromOffset(DAG, SL: DL, Src: Op, DWordOffset: FirstSrc.second); |
12190 | assert(Op.getValueSizeInBits() == 32); |
12191 | |
12192 | // Check that we are not just extracting the bytes in order from an op |
12193 | if (!SecondSrc) { |
12194 | int Low16 = PermMask & 0xffff; |
12195 | int Hi16 = (PermMask & 0xffff0000) >> 16; |
12196 | |
12197 | bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100); |
12198 | bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302); |
12199 | |
12200 | // The perm op would really just produce Op. So combine into Op |
12201 | if (WellFormedLow && WellFormedHi) |
12202 | return DAG.getBitcast(VT: MVT::getIntegerVT(BitWidth: 32), V: Op); |
12203 | } |
12204 | |
12205 | SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op; |
12206 | |
12207 | if (SecondSrc) { |
12208 | OtherOp = getDWordFromOffset(DAG, SL: DL, Src: OtherOp, DWordOffset: SecondSrc->second); |
12209 | assert(OtherOp.getValueSizeInBits() == 32); |
12210 | } |
12211 | |
12212 | if (hasNon16BitAccesses(PermMask, Op, OtherOp)) { |
12213 | |
12214 | assert(Op.getValueType().isByteSized() && |
12215 | OtherOp.getValueType().isByteSized()); |
12216 | |
12217 | // If the ultimate src is less than 32 bits, then we will only be |
12218 | // using bytes 0: Op.getValueSizeInBytes() - 1 in the or. |
12219 | // CalculateByteProvider would not have returned Op as source if we |
12220 | // used a byte that is outside its ValueType. Thus, we are free to |
12221 | // ANY_EXTEND as the extended bits are dont-cares. |
12222 | Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32); |
12223 | OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32); |
12224 | |
12225 | return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp, |
12226 | DAG.getConstant(PermMask, DL, MVT::i32)); |
12227 | } |
12228 | return SDValue(); |
12229 | } |
12230 | |
12231 | SDValue SITargetLowering::performOrCombine(SDNode *N, |
12232 | DAGCombinerInfo &DCI) const { |
12233 | SelectionDAG &DAG = DCI.DAG; |
12234 | SDValue LHS = N->getOperand(Num: 0); |
12235 | SDValue RHS = N->getOperand(Num: 1); |
12236 | |
12237 | EVT VT = N->getValueType(ResNo: 0); |
12238 | if (VT == MVT::i1) { |
12239 | // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2) |
12240 | if (LHS.getOpcode() == AMDGPUISD::FP_CLASS && |
12241 | RHS.getOpcode() == AMDGPUISD::FP_CLASS) { |
12242 | SDValue Src = LHS.getOperand(i: 0); |
12243 | if (Src != RHS.getOperand(i: 0)) |
12244 | return SDValue(); |
12245 | |
12246 | const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(Val: LHS.getOperand(i: 1)); |
12247 | const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val: RHS.getOperand(i: 1)); |
12248 | if (!CLHS || !CRHS) |
12249 | return SDValue(); |
12250 | |
12251 | // Only 10 bits are used. |
12252 | static const uint32_t MaxMask = 0x3ff; |
12253 | |
12254 | uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask; |
12255 | SDLoc DL(N); |
12256 | return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, |
12257 | Src, DAG.getConstant(NewMask, DL, MVT::i32)); |
12258 | } |
12259 | |
12260 | return SDValue(); |
12261 | } |
12262 | |
12263 | // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2) |
12264 | if (isa<ConstantSDNode>(Val: RHS) && LHS.hasOneUse() && |
12265 | LHS.getOpcode() == AMDGPUISD::PERM && |
12266 | isa<ConstantSDNode>(Val: LHS.getOperand(i: 2))) { |
12267 | uint32_t Sel = getConstantPermuteMask(C: N->getConstantOperandVal(Num: 1)); |
12268 | if (!Sel) |
12269 | return SDValue(); |
12270 | |
12271 | Sel |= LHS.getConstantOperandVal(i: 2); |
12272 | SDLoc DL(N); |
12273 | return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0), |
12274 | LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32)); |
12275 | } |
12276 | |
12277 | // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2) |
12278 | const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); |
12279 | if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() && |
12280 | N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) { |
12281 | |
12282 | // If all the uses of an or need to extract the individual elements, do not |
12283 | // attempt to lower into v_perm |
12284 | auto usesCombinedOperand = [](SDNode *OrUse) { |
12285 | // If we have any non-vectorized use, then it is a candidate for v_perm |
12286 | if (OrUse->getOpcode() != ISD::BITCAST || |
12287 | !OrUse->getValueType(ResNo: 0).isVector()) |
12288 | return true; |
12289 | |
12290 | // If we have any non-vectorized use, then it is a candidate for v_perm |
12291 | for (auto VUse : OrUse->uses()) { |
12292 | if (!VUse->getValueType(ResNo: 0).isVector()) |
12293 | return true; |
12294 | |
12295 | // If the use of a vector is a store, then combining via a v_perm |
12296 | // is beneficial. |
12297 | // TODO -- whitelist more uses |
12298 | for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg}) |
12299 | if (VUse->getOpcode() == VectorwiseOp) |
12300 | return true; |
12301 | } |
12302 | return false; |
12303 | }; |
12304 | |
12305 | if (!any_of(Range: N->uses(), P: usesCombinedOperand)) |
12306 | return SDValue(); |
12307 | |
12308 | uint32_t LHSMask = getPermuteMask(V: LHS); |
12309 | uint32_t RHSMask = getPermuteMask(V: RHS); |
12310 | |
12311 | if (LHSMask != ~0u && RHSMask != ~0u) { |
12312 | // Canonicalize the expression in an attempt to have fewer unique masks |
12313 | // and therefore fewer registers used to hold the masks. |
12314 | if (LHSMask > RHSMask) { |
12315 | std::swap(a&: LHSMask, b&: RHSMask); |
12316 | std::swap(a&: LHS, b&: RHS); |
12317 | } |
12318 | |
12319 | // Select 0xc for each lane used from source operand. Zero has 0xc mask |
12320 | // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range. |
12321 | uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c; |
12322 | uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c; |
12323 | |
12324 | // Check of we need to combine values from two sources within a byte. |
12325 | if (!(LHSUsedLanes & RHSUsedLanes) && |
12326 | // If we select high and lower word keep it for SDWA. |
12327 | // TODO: teach SDWA to work with v_perm_b32 and remove the check. |
12328 | !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) { |
12329 | // Kill zero bytes selected by other mask. Zero value is 0xc. |
12330 | LHSMask &= ~RHSUsedLanes; |
12331 | RHSMask &= ~LHSUsedLanes; |
12332 | // Add 4 to each active LHS lane |
12333 | LHSMask |= LHSUsedLanes & 0x04040404; |
12334 | // Combine masks |
12335 | uint32_t Sel = LHSMask | RHSMask; |
12336 | SDLoc DL(N); |
12337 | |
12338 | return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, |
12339 | LHS.getOperand(0), RHS.getOperand(0), |
12340 | DAG.getConstant(Sel, DL, MVT::i32)); |
12341 | } |
12342 | } |
12343 | if (LHSMask == ~0u || RHSMask == ~0u) { |
12344 | if (SDValue Perm = matchPERM(N, DCI)) |
12345 | return Perm; |
12346 | } |
12347 | } |
12348 | |
12349 | if (VT != MVT::i64 || DCI.isBeforeLegalizeOps()) |
12350 | return SDValue(); |
12351 | |
12352 | // TODO: This could be a generic combine with a predicate for extracting the |
12353 | // high half of an integer being free. |
12354 | |
12355 | // (or i64:x, (zero_extend i32:y)) -> |
12356 | // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x))) |
12357 | if (LHS.getOpcode() == ISD::ZERO_EXTEND && |
12358 | RHS.getOpcode() != ISD::ZERO_EXTEND) |
12359 | std::swap(a&: LHS, b&: RHS); |
12360 | |
12361 | if (RHS.getOpcode() == ISD::ZERO_EXTEND) { |
12362 | SDValue ExtSrc = RHS.getOperand(i: 0); |
12363 | EVT SrcVT = ExtSrc.getValueType(); |
12364 | if (SrcVT == MVT::i32) { |
12365 | SDLoc SL(N); |
12366 | SDValue LowLHS, HiBits; |
12367 | std::tie(args&: LowLHS, args&: HiBits) = split64BitValue(Op: LHS, DAG); |
12368 | SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc); |
12369 | |
12370 | DCI.AddToWorklist(N: LowOr.getNode()); |
12371 | DCI.AddToWorklist(N: HiBits.getNode()); |
12372 | |
12373 | SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, |
12374 | LowOr, HiBits); |
12375 | return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); |
12376 | } |
12377 | } |
12378 | |
12379 | const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1)); |
12380 | if (CRHS) { |
12381 | if (SDValue Split |
12382 | = splitBinaryBitConstantOp(DCI, SL: SDLoc(N), Opc: ISD::OR, |
12383 | LHS: N->getOperand(Num: 0), CRHS)) |
12384 | return Split; |
12385 | } |
12386 | |
12387 | return SDValue(); |
12388 | } |
12389 | |
12390 | SDValue SITargetLowering::performXorCombine(SDNode *N, |
12391 | DAGCombinerInfo &DCI) const { |
12392 | if (SDValue RV = reassociateScalarOps(N, DAG&: DCI.DAG)) |
12393 | return RV; |
12394 | |
12395 | SDValue LHS = N->getOperand(Num: 0); |
12396 | SDValue RHS = N->getOperand(Num: 1); |
12397 | |
12398 | const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS); |
12399 | SelectionDAG &DAG = DCI.DAG; |
12400 | |
12401 | EVT VT = N->getValueType(ResNo: 0); |
12402 | if (CRHS && VT == MVT::i64) { |
12403 | if (SDValue Split |
12404 | = splitBinaryBitConstantOp(DCI, SL: SDLoc(N), Opc: ISD::XOR, LHS, CRHS)) |
12405 | return Split; |
12406 | } |
12407 | |
12408 | // Make sure to apply the 64-bit constant splitting fold before trying to fold |
12409 | // fneg-like xors into 64-bit select. |
12410 | if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) { |
12411 | // This looks like an fneg, try to fold as a source modifier. |
12412 | if (CRHS && CRHS->getAPIntValue().isSignMask() && |
12413 | shouldFoldFNegIntoSrc(FNeg: N, FNegSrc: LHS)) { |
12414 | // xor (select c, a, b), 0x80000000 -> |
12415 | // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b))) |
12416 | SDLoc DL(N); |
12417 | SDValue CastLHS = |
12418 | DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1)); |
12419 | SDValue CastRHS = |
12420 | DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2)); |
12421 | SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS); |
12422 | SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS); |
12423 | SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32, |
12424 | LHS->getOperand(0), FNegLHS, FNegRHS); |
12425 | return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: NewSelect); |
12426 | } |
12427 | } |
12428 | |
12429 | return SDValue(); |
12430 | } |
12431 | |
12432 | SDValue SITargetLowering::performZeroExtendCombine(SDNode *N, |
12433 | DAGCombinerInfo &DCI) const { |
12434 | if (!Subtarget->has16BitInsts() || |
12435 | DCI.getDAGCombineLevel() < AfterLegalizeDAG) |
12436 | return SDValue(); |
12437 | |
12438 | EVT VT = N->getValueType(ResNo: 0); |
12439 | if (VT != MVT::i32) |
12440 | return SDValue(); |
12441 | |
12442 | SDValue Src = N->getOperand(Num: 0); |
12443 | if (Src.getValueType() != MVT::i16) |
12444 | return SDValue(); |
12445 | |
12446 | return SDValue(); |
12447 | } |
12448 | |
12449 | SDValue |
12450 | SITargetLowering::performSignExtendInRegCombine(SDNode *N, |
12451 | DAGCombinerInfo &DCI) const { |
12452 | SDValue Src = N->getOperand(Num: 0); |
12453 | auto *VTSign = cast<VTSDNode>(Val: N->getOperand(Num: 1)); |
12454 | |
12455 | // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them |
12456 | // with s_buffer_load_i8 and s_buffer_load_i16 respectively. |
12457 | if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE && |
12458 | VTSign->getVT() == MVT::i8) || |
12459 | (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT && |
12460 | VTSign->getVT() == MVT::i16))) { |
12461 | assert(Subtarget->hasScalarSubwordLoads() && |
12462 | "s_buffer_load_{u8, i8} are supported " |
12463 | "in GFX12 (or newer) architectures." ); |
12464 | EVT VT = Src.getValueType(); |
12465 | unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE) |
12466 | ? AMDGPUISD::SBUFFER_LOAD_BYTE |
12467 | : AMDGPUISD::SBUFFER_LOAD_SHORT; |
12468 | SDLoc DL(N); |
12469 | SDVTList ResList = DCI.DAG.getVTList(MVT::i32); |
12470 | SDValue Ops[] = { |
12471 | Src.getOperand(i: 0), // source register |
12472 | Src.getOperand(i: 1), // offset |
12473 | Src.getOperand(i: 2) // cachePolicy |
12474 | }; |
12475 | auto *M = cast<MemSDNode>(Val&: Src); |
12476 | SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode( |
12477 | Opcode: Opc, dl: DL, VTList: ResList, Ops, MemVT: M->getMemoryVT(), MMO: M->getMemOperand()); |
12478 | SDValue LoadVal = DCI.DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: BufferLoad); |
12479 | return LoadVal; |
12480 | } else if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE && |
12481 | VTSign->getVT() == MVT::i8) || |
12482 | (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT && |
12483 | VTSign->getVT() == MVT::i16)) && |
12484 | Src.hasOneUse()) { |
12485 | auto *M = cast<MemSDNode>(Val&: Src); |
12486 | SDValue Ops[] = { |
12487 | Src.getOperand(i: 0), // Chain |
12488 | Src.getOperand(i: 1), // rsrc |
12489 | Src.getOperand(i: 2), // vindex |
12490 | Src.getOperand(i: 3), // voffset |
12491 | Src.getOperand(i: 4), // soffset |
12492 | Src.getOperand(i: 5), // offset |
12493 | Src.getOperand(i: 6), |
12494 | Src.getOperand(i: 7) |
12495 | }; |
12496 | // replace with BUFFER_LOAD_BYTE/SHORT |
12497 | SDVTList ResList = DCI.DAG.getVTList(MVT::i32, |
12498 | Src.getOperand(0).getValueType()); |
12499 | unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE) ? |
12500 | AMDGPUISD::BUFFER_LOAD_BYTE : AMDGPUISD::BUFFER_LOAD_SHORT; |
12501 | SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(Opcode: Opc, dl: SDLoc(N), |
12502 | VTList: ResList, |
12503 | Ops, MemVT: M->getMemoryVT(), |
12504 | MMO: M->getMemOperand()); |
12505 | return DCI.DAG.getMergeValues(Ops: {BufferLoadSignExt, |
12506 | BufferLoadSignExt.getValue(R: 1)}, dl: SDLoc(N)); |
12507 | } |
12508 | return SDValue(); |
12509 | } |
12510 | |
12511 | SDValue SITargetLowering::performClassCombine(SDNode *N, |
12512 | DAGCombinerInfo &DCI) const { |
12513 | SelectionDAG &DAG = DCI.DAG; |
12514 | SDValue Mask = N->getOperand(Num: 1); |
12515 | |
12516 | // fp_class x, 0 -> false |
12517 | if (isNullConstant(Mask)) |
12518 | return DAG.getConstant(0, SDLoc(N), MVT::i1); |
12519 | |
12520 | if (N->getOperand(0).isUndef()) |
12521 | return DAG.getUNDEF(MVT::i1); |
12522 | |
12523 | return SDValue(); |
12524 | } |
12525 | |
12526 | SDValue SITargetLowering::performRcpCombine(SDNode *N, |
12527 | DAGCombinerInfo &DCI) const { |
12528 | EVT VT = N->getValueType(ResNo: 0); |
12529 | SDValue N0 = N->getOperand(Num: 0); |
12530 | |
12531 | if (N0.isUndef()) { |
12532 | return DCI.DAG.getConstantFP( |
12533 | Val: APFloat::getQNaN(Sem: SelectionDAG::EVTToAPFloatSemantics(VT)), DL: SDLoc(N), |
12534 | VT); |
12535 | } |
12536 | |
12537 | if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP || |
12538 | N0.getOpcode() == ISD::SINT_TO_FP)) { |
12539 | return DCI.DAG.getNode(Opcode: AMDGPUISD::RCP_IFLAG, DL: SDLoc(N), VT, Operand: N0, |
12540 | Flags: N->getFlags()); |
12541 | } |
12542 | |
12543 | // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here. |
12544 | if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) && |
12545 | N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) { |
12546 | return DCI.DAG.getNode(Opcode: AMDGPUISD::RSQ, DL: SDLoc(N), VT, |
12547 | Operand: N0.getOperand(i: 0), Flags: N->getFlags()); |
12548 | } |
12549 | |
12550 | return AMDGPUTargetLowering::performRcpCombine(N, DCI); |
12551 | } |
12552 | |
12553 | bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op, |
12554 | unsigned MaxDepth) const { |
12555 | unsigned Opcode = Op.getOpcode(); |
12556 | if (Opcode == ISD::FCANONICALIZE) |
12557 | return true; |
12558 | |
12559 | if (auto *CFP = dyn_cast<ConstantFPSDNode>(Val&: Op)) { |
12560 | const auto &F = CFP->getValueAPF(); |
12561 | if (F.isNaN() && F.isSignaling()) |
12562 | return false; |
12563 | if (!F.isDenormal()) |
12564 | return true; |
12565 | |
12566 | DenormalMode Mode = |
12567 | DAG.getMachineFunction().getDenormalMode(FPType: F.getSemantics()); |
12568 | return Mode == DenormalMode::getIEEE(); |
12569 | } |
12570 | |
12571 | // If source is a result of another standard FP operation it is already in |
12572 | // canonical form. |
12573 | if (MaxDepth == 0) |
12574 | return false; |
12575 | |
12576 | switch (Opcode) { |
12577 | // These will flush denorms if required. |
12578 | case ISD::FADD: |
12579 | case ISD::FSUB: |
12580 | case ISD::FMUL: |
12581 | case ISD::FCEIL: |
12582 | case ISD::FFLOOR: |
12583 | case ISD::FMA: |
12584 | case ISD::FMAD: |
12585 | case ISD::FSQRT: |
12586 | case ISD::FDIV: |
12587 | case ISD::FREM: |
12588 | case ISD::FP_ROUND: |
12589 | case ISD::FP_EXTEND: |
12590 | case ISD::FP16_TO_FP: |
12591 | case ISD::FP_TO_FP16: |
12592 | case ISD::BF16_TO_FP: |
12593 | case ISD::FP_TO_BF16: |
12594 | case ISD::FLDEXP: |
12595 | case AMDGPUISD::FMUL_LEGACY: |
12596 | case AMDGPUISD::FMAD_FTZ: |
12597 | case AMDGPUISD::RCP: |
12598 | case AMDGPUISD::RSQ: |
12599 | case AMDGPUISD::RSQ_CLAMP: |
12600 | case AMDGPUISD::RCP_LEGACY: |
12601 | case AMDGPUISD::RCP_IFLAG: |
12602 | case AMDGPUISD::LOG: |
12603 | case AMDGPUISD::EXP: |
12604 | case AMDGPUISD::DIV_SCALE: |
12605 | case AMDGPUISD::DIV_FMAS: |
12606 | case AMDGPUISD::DIV_FIXUP: |
12607 | case AMDGPUISD::FRACT: |
12608 | case AMDGPUISD::CVT_PKRTZ_F16_F32: |
12609 | case AMDGPUISD::CVT_F32_UBYTE0: |
12610 | case AMDGPUISD::CVT_F32_UBYTE1: |
12611 | case AMDGPUISD::CVT_F32_UBYTE2: |
12612 | case AMDGPUISD::CVT_F32_UBYTE3: |
12613 | case AMDGPUISD::FP_TO_FP16: |
12614 | case AMDGPUISD::SIN_HW: |
12615 | case AMDGPUISD::COS_HW: |
12616 | return true; |
12617 | |
12618 | // It can/will be lowered or combined as a bit operation. |
12619 | // Need to check their input recursively to handle. |
12620 | case ISD::FNEG: |
12621 | case ISD::FABS: |
12622 | case ISD::FCOPYSIGN: |
12623 | return isCanonicalized(DAG, Op: Op.getOperand(i: 0), MaxDepth: MaxDepth - 1); |
12624 | |
12625 | case ISD::AND: |
12626 | if (Op.getValueType() == MVT::i32) { |
12627 | // Be careful as we only know it is a bitcast floating point type. It |
12628 | // could be f32, v2f16, we have no way of knowing. Luckily the constant |
12629 | // value that we optimize for, which comes up in fp32 to bf16 conversions, |
12630 | // is valid to optimize for all types. |
12631 | if (auto *RHS = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 1))) { |
12632 | if (RHS->getZExtValue() == 0xffff0000) { |
12633 | return isCanonicalized(DAG, Op: Op.getOperand(i: 0), MaxDepth: MaxDepth - 1); |
12634 | } |
12635 | } |
12636 | } |
12637 | break; |
12638 | |
12639 | case ISD::FSIN: |
12640 | case ISD::FCOS: |
12641 | case ISD::FSINCOS: |
12642 | return Op.getValueType().getScalarType() != MVT::f16; |
12643 | |
12644 | case ISD::FMINNUM: |
12645 | case ISD::FMAXNUM: |
12646 | case ISD::FMINNUM_IEEE: |
12647 | case ISD::FMAXNUM_IEEE: |
12648 | case ISD::FMINIMUM: |
12649 | case ISD::FMAXIMUM: |
12650 | case AMDGPUISD::CLAMP: |
12651 | case AMDGPUISD::FMED3: |
12652 | case AMDGPUISD::FMAX3: |
12653 | case AMDGPUISD::FMIN3: |
12654 | case AMDGPUISD::FMAXIMUM3: |
12655 | case AMDGPUISD::FMINIMUM3: { |
12656 | // FIXME: Shouldn't treat the generic operations different based these. |
12657 | // However, we aren't really required to flush the result from |
12658 | // minnum/maxnum.. |
12659 | |
12660 | // snans will be quieted, so we only need to worry about denormals. |
12661 | if (Subtarget->supportsMinMaxDenormModes() || |
12662 | // FIXME: denormalsEnabledForType is broken for dynamic |
12663 | denormalsEnabledForType(DAG, VT: Op.getValueType())) |
12664 | return true; |
12665 | |
12666 | // Flushing may be required. |
12667 | // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such |
12668 | // targets need to check their input recursively. |
12669 | |
12670 | // FIXME: Does this apply with clamp? It's implemented with max. |
12671 | for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) { |
12672 | if (!isCanonicalized(DAG, Op: Op.getOperand(i: I), MaxDepth: MaxDepth - 1)) |
12673 | return false; |
12674 | } |
12675 | |
12676 | return true; |
12677 | } |
12678 | case ISD::SELECT: { |
12679 | return isCanonicalized(DAG, Op: Op.getOperand(i: 1), MaxDepth: MaxDepth - 1) && |
12680 | isCanonicalized(DAG, Op: Op.getOperand(i: 2), MaxDepth: MaxDepth - 1); |
12681 | } |
12682 | case ISD::BUILD_VECTOR: { |
12683 | for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) { |
12684 | SDValue SrcOp = Op.getOperand(i); |
12685 | if (!isCanonicalized(DAG, Op: SrcOp, MaxDepth: MaxDepth - 1)) |
12686 | return false; |
12687 | } |
12688 | |
12689 | return true; |
12690 | } |
12691 | case ISD::EXTRACT_VECTOR_ELT: |
12692 | case ISD::EXTRACT_SUBVECTOR: { |
12693 | return isCanonicalized(DAG, Op: Op.getOperand(i: 0), MaxDepth: MaxDepth - 1); |
12694 | } |
12695 | case ISD::INSERT_VECTOR_ELT: { |
12696 | return isCanonicalized(DAG, Op: Op.getOperand(i: 0), MaxDepth: MaxDepth - 1) && |
12697 | isCanonicalized(DAG, Op: Op.getOperand(i: 1), MaxDepth: MaxDepth - 1); |
12698 | } |
12699 | case ISD::UNDEF: |
12700 | // Could be anything. |
12701 | return false; |
12702 | |
12703 | case ISD::BITCAST: |
12704 | // TODO: This is incorrect as it loses track of the operand's type. We may |
12705 | // end up effectively bitcasting from f32 to v2f16 or vice versa, and the |
12706 | // same bits that are canonicalized in one type need not be in the other. |
12707 | return isCanonicalized(DAG, Op: Op.getOperand(i: 0), MaxDepth: MaxDepth - 1); |
12708 | case ISD::TRUNCATE: { |
12709 | // Hack round the mess we make when legalizing extract_vector_elt |
12710 | if (Op.getValueType() == MVT::i16) { |
12711 | SDValue TruncSrc = Op.getOperand(i: 0); |
12712 | if (TruncSrc.getValueType() == MVT::i32 && |
12713 | TruncSrc.getOpcode() == ISD::BITCAST && |
12714 | TruncSrc.getOperand(0).getValueType() == MVT::v2f16) { |
12715 | return isCanonicalized(DAG, Op: TruncSrc.getOperand(i: 0), MaxDepth: MaxDepth - 1); |
12716 | } |
12717 | } |
12718 | return false; |
12719 | } |
12720 | case ISD::INTRINSIC_WO_CHAIN: { |
12721 | unsigned IntrinsicID = Op.getConstantOperandVal(i: 0); |
12722 | // TODO: Handle more intrinsics |
12723 | switch (IntrinsicID) { |
12724 | case Intrinsic::amdgcn_cvt_pkrtz: |
12725 | case Intrinsic::amdgcn_cubeid: |
12726 | case Intrinsic::amdgcn_frexp_mant: |
12727 | case Intrinsic::amdgcn_fdot2: |
12728 | case Intrinsic::amdgcn_rcp: |
12729 | case Intrinsic::amdgcn_rsq: |
12730 | case Intrinsic::amdgcn_rsq_clamp: |
12731 | case Intrinsic::amdgcn_rcp_legacy: |
12732 | case Intrinsic::amdgcn_rsq_legacy: |
12733 | case Intrinsic::amdgcn_trig_preop: |
12734 | case Intrinsic::amdgcn_log: |
12735 | case Intrinsic::amdgcn_exp2: |
12736 | case Intrinsic::amdgcn_sqrt: |
12737 | return true; |
12738 | default: |
12739 | break; |
12740 | } |
12741 | |
12742 | break; |
12743 | } |
12744 | default: |
12745 | break; |
12746 | } |
12747 | |
12748 | // FIXME: denormalsEnabledForType is broken for dynamic |
12749 | return denormalsEnabledForType(DAG, VT: Op.getValueType()) && |
12750 | DAG.isKnownNeverSNaN(Op); |
12751 | } |
12752 | |
12753 | bool SITargetLowering::isCanonicalized(Register Reg, const MachineFunction &MF, |
12754 | unsigned MaxDepth) const { |
12755 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
12756 | MachineInstr *MI = MRI.getVRegDef(Reg); |
12757 | unsigned Opcode = MI->getOpcode(); |
12758 | |
12759 | if (Opcode == AMDGPU::G_FCANONICALIZE) |
12760 | return true; |
12761 | |
12762 | std::optional<FPValueAndVReg> FCR; |
12763 | // Constant splat (can be padded with undef) or scalar constant. |
12764 | if (mi_match(R: Reg, MRI, P: MIPatternMatch::m_GFCstOrSplat(FPValReg&: FCR))) { |
12765 | if (FCR->Value.isSignaling()) |
12766 | return false; |
12767 | if (!FCR->Value.isDenormal()) |
12768 | return true; |
12769 | |
12770 | DenormalMode Mode = MF.getDenormalMode(FPType: FCR->Value.getSemantics()); |
12771 | return Mode == DenormalMode::getIEEE(); |
12772 | } |
12773 | |
12774 | if (MaxDepth == 0) |
12775 | return false; |
12776 | |
12777 | switch (Opcode) { |
12778 | case AMDGPU::G_FADD: |
12779 | case AMDGPU::G_FSUB: |
12780 | case AMDGPU::G_FMUL: |
12781 | case AMDGPU::G_FCEIL: |
12782 | case AMDGPU::G_FFLOOR: |
12783 | case AMDGPU::G_FRINT: |
12784 | case AMDGPU::G_FNEARBYINT: |
12785 | case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND: |
12786 | case AMDGPU::G_INTRINSIC_TRUNC: |
12787 | case AMDGPU::G_INTRINSIC_ROUNDEVEN: |
12788 | case AMDGPU::G_FMA: |
12789 | case AMDGPU::G_FMAD: |
12790 | case AMDGPU::G_FSQRT: |
12791 | case AMDGPU::G_FDIV: |
12792 | case AMDGPU::G_FREM: |
12793 | case AMDGPU::G_FPOW: |
12794 | case AMDGPU::G_FPEXT: |
12795 | case AMDGPU::G_FLOG: |
12796 | case AMDGPU::G_FLOG2: |
12797 | case AMDGPU::G_FLOG10: |
12798 | case AMDGPU::G_FPTRUNC: |
12799 | case AMDGPU::G_AMDGPU_RCP_IFLAG: |
12800 | case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0: |
12801 | case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1: |
12802 | case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2: |
12803 | case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3: |
12804 | return true; |
12805 | case AMDGPU::G_FNEG: |
12806 | case AMDGPU::G_FABS: |
12807 | case AMDGPU::G_FCOPYSIGN: |
12808 | return isCanonicalized(Reg: MI->getOperand(i: 1).getReg(), MF, MaxDepth: MaxDepth - 1); |
12809 | case AMDGPU::G_FMINNUM: |
12810 | case AMDGPU::G_FMAXNUM: |
12811 | case AMDGPU::G_FMINNUM_IEEE: |
12812 | case AMDGPU::G_FMAXNUM_IEEE: |
12813 | case AMDGPU::G_FMINIMUM: |
12814 | case AMDGPU::G_FMAXIMUM: { |
12815 | if (Subtarget->supportsMinMaxDenormModes() || |
12816 | // FIXME: denormalsEnabledForType is broken for dynamic |
12817 | denormalsEnabledForType(Ty: MRI.getType(Reg), MF)) |
12818 | return true; |
12819 | |
12820 | [[fallthrough]]; |
12821 | } |
12822 | case AMDGPU::G_BUILD_VECTOR: |
12823 | for (const MachineOperand &MO : llvm::drop_begin(RangeOrContainer: MI->operands())) |
12824 | if (!isCanonicalized(Reg: MO.getReg(), MF, MaxDepth: MaxDepth - 1)) |
12825 | return false; |
12826 | return true; |
12827 | case AMDGPU::G_INTRINSIC: |
12828 | case AMDGPU::G_INTRINSIC_CONVERGENT: |
12829 | switch (cast<GIntrinsic>(Val: MI)->getIntrinsicID()) { |
12830 | case Intrinsic::amdgcn_fmul_legacy: |
12831 | case Intrinsic::amdgcn_fmad_ftz: |
12832 | case Intrinsic::amdgcn_sqrt: |
12833 | case Intrinsic::amdgcn_fmed3: |
12834 | case Intrinsic::amdgcn_sin: |
12835 | case Intrinsic::amdgcn_cos: |
12836 | case Intrinsic::amdgcn_log: |
12837 | case Intrinsic::amdgcn_exp2: |
12838 | case Intrinsic::amdgcn_log_clamp: |
12839 | case Intrinsic::amdgcn_rcp: |
12840 | case Intrinsic::amdgcn_rcp_legacy: |
12841 | case Intrinsic::amdgcn_rsq: |
12842 | case Intrinsic::amdgcn_rsq_clamp: |
12843 | case Intrinsic::amdgcn_rsq_legacy: |
12844 | case Intrinsic::amdgcn_div_scale: |
12845 | case Intrinsic::amdgcn_div_fmas: |
12846 | case Intrinsic::amdgcn_div_fixup: |
12847 | case Intrinsic::amdgcn_fract: |
12848 | case Intrinsic::amdgcn_cvt_pkrtz: |
12849 | case Intrinsic::amdgcn_cubeid: |
12850 | case Intrinsic::amdgcn_cubema: |
12851 | case Intrinsic::amdgcn_cubesc: |
12852 | case Intrinsic::amdgcn_cubetc: |
12853 | case Intrinsic::amdgcn_frexp_mant: |
12854 | case Intrinsic::amdgcn_fdot2: |
12855 | case Intrinsic::amdgcn_trig_preop: |
12856 | return true; |
12857 | default: |
12858 | break; |
12859 | } |
12860 | |
12861 | [[fallthrough]]; |
12862 | default: |
12863 | return false; |
12864 | } |
12865 | |
12866 | llvm_unreachable("invalid operation" ); |
12867 | } |
12868 | |
12869 | // Constant fold canonicalize. |
12870 | SDValue SITargetLowering::getCanonicalConstantFP( |
12871 | SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const { |
12872 | // Flush denormals to 0 if not enabled. |
12873 | if (C.isDenormal()) { |
12874 | DenormalMode Mode = |
12875 | DAG.getMachineFunction().getDenormalMode(FPType: C.getSemantics()); |
12876 | if (Mode == DenormalMode::getPreserveSign()) { |
12877 | return DAG.getConstantFP( |
12878 | Val: APFloat::getZero(Sem: C.getSemantics(), Negative: C.isNegative()), DL: SL, VT); |
12879 | } |
12880 | |
12881 | if (Mode != DenormalMode::getIEEE()) |
12882 | return SDValue(); |
12883 | } |
12884 | |
12885 | if (C.isNaN()) { |
12886 | APFloat CanonicalQNaN = APFloat::getQNaN(Sem: C.getSemantics()); |
12887 | if (C.isSignaling()) { |
12888 | // Quiet a signaling NaN. |
12889 | // FIXME: Is this supposed to preserve payload bits? |
12890 | return DAG.getConstantFP(Val: CanonicalQNaN, DL: SL, VT); |
12891 | } |
12892 | |
12893 | // Make sure it is the canonical NaN bitpattern. |
12894 | // |
12895 | // TODO: Can we use -1 as the canonical NaN value since it's an inline |
12896 | // immediate? |
12897 | if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt()) |
12898 | return DAG.getConstantFP(Val: CanonicalQNaN, DL: SL, VT); |
12899 | } |
12900 | |
12901 | // Already canonical. |
12902 | return DAG.getConstantFP(Val: C, DL: SL, VT); |
12903 | } |
12904 | |
12905 | static bool vectorEltWillFoldAway(SDValue Op) { |
12906 | return Op.isUndef() || isa<ConstantFPSDNode>(Val: Op); |
12907 | } |
12908 | |
12909 | SDValue SITargetLowering::performFCanonicalizeCombine( |
12910 | SDNode *N, |
12911 | DAGCombinerInfo &DCI) const { |
12912 | SelectionDAG &DAG = DCI.DAG; |
12913 | SDValue N0 = N->getOperand(Num: 0); |
12914 | EVT VT = N->getValueType(ResNo: 0); |
12915 | |
12916 | // fcanonicalize undef -> qnan |
12917 | if (N0.isUndef()) { |
12918 | APFloat QNaN = APFloat::getQNaN(Sem: SelectionDAG::EVTToAPFloatSemantics(VT)); |
12919 | return DAG.getConstantFP(Val: QNaN, DL: SDLoc(N), VT); |
12920 | } |
12921 | |
12922 | if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N: N0)) { |
12923 | EVT VT = N->getValueType(ResNo: 0); |
12924 | return getCanonicalConstantFP(DAG, SL: SDLoc(N), VT, C: CFP->getValueAPF()); |
12925 | } |
12926 | |
12927 | // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x), |
12928 | // (fcanonicalize k) |
12929 | // |
12930 | // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0 |
12931 | |
12932 | // TODO: This could be better with wider vectors that will be split to v2f16, |
12933 | // and to consider uses since there aren't that many packed operations. |
12934 | if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 && |
12935 | isTypeLegal(MVT::v2f16)) { |
12936 | SDLoc SL(N); |
12937 | SDValue NewElts[2]; |
12938 | SDValue Lo = N0.getOperand(i: 0); |
12939 | SDValue Hi = N0.getOperand(i: 1); |
12940 | EVT EltVT = Lo.getValueType(); |
12941 | |
12942 | if (vectorEltWillFoldAway(Op: Lo) || vectorEltWillFoldAway(Op: Hi)) { |
12943 | for (unsigned I = 0; I != 2; ++I) { |
12944 | SDValue Op = N0.getOperand(i: I); |
12945 | if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Val&: Op)) { |
12946 | NewElts[I] = getCanonicalConstantFP(DAG, SL, VT: EltVT, |
12947 | C: CFP->getValueAPF()); |
12948 | } else if (Op.isUndef()) { |
12949 | // Handled below based on what the other operand is. |
12950 | NewElts[I] = Op; |
12951 | } else { |
12952 | NewElts[I] = DAG.getNode(Opcode: ISD::FCANONICALIZE, DL: SL, VT: EltVT, Operand: Op); |
12953 | } |
12954 | } |
12955 | |
12956 | // If one half is undef, and one is constant, prefer a splat vector rather |
12957 | // than the normal qNaN. If it's a register, prefer 0.0 since that's |
12958 | // cheaper to use and may be free with a packed operation. |
12959 | if (NewElts[0].isUndef()) { |
12960 | if (isa<ConstantFPSDNode>(Val: NewElts[1])) |
12961 | NewElts[0] = isa<ConstantFPSDNode>(Val: NewElts[1]) ? |
12962 | NewElts[1]: DAG.getConstantFP(Val: 0.0f, DL: SL, VT: EltVT); |
12963 | } |
12964 | |
12965 | if (NewElts[1].isUndef()) { |
12966 | NewElts[1] = isa<ConstantFPSDNode>(Val: NewElts[0]) ? |
12967 | NewElts[0] : DAG.getConstantFP(Val: 0.0f, DL: SL, VT: EltVT); |
12968 | } |
12969 | |
12970 | return DAG.getBuildVector(VT, DL: SL, Ops: NewElts); |
12971 | } |
12972 | } |
12973 | |
12974 | return SDValue(); |
12975 | } |
12976 | |
12977 | static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { |
12978 | switch (Opc) { |
12979 | case ISD::FMAXNUM: |
12980 | case ISD::FMAXNUM_IEEE: |
12981 | return AMDGPUISD::FMAX3; |
12982 | case ISD::FMAXIMUM: |
12983 | return AMDGPUISD::FMAXIMUM3; |
12984 | case ISD::SMAX: |
12985 | return AMDGPUISD::SMAX3; |
12986 | case ISD::UMAX: |
12987 | return AMDGPUISD::UMAX3; |
12988 | case ISD::FMINNUM: |
12989 | case ISD::FMINNUM_IEEE: |
12990 | return AMDGPUISD::FMIN3; |
12991 | case ISD::FMINIMUM: |
12992 | return AMDGPUISD::FMINIMUM3; |
12993 | case ISD::SMIN: |
12994 | return AMDGPUISD::SMIN3; |
12995 | case ISD::UMIN: |
12996 | return AMDGPUISD::UMIN3; |
12997 | default: |
12998 | llvm_unreachable("Not a min/max opcode" ); |
12999 | } |
13000 | } |
13001 | |
13002 | SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG, |
13003 | const SDLoc &SL, SDValue Src, |
13004 | SDValue MinVal, |
13005 | SDValue MaxVal, |
13006 | bool Signed) const { |
13007 | |
13008 | // med3 comes from |
13009 | // min(max(x, K0), K1), K0 < K1 |
13010 | // max(min(x, K0), K1), K1 < K0 |
13011 | // |
13012 | // "MinVal" and "MaxVal" respectively refer to the rhs of the |
13013 | // min/max op. |
13014 | ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(Val&: MinVal); |
13015 | ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(Val&: MaxVal); |
13016 | |
13017 | if (!MinK || !MaxK) |
13018 | return SDValue(); |
13019 | |
13020 | if (Signed) { |
13021 | if (MaxK->getAPIntValue().sge(RHS: MinK->getAPIntValue())) |
13022 | return SDValue(); |
13023 | } else { |
13024 | if (MaxK->getAPIntValue().uge(RHS: MinK->getAPIntValue())) |
13025 | return SDValue(); |
13026 | } |
13027 | |
13028 | EVT VT = MinK->getValueType(ResNo: 0); |
13029 | unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3; |
13030 | if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16())) |
13031 | return DAG.getNode(Opcode: Med3Opc, DL: SL, VT, N1: Src, N2: MaxVal, N3: MinVal); |
13032 | |
13033 | // Note: we could also extend to i32 and use i32 med3 if i16 med3 is |
13034 | // not available, but this is unlikely to be profitable as constants |
13035 | // will often need to be materialized & extended, especially on |
13036 | // pre-GFX10 where VOP3 instructions couldn't take literal operands. |
13037 | return SDValue(); |
13038 | } |
13039 | |
13040 | static ConstantFPSDNode *getSplatConstantFP(SDValue Op) { |
13041 | if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: Op)) |
13042 | return C; |
13043 | |
13044 | if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Val&: Op)) { |
13045 | if (ConstantFPSDNode *C = BV->getConstantFPSplatNode()) |
13046 | return C; |
13047 | } |
13048 | |
13049 | return nullptr; |
13050 | } |
13051 | |
13052 | SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG, |
13053 | const SDLoc &SL, |
13054 | SDValue Op0, |
13055 | SDValue Op1) const { |
13056 | ConstantFPSDNode *K1 = getSplatConstantFP(Op: Op1); |
13057 | if (!K1) |
13058 | return SDValue(); |
13059 | |
13060 | ConstantFPSDNode *K0 = getSplatConstantFP(Op: Op0.getOperand(i: 1)); |
13061 | if (!K0) |
13062 | return SDValue(); |
13063 | |
13064 | // Ordered >= (although NaN inputs should have folded away by now). |
13065 | if (K0->getValueAPF() > K1->getValueAPF()) |
13066 | return SDValue(); |
13067 | |
13068 | const MachineFunction &MF = DAG.getMachineFunction(); |
13069 | const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
13070 | |
13071 | // TODO: Check IEEE bit enabled? |
13072 | EVT VT = Op0.getValueType(); |
13073 | if (Info->getMode().DX10Clamp) { |
13074 | // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the |
13075 | // hardware fmed3 behavior converting to a min. |
13076 | // FIXME: Should this be allowing -0.0? |
13077 | if (K1->isExactlyValue(V: 1.0) && K0->isExactlyValue(V: 0.0)) |
13078 | return DAG.getNode(Opcode: AMDGPUISD::CLAMP, DL: SL, VT, Operand: Op0.getOperand(i: 0)); |
13079 | } |
13080 | |
13081 | // med3 for f16 is only available on gfx9+, and not available for v2f16. |
13082 | if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) { |
13083 | // This isn't safe with signaling NaNs because in IEEE mode, min/max on a |
13084 | // signaling NaN gives a quiet NaN. The quiet NaN input to the min would |
13085 | // then give the other result, which is different from med3 with a NaN |
13086 | // input. |
13087 | SDValue Var = Op0.getOperand(i: 0); |
13088 | if (!DAG.isKnownNeverSNaN(Op: Var)) |
13089 | return SDValue(); |
13090 | |
13091 | const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); |
13092 | |
13093 | if ((!K0->hasOneUse() || TII->isInlineConstant(Imm: K0->getValueAPF())) && |
13094 | (!K1->hasOneUse() || TII->isInlineConstant(Imm: K1->getValueAPF()))) { |
13095 | return DAG.getNode(Opcode: AMDGPUISD::FMED3, DL: SL, VT: K0->getValueType(ResNo: 0), |
13096 | N1: Var, N2: SDValue(K0, 0), N3: SDValue(K1, 0)); |
13097 | } |
13098 | } |
13099 | |
13100 | return SDValue(); |
13101 | } |
13102 | |
13103 | SDValue SITargetLowering::performMinMaxCombine(SDNode *N, |
13104 | DAGCombinerInfo &DCI) const { |
13105 | SelectionDAG &DAG = DCI.DAG; |
13106 | |
13107 | EVT VT = N->getValueType(ResNo: 0); |
13108 | unsigned Opc = N->getOpcode(); |
13109 | SDValue Op0 = N->getOperand(Num: 0); |
13110 | SDValue Op1 = N->getOperand(Num: 1); |
13111 | |
13112 | // Only do this if the inner op has one use since this will just increases |
13113 | // register pressure for no benefit. |
13114 | |
13115 | if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY && |
13116 | !VT.isVector() && |
13117 | (VT == MVT::i32 || VT == MVT::f32 || |
13118 | ((VT == MVT::f16 || VT == MVT::i16) && Subtarget->hasMin3Max3_16()))) { |
13119 | // max(max(a, b), c) -> max3(a, b, c) |
13120 | // min(min(a, b), c) -> min3(a, b, c) |
13121 | if (Op0.getOpcode() == Opc && Op0.hasOneUse()) { |
13122 | SDLoc DL(N); |
13123 | return DAG.getNode(Opcode: minMaxOpcToMin3Max3Opc(Opc), |
13124 | DL, |
13125 | VT: N->getValueType(ResNo: 0), |
13126 | N1: Op0.getOperand(i: 0), |
13127 | N2: Op0.getOperand(i: 1), |
13128 | N3: Op1); |
13129 | } |
13130 | |
13131 | // Try commuted. |
13132 | // max(a, max(b, c)) -> max3(a, b, c) |
13133 | // min(a, min(b, c)) -> min3(a, b, c) |
13134 | if (Op1.getOpcode() == Opc && Op1.hasOneUse()) { |
13135 | SDLoc DL(N); |
13136 | return DAG.getNode(Opcode: minMaxOpcToMin3Max3Opc(Opc), |
13137 | DL, |
13138 | VT: N->getValueType(ResNo: 0), |
13139 | N1: Op0, |
13140 | N2: Op1.getOperand(i: 0), |
13141 | N3: Op1.getOperand(i: 1)); |
13142 | } |
13143 | } |
13144 | |
13145 | // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1) |
13146 | // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0) |
13147 | if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) { |
13148 | if (SDValue Med3 = performIntMed3ImmCombine( |
13149 | DAG, SL: SDLoc(N), Src: Op0->getOperand(Num: 0), MinVal: Op1, MaxVal: Op0->getOperand(Num: 1), Signed: true)) |
13150 | return Med3; |
13151 | } |
13152 | if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) { |
13153 | if (SDValue Med3 = performIntMed3ImmCombine( |
13154 | DAG, SL: SDLoc(N), Src: Op0->getOperand(Num: 0), MinVal: Op0->getOperand(Num: 1), MaxVal: Op1, Signed: true)) |
13155 | return Med3; |
13156 | } |
13157 | |
13158 | if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) { |
13159 | if (SDValue Med3 = performIntMed3ImmCombine( |
13160 | DAG, SL: SDLoc(N), Src: Op0->getOperand(Num: 0), MinVal: Op1, MaxVal: Op0->getOperand(Num: 1), Signed: false)) |
13161 | return Med3; |
13162 | } |
13163 | if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) { |
13164 | if (SDValue Med3 = performIntMed3ImmCombine( |
13165 | DAG, SL: SDLoc(N), Src: Op0->getOperand(Num: 0), MinVal: Op0->getOperand(Num: 1), MaxVal: Op1, Signed: false)) |
13166 | return Med3; |
13167 | } |
13168 | |
13169 | // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1) |
13170 | if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) || |
13171 | (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) || |
13172 | (Opc == AMDGPUISD::FMIN_LEGACY && |
13173 | Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) && |
13174 | (VT == MVT::f32 || VT == MVT::f64 || |
13175 | (VT == MVT::f16 && Subtarget->has16BitInsts()) || |
13176 | (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) && |
13177 | Op0.hasOneUse()) { |
13178 | if (SDValue Res = performFPMed3ImmCombine(DAG, SL: SDLoc(N), Op0, Op1)) |
13179 | return Res; |
13180 | } |
13181 | |
13182 | return SDValue(); |
13183 | } |
13184 | |
13185 | static bool isClampZeroToOne(SDValue A, SDValue B) { |
13186 | if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(Val&: A)) { |
13187 | if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(Val&: B)) { |
13188 | // FIXME: Should this be allowing -0.0? |
13189 | return (CA->isExactlyValue(V: 0.0) && CB->isExactlyValue(V: 1.0)) || |
13190 | (CA->isExactlyValue(V: 1.0) && CB->isExactlyValue(V: 0.0)); |
13191 | } |
13192 | } |
13193 | |
13194 | return false; |
13195 | } |
13196 | |
13197 | // FIXME: Should only worry about snans for version with chain. |
13198 | SDValue SITargetLowering::performFMed3Combine(SDNode *N, |
13199 | DAGCombinerInfo &DCI) const { |
13200 | EVT VT = N->getValueType(ResNo: 0); |
13201 | // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and |
13202 | // NaNs. With a NaN input, the order of the operands may change the result. |
13203 | |
13204 | SelectionDAG &DAG = DCI.DAG; |
13205 | SDLoc SL(N); |
13206 | |
13207 | SDValue Src0 = N->getOperand(Num: 0); |
13208 | SDValue Src1 = N->getOperand(Num: 1); |
13209 | SDValue Src2 = N->getOperand(Num: 2); |
13210 | |
13211 | if (isClampZeroToOne(A: Src0, B: Src1)) { |
13212 | // const_a, const_b, x -> clamp is safe in all cases including signaling |
13213 | // nans. |
13214 | // FIXME: Should this be allowing -0.0? |
13215 | return DAG.getNode(Opcode: AMDGPUISD::CLAMP, DL: SL, VT, Operand: Src2); |
13216 | } |
13217 | |
13218 | const MachineFunction &MF = DAG.getMachineFunction(); |
13219 | const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
13220 | |
13221 | // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother |
13222 | // handling no dx10-clamp? |
13223 | if (Info->getMode().DX10Clamp) { |
13224 | // If NaNs is clamped to 0, we are free to reorder the inputs. |
13225 | |
13226 | if (isa<ConstantFPSDNode>(Val: Src0) && !isa<ConstantFPSDNode>(Val: Src1)) |
13227 | std::swap(a&: Src0, b&: Src1); |
13228 | |
13229 | if (isa<ConstantFPSDNode>(Val: Src1) && !isa<ConstantFPSDNode>(Val: Src2)) |
13230 | std::swap(a&: Src1, b&: Src2); |
13231 | |
13232 | if (isa<ConstantFPSDNode>(Val: Src0) && !isa<ConstantFPSDNode>(Val: Src1)) |
13233 | std::swap(a&: Src0, b&: Src1); |
13234 | |
13235 | if (isClampZeroToOne(A: Src1, B: Src2)) |
13236 | return DAG.getNode(Opcode: AMDGPUISD::CLAMP, DL: SL, VT, Operand: Src0); |
13237 | } |
13238 | |
13239 | return SDValue(); |
13240 | } |
13241 | |
13242 | SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N, |
13243 | DAGCombinerInfo &DCI) const { |
13244 | SDValue Src0 = N->getOperand(Num: 0); |
13245 | SDValue Src1 = N->getOperand(Num: 1); |
13246 | if (Src0.isUndef() && Src1.isUndef()) |
13247 | return DCI.DAG.getUNDEF(VT: N->getValueType(ResNo: 0)); |
13248 | return SDValue(); |
13249 | } |
13250 | |
13251 | // Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be |
13252 | // expanded into a set of cmp/select instructions. |
13253 | bool SITargetLowering::shouldExpandVectorDynExt(unsigned EltSize, |
13254 | unsigned NumElem, |
13255 | bool IsDivergentIdx, |
13256 | const GCNSubtarget *Subtarget) { |
13257 | if (UseDivergentRegisterIndexing) |
13258 | return false; |
13259 | |
13260 | unsigned VecSize = EltSize * NumElem; |
13261 | |
13262 | // Sub-dword vectors of size 2 dword or less have better implementation. |
13263 | if (VecSize <= 64 && EltSize < 32) |
13264 | return false; |
13265 | |
13266 | // Always expand the rest of sub-dword instructions, otherwise it will be |
13267 | // lowered via memory. |
13268 | if (EltSize < 32) |
13269 | return true; |
13270 | |
13271 | // Always do this if var-idx is divergent, otherwise it will become a loop. |
13272 | if (IsDivergentIdx) |
13273 | return true; |
13274 | |
13275 | // Large vectors would yield too many compares and v_cndmask_b32 instructions. |
13276 | unsigned NumInsts = NumElem /* Number of compares */ + |
13277 | ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */; |
13278 | |
13279 | // On some architectures (GFX9) movrel is not available and it's better |
13280 | // to expand. |
13281 | if (!Subtarget->hasMovrel()) |
13282 | return NumInsts <= 16; |
13283 | |
13284 | // If movrel is available, use it instead of expanding for vector of 8 |
13285 | // elements. |
13286 | return NumInsts <= 15; |
13287 | } |
13288 | |
13289 | bool SITargetLowering::shouldExpandVectorDynExt(SDNode *N) const { |
13290 | SDValue Idx = N->getOperand(Num: N->getNumOperands() - 1); |
13291 | if (isa<ConstantSDNode>(Val: Idx)) |
13292 | return false; |
13293 | |
13294 | SDValue Vec = N->getOperand(Num: 0); |
13295 | EVT VecVT = Vec.getValueType(); |
13296 | EVT EltVT = VecVT.getVectorElementType(); |
13297 | unsigned EltSize = EltVT.getSizeInBits(); |
13298 | unsigned NumElem = VecVT.getVectorNumElements(); |
13299 | |
13300 | return SITargetLowering::shouldExpandVectorDynExt( |
13301 | EltSize, NumElem, IsDivergentIdx: Idx->isDivergent(), Subtarget: getSubtarget()); |
13302 | } |
13303 | |
13304 | SDValue SITargetLowering::( |
13305 | SDNode *N, DAGCombinerInfo &DCI) const { |
13306 | SDValue Vec = N->getOperand(Num: 0); |
13307 | SelectionDAG &DAG = DCI.DAG; |
13308 | |
13309 | EVT VecVT = Vec.getValueType(); |
13310 | EVT VecEltVT = VecVT.getVectorElementType(); |
13311 | EVT ResVT = N->getValueType(ResNo: 0); |
13312 | |
13313 | unsigned VecSize = VecVT.getSizeInBits(); |
13314 | unsigned VecEltSize = VecEltVT.getSizeInBits(); |
13315 | |
13316 | if ((Vec.getOpcode() == ISD::FNEG || |
13317 | Vec.getOpcode() == ISD::FABS) && allUsesHaveSourceMods(N)) { |
13318 | SDLoc SL(N); |
13319 | SDValue Idx = N->getOperand(Num: 1); |
13320 | SDValue Elt = |
13321 | DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ResVT, N1: Vec.getOperand(i: 0), N2: Idx); |
13322 | return DAG.getNode(Opcode: Vec.getOpcode(), DL: SL, VT: ResVT, Operand: Elt); |
13323 | } |
13324 | |
13325 | // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx) |
13326 | // => |
13327 | // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx) |
13328 | // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx) |
13329 | // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt |
13330 | if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) { |
13331 | SDLoc SL(N); |
13332 | SDValue Idx = N->getOperand(Num: 1); |
13333 | unsigned Opc = Vec.getOpcode(); |
13334 | |
13335 | switch(Opc) { |
13336 | default: |
13337 | break; |
13338 | // TODO: Support other binary operations. |
13339 | case ISD::FADD: |
13340 | case ISD::FSUB: |
13341 | case ISD::FMUL: |
13342 | case ISD::ADD: |
13343 | case ISD::UMIN: |
13344 | case ISD::UMAX: |
13345 | case ISD::SMIN: |
13346 | case ISD::SMAX: |
13347 | case ISD::FMAXNUM: |
13348 | case ISD::FMINNUM: |
13349 | case ISD::FMAXNUM_IEEE: |
13350 | case ISD::FMINNUM_IEEE: |
13351 | case ISD::FMAXIMUM: |
13352 | case ISD::FMINIMUM: { |
13353 | SDValue Elt0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ResVT, |
13354 | N1: Vec.getOperand(i: 0), N2: Idx); |
13355 | SDValue Elt1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ResVT, |
13356 | N1: Vec.getOperand(i: 1), N2: Idx); |
13357 | |
13358 | DCI.AddToWorklist(N: Elt0.getNode()); |
13359 | DCI.AddToWorklist(N: Elt1.getNode()); |
13360 | return DAG.getNode(Opcode: Opc, DL: SL, VT: ResVT, N1: Elt0, N2: Elt1, Flags: Vec->getFlags()); |
13361 | } |
13362 | } |
13363 | } |
13364 | |
13365 | // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx) |
13366 | if (shouldExpandVectorDynExt(N)) { |
13367 | SDLoc SL(N); |
13368 | SDValue Idx = N->getOperand(Num: 1); |
13369 | SDValue V; |
13370 | for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) { |
13371 | SDValue IC = DAG.getVectorIdxConstant(Val: I, DL: SL); |
13372 | SDValue Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ResVT, N1: Vec, N2: IC); |
13373 | if (I == 0) |
13374 | V = Elt; |
13375 | else |
13376 | V = DAG.getSelectCC(DL: SL, LHS: Idx, RHS: IC, True: Elt, False: V, Cond: ISD::SETEQ); |
13377 | } |
13378 | return V; |
13379 | } |
13380 | |
13381 | if (!DCI.isBeforeLegalize()) |
13382 | return SDValue(); |
13383 | |
13384 | // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit |
13385 | // elements. This exposes more load reduction opportunities by replacing |
13386 | // multiple small extract_vector_elements with a single 32-bit extract. |
13387 | auto *Idx = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1)); |
13388 | if (isa<MemSDNode>(Val: Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() && |
13389 | VecSize > 32 && VecSize % 32 == 0 && Idx) { |
13390 | EVT NewVT = getEquivalentMemType(Context&: *DAG.getContext(), VT: VecVT); |
13391 | |
13392 | unsigned BitIndex = Idx->getZExtValue() * VecEltSize; |
13393 | unsigned EltIdx = BitIndex / 32; |
13394 | unsigned LeftoverBitIdx = BitIndex % 32; |
13395 | SDLoc SL(N); |
13396 | |
13397 | SDValue Cast = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVT, Operand: Vec); |
13398 | DCI.AddToWorklist(N: Cast.getNode()); |
13399 | |
13400 | SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast, |
13401 | DAG.getConstant(EltIdx, SL, MVT::i32)); |
13402 | DCI.AddToWorklist(N: Elt.getNode()); |
13403 | SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt, |
13404 | DAG.getConstant(LeftoverBitIdx, SL, MVT::i32)); |
13405 | DCI.AddToWorklist(N: Srl.getNode()); |
13406 | |
13407 | EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger(); |
13408 | SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: VecEltAsIntVT, Operand: Srl); |
13409 | DCI.AddToWorklist(N: Trunc.getNode()); |
13410 | |
13411 | if (VecEltVT == ResVT) { |
13412 | return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: VecEltVT, Operand: Trunc); |
13413 | } |
13414 | |
13415 | assert(ResVT.isScalarInteger()); |
13416 | return DAG.getAnyExtOrTrunc(Op: Trunc, DL: SL, VT: ResVT); |
13417 | } |
13418 | |
13419 | return SDValue(); |
13420 | } |
13421 | |
13422 | SDValue |
13423 | SITargetLowering::performInsertVectorEltCombine(SDNode *N, |
13424 | DAGCombinerInfo &DCI) const { |
13425 | SDValue Vec = N->getOperand(Num: 0); |
13426 | SDValue Idx = N->getOperand(Num: 2); |
13427 | EVT VecVT = Vec.getValueType(); |
13428 | EVT EltVT = VecVT.getVectorElementType(); |
13429 | |
13430 | // INSERT_VECTOR_ELT (<n x e>, var-idx) |
13431 | // => BUILD_VECTOR n x select (e, const-idx) |
13432 | if (!shouldExpandVectorDynExt(N)) |
13433 | return SDValue(); |
13434 | |
13435 | SelectionDAG &DAG = DCI.DAG; |
13436 | SDLoc SL(N); |
13437 | SDValue Ins = N->getOperand(Num: 1); |
13438 | EVT IdxVT = Idx.getValueType(); |
13439 | |
13440 | SmallVector<SDValue, 16> Ops; |
13441 | for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) { |
13442 | SDValue IC = DAG.getConstant(Val: I, DL: SL, VT: IdxVT); |
13443 | SDValue Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT, N1: Vec, N2: IC); |
13444 | SDValue V = DAG.getSelectCC(DL: SL, LHS: Idx, RHS: IC, True: Ins, False: Elt, Cond: ISD::SETEQ); |
13445 | Ops.push_back(Elt: V); |
13446 | } |
13447 | |
13448 | return DAG.getBuildVector(VT: VecVT, DL: SL, Ops); |
13449 | } |
13450 | |
13451 | /// Return the source of an fp_extend from f16 to f32, or a converted FP |
13452 | /// constant. |
13453 | static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src) { |
13454 | if (Src.getOpcode() == ISD::FP_EXTEND && |
13455 | Src.getOperand(0).getValueType() == MVT::f16) { |
13456 | return Src.getOperand(i: 0); |
13457 | } |
13458 | |
13459 | if (auto *CFP = dyn_cast<ConstantFPSDNode>(Val&: Src)) { |
13460 | APFloat Val = CFP->getValueAPF(); |
13461 | bool LosesInfo = true; |
13462 | Val.convert(ToSemantics: APFloat::IEEEhalf(), RM: APFloat::rmNearestTiesToEven, losesInfo: &LosesInfo); |
13463 | if (!LosesInfo) |
13464 | return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16); |
13465 | } |
13466 | |
13467 | return SDValue(); |
13468 | } |
13469 | |
13470 | SDValue SITargetLowering::performFPRoundCombine(SDNode *N, |
13471 | DAGCombinerInfo &DCI) const { |
13472 | assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() && |
13473 | "combine only useful on gfx8" ); |
13474 | |
13475 | SDValue TruncSrc = N->getOperand(Num: 0); |
13476 | EVT VT = N->getValueType(ResNo: 0); |
13477 | if (VT != MVT::f16) |
13478 | return SDValue(); |
13479 | |
13480 | if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 || |
13481 | TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse()) |
13482 | return SDValue(); |
13483 | |
13484 | SelectionDAG &DAG = DCI.DAG; |
13485 | SDLoc SL(N); |
13486 | |
13487 | // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3, |
13488 | // and expanding it with min/max saves 1 instruction vs. casting to f32 and |
13489 | // casting back. |
13490 | |
13491 | // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) => |
13492 | // fmin(fmax(a, b), fmax(fmin(a, b), c)) |
13493 | SDValue A = strictFPExtFromF16(DAG, Src: TruncSrc.getOperand(i: 0)); |
13494 | if (!A) |
13495 | return SDValue(); |
13496 | |
13497 | SDValue B = strictFPExtFromF16(DAG, Src: TruncSrc.getOperand(i: 1)); |
13498 | if (!B) |
13499 | return SDValue(); |
13500 | |
13501 | SDValue C = strictFPExtFromF16(DAG, Src: TruncSrc.getOperand(i: 2)); |
13502 | if (!C) |
13503 | return SDValue(); |
13504 | |
13505 | // This changes signaling nan behavior. If an input is a signaling nan, it |
13506 | // would have been quieted by the fpext originally. We don't care because |
13507 | // these are unconstrained ops. If we needed to insert quieting canonicalizes |
13508 | // we would be worse off than just doing the promotion. |
13509 | SDValue A1 = DAG.getNode(Opcode: ISD::FMINNUM_IEEE, DL: SL, VT, N1: A, N2: B); |
13510 | SDValue B1 = DAG.getNode(Opcode: ISD::FMAXNUM_IEEE, DL: SL, VT, N1: A, N2: B); |
13511 | SDValue C1 = DAG.getNode(Opcode: ISD::FMAXNUM_IEEE, DL: SL, VT, N1: A1, N2: C); |
13512 | return DAG.getNode(Opcode: ISD::FMINNUM_IEEE, DL: SL, VT, N1: B1, N2: C1); |
13513 | } |
13514 | |
13515 | unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, |
13516 | const SDNode *N0, |
13517 | const SDNode *N1) const { |
13518 | EVT VT = N0->getValueType(ResNo: 0); |
13519 | |
13520 | // Only do this if we are not trying to support denormals. v_mad_f32 does not |
13521 | // support denormals ever. |
13522 | if (((VT == MVT::f32 && |
13523 | denormalModeIsFlushAllF32(DAG.getMachineFunction())) || |
13524 | (VT == MVT::f16 && Subtarget->hasMadF16() && |
13525 | denormalModeIsFlushAllF64F16(DAG.getMachineFunction()))) && |
13526 | isOperationLegal(ISD::FMAD, VT)) |
13527 | return ISD::FMAD; |
13528 | |
13529 | const TargetOptions &Options = DAG.getTarget().Options; |
13530 | if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath || |
13531 | (N0->getFlags().hasAllowContract() && |
13532 | N1->getFlags().hasAllowContract())) && |
13533 | isFMAFasterThanFMulAndFAdd(MF: DAG.getMachineFunction(), VT)) { |
13534 | return ISD::FMA; |
13535 | } |
13536 | |
13537 | return 0; |
13538 | } |
13539 | |
13540 | // For a reassociatable opcode perform: |
13541 | // op x, (op y, z) -> op (op x, z), y, if x and z are uniform |
13542 | SDValue SITargetLowering::reassociateScalarOps(SDNode *N, |
13543 | SelectionDAG &DAG) const { |
13544 | EVT VT = N->getValueType(ResNo: 0); |
13545 | if (VT != MVT::i32 && VT != MVT::i64) |
13546 | return SDValue(); |
13547 | |
13548 | if (DAG.isBaseWithConstantOffset(Op: SDValue(N, 0))) |
13549 | return SDValue(); |
13550 | |
13551 | unsigned Opc = N->getOpcode(); |
13552 | SDValue Op0 = N->getOperand(Num: 0); |
13553 | SDValue Op1 = N->getOperand(Num: 1); |
13554 | |
13555 | if (!(Op0->isDivergent() ^ Op1->isDivergent())) |
13556 | return SDValue(); |
13557 | |
13558 | if (Op0->isDivergent()) |
13559 | std::swap(a&: Op0, b&: Op1); |
13560 | |
13561 | if (Op1.getOpcode() != Opc || !Op1.hasOneUse()) |
13562 | return SDValue(); |
13563 | |
13564 | SDValue Op2 = Op1.getOperand(i: 1); |
13565 | Op1 = Op1.getOperand(i: 0); |
13566 | if (!(Op1->isDivergent() ^ Op2->isDivergent())) |
13567 | return SDValue(); |
13568 | |
13569 | if (Op1->isDivergent()) |
13570 | std::swap(a&: Op1, b&: Op2); |
13571 | |
13572 | SDLoc SL(N); |
13573 | SDValue Add1 = DAG.getNode(Opcode: Opc, DL: SL, VT, N1: Op0, N2: Op1); |
13574 | return DAG.getNode(Opcode: Opc, DL: SL, VT, N1: Add1, N2: Op2); |
13575 | } |
13576 | |
13577 | static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, |
13578 | EVT VT, |
13579 | SDValue N0, SDValue N1, SDValue N2, |
13580 | bool Signed) { |
13581 | unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32; |
13582 | SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1); |
13583 | SDValue Mad = DAG.getNode(Opcode: MadOpc, DL: SL, VTList: VTs, N1: N0, N2: N1, N3: N2); |
13584 | return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: Mad); |
13585 | } |
13586 | |
13587 | // Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high |
13588 | // multiplies, if any. |
13589 | // |
13590 | // Full 64-bit multiplies that feed into an addition are lowered here instead |
13591 | // of using the generic expansion. The generic expansion ends up with |
13592 | // a tree of ADD nodes that prevents us from using the "add" part of the |
13593 | // MAD instruction. The expansion produced here results in a chain of ADDs |
13594 | // instead of a tree. |
13595 | SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N, |
13596 | DAGCombinerInfo &DCI) const { |
13597 | assert(N->getOpcode() == ISD::ADD); |
13598 | |
13599 | SelectionDAG &DAG = DCI.DAG; |
13600 | EVT VT = N->getValueType(ResNo: 0); |
13601 | SDLoc SL(N); |
13602 | SDValue LHS = N->getOperand(Num: 0); |
13603 | SDValue RHS = N->getOperand(Num: 1); |
13604 | |
13605 | if (VT.isVector()) |
13606 | return SDValue(); |
13607 | |
13608 | // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall |
13609 | // result in scalar registers for uniform values. |
13610 | if (!N->isDivergent() && Subtarget->hasSMulHi()) |
13611 | return SDValue(); |
13612 | |
13613 | unsigned NumBits = VT.getScalarSizeInBits(); |
13614 | if (NumBits <= 32 || NumBits > 64) |
13615 | return SDValue(); |
13616 | |
13617 | if (LHS.getOpcode() != ISD::MUL) { |
13618 | assert(RHS.getOpcode() == ISD::MUL); |
13619 | std::swap(a&: LHS, b&: RHS); |
13620 | } |
13621 | |
13622 | // Avoid the fold if it would unduly increase the number of multiplies due to |
13623 | // multiple uses, except on hardware with full-rate multiply-add (which is |
13624 | // part of full-rate 64-bit ops). |
13625 | if (!Subtarget->hasFullRate64Ops()) { |
13626 | unsigned NumUsers = 0; |
13627 | for (SDNode *Use : LHS->uses()) { |
13628 | // There is a use that does not feed into addition, so the multiply can't |
13629 | // be removed. We prefer MUL + ADD + ADDC over MAD + MUL. |
13630 | if (Use->getOpcode() != ISD::ADD) |
13631 | return SDValue(); |
13632 | |
13633 | // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer |
13634 | // MUL + 3xADD + 3xADDC over 3xMAD. |
13635 | ++NumUsers; |
13636 | if (NumUsers >= 3) |
13637 | return SDValue(); |
13638 | } |
13639 | } |
13640 | |
13641 | SDValue MulLHS = LHS.getOperand(i: 0); |
13642 | SDValue MulRHS = LHS.getOperand(i: 1); |
13643 | SDValue AddRHS = RHS; |
13644 | |
13645 | // Always check whether operands are small unsigned values, since that |
13646 | // knowledge is useful in more cases. Check for small signed values only if |
13647 | // doing so can unlock a shorter code sequence. |
13648 | bool MulLHSUnsigned32 = numBitsUnsigned(Op: MulLHS, DAG) <= 32; |
13649 | bool MulRHSUnsigned32 = numBitsUnsigned(Op: MulRHS, DAG) <= 32; |
13650 | |
13651 | bool MulSignedLo = false; |
13652 | if (!MulLHSUnsigned32 || !MulRHSUnsigned32) { |
13653 | MulSignedLo = numBitsSigned(Op: MulLHS, DAG) <= 32 && |
13654 | numBitsSigned(Op: MulRHS, DAG) <= 32; |
13655 | } |
13656 | |
13657 | // The operands and final result all have the same number of bits. If |
13658 | // operands need to be extended, they can be extended with garbage. The |
13659 | // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is |
13660 | // truncated away in the end. |
13661 | if (VT != MVT::i64) { |
13662 | MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS); |
13663 | MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS); |
13664 | AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS); |
13665 | } |
13666 | |
13667 | // The basic code generated is conceptually straightforward. Pseudo code: |
13668 | // |
13669 | // accum = mad_64_32 lhs.lo, rhs.lo, accum |
13670 | // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi |
13671 | // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi |
13672 | // |
13673 | // The second and third lines are optional, depending on whether the factors |
13674 | // are {sign,zero}-extended or not. |
13675 | // |
13676 | // The actual DAG is noisier than the pseudo code, but only due to |
13677 | // instructions that disassemble values into low and high parts, and |
13678 | // assemble the final result. |
13679 | SDValue One = DAG.getConstant(1, SL, MVT::i32); |
13680 | |
13681 | auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS); |
13682 | auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS); |
13683 | SDValue Accum = |
13684 | getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo); |
13685 | |
13686 | if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) { |
13687 | SDValue AccumLo, AccumHi; |
13688 | std::tie(AccumLo, AccumHi) = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32); |
13689 | |
13690 | if (!MulLHSUnsigned32) { |
13691 | auto MulLHSHi = |
13692 | DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One); |
13693 | SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo); |
13694 | AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi); |
13695 | } |
13696 | |
13697 | if (!MulRHSUnsigned32) { |
13698 | auto MulRHSHi = |
13699 | DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One); |
13700 | SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi); |
13701 | AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi); |
13702 | } |
13703 | |
13704 | Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi}); |
13705 | Accum = DAG.getBitcast(MVT::i64, Accum); |
13706 | } |
13707 | |
13708 | if (VT != MVT::i64) |
13709 | Accum = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: Accum); |
13710 | return Accum; |
13711 | } |
13712 | |
13713 | // Collect the ultimate src of each of the mul node's operands, and confirm |
13714 | // each operand is 8 bytes. |
13715 | static std::optional<ByteProvider<SDValue>> |
13716 | handleMulOperand(const SDValue &MulOperand) { |
13717 | auto Byte0 = calculateByteProvider(Op: MulOperand, Index: 0, Depth: 0); |
13718 | if (!Byte0 || Byte0->isConstantZero()) { |
13719 | return std::nullopt; |
13720 | } |
13721 | auto Byte1 = calculateByteProvider(Op: MulOperand, Index: 1, Depth: 0); |
13722 | if (Byte1 && !Byte1->isConstantZero()) { |
13723 | return std::nullopt; |
13724 | } |
13725 | return Byte0; |
13726 | } |
13727 | |
13728 | static unsigned addPermMasks(unsigned First, unsigned Second) { |
13729 | unsigned FirstCs = First & 0x0c0c0c0c; |
13730 | unsigned SecondCs = Second & 0x0c0c0c0c; |
13731 | unsigned FirstNoCs = First & ~0x0c0c0c0c; |
13732 | unsigned SecondNoCs = Second & ~0x0c0c0c0c; |
13733 | |
13734 | assert((FirstCs & 0xFF) | (SecondCs & 0xFF)); |
13735 | assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00)); |
13736 | assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000)); |
13737 | assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000)); |
13738 | |
13739 | return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs); |
13740 | } |
13741 | |
13742 | struct DotSrc { |
13743 | SDValue SrcOp; |
13744 | int64_t PermMask; |
13745 | int64_t DWordOffset; |
13746 | }; |
13747 | |
13748 | static void placeSources(ByteProvider<SDValue> &Src0, |
13749 | ByteProvider<SDValue> &Src1, |
13750 | SmallVectorImpl<DotSrc> &Src0s, |
13751 | SmallVectorImpl<DotSrc> &Src1s, int Step) { |
13752 | |
13753 | assert(Src0.Src.has_value() && Src1.Src.has_value()); |
13754 | // Src0s and Src1s are empty, just place arbitrarily. |
13755 | if (Step == 0) { |
13756 | Src0s.push_back(Elt: {.SrcOp: *Src0.Src, .PermMask: ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c, |
13757 | .DWordOffset: Src0.SrcOffset / 4}); |
13758 | Src1s.push_back(Elt: {.SrcOp: *Src1.Src, .PermMask: ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c, |
13759 | .DWordOffset: Src1.SrcOffset / 4}); |
13760 | return; |
13761 | } |
13762 | |
13763 | for (int BPI = 0; BPI < 2; BPI++) { |
13764 | std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1}; |
13765 | if (BPI == 1) { |
13766 | BPP = {Src1, Src0}; |
13767 | } |
13768 | unsigned ZeroMask = 0x0c0c0c0c; |
13769 | unsigned FMask = 0xFF << (8 * (3 - Step)); |
13770 | |
13771 | unsigned FirstMask = |
13772 | (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask); |
13773 | unsigned SecondMask = |
13774 | (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask); |
13775 | // Attempt to find Src vector which contains our SDValue, if so, add our |
13776 | // perm mask to the existing one. If we are unable to find a match for the |
13777 | // first SDValue, attempt to find match for the second. |
13778 | int FirstGroup = -1; |
13779 | for (int I = 0; I < 2; I++) { |
13780 | SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s; |
13781 | auto MatchesFirst = [&BPP](DotSrc &IterElt) { |
13782 | return IterElt.SrcOp == *BPP.first.Src && |
13783 | (IterElt.DWordOffset == (BPP.first.SrcOffset / 4)); |
13784 | }; |
13785 | |
13786 | auto Match = llvm::find_if(Range&: Srcs, P: MatchesFirst); |
13787 | if (Match != Srcs.end()) { |
13788 | Match->PermMask = addPermMasks(First: FirstMask, Second: Match->PermMask); |
13789 | FirstGroup = I; |
13790 | break; |
13791 | } |
13792 | } |
13793 | if (FirstGroup != -1) { |
13794 | SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s; |
13795 | auto MatchesSecond = [&BPP](DotSrc &IterElt) { |
13796 | return IterElt.SrcOp == *BPP.second.Src && |
13797 | (IterElt.DWordOffset == (BPP.second.SrcOffset / 4)); |
13798 | }; |
13799 | auto Match = llvm::find_if(Range&: Srcs, P: MatchesSecond); |
13800 | if (Match != Srcs.end()) { |
13801 | Match->PermMask = addPermMasks(First: SecondMask, Second: Match->PermMask); |
13802 | } else |
13803 | Srcs.push_back(Elt: {.SrcOp: *BPP.second.Src, .PermMask: SecondMask, .DWordOffset: BPP.second.SrcOffset / 4}); |
13804 | return; |
13805 | } |
13806 | } |
13807 | |
13808 | // If we have made it here, then we could not find a match in Src0s or Src1s |
13809 | // for either Src0 or Src1, so just place them arbitrarily. |
13810 | |
13811 | unsigned ZeroMask = 0x0c0c0c0c; |
13812 | unsigned FMask = 0xFF << (8 * (3 - Step)); |
13813 | |
13814 | Src0s.push_back( |
13815 | Elt: {.SrcOp: *Src0.Src, |
13816 | .PermMask: ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)), |
13817 | .DWordOffset: Src1.SrcOffset / 4}); |
13818 | Src1s.push_back( |
13819 | Elt: {.SrcOp: *Src1.Src, |
13820 | .PermMask: ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)), |
13821 | .DWordOffset: Src1.SrcOffset / 4}); |
13822 | |
13823 | return; |
13824 | } |
13825 | |
13826 | static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, |
13827 | SmallVectorImpl<DotSrc> &Srcs, bool IsSigned, |
13828 | bool IsAny) { |
13829 | |
13830 | // If we just have one source, just permute it accordingly. |
13831 | if (Srcs.size() == 1) { |
13832 | auto Elt = Srcs.begin(); |
13833 | auto EltOp = getDWordFromOffset(DAG, SL, Src: Elt->SrcOp, DWordOffset: Elt->DWordOffset); |
13834 | |
13835 | // v_perm will produce the original value |
13836 | if (Elt->PermMask == 0x3020100) |
13837 | return EltOp; |
13838 | |
13839 | return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp, |
13840 | DAG.getConstant(Elt->PermMask, SL, MVT::i32)); |
13841 | } |
13842 | |
13843 | auto FirstElt = Srcs.begin(); |
13844 | auto SecondElt = std::next(x: FirstElt); |
13845 | |
13846 | SmallVector<SDValue, 2> Perms; |
13847 | |
13848 | // If we have multiple sources in the chain, combine them via perms (using |
13849 | // calculated perm mask) and Ors. |
13850 | while (true) { |
13851 | auto FirstMask = FirstElt->PermMask; |
13852 | auto SecondMask = SecondElt->PermMask; |
13853 | |
13854 | unsigned FirstCs = FirstMask & 0x0c0c0c0c; |
13855 | unsigned FirstPlusFour = FirstMask | 0x04040404; |
13856 | // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any |
13857 | // original 0x0C. |
13858 | FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs; |
13859 | |
13860 | auto PermMask = addPermMasks(First: FirstMask, Second: SecondMask); |
13861 | auto FirstVal = |
13862 | getDWordFromOffset(DAG, SL, Src: FirstElt->SrcOp, DWordOffset: FirstElt->DWordOffset); |
13863 | auto SecondVal = |
13864 | getDWordFromOffset(DAG, SL, Src: SecondElt->SrcOp, DWordOffset: SecondElt->DWordOffset); |
13865 | |
13866 | Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal, |
13867 | SecondVal, |
13868 | DAG.getConstant(PermMask, SL, MVT::i32))); |
13869 | |
13870 | FirstElt = std::next(x: SecondElt); |
13871 | if (FirstElt == Srcs.end()) |
13872 | break; |
13873 | |
13874 | SecondElt = std::next(x: FirstElt); |
13875 | // If we only have a FirstElt, then just combine that into the cumulative |
13876 | // source node. |
13877 | if (SecondElt == Srcs.end()) { |
13878 | auto EltOp = |
13879 | getDWordFromOffset(DAG, SL, Src: FirstElt->SrcOp, DWordOffset: FirstElt->DWordOffset); |
13880 | |
13881 | Perms.push_back( |
13882 | DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp, |
13883 | DAG.getConstant(FirstElt->PermMask, SL, MVT::i32))); |
13884 | break; |
13885 | } |
13886 | } |
13887 | |
13888 | assert(Perms.size() == 1 || Perms.size() == 2); |
13889 | return Perms.size() == 2 |
13890 | ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1]) |
13891 | : Perms[0]; |
13892 | } |
13893 | |
13894 | static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) { |
13895 | for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) { |
13896 | EntryMask = EntryMask >> ((4 - ChainLength) * 8); |
13897 | auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000; |
13898 | EntryMask += ZeroMask; |
13899 | } |
13900 | } |
13901 | |
13902 | static bool isMul(const SDValue Op) { |
13903 | auto Opcode = Op.getOpcode(); |
13904 | |
13905 | return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 || |
13906 | Opcode == AMDGPUISD::MUL_I24); |
13907 | } |
13908 | |
13909 | static std::optional<bool> |
13910 | checkDot4MulSignedness(const SDValue &N, ByteProvider<SDValue> &Src0, |
13911 | ByteProvider<SDValue> &Src1, const SDValue &S0Op, |
13912 | const SDValue &S1Op, const SelectionDAG &DAG) { |
13913 | // If we both ops are i8s (pre legalize-dag), then the signedness semantics |
13914 | // of the dot4 is irrelevant. |
13915 | if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8) |
13916 | return false; |
13917 | |
13918 | auto Known0 = DAG.computeKnownBits(Op: S0Op, Depth: 0); |
13919 | bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0; |
13920 | bool S0IsSigned = Known0.countMinLeadingOnes() > 0; |
13921 | auto Known1 = DAG.computeKnownBits(Op: S1Op, Depth: 0); |
13922 | bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0; |
13923 | bool S1IsSigned = Known1.countMinLeadingOnes() > 0; |
13924 | |
13925 | assert(!(S0IsUnsigned && S0IsSigned)); |
13926 | assert(!(S1IsUnsigned && S1IsSigned)); |
13927 | |
13928 | // There are 9 possible permutations of |
13929 | // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned} |
13930 | |
13931 | // In two permutations, the sign bits are known to be the same for both Ops, |
13932 | // so simply return Signed / Unsigned corresponding to the MSB |
13933 | |
13934 | if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned)) |
13935 | return S0IsSigned; |
13936 | |
13937 | // In another two permutations, the sign bits are known to be opposite. In |
13938 | // this case return std::nullopt to indicate a bad match. |
13939 | |
13940 | if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned)) |
13941 | return std::nullopt; |
13942 | |
13943 | // In the remaining five permutations, we don't know the value of the sign |
13944 | // bit for at least one Op. Since we have a valid ByteProvider, we know that |
13945 | // the upper bits must be extension bits. Thus, the only ways for the sign |
13946 | // bit to be unknown is if it was sign extended from unknown value, or if it |
13947 | // was any extended. In either case, it is correct to use the signed |
13948 | // version of the signedness semantics of dot4 |
13949 | |
13950 | // In two of such permutations, we known the sign bit is set for |
13951 | // one op, and the other is unknown. It is okay to used signed version of |
13952 | // dot4. |
13953 | if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) || |
13954 | ((S1IsSigned && !(S0IsSigned || S0IsUnsigned)))) |
13955 | return true; |
13956 | |
13957 | // In one such permutation, we don't know either of the sign bits. It is okay |
13958 | // to used the signed version of dot4. |
13959 | if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned))) |
13960 | return true; |
13961 | |
13962 | // In two of such permutations, we known the sign bit is unset for |
13963 | // one op, and the other is unknown. Return std::nullopt to indicate a |
13964 | // bad match. |
13965 | if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) || |
13966 | ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned)))) |
13967 | return std::nullopt; |
13968 | |
13969 | llvm_unreachable("Fully covered condition" ); |
13970 | } |
13971 | |
13972 | SDValue SITargetLowering::performAddCombine(SDNode *N, |
13973 | DAGCombinerInfo &DCI) const { |
13974 | SelectionDAG &DAG = DCI.DAG; |
13975 | EVT VT = N->getValueType(ResNo: 0); |
13976 | SDLoc SL(N); |
13977 | SDValue LHS = N->getOperand(Num: 0); |
13978 | SDValue RHS = N->getOperand(Num: 1); |
13979 | |
13980 | if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) { |
13981 | if (Subtarget->hasMad64_32()) { |
13982 | if (SDValue Folded = tryFoldToMad64_32(N, DCI)) |
13983 | return Folded; |
13984 | } |
13985 | } |
13986 | |
13987 | if (SDValue V = reassociateScalarOps(N, DAG)) { |
13988 | return V; |
13989 | } |
13990 | |
13991 | if ((isMul(Op: LHS) || isMul(Op: RHS)) && Subtarget->hasDot7Insts() && |
13992 | (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) { |
13993 | SDValue TempNode(N, 0); |
13994 | std::optional<bool> IsSigned; |
13995 | SmallVector<DotSrc, 4> Src0s; |
13996 | SmallVector<DotSrc, 4> Src1s; |
13997 | SmallVector<SDValue, 4> Src2s; |
13998 | |
13999 | // Match the v_dot4 tree, while collecting src nodes. |
14000 | int ChainLength = 0; |
14001 | for (int I = 0; I < 4; I++) { |
14002 | auto MulIdx = isMul(Op: LHS) ? 0 : isMul(Op: RHS) ? 1 : -1; |
14003 | if (MulIdx == -1) |
14004 | break; |
14005 | auto Src0 = handleMulOperand(MulOperand: TempNode->getOperand(Num: MulIdx)->getOperand(Num: 0)); |
14006 | if (!Src0) |
14007 | break; |
14008 | auto Src1 = handleMulOperand(MulOperand: TempNode->getOperand(Num: MulIdx)->getOperand(Num: 1)); |
14009 | if (!Src1) |
14010 | break; |
14011 | |
14012 | auto IterIsSigned = checkDot4MulSignedness( |
14013 | N: TempNode->getOperand(Num: MulIdx), Src0&: *Src0, Src1&: *Src1, |
14014 | S0Op: TempNode->getOperand(Num: MulIdx)->getOperand(Num: 0), |
14015 | S1Op: TempNode->getOperand(Num: MulIdx)->getOperand(Num: 1), DAG); |
14016 | if (!IterIsSigned) |
14017 | break; |
14018 | if (!IsSigned) |
14019 | IsSigned = *IterIsSigned; |
14020 | if (*IterIsSigned != *IsSigned) |
14021 | break; |
14022 | placeSources(Src0&: *Src0, Src1&: *Src1, Src0s, Src1s, Step: I); |
14023 | auto AddIdx = 1 - MulIdx; |
14024 | // Allow the special case where add (add (mul24, 0), mul24) became -> |
14025 | // add (mul24, mul24). |
14026 | if (I == 2 && isMul(Op: TempNode->getOperand(Num: AddIdx))) { |
14027 | Src2s.push_back(Elt: TempNode->getOperand(Num: AddIdx)); |
14028 | auto Src0 = |
14029 | handleMulOperand(MulOperand: TempNode->getOperand(Num: AddIdx)->getOperand(Num: 0)); |
14030 | if (!Src0) |
14031 | break; |
14032 | auto Src1 = |
14033 | handleMulOperand(MulOperand: TempNode->getOperand(Num: AddIdx)->getOperand(Num: 1)); |
14034 | if (!Src1) |
14035 | break; |
14036 | auto IterIsSigned = checkDot4MulSignedness( |
14037 | N: TempNode->getOperand(Num: AddIdx), Src0&: *Src0, Src1&: *Src1, |
14038 | S0Op: TempNode->getOperand(Num: AddIdx)->getOperand(Num: 0), |
14039 | S1Op: TempNode->getOperand(Num: AddIdx)->getOperand(Num: 1), DAG); |
14040 | if (!IterIsSigned) |
14041 | break; |
14042 | assert(IsSigned); |
14043 | if (*IterIsSigned != *IsSigned) |
14044 | break; |
14045 | placeSources(Src0&: *Src0, Src1&: *Src1, Src0s, Src1s, Step: I + 1); |
14046 | Src2s.push_back(DAG.getConstant(0, SL, MVT::i32)); |
14047 | ChainLength = I + 2; |
14048 | break; |
14049 | } |
14050 | |
14051 | TempNode = TempNode->getOperand(Num: AddIdx); |
14052 | Src2s.push_back(Elt: TempNode); |
14053 | ChainLength = I + 1; |
14054 | if (TempNode->getNumOperands() < 2) |
14055 | break; |
14056 | LHS = TempNode->getOperand(Num: 0); |
14057 | RHS = TempNode->getOperand(Num: 1); |
14058 | } |
14059 | |
14060 | if (ChainLength < 2) |
14061 | return SDValue(); |
14062 | |
14063 | // Masks were constructed with assumption that we would find a chain of |
14064 | // length 4. If not, then we need to 0 out the MSB bits (via perm mask of |
14065 | // 0x0c) so they do not affect dot calculation. |
14066 | if (ChainLength < 4) { |
14067 | fixMasks(Srcs&: Src0s, ChainLength); |
14068 | fixMasks(Srcs&: Src1s, ChainLength); |
14069 | } |
14070 | |
14071 | SDValue Src0, Src1; |
14072 | |
14073 | // If we are just using a single source for both, and have permuted the |
14074 | // bytes consistently, we can just use the sources without permuting |
14075 | // (commutation). |
14076 | bool UseOriginalSrc = false; |
14077 | if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 && |
14078 | Src0s.begin()->PermMask == Src1s.begin()->PermMask && |
14079 | Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 && |
14080 | Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) { |
14081 | SmallVector<unsigned, 4> SrcBytes; |
14082 | auto Src0Mask = Src0s.begin()->PermMask; |
14083 | SrcBytes.push_back(Elt: Src0Mask & 0xFF000000); |
14084 | bool UniqueEntries = true; |
14085 | for (auto I = 1; I < 4; I++) { |
14086 | auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8)); |
14087 | |
14088 | if (is_contained(Range&: SrcBytes, Element: NextByte)) { |
14089 | UniqueEntries = false; |
14090 | break; |
14091 | } |
14092 | SrcBytes.push_back(Elt: NextByte); |
14093 | } |
14094 | |
14095 | if (UniqueEntries) { |
14096 | UseOriginalSrc = true; |
14097 | |
14098 | auto FirstElt = Src0s.begin(); |
14099 | auto FirstEltOp = |
14100 | getDWordFromOffset(DAG, SL, Src: FirstElt->SrcOp, DWordOffset: FirstElt->DWordOffset); |
14101 | |
14102 | auto SecondElt = Src1s.begin(); |
14103 | auto SecondEltOp = getDWordFromOffset(DAG, SL, Src: SecondElt->SrcOp, |
14104 | DWordOffset: SecondElt->DWordOffset); |
14105 | |
14106 | Src0 = DAG.getBitcastedAnyExtOrTrunc(Op: FirstEltOp, DL: SL, |
14107 | VT: MVT::getIntegerVT(BitWidth: 32)); |
14108 | Src1 = DAG.getBitcastedAnyExtOrTrunc(Op: SecondEltOp, DL: SL, |
14109 | VT: MVT::getIntegerVT(BitWidth: 32)); |
14110 | } |
14111 | } |
14112 | |
14113 | if (!UseOriginalSrc) { |
14114 | Src0 = resolveSources(DAG, SL, Srcs&: Src0s, IsSigned: false, IsAny: true); |
14115 | Src1 = resolveSources(DAG, SL, Srcs&: Src1s, IsSigned: false, IsAny: true); |
14116 | } |
14117 | |
14118 | assert(IsSigned); |
14119 | SDValue Src2 = |
14120 | DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32); |
14121 | |
14122 | SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4 |
14123 | : Intrinsic::amdgcn_udot4, |
14124 | SL, MVT::i64); |
14125 | |
14126 | assert(!VT.isVector()); |
14127 | auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0, |
14128 | Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1)); |
14129 | |
14130 | return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT); |
14131 | } |
14132 | |
14133 | if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG()) |
14134 | return SDValue(); |
14135 | |
14136 | // add x, zext (setcc) => uaddo_carry x, 0, setcc |
14137 | // add x, sext (setcc) => usubo_carry x, 0, setcc |
14138 | unsigned Opc = LHS.getOpcode(); |
14139 | if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND || |
14140 | Opc == ISD::ANY_EXTEND || Opc == ISD::UADDO_CARRY) |
14141 | std::swap(a&: RHS, b&: LHS); |
14142 | |
14143 | Opc = RHS.getOpcode(); |
14144 | switch (Opc) { |
14145 | default: break; |
14146 | case ISD::ZERO_EXTEND: |
14147 | case ISD::SIGN_EXTEND: |
14148 | case ISD::ANY_EXTEND: { |
14149 | auto Cond = RHS.getOperand(i: 0); |
14150 | // If this won't be a real VOPC output, we would still need to insert an |
14151 | // extra instruction anyway. |
14152 | if (!isBoolSGPR(V: Cond)) |
14153 | break; |
14154 | SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1); |
14155 | SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond }; |
14156 | Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::USUBO_CARRY : ISD::UADDO_CARRY; |
14157 | return DAG.getNode(Opc, SL, VTList, Args); |
14158 | } |
14159 | case ISD::UADDO_CARRY: { |
14160 | // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc |
14161 | if (!isNullConstant(V: RHS.getOperand(i: 1))) |
14162 | break; |
14163 | SDValue Args[] = { LHS, RHS.getOperand(i: 0), RHS.getOperand(i: 2) }; |
14164 | return DAG.getNode(Opcode: ISD::UADDO_CARRY, DL: SDLoc(N), VTList: RHS->getVTList(), Ops: Args); |
14165 | } |
14166 | } |
14167 | return SDValue(); |
14168 | } |
14169 | |
14170 | SDValue SITargetLowering::performSubCombine(SDNode *N, |
14171 | DAGCombinerInfo &DCI) const { |
14172 | SelectionDAG &DAG = DCI.DAG; |
14173 | EVT VT = N->getValueType(ResNo: 0); |
14174 | |
14175 | if (VT != MVT::i32) |
14176 | return SDValue(); |
14177 | |
14178 | SDLoc SL(N); |
14179 | SDValue LHS = N->getOperand(Num: 0); |
14180 | SDValue RHS = N->getOperand(Num: 1); |
14181 | |
14182 | // sub x, zext (setcc) => usubo_carry x, 0, setcc |
14183 | // sub x, sext (setcc) => uaddo_carry x, 0, setcc |
14184 | unsigned Opc = RHS.getOpcode(); |
14185 | switch (Opc) { |
14186 | default: break; |
14187 | case ISD::ZERO_EXTEND: |
14188 | case ISD::SIGN_EXTEND: |
14189 | case ISD::ANY_EXTEND: { |
14190 | auto Cond = RHS.getOperand(i: 0); |
14191 | // If this won't be a real VOPC output, we would still need to insert an |
14192 | // extra instruction anyway. |
14193 | if (!isBoolSGPR(V: Cond)) |
14194 | break; |
14195 | SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1); |
14196 | SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond }; |
14197 | Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::UADDO_CARRY : ISD::USUBO_CARRY; |
14198 | return DAG.getNode(Opc, SL, VTList, Args); |
14199 | } |
14200 | } |
14201 | |
14202 | if (LHS.getOpcode() == ISD::USUBO_CARRY) { |
14203 | // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc |
14204 | if (!isNullConstant(V: LHS.getOperand(i: 1))) |
14205 | return SDValue(); |
14206 | SDValue Args[] = { LHS.getOperand(i: 0), RHS, LHS.getOperand(i: 2) }; |
14207 | return DAG.getNode(Opcode: ISD::USUBO_CARRY, DL: SDLoc(N), VTList: LHS->getVTList(), Ops: Args); |
14208 | } |
14209 | return SDValue(); |
14210 | } |
14211 | |
14212 | SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N, |
14213 | DAGCombinerInfo &DCI) const { |
14214 | |
14215 | if (N->getValueType(0) != MVT::i32) |
14216 | return SDValue(); |
14217 | |
14218 | if (!isNullConstant(V: N->getOperand(Num: 1))) |
14219 | return SDValue(); |
14220 | |
14221 | SelectionDAG &DAG = DCI.DAG; |
14222 | SDValue LHS = N->getOperand(Num: 0); |
14223 | |
14224 | // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc |
14225 | // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc |
14226 | unsigned LHSOpc = LHS.getOpcode(); |
14227 | unsigned Opc = N->getOpcode(); |
14228 | if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) || |
14229 | (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) { |
14230 | SDValue Args[] = { LHS.getOperand(i: 0), LHS.getOperand(i: 1), N->getOperand(Num: 2) }; |
14231 | return DAG.getNode(Opcode: Opc, DL: SDLoc(N), VTList: N->getVTList(), Ops: Args); |
14232 | } |
14233 | return SDValue(); |
14234 | } |
14235 | |
14236 | SDValue SITargetLowering::performFAddCombine(SDNode *N, |
14237 | DAGCombinerInfo &DCI) const { |
14238 | if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) |
14239 | return SDValue(); |
14240 | |
14241 | SelectionDAG &DAG = DCI.DAG; |
14242 | EVT VT = N->getValueType(ResNo: 0); |
14243 | |
14244 | SDLoc SL(N); |
14245 | SDValue LHS = N->getOperand(Num: 0); |
14246 | SDValue RHS = N->getOperand(Num: 1); |
14247 | |
14248 | // These should really be instruction patterns, but writing patterns with |
14249 | // source modifiers is a pain. |
14250 | |
14251 | // fadd (fadd (a, a), b) -> mad 2.0, a, b |
14252 | if (LHS.getOpcode() == ISD::FADD) { |
14253 | SDValue A = LHS.getOperand(i: 0); |
14254 | if (A == LHS.getOperand(i: 1)) { |
14255 | unsigned FusedOp = getFusedOpcode(DAG, N0: N, N1: LHS.getNode()); |
14256 | if (FusedOp != 0) { |
14257 | const SDValue Two = DAG.getConstantFP(Val: 2.0, DL: SL, VT); |
14258 | return DAG.getNode(Opcode: FusedOp, DL: SL, VT, N1: A, N2: Two, N3: RHS); |
14259 | } |
14260 | } |
14261 | } |
14262 | |
14263 | // fadd (b, fadd (a, a)) -> mad 2.0, a, b |
14264 | if (RHS.getOpcode() == ISD::FADD) { |
14265 | SDValue A = RHS.getOperand(i: 0); |
14266 | if (A == RHS.getOperand(i: 1)) { |
14267 | unsigned FusedOp = getFusedOpcode(DAG, N0: N, N1: RHS.getNode()); |
14268 | if (FusedOp != 0) { |
14269 | const SDValue Two = DAG.getConstantFP(Val: 2.0, DL: SL, VT); |
14270 | return DAG.getNode(Opcode: FusedOp, DL: SL, VT, N1: A, N2: Two, N3: LHS); |
14271 | } |
14272 | } |
14273 | } |
14274 | |
14275 | return SDValue(); |
14276 | } |
14277 | |
14278 | SDValue SITargetLowering::performFSubCombine(SDNode *N, |
14279 | DAGCombinerInfo &DCI) const { |
14280 | if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) |
14281 | return SDValue(); |
14282 | |
14283 | SelectionDAG &DAG = DCI.DAG; |
14284 | SDLoc SL(N); |
14285 | EVT VT = N->getValueType(ResNo: 0); |
14286 | assert(!VT.isVector()); |
14287 | |
14288 | // Try to get the fneg to fold into the source modifier. This undoes generic |
14289 | // DAG combines and folds them into the mad. |
14290 | // |
14291 | // Only do this if we are not trying to support denormals. v_mad_f32 does |
14292 | // not support denormals ever. |
14293 | SDValue LHS = N->getOperand(Num: 0); |
14294 | SDValue RHS = N->getOperand(Num: 1); |
14295 | if (LHS.getOpcode() == ISD::FADD) { |
14296 | // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c) |
14297 | SDValue A = LHS.getOperand(i: 0); |
14298 | if (A == LHS.getOperand(i: 1)) { |
14299 | unsigned FusedOp = getFusedOpcode(DAG, N0: N, N1: LHS.getNode()); |
14300 | if (FusedOp != 0){ |
14301 | const SDValue Two = DAG.getConstantFP(Val: 2.0, DL: SL, VT); |
14302 | SDValue NegRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS); |
14303 | |
14304 | return DAG.getNode(Opcode: FusedOp, DL: SL, VT, N1: A, N2: Two, N3: NegRHS); |
14305 | } |
14306 | } |
14307 | } |
14308 | |
14309 | if (RHS.getOpcode() == ISD::FADD) { |
14310 | // (fsub c, (fadd a, a)) -> mad -2.0, a, c |
14311 | |
14312 | SDValue A = RHS.getOperand(i: 0); |
14313 | if (A == RHS.getOperand(i: 1)) { |
14314 | unsigned FusedOp = getFusedOpcode(DAG, N0: N, N1: RHS.getNode()); |
14315 | if (FusedOp != 0){ |
14316 | const SDValue NegTwo = DAG.getConstantFP(Val: -2.0, DL: SL, VT); |
14317 | return DAG.getNode(Opcode: FusedOp, DL: SL, VT, N1: A, N2: NegTwo, N3: LHS); |
14318 | } |
14319 | } |
14320 | } |
14321 | |
14322 | return SDValue(); |
14323 | } |
14324 | |
14325 | SDValue SITargetLowering::performFDivCombine(SDNode *N, |
14326 | DAGCombinerInfo &DCI) const { |
14327 | SelectionDAG &DAG = DCI.DAG; |
14328 | SDLoc SL(N); |
14329 | EVT VT = N->getValueType(ResNo: 0); |
14330 | if (VT != MVT::f16 || !Subtarget->has16BitInsts()) |
14331 | return SDValue(); |
14332 | |
14333 | SDValue LHS = N->getOperand(Num: 0); |
14334 | SDValue RHS = N->getOperand(Num: 1); |
14335 | |
14336 | SDNodeFlags Flags = N->getFlags(); |
14337 | SDNodeFlags RHSFlags = RHS->getFlags(); |
14338 | if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() || |
14339 | !RHS->hasOneUse()) |
14340 | return SDValue(); |
14341 | |
14342 | if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(Val&: LHS)) { |
14343 | bool IsNegative = false; |
14344 | if (CLHS->isExactlyValue(V: 1.0) || |
14345 | (IsNegative = CLHS->isExactlyValue(V: -1.0))) { |
14346 | // fdiv contract 1.0, (sqrt contract x) -> rsq for f16 |
14347 | // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16 |
14348 | if (RHS.getOpcode() == ISD::FSQRT) { |
14349 | // TODO: Or in RHS flags, somehow missing from SDNodeFlags |
14350 | SDValue Rsq = |
14351 | DAG.getNode(Opcode: AMDGPUISD::RSQ, DL: SL, VT, Operand: RHS.getOperand(i: 0), Flags); |
14352 | return IsNegative ? DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Rsq, Flags) : Rsq; |
14353 | } |
14354 | } |
14355 | } |
14356 | |
14357 | return SDValue(); |
14358 | } |
14359 | |
14360 | SDValue SITargetLowering::performFMACombine(SDNode *N, |
14361 | DAGCombinerInfo &DCI) const { |
14362 | SelectionDAG &DAG = DCI.DAG; |
14363 | EVT VT = N->getValueType(ResNo: 0); |
14364 | SDLoc SL(N); |
14365 | |
14366 | if (!Subtarget->hasDot7Insts() || VT != MVT::f32) |
14367 | return SDValue(); |
14368 | |
14369 | // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) -> |
14370 | // FDOT2((V2F16)S0, (V2F16)S1, (F32)z)) |
14371 | SDValue Op1 = N->getOperand(Num: 0); |
14372 | SDValue Op2 = N->getOperand(Num: 1); |
14373 | SDValue FMA = N->getOperand(Num: 2); |
14374 | |
14375 | if (FMA.getOpcode() != ISD::FMA || |
14376 | Op1.getOpcode() != ISD::FP_EXTEND || |
14377 | Op2.getOpcode() != ISD::FP_EXTEND) |
14378 | return SDValue(); |
14379 | |
14380 | // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero, |
14381 | // regardless of the denorm mode setting. Therefore, |
14382 | // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2. |
14383 | const TargetOptions &Options = DAG.getTarget().Options; |
14384 | if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath || |
14385 | (N->getFlags().hasAllowContract() && |
14386 | FMA->getFlags().hasAllowContract())) { |
14387 | Op1 = Op1.getOperand(i: 0); |
14388 | Op2 = Op2.getOperand(i: 0); |
14389 | if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || |
14390 | Op2.getOpcode() != ISD::EXTRACT_VECTOR_ELT) |
14391 | return SDValue(); |
14392 | |
14393 | SDValue Vec1 = Op1.getOperand(i: 0); |
14394 | SDValue Idx1 = Op1.getOperand(i: 1); |
14395 | SDValue Vec2 = Op2.getOperand(i: 0); |
14396 | |
14397 | SDValue FMAOp1 = FMA.getOperand(i: 0); |
14398 | SDValue FMAOp2 = FMA.getOperand(i: 1); |
14399 | SDValue FMAAcc = FMA.getOperand(i: 2); |
14400 | |
14401 | if (FMAOp1.getOpcode() != ISD::FP_EXTEND || |
14402 | FMAOp2.getOpcode() != ISD::FP_EXTEND) |
14403 | return SDValue(); |
14404 | |
14405 | FMAOp1 = FMAOp1.getOperand(i: 0); |
14406 | FMAOp2 = FMAOp2.getOperand(i: 0); |
14407 | if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || |
14408 | FMAOp2.getOpcode() != ISD::EXTRACT_VECTOR_ELT) |
14409 | return SDValue(); |
14410 | |
14411 | SDValue Vec3 = FMAOp1.getOperand(i: 0); |
14412 | SDValue Vec4 = FMAOp2.getOperand(i: 0); |
14413 | SDValue Idx2 = FMAOp1.getOperand(i: 1); |
14414 | |
14415 | if (Idx1 != Op2.getOperand(i: 1) || Idx2 != FMAOp2.getOperand(i: 1) || |
14416 | // Idx1 and Idx2 cannot be the same. |
14417 | Idx1 == Idx2) |
14418 | return SDValue(); |
14419 | |
14420 | if (Vec1 == Vec2 || Vec3 == Vec4) |
14421 | return SDValue(); |
14422 | |
14423 | if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16) |
14424 | return SDValue(); |
14425 | |
14426 | if ((Vec1 == Vec3 && Vec2 == Vec4) || |
14427 | (Vec1 == Vec4 && Vec2 == Vec3)) { |
14428 | return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc, |
14429 | DAG.getTargetConstant(0, SL, MVT::i1)); |
14430 | } |
14431 | } |
14432 | return SDValue(); |
14433 | } |
14434 | |
14435 | SDValue SITargetLowering::performSetCCCombine(SDNode *N, |
14436 | DAGCombinerInfo &DCI) const { |
14437 | SelectionDAG &DAG = DCI.DAG; |
14438 | SDLoc SL(N); |
14439 | |
14440 | SDValue LHS = N->getOperand(Num: 0); |
14441 | SDValue RHS = N->getOperand(Num: 1); |
14442 | EVT VT = LHS.getValueType(); |
14443 | ISD::CondCode CC = cast<CondCodeSDNode>(Val: N->getOperand(Num: 2))->get(); |
14444 | |
14445 | auto CRHS = dyn_cast<ConstantSDNode>(Val&: RHS); |
14446 | if (!CRHS) { |
14447 | CRHS = dyn_cast<ConstantSDNode>(Val&: LHS); |
14448 | if (CRHS) { |
14449 | std::swap(a&: LHS, b&: RHS); |
14450 | CC = getSetCCSwappedOperands(Operation: CC); |
14451 | } |
14452 | } |
14453 | |
14454 | if (CRHS) { |
14455 | if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND && |
14456 | isBoolSGPR(LHS.getOperand(0))) { |
14457 | // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1 |
14458 | // setcc (sext from i1 cc), -1, eq|sle|uge) => cc |
14459 | // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1 |
14460 | // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc |
14461 | if ((CRHS->isAllOnes() && |
14462 | (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) || |
14463 | (CRHS->isZero() && |
14464 | (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE))) |
14465 | return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0), |
14466 | DAG.getConstant(-1, SL, MVT::i1)); |
14467 | if ((CRHS->isAllOnes() && |
14468 | (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) || |
14469 | (CRHS->isZero() && |
14470 | (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT))) |
14471 | return LHS.getOperand(i: 0); |
14472 | } |
14473 | |
14474 | const APInt &CRHSVal = CRHS->getAPIntValue(); |
14475 | if ((CC == ISD::SETEQ || CC == ISD::SETNE) && |
14476 | LHS.getOpcode() == ISD::SELECT && |
14477 | isa<ConstantSDNode>(Val: LHS.getOperand(i: 1)) && |
14478 | isa<ConstantSDNode>(Val: LHS.getOperand(i: 2)) && |
14479 | LHS.getConstantOperandVal(i: 1) != LHS.getConstantOperandVal(i: 2) && |
14480 | isBoolSGPR(V: LHS.getOperand(i: 0))) { |
14481 | // Given CT != FT: |
14482 | // setcc (select cc, CT, CF), CF, eq => xor cc, -1 |
14483 | // setcc (select cc, CT, CF), CF, ne => cc |
14484 | // setcc (select cc, CT, CF), CT, ne => xor cc, -1 |
14485 | // setcc (select cc, CT, CF), CT, eq => cc |
14486 | const APInt &CT = LHS.getConstantOperandAPInt(i: 1); |
14487 | const APInt &CF = LHS.getConstantOperandAPInt(i: 2); |
14488 | |
14489 | if ((CF == CRHSVal && CC == ISD::SETEQ) || |
14490 | (CT == CRHSVal && CC == ISD::SETNE)) |
14491 | return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0), |
14492 | DAG.getConstant(-1, SL, MVT::i1)); |
14493 | if ((CF == CRHSVal && CC == ISD::SETNE) || |
14494 | (CT == CRHSVal && CC == ISD::SETEQ)) |
14495 | return LHS.getOperand(i: 0); |
14496 | } |
14497 | } |
14498 | |
14499 | if (VT != MVT::f32 && VT != MVT::f64 && |
14500 | (!Subtarget->has16BitInsts() || VT != MVT::f16)) |
14501 | return SDValue(); |
14502 | |
14503 | // Match isinf/isfinite pattern |
14504 | // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity)) |
14505 | // (fcmp one (fabs x), inf) -> (fp_class x, |
14506 | // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero) |
14507 | if ((CC == ISD::SETOEQ || CC == ISD::SETONE) && LHS.getOpcode() == ISD::FABS) { |
14508 | const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(Val&: RHS); |
14509 | if (!CRHS) |
14510 | return SDValue(); |
14511 | |
14512 | const APFloat &APF = CRHS->getValueAPF(); |
14513 | if (APF.isInfinity() && !APF.isNegative()) { |
14514 | const unsigned IsInfMask = SIInstrFlags::P_INFINITY | |
14515 | SIInstrFlags::N_INFINITY; |
14516 | const unsigned IsFiniteMask = SIInstrFlags::N_ZERO | |
14517 | SIInstrFlags::P_ZERO | |
14518 | SIInstrFlags::N_NORMAL | |
14519 | SIInstrFlags::P_NORMAL | |
14520 | SIInstrFlags::N_SUBNORMAL | |
14521 | SIInstrFlags::P_SUBNORMAL; |
14522 | unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask; |
14523 | return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0), |
14524 | DAG.getConstant(Mask, SL, MVT::i32)); |
14525 | } |
14526 | } |
14527 | |
14528 | return SDValue(); |
14529 | } |
14530 | |
14531 | SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N, |
14532 | DAGCombinerInfo &DCI) const { |
14533 | SelectionDAG &DAG = DCI.DAG; |
14534 | SDLoc SL(N); |
14535 | unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0; |
14536 | |
14537 | SDValue Src = N->getOperand(Num: 0); |
14538 | SDValue Shift = N->getOperand(Num: 0); |
14539 | |
14540 | // TODO: Extend type shouldn't matter (assuming legal types). |
14541 | if (Shift.getOpcode() == ISD::ZERO_EXTEND) |
14542 | Shift = Shift.getOperand(i: 0); |
14543 | |
14544 | if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) { |
14545 | // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x |
14546 | // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x |
14547 | // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x |
14548 | // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x |
14549 | // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x |
14550 | if (auto *C = dyn_cast<ConstantSDNode>(Val: Shift.getOperand(i: 1))) { |
14551 | SDValue Shifted = DAG.getZExtOrTrunc(Shift.getOperand(0), |
14552 | SDLoc(Shift.getOperand(0)), MVT::i32); |
14553 | |
14554 | unsigned ShiftOffset = 8 * Offset; |
14555 | if (Shift.getOpcode() == ISD::SHL) |
14556 | ShiftOffset -= C->getZExtValue(); |
14557 | else |
14558 | ShiftOffset += C->getZExtValue(); |
14559 | |
14560 | if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) { |
14561 | return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL, |
14562 | MVT::f32, Shifted); |
14563 | } |
14564 | } |
14565 | } |
14566 | |
14567 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); |
14568 | APInt DemandedBits = APInt::getBitsSet(numBits: 32, loBit: 8 * Offset, hiBit: 8 * Offset + 8); |
14569 | if (TLI.SimplifyDemandedBits(Op: Src, DemandedBits, DCI)) { |
14570 | // We simplified Src. If this node is not dead, visit it again so it is |
14571 | // folded properly. |
14572 | if (N->getOpcode() != ISD::DELETED_NODE) |
14573 | DCI.AddToWorklist(N); |
14574 | return SDValue(N, 0); |
14575 | } |
14576 | |
14577 | // Handle (or x, (srl y, 8)) pattern when known bits are zero. |
14578 | if (SDValue DemandedSrc = |
14579 | TLI.SimplifyMultipleUseDemandedBits(Src, DemandedBits, DAG)) |
14580 | return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc); |
14581 | |
14582 | return SDValue(); |
14583 | } |
14584 | |
14585 | SDValue SITargetLowering::performClampCombine(SDNode *N, |
14586 | DAGCombinerInfo &DCI) const { |
14587 | ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(Val: N->getOperand(Num: 0)); |
14588 | if (!CSrc) |
14589 | return SDValue(); |
14590 | |
14591 | const MachineFunction &MF = DCI.DAG.getMachineFunction(); |
14592 | const APFloat &F = CSrc->getValueAPF(); |
14593 | APFloat Zero = APFloat::getZero(Sem: F.getSemantics()); |
14594 | if (F < Zero || |
14595 | (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) { |
14596 | return DCI.DAG.getConstantFP(Val: Zero, DL: SDLoc(N), VT: N->getValueType(ResNo: 0)); |
14597 | } |
14598 | |
14599 | APFloat One(F.getSemantics(), "1.0" ); |
14600 | if (F > One) |
14601 | return DCI.DAG.getConstantFP(Val: One, DL: SDLoc(N), VT: N->getValueType(ResNo: 0)); |
14602 | |
14603 | return SDValue(CSrc, 0); |
14604 | } |
14605 | |
14606 | |
14607 | SDValue SITargetLowering::PerformDAGCombine(SDNode *N, |
14608 | DAGCombinerInfo &DCI) const { |
14609 | if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None) |
14610 | return SDValue(); |
14611 | switch (N->getOpcode()) { |
14612 | case ISD::ADD: |
14613 | return performAddCombine(N, DCI); |
14614 | case ISD::SUB: |
14615 | return performSubCombine(N, DCI); |
14616 | case ISD::UADDO_CARRY: |
14617 | case ISD::USUBO_CARRY: |
14618 | return performAddCarrySubCarryCombine(N, DCI); |
14619 | case ISD::FADD: |
14620 | return performFAddCombine(N, DCI); |
14621 | case ISD::FSUB: |
14622 | return performFSubCombine(N, DCI); |
14623 | case ISD::FDIV: |
14624 | return performFDivCombine(N, DCI); |
14625 | case ISD::SETCC: |
14626 | return performSetCCCombine(N, DCI); |
14627 | case ISD::FMAXNUM: |
14628 | case ISD::FMINNUM: |
14629 | case ISD::FMAXNUM_IEEE: |
14630 | case ISD::FMINNUM_IEEE: |
14631 | case ISD::FMAXIMUM: |
14632 | case ISD::FMINIMUM: |
14633 | case ISD::SMAX: |
14634 | case ISD::SMIN: |
14635 | case ISD::UMAX: |
14636 | case ISD::UMIN: |
14637 | case AMDGPUISD::FMIN_LEGACY: |
14638 | case AMDGPUISD::FMAX_LEGACY: |
14639 | return performMinMaxCombine(N, DCI); |
14640 | case ISD::FMA: |
14641 | return performFMACombine(N, DCI); |
14642 | case ISD::AND: |
14643 | return performAndCombine(N, DCI); |
14644 | case ISD::OR: |
14645 | return performOrCombine(N, DCI); |
14646 | case ISD::FSHR: { |
14647 | const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); |
14648 | if (N->getValueType(0) == MVT::i32 && N->isDivergent() && |
14649 | TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) { |
14650 | return matchPERM(N, DCI); |
14651 | } |
14652 | break; |
14653 | } |
14654 | case ISD::XOR: |
14655 | return performXorCombine(N, DCI); |
14656 | case ISD::ZERO_EXTEND: |
14657 | return performZeroExtendCombine(N, DCI); |
14658 | case ISD::SIGN_EXTEND_INREG: |
14659 | return performSignExtendInRegCombine(N , DCI); |
14660 | case AMDGPUISD::FP_CLASS: |
14661 | return performClassCombine(N, DCI); |
14662 | case ISD::FCANONICALIZE: |
14663 | return performFCanonicalizeCombine(N, DCI); |
14664 | case AMDGPUISD::RCP: |
14665 | return performRcpCombine(N, DCI); |
14666 | case ISD::FLDEXP: |
14667 | case AMDGPUISD::FRACT: |
14668 | case AMDGPUISD::RSQ: |
14669 | case AMDGPUISD::RCP_LEGACY: |
14670 | case AMDGPUISD::RCP_IFLAG: |
14671 | case AMDGPUISD::RSQ_CLAMP: { |
14672 | // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted |
14673 | SDValue Src = N->getOperand(Num: 0); |
14674 | if (Src.isUndef()) |
14675 | return Src; |
14676 | break; |
14677 | } |
14678 | case ISD::SINT_TO_FP: |
14679 | case ISD::UINT_TO_FP: |
14680 | return performUCharToFloatCombine(N, DCI); |
14681 | case ISD::FCOPYSIGN: |
14682 | return performFCopySignCombine(N, DCI); |
14683 | case AMDGPUISD::CVT_F32_UBYTE0: |
14684 | case AMDGPUISD::CVT_F32_UBYTE1: |
14685 | case AMDGPUISD::CVT_F32_UBYTE2: |
14686 | case AMDGPUISD::CVT_F32_UBYTE3: |
14687 | return performCvtF32UByteNCombine(N, DCI); |
14688 | case AMDGPUISD::FMED3: |
14689 | return performFMed3Combine(N, DCI); |
14690 | case AMDGPUISD::CVT_PKRTZ_F16_F32: |
14691 | return performCvtPkRTZCombine(N, DCI); |
14692 | case AMDGPUISD::CLAMP: |
14693 | return performClampCombine(N, DCI); |
14694 | case ISD::SCALAR_TO_VECTOR: { |
14695 | SelectionDAG &DAG = DCI.DAG; |
14696 | EVT VT = N->getValueType(ResNo: 0); |
14697 | |
14698 | // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x)) |
14699 | if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) { |
14700 | SDLoc SL(N); |
14701 | SDValue Src = N->getOperand(Num: 0); |
14702 | EVT EltVT = Src.getValueType(); |
14703 | if (EltVT != MVT::i16) |
14704 | Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src); |
14705 | |
14706 | SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src); |
14707 | return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Ext); |
14708 | } |
14709 | |
14710 | break; |
14711 | } |
14712 | case ISD::EXTRACT_VECTOR_ELT: |
14713 | return performExtractVectorEltCombine(N, DCI); |
14714 | case ISD::INSERT_VECTOR_ELT: |
14715 | return performInsertVectorEltCombine(N, DCI); |
14716 | case ISD::FP_ROUND: |
14717 | return performFPRoundCombine(N, DCI); |
14718 | case ISD::LOAD: { |
14719 | if (SDValue Widened = widenLoad(Ld: cast<LoadSDNode>(Val: N), DCI)) |
14720 | return Widened; |
14721 | [[fallthrough]]; |
14722 | } |
14723 | default: { |
14724 | if (!DCI.isBeforeLegalize()) { |
14725 | if (MemSDNode *MemNode = dyn_cast<MemSDNode>(Val: N)) |
14726 | return performMemSDNodeCombine(N: MemNode, DCI); |
14727 | } |
14728 | |
14729 | break; |
14730 | } |
14731 | } |
14732 | |
14733 | return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); |
14734 | } |
14735 | |
14736 | /// Helper function for adjustWritemask |
14737 | static unsigned SubIdx2Lane(unsigned Idx) { |
14738 | switch (Idx) { |
14739 | default: return ~0u; |
14740 | case AMDGPU::sub0: return 0; |
14741 | case AMDGPU::sub1: return 1; |
14742 | case AMDGPU::sub2: return 2; |
14743 | case AMDGPU::sub3: return 3; |
14744 | case AMDGPU::sub4: return 4; // Possible with TFE/LWE |
14745 | } |
14746 | } |
14747 | |
14748 | /// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions |
14749 | SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, |
14750 | SelectionDAG &DAG) const { |
14751 | unsigned Opcode = Node->getMachineOpcode(); |
14752 | |
14753 | // Subtract 1 because the vdata output is not a MachineSDNode operand. |
14754 | int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1; |
14755 | if (D16Idx >= 0 && Node->getConstantOperandVal(Num: D16Idx)) |
14756 | return Node; // not implemented for D16 |
14757 | |
14758 | SDNode *Users[5] = { nullptr }; |
14759 | unsigned Lane = 0; |
14760 | unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1; |
14761 | unsigned OldDmask = Node->getConstantOperandVal(Num: DmaskIdx); |
14762 | unsigned NewDmask = 0; |
14763 | unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1; |
14764 | unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1; |
14765 | bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(Num: TFEIdx)) || |
14766 | (int(LWEIdx) >= 0 && Node->getConstantOperandVal(Num: LWEIdx))) |
14767 | ? true |
14768 | : false; |
14769 | unsigned TFCLane = 0; |
14770 | bool HasChain = Node->getNumValues() > 1; |
14771 | |
14772 | if (OldDmask == 0) { |
14773 | // These are folded out, but on the chance it happens don't assert. |
14774 | return Node; |
14775 | } |
14776 | |
14777 | unsigned OldBitsSet = llvm::popcount(Value: OldDmask); |
14778 | // Work out which is the TFE/LWE lane if that is enabled. |
14779 | if (UsesTFC) { |
14780 | TFCLane = OldBitsSet; |
14781 | } |
14782 | |
14783 | // Try to figure out the used register components |
14784 | for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end(); |
14785 | I != E; ++I) { |
14786 | |
14787 | // Don't look at users of the chain. |
14788 | if (I.getUse().getResNo() != 0) |
14789 | continue; |
14790 | |
14791 | // Abort if we can't understand the usage |
14792 | if (!I->isMachineOpcode() || |
14793 | I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) |
14794 | return Node; |
14795 | |
14796 | // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used. |
14797 | // Note that subregs are packed, i.e. Lane==0 is the first bit set |
14798 | // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit |
14799 | // set, etc. |
14800 | Lane = SubIdx2Lane(Idx: I->getConstantOperandVal(Num: 1)); |
14801 | if (Lane == ~0u) |
14802 | return Node; |
14803 | |
14804 | // Check if the use is for the TFE/LWE generated result at VGPRn+1. |
14805 | if (UsesTFC && Lane == TFCLane) { |
14806 | Users[Lane] = *I; |
14807 | } else { |
14808 | // Set which texture component corresponds to the lane. |
14809 | unsigned Comp; |
14810 | for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) { |
14811 | Comp = llvm::countr_zero(Val: Dmask); |
14812 | Dmask &= ~(1 << Comp); |
14813 | } |
14814 | |
14815 | // Abort if we have more than one user per component. |
14816 | if (Users[Lane]) |
14817 | return Node; |
14818 | |
14819 | Users[Lane] = *I; |
14820 | NewDmask |= 1 << Comp; |
14821 | } |
14822 | } |
14823 | |
14824 | // Don't allow 0 dmask, as hardware assumes one channel enabled. |
14825 | bool NoChannels = !NewDmask; |
14826 | if (NoChannels) { |
14827 | if (!UsesTFC) { |
14828 | // No uses of the result and not using TFC. Then do nothing. |
14829 | return Node; |
14830 | } |
14831 | // If the original dmask has one channel - then nothing to do |
14832 | if (OldBitsSet == 1) |
14833 | return Node; |
14834 | // Use an arbitrary dmask - required for the instruction to work |
14835 | NewDmask = 1; |
14836 | } |
14837 | // Abort if there's no change |
14838 | if (NewDmask == OldDmask) |
14839 | return Node; |
14840 | |
14841 | unsigned BitsSet = llvm::popcount(Value: NewDmask); |
14842 | |
14843 | // Check for TFE or LWE - increase the number of channels by one to account |
14844 | // for the extra return value |
14845 | // This will need adjustment for D16 if this is also included in |
14846 | // adjustWriteMask (this function) but at present D16 are excluded. |
14847 | unsigned NewChannels = BitsSet + UsesTFC; |
14848 | |
14849 | int NewOpcode = |
14850 | AMDGPU::getMaskedMIMGOp(Opc: Node->getMachineOpcode(), NewChannels); |
14851 | assert(NewOpcode != -1 && |
14852 | NewOpcode != static_cast<int>(Node->getMachineOpcode()) && |
14853 | "failed to find equivalent MIMG op" ); |
14854 | |
14855 | // Adjust the writemask in the node |
14856 | SmallVector<SDValue, 12> Ops; |
14857 | Ops.insert(I: Ops.end(), From: Node->op_begin(), To: Node->op_begin() + DmaskIdx); |
14858 | Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32)); |
14859 | Ops.insert(I: Ops.end(), From: Node->op_begin() + DmaskIdx + 1, To: Node->op_end()); |
14860 | |
14861 | MVT SVT = Node->getValueType(ResNo: 0).getVectorElementType().getSimpleVT(); |
14862 | |
14863 | MVT ResultVT = NewChannels == 1 ? |
14864 | SVT : MVT::getVectorVT(VT: SVT, NumElements: NewChannels == 3 ? 4 : |
14865 | NewChannels == 5 ? 8 : NewChannels); |
14866 | SDVTList NewVTList = HasChain ? |
14867 | DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT); |
14868 | |
14869 | |
14870 | MachineSDNode *NewNode = DAG.getMachineNode(Opcode: NewOpcode, dl: SDLoc(Node), |
14871 | VTs: NewVTList, Ops); |
14872 | |
14873 | if (HasChain) { |
14874 | // Update chain. |
14875 | DAG.setNodeMemRefs(N: NewNode, NewMemRefs: Node->memoperands()); |
14876 | DAG.ReplaceAllUsesOfValueWith(From: SDValue(Node, 1), To: SDValue(NewNode, 1)); |
14877 | } |
14878 | |
14879 | if (NewChannels == 1) { |
14880 | assert(Node->hasNUsesOfValue(1, 0)); |
14881 | SDNode *Copy = DAG.getMachineNode(Opcode: TargetOpcode::COPY, |
14882 | dl: SDLoc(Node), VT: Users[Lane]->getValueType(ResNo: 0), |
14883 | Op1: SDValue(NewNode, 0)); |
14884 | DAG.ReplaceAllUsesWith(From: Users[Lane], To: Copy); |
14885 | return nullptr; |
14886 | } |
14887 | |
14888 | // Update the users of the node with the new indices |
14889 | for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) { |
14890 | SDNode *User = Users[i]; |
14891 | if (!User) { |
14892 | // Handle the special case of NoChannels. We set NewDmask to 1 above, but |
14893 | // Users[0] is still nullptr because channel 0 doesn't really have a use. |
14894 | if (i || !NoChannels) |
14895 | continue; |
14896 | } else { |
14897 | SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32); |
14898 | SDNode *NewUser = DAG.UpdateNodeOperands(N: User, Op1: SDValue(NewNode, 0), Op2: Op); |
14899 | if (NewUser != User) { |
14900 | DAG.ReplaceAllUsesWith(From: SDValue(User, 0), To: SDValue(NewUser, 0)); |
14901 | DAG.RemoveDeadNode(N: User); |
14902 | } |
14903 | } |
14904 | |
14905 | switch (Idx) { |
14906 | default: break; |
14907 | case AMDGPU::sub0: Idx = AMDGPU::sub1; break; |
14908 | case AMDGPU::sub1: Idx = AMDGPU::sub2; break; |
14909 | case AMDGPU::sub2: Idx = AMDGPU::sub3; break; |
14910 | case AMDGPU::sub3: Idx = AMDGPU::sub4; break; |
14911 | } |
14912 | } |
14913 | |
14914 | DAG.RemoveDeadNode(N: Node); |
14915 | return nullptr; |
14916 | } |
14917 | |
14918 | static bool isFrameIndexOp(SDValue Op) { |
14919 | if (Op.getOpcode() == ISD::AssertZext) |
14920 | Op = Op.getOperand(i: 0); |
14921 | |
14922 | return isa<FrameIndexSDNode>(Val: Op); |
14923 | } |
14924 | |
14925 | /// Legalize target independent instructions (e.g. INSERT_SUBREG) |
14926 | /// with frame index operands. |
14927 | /// LLVM assumes that inputs are to these instructions are registers. |
14928 | SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node, |
14929 | SelectionDAG &DAG) const { |
14930 | if (Node->getOpcode() == ISD::CopyToReg) { |
14931 | RegisterSDNode *DestReg = cast<RegisterSDNode>(Val: Node->getOperand(Num: 1)); |
14932 | SDValue SrcVal = Node->getOperand(Num: 2); |
14933 | |
14934 | // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have |
14935 | // to try understanding copies to physical registers. |
14936 | if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) { |
14937 | SDLoc SL(Node); |
14938 | MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); |
14939 | SDValue VReg = DAG.getRegister( |
14940 | MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1); |
14941 | |
14942 | SDNode *Glued = Node->getGluedNode(); |
14943 | SDValue ToVReg |
14944 | = DAG.getCopyToReg(Chain: Node->getOperand(Num: 0), dl: SL, Reg: VReg, N: SrcVal, |
14945 | Glue: SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0)); |
14946 | SDValue ToResultReg |
14947 | = DAG.getCopyToReg(Chain: ToVReg, dl: SL, Reg: SDValue(DestReg, 0), |
14948 | N: VReg, Glue: ToVReg.getValue(R: 1)); |
14949 | DAG.ReplaceAllUsesWith(From: Node, To: ToResultReg.getNode()); |
14950 | DAG.RemoveDeadNode(N: Node); |
14951 | return ToResultReg.getNode(); |
14952 | } |
14953 | } |
14954 | |
14955 | SmallVector<SDValue, 8> Ops; |
14956 | for (unsigned i = 0; i < Node->getNumOperands(); ++i) { |
14957 | if (!isFrameIndexOp(Op: Node->getOperand(Num: i))) { |
14958 | Ops.push_back(Elt: Node->getOperand(Num: i)); |
14959 | continue; |
14960 | } |
14961 | |
14962 | SDLoc DL(Node); |
14963 | Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, |
14964 | Node->getOperand(i).getValueType(), |
14965 | Node->getOperand(i)), 0)); |
14966 | } |
14967 | |
14968 | return DAG.UpdateNodeOperands(N: Node, Ops); |
14969 | } |
14970 | |
14971 | /// Fold the instructions after selecting them. |
14972 | /// Returns null if users were already updated. |
14973 | SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, |
14974 | SelectionDAG &DAG) const { |
14975 | const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); |
14976 | unsigned Opcode = Node->getMachineOpcode(); |
14977 | |
14978 | if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() && |
14979 | !TII->isGather4(Opcode) && |
14980 | AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) { |
14981 | return adjustWritemask(Node, DAG); |
14982 | } |
14983 | |
14984 | if (Opcode == AMDGPU::INSERT_SUBREG || |
14985 | Opcode == AMDGPU::REG_SEQUENCE) { |
14986 | legalizeTargetIndependentNode(Node, DAG); |
14987 | return Node; |
14988 | } |
14989 | |
14990 | switch (Opcode) { |
14991 | case AMDGPU::V_DIV_SCALE_F32_e64: |
14992 | case AMDGPU::V_DIV_SCALE_F64_e64: { |
14993 | // Satisfy the operand register constraint when one of the inputs is |
14994 | // undefined. Ordinarily each undef value will have its own implicit_def of |
14995 | // a vreg, so force these to use a single register. |
14996 | SDValue Src0 = Node->getOperand(Num: 1); |
14997 | SDValue Src1 = Node->getOperand(Num: 3); |
14998 | SDValue Src2 = Node->getOperand(Num: 5); |
14999 | |
15000 | if ((Src0.isMachineOpcode() && |
15001 | Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) && |
15002 | (Src0 == Src1 || Src0 == Src2)) |
15003 | break; |
15004 | |
15005 | MVT VT = Src0.getValueType().getSimpleVT(); |
15006 | const TargetRegisterClass *RC = |
15007 | getRegClassFor(VT, isDivergent: Src0.getNode()->isDivergent()); |
15008 | |
15009 | MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); |
15010 | SDValue UndefReg = DAG.getRegister(Reg: MRI.createVirtualRegister(RegClass: RC), VT); |
15011 | |
15012 | SDValue ImpDef = DAG.getCopyToReg(Chain: DAG.getEntryNode(), dl: SDLoc(Node), |
15013 | Reg: UndefReg, N: Src0, Glue: SDValue()); |
15014 | |
15015 | // src0 must be the same register as src1 or src2, even if the value is |
15016 | // undefined, so make sure we don't violate this constraint. |
15017 | if (Src0.isMachineOpcode() && |
15018 | Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) { |
15019 | if (Src1.isMachineOpcode() && |
15020 | Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) |
15021 | Src0 = Src1; |
15022 | else if (Src2.isMachineOpcode() && |
15023 | Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) |
15024 | Src0 = Src2; |
15025 | else { |
15026 | assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF); |
15027 | Src0 = UndefReg; |
15028 | Src1 = UndefReg; |
15029 | } |
15030 | } else |
15031 | break; |
15032 | |
15033 | SmallVector<SDValue, 9> Ops(Node->op_begin(), Node->op_end()); |
15034 | Ops[1] = Src0; |
15035 | Ops[3] = Src1; |
15036 | Ops[5] = Src2; |
15037 | Ops.push_back(Elt: ImpDef.getValue(R: 1)); |
15038 | return DAG.getMachineNode(Opcode, dl: SDLoc(Node), VTs: Node->getVTList(), Ops); |
15039 | } |
15040 | default: |
15041 | break; |
15042 | } |
15043 | |
15044 | return Node; |
15045 | } |
15046 | |
15047 | // Any MIMG instructions that use tfe or lwe require an initialization of the |
15048 | // result register that will be written in the case of a memory access failure. |
15049 | // The required code is also added to tie this init code to the result of the |
15050 | // img instruction. |
15051 | void SITargetLowering::AddMemOpInit(MachineInstr &MI) const { |
15052 | const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); |
15053 | const SIRegisterInfo &TRI = TII->getRegisterInfo(); |
15054 | MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); |
15055 | MachineBasicBlock &MBB = *MI.getParent(); |
15056 | |
15057 | int DstIdx = |
15058 | AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata); |
15059 | unsigned InitIdx = 0; |
15060 | |
15061 | if (TII->isImage(MI)) { |
15062 | MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe); |
15063 | MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe); |
15064 | MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16); |
15065 | |
15066 | if (!TFE && !LWE) // intersect_ray |
15067 | return; |
15068 | |
15069 | unsigned TFEVal = TFE ? TFE->getImm() : 0; |
15070 | unsigned LWEVal = LWE ? LWE->getImm() : 0; |
15071 | unsigned D16Val = D16 ? D16->getImm() : 0; |
15072 | |
15073 | if (!TFEVal && !LWEVal) |
15074 | return; |
15075 | |
15076 | // At least one of TFE or LWE are non-zero |
15077 | // We have to insert a suitable initialization of the result value and |
15078 | // tie this to the dest of the image instruction. |
15079 | |
15080 | // Calculate which dword we have to initialize to 0. |
15081 | MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask); |
15082 | |
15083 | // check that dmask operand is found. |
15084 | assert(MO_Dmask && "Expected dmask operand in instruction" ); |
15085 | |
15086 | unsigned dmask = MO_Dmask->getImm(); |
15087 | // Determine the number of active lanes taking into account the |
15088 | // Gather4 special case |
15089 | unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(Value: dmask); |
15090 | |
15091 | bool Packed = !Subtarget->hasUnpackedD16VMem(); |
15092 | |
15093 | InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1; |
15094 | |
15095 | // Abandon attempt if the dst size isn't large enough |
15096 | // - this is in fact an error but this is picked up elsewhere and |
15097 | // reported correctly. |
15098 | uint32_t DstSize = |
15099 | TRI.getRegSizeInBits(*TII->getOpRegClass(MI, OpNo: DstIdx)) / 32; |
15100 | if (DstSize < InitIdx) |
15101 | return; |
15102 | } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(Opc: MI.getOpcode())) { |
15103 | InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, OpNo: DstIdx)) / 32; |
15104 | } else { |
15105 | return; |
15106 | } |
15107 | |
15108 | const DebugLoc &DL = MI.getDebugLoc(); |
15109 | |
15110 | // Create a register for the initialization value. |
15111 | Register PrevDst = MRI.cloneVirtualRegister(VReg: MI.getOperand(i: DstIdx).getReg()); |
15112 | unsigned NewDst = 0; // Final initialized value will be in here |
15113 | |
15114 | // If PRTStrictNull feature is enabled (the default) then initialize |
15115 | // all the result registers to 0, otherwise just the error indication |
15116 | // register (VGPRn+1) |
15117 | unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1; |
15118 | unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1); |
15119 | |
15120 | BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst); |
15121 | for (; SizeLeft; SizeLeft--, CurrIdx++) { |
15122 | NewDst = MRI.createVirtualRegister(RegClass: TII->getOpRegClass(MI, OpNo: DstIdx)); |
15123 | // Initialize dword |
15124 | Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
15125 | BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg) |
15126 | .addImm(0); |
15127 | // Insert into the super-reg |
15128 | BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst) |
15129 | .addReg(PrevDst) |
15130 | .addReg(SubReg) |
15131 | .addImm(SIRegisterInfo::getSubRegFromChannel(Channel: CurrIdx)); |
15132 | |
15133 | PrevDst = NewDst; |
15134 | } |
15135 | |
15136 | // Add as an implicit operand |
15137 | MI.addOperand(Op: MachineOperand::CreateReg(Reg: NewDst, isDef: false, isImp: true)); |
15138 | |
15139 | // Tie the just added implicit operand to the dst |
15140 | MI.tieOperands(DefIdx: DstIdx, UseIdx: MI.getNumOperands() - 1); |
15141 | } |
15142 | |
15143 | /// Assign the register class depending on the number of |
15144 | /// bits set in the writemask |
15145 | void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, |
15146 | SDNode *Node) const { |
15147 | const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); |
15148 | |
15149 | MachineFunction *MF = MI.getParent()->getParent(); |
15150 | MachineRegisterInfo &MRI = MF->getRegInfo(); |
15151 | SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); |
15152 | |
15153 | if (TII->isVOP3(Opcode: MI.getOpcode())) { |
15154 | // Make sure constant bus requirements are respected. |
15155 | TII->legalizeOperandsVOP3(MRI, MI); |
15156 | |
15157 | // Prefer VGPRs over AGPRs in mAI instructions where possible. |
15158 | // This saves a chain-copy of registers and better balance register |
15159 | // use between vgpr and agpr as agpr tuples tend to be big. |
15160 | if (!MI.getDesc().operands().empty()) { |
15161 | unsigned Opc = MI.getOpcode(); |
15162 | bool HasAGPRs = Info->mayNeedAGPRs(); |
15163 | const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); |
15164 | int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); |
15165 | for (auto I : |
15166 | {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), |
15167 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) { |
15168 | if (I == -1) |
15169 | break; |
15170 | if ((I == Src2Idx) && (HasAGPRs)) |
15171 | break; |
15172 | MachineOperand &Op = MI.getOperand(I); |
15173 | if (!Op.isReg() || !Op.getReg().isVirtual()) |
15174 | continue; |
15175 | auto *RC = TRI->getRegClassForReg(MRI, Op.getReg()); |
15176 | if (!TRI->hasAGPRs(RC)) |
15177 | continue; |
15178 | auto *Src = MRI.getUniqueVRegDef(Op.getReg()); |
15179 | if (!Src || !Src->isCopy() || |
15180 | !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg())) |
15181 | continue; |
15182 | auto *NewRC = TRI->getEquivalentVGPRClass(RC); |
15183 | // All uses of agpr64 and agpr32 can also accept vgpr except for |
15184 | // v_accvgpr_read, but we do not produce agpr reads during selection, |
15185 | // so no use checks are needed. |
15186 | MRI.setRegClass(Op.getReg(), NewRC); |
15187 | } |
15188 | |
15189 | if (!HasAGPRs) |
15190 | return; |
15191 | |
15192 | // Resolve the rest of AV operands to AGPRs. |
15193 | if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) { |
15194 | if (Src2->isReg() && Src2->getReg().isVirtual()) { |
15195 | auto *RC = TRI->getRegClassForReg(MRI, Reg: Src2->getReg()); |
15196 | if (TRI->isVectorSuperClass(RC: RC)) { |
15197 | auto *NewRC = TRI->getEquivalentAGPRClass(SRC: RC); |
15198 | MRI.setRegClass(Reg: Src2->getReg(), RC: NewRC); |
15199 | if (Src2->isTied()) |
15200 | MRI.setRegClass(Reg: MI.getOperand(i: 0).getReg(), RC: NewRC); |
15201 | } |
15202 | } |
15203 | } |
15204 | } |
15205 | |
15206 | return; |
15207 | } |
15208 | |
15209 | if (TII->isImage(MI)) |
15210 | TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr); |
15211 | } |
15212 | |
15213 | static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, |
15214 | uint64_t Val) { |
15215 | SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32); |
15216 | return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0); |
15217 | } |
15218 | |
15219 | MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG, |
15220 | const SDLoc &DL, |
15221 | SDValue Ptr) const { |
15222 | const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); |
15223 | |
15224 | // Build the half of the subregister with the constants before building the |
15225 | // full 128-bit register. If we are building multiple resource descriptors, |
15226 | // this will allow CSEing of the 2-component register. |
15227 | const SDValue Ops0[] = { |
15228 | DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32), |
15229 | buildSMovImm32(DAG, DL, 0), |
15230 | DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), |
15231 | buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32), |
15232 | DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32) |
15233 | }; |
15234 | |
15235 | SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, |
15236 | MVT::v2i32, Ops0), 0); |
15237 | |
15238 | // Combine the constants and the pointer. |
15239 | const SDValue Ops1[] = { |
15240 | DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), |
15241 | Ptr, |
15242 | DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), |
15243 | SubRegHi, |
15244 | DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32) |
15245 | }; |
15246 | |
15247 | return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1); |
15248 | } |
15249 | |
15250 | /// Return a resource descriptor with the 'Add TID' bit enabled |
15251 | /// The TID (Thread ID) is multiplied by the stride value (bits [61:48] |
15252 | /// of the resource descriptor) to create an offset, which is added to |
15253 | /// the resource pointer. |
15254 | MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL, |
15255 | SDValue Ptr, uint32_t RsrcDword1, |
15256 | uint64_t RsrcDword2And3) const { |
15257 | SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr); |
15258 | SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr); |
15259 | if (RsrcDword1) { |
15260 | PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi, |
15261 | DAG.getConstant(RsrcDword1, DL, MVT::i32)), |
15262 | 0); |
15263 | } |
15264 | |
15265 | SDValue DataLo = buildSMovImm32(DAG, DL, |
15266 | Val: RsrcDword2And3 & UINT64_C(0xFFFFFFFF)); |
15267 | SDValue DataHi = buildSMovImm32(DAG, DL, Val: RsrcDword2And3 >> 32); |
15268 | |
15269 | const SDValue Ops[] = { |
15270 | DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), |
15271 | PtrLo, |
15272 | DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), |
15273 | PtrHi, |
15274 | DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32), |
15275 | DataLo, |
15276 | DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32), |
15277 | DataHi, |
15278 | DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32) |
15279 | }; |
15280 | |
15281 | return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops); |
15282 | } |
15283 | |
15284 | //===----------------------------------------------------------------------===// |
15285 | // SI Inline Assembly Support |
15286 | //===----------------------------------------------------------------------===// |
15287 | |
15288 | std::pair<unsigned, const TargetRegisterClass *> |
15289 | SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_, |
15290 | StringRef Constraint, |
15291 | MVT VT) const { |
15292 | const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_); |
15293 | |
15294 | const TargetRegisterClass *RC = nullptr; |
15295 | if (Constraint.size() == 1) { |
15296 | const unsigned BitWidth = VT.getSizeInBits(); |
15297 | switch (Constraint[0]) { |
15298 | default: |
15299 | return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); |
15300 | case 's': |
15301 | case 'r': |
15302 | switch (BitWidth) { |
15303 | case 16: |
15304 | RC = &AMDGPU::SReg_32RegClass; |
15305 | break; |
15306 | case 64: |
15307 | RC = &AMDGPU::SGPR_64RegClass; |
15308 | break; |
15309 | default: |
15310 | RC = SIRegisterInfo::getSGPRClassForBitWidth(BitWidth); |
15311 | if (!RC) |
15312 | return std::pair(0U, nullptr); |
15313 | break; |
15314 | } |
15315 | break; |
15316 | case 'v': |
15317 | switch (BitWidth) { |
15318 | case 16: |
15319 | RC = &AMDGPU::VGPR_32RegClass; |
15320 | break; |
15321 | default: |
15322 | RC = TRI->getVGPRClassForBitWidth(BitWidth); |
15323 | if (!RC) |
15324 | return std::pair(0U, nullptr); |
15325 | break; |
15326 | } |
15327 | break; |
15328 | case 'a': |
15329 | if (!Subtarget->hasMAIInsts()) |
15330 | break; |
15331 | switch (BitWidth) { |
15332 | case 16: |
15333 | RC = &AMDGPU::AGPR_32RegClass; |
15334 | break; |
15335 | default: |
15336 | RC = TRI->getAGPRClassForBitWidth(BitWidth); |
15337 | if (!RC) |
15338 | return std::pair(0U, nullptr); |
15339 | break; |
15340 | } |
15341 | break; |
15342 | } |
15343 | // We actually support i128, i16 and f16 as inline parameters |
15344 | // even if they are not reported as legal |
15345 | if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 || |
15346 | VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16)) |
15347 | return std::pair(0U, RC); |
15348 | } |
15349 | |
15350 | if (Constraint.starts_with(Prefix: "{" ) && Constraint.ends_with(Suffix: "}" )) { |
15351 | StringRef RegName(Constraint.data() + 1, Constraint.size() - 2); |
15352 | if (RegName.consume_front(Prefix: "v" )) { |
15353 | RC = &AMDGPU::VGPR_32RegClass; |
15354 | } else if (RegName.consume_front(Prefix: "s" )) { |
15355 | RC = &AMDGPU::SGPR_32RegClass; |
15356 | } else if (RegName.consume_front(Prefix: "a" )) { |
15357 | RC = &AMDGPU::AGPR_32RegClass; |
15358 | } |
15359 | |
15360 | if (RC) { |
15361 | uint32_t Idx; |
15362 | if (RegName.consume_front(Prefix: "[" )) { |
15363 | uint32_t End; |
15364 | bool Failed = RegName.consumeInteger(Radix: 10, Result&: Idx); |
15365 | Failed |= !RegName.consume_front(Prefix: ":" ); |
15366 | Failed |= RegName.consumeInteger(Radix: 10, Result&: End); |
15367 | Failed |= !RegName.consume_back(Suffix: "]" ); |
15368 | if (!Failed) { |
15369 | uint32_t Width = (End - Idx + 1) * 32; |
15370 | MCRegister Reg = RC->getRegister(i: Idx); |
15371 | if (SIRegisterInfo::isVGPRClass(RC)) |
15372 | RC = TRI->getVGPRClassForBitWidth(BitWidth: Width); |
15373 | else if (SIRegisterInfo::isSGPRClass(RC)) |
15374 | RC = TRI->getSGPRClassForBitWidth(BitWidth: Width); |
15375 | else if (SIRegisterInfo::isAGPRClass(RC)) |
15376 | RC = TRI->getAGPRClassForBitWidth(BitWidth: Width); |
15377 | if (RC) { |
15378 | Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC); |
15379 | return std::pair(Reg, RC); |
15380 | } |
15381 | } |
15382 | } else { |
15383 | bool Failed = RegName.getAsInteger(Radix: 10, Result&: Idx); |
15384 | if (!Failed && Idx < RC->getNumRegs()) |
15385 | return std::pair(RC->getRegister(i: Idx), RC); |
15386 | } |
15387 | } |
15388 | } |
15389 | |
15390 | auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); |
15391 | if (Ret.first) |
15392 | Ret.second = TRI->getPhysRegBaseClass(Ret.first); |
15393 | |
15394 | return Ret; |
15395 | } |
15396 | |
15397 | static bool isImmConstraint(StringRef Constraint) { |
15398 | if (Constraint.size() == 1) { |
15399 | switch (Constraint[0]) { |
15400 | default: break; |
15401 | case 'I': |
15402 | case 'J': |
15403 | case 'A': |
15404 | case 'B': |
15405 | case 'C': |
15406 | return true; |
15407 | } |
15408 | } else if (Constraint == "DA" || |
15409 | Constraint == "DB" ) { |
15410 | return true; |
15411 | } |
15412 | return false; |
15413 | } |
15414 | |
15415 | SITargetLowering::ConstraintType |
15416 | SITargetLowering::getConstraintType(StringRef Constraint) const { |
15417 | if (Constraint.size() == 1) { |
15418 | switch (Constraint[0]) { |
15419 | default: break; |
15420 | case 's': |
15421 | case 'v': |
15422 | case 'a': |
15423 | return C_RegisterClass; |
15424 | } |
15425 | } |
15426 | if (isImmConstraint(Constraint)) { |
15427 | return C_Other; |
15428 | } |
15429 | return TargetLowering::getConstraintType(Constraint); |
15430 | } |
15431 | |
15432 | static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) { |
15433 | if (!AMDGPU::isInlinableIntLiteral(Literal: Val)) { |
15434 | Val = Val & maskTrailingOnes<uint64_t>(N: Size); |
15435 | } |
15436 | return Val; |
15437 | } |
15438 | |
15439 | void SITargetLowering::LowerAsmOperandForConstraint(SDValue Op, |
15440 | StringRef Constraint, |
15441 | std::vector<SDValue> &Ops, |
15442 | SelectionDAG &DAG) const { |
15443 | if (isImmConstraint(Constraint)) { |
15444 | uint64_t Val; |
15445 | if (getAsmOperandConstVal(Op, Val) && |
15446 | checkAsmConstraintVal(Op, Constraint, Val)) { |
15447 | Val = clearUnusedBits(Val, Size: Op.getScalarValueSizeInBits()); |
15448 | Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64)); |
15449 | } |
15450 | } else { |
15451 | TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); |
15452 | } |
15453 | } |
15454 | |
15455 | bool SITargetLowering::getAsmOperandConstVal(SDValue Op, uint64_t &Val) const { |
15456 | unsigned Size = Op.getScalarValueSizeInBits(); |
15457 | if (Size > 64) |
15458 | return false; |
15459 | |
15460 | if (Size == 16 && !Subtarget->has16BitInsts()) |
15461 | return false; |
15462 | |
15463 | if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: Op)) { |
15464 | Val = C->getSExtValue(); |
15465 | return true; |
15466 | } |
15467 | if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: Op)) { |
15468 | Val = C->getValueAPF().bitcastToAPInt().getSExtValue(); |
15469 | return true; |
15470 | } |
15471 | if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Val&: Op)) { |
15472 | if (Size != 16 || Op.getNumOperands() != 2) |
15473 | return false; |
15474 | if (Op.getOperand(i: 0).isUndef() || Op.getOperand(i: 1).isUndef()) |
15475 | return false; |
15476 | if (ConstantSDNode *C = V->getConstantSplatNode()) { |
15477 | Val = C->getSExtValue(); |
15478 | return true; |
15479 | } |
15480 | if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) { |
15481 | Val = C->getValueAPF().bitcastToAPInt().getSExtValue(); |
15482 | return true; |
15483 | } |
15484 | } |
15485 | |
15486 | return false; |
15487 | } |
15488 | |
15489 | bool SITargetLowering::checkAsmConstraintVal(SDValue Op, StringRef Constraint, |
15490 | uint64_t Val) const { |
15491 | if (Constraint.size() == 1) { |
15492 | switch (Constraint[0]) { |
15493 | case 'I': |
15494 | return AMDGPU::isInlinableIntLiteral(Literal: Val); |
15495 | case 'J': |
15496 | return isInt<16>(x: Val); |
15497 | case 'A': |
15498 | return checkAsmConstraintValA(Op, Val); |
15499 | case 'B': |
15500 | return isInt<32>(x: Val); |
15501 | case 'C': |
15502 | return isUInt<32>(x: clearUnusedBits(Val, Size: Op.getScalarValueSizeInBits())) || |
15503 | AMDGPU::isInlinableIntLiteral(Literal: Val); |
15504 | default: |
15505 | break; |
15506 | } |
15507 | } else if (Constraint.size() == 2) { |
15508 | if (Constraint == "DA" ) { |
15509 | int64_t HiBits = static_cast<int32_t>(Val >> 32); |
15510 | int64_t LoBits = static_cast<int32_t>(Val); |
15511 | return checkAsmConstraintValA(Op, Val: HiBits, MaxSize: 32) && |
15512 | checkAsmConstraintValA(Op, Val: LoBits, MaxSize: 32); |
15513 | } |
15514 | if (Constraint == "DB" ) { |
15515 | return true; |
15516 | } |
15517 | } |
15518 | llvm_unreachable("Invalid asm constraint" ); |
15519 | } |
15520 | |
15521 | bool SITargetLowering::checkAsmConstraintValA(SDValue Op, uint64_t Val, |
15522 | unsigned MaxSize) const { |
15523 | unsigned Size = std::min<unsigned>(a: Op.getScalarValueSizeInBits(), b: MaxSize); |
15524 | bool HasInv2Pi = Subtarget->hasInv2PiInlineImm(); |
15525 | if (Size == 16) { |
15526 | MVT VT = Op.getSimpleValueType(); |
15527 | switch (VT.SimpleTy) { |
15528 | default: |
15529 | return false; |
15530 | case MVT::i16: |
15531 | return AMDGPU::isInlinableLiteralI16(Literal: Val, HasInv2Pi); |
15532 | case MVT::f16: |
15533 | return AMDGPU::isInlinableLiteralFP16(Literal: Val, HasInv2Pi); |
15534 | case MVT::bf16: |
15535 | return AMDGPU::isInlinableLiteralBF16(Literal: Val, HasInv2Pi); |
15536 | case MVT::v2i16: |
15537 | return AMDGPU::getInlineEncodingV2I16(Literal: Val).has_value(); |
15538 | case MVT::v2f16: |
15539 | return AMDGPU::getInlineEncodingV2F16(Literal: Val).has_value(); |
15540 | case MVT::v2bf16: |
15541 | return AMDGPU::getInlineEncodingV2BF16(Literal: Val).has_value(); |
15542 | } |
15543 | } |
15544 | if ((Size == 32 && AMDGPU::isInlinableLiteral32(Literal: Val, HasInv2Pi)) || |
15545 | (Size == 64 && AMDGPU::isInlinableLiteral64(Literal: Val, HasInv2Pi))) |
15546 | return true; |
15547 | return false; |
15548 | } |
15549 | |
15550 | static int getAlignedAGPRClassID(unsigned UnalignedClassID) { |
15551 | switch (UnalignedClassID) { |
15552 | case AMDGPU::VReg_64RegClassID: |
15553 | return AMDGPU::VReg_64_Align2RegClassID; |
15554 | case AMDGPU::VReg_96RegClassID: |
15555 | return AMDGPU::VReg_96_Align2RegClassID; |
15556 | case AMDGPU::VReg_128RegClassID: |
15557 | return AMDGPU::VReg_128_Align2RegClassID; |
15558 | case AMDGPU::VReg_160RegClassID: |
15559 | return AMDGPU::VReg_160_Align2RegClassID; |
15560 | case AMDGPU::VReg_192RegClassID: |
15561 | return AMDGPU::VReg_192_Align2RegClassID; |
15562 | case AMDGPU::VReg_224RegClassID: |
15563 | return AMDGPU::VReg_224_Align2RegClassID; |
15564 | case AMDGPU::VReg_256RegClassID: |
15565 | return AMDGPU::VReg_256_Align2RegClassID; |
15566 | case AMDGPU::VReg_288RegClassID: |
15567 | return AMDGPU::VReg_288_Align2RegClassID; |
15568 | case AMDGPU::VReg_320RegClassID: |
15569 | return AMDGPU::VReg_320_Align2RegClassID; |
15570 | case AMDGPU::VReg_352RegClassID: |
15571 | return AMDGPU::VReg_352_Align2RegClassID; |
15572 | case AMDGPU::VReg_384RegClassID: |
15573 | return AMDGPU::VReg_384_Align2RegClassID; |
15574 | case AMDGPU::VReg_512RegClassID: |
15575 | return AMDGPU::VReg_512_Align2RegClassID; |
15576 | case AMDGPU::VReg_1024RegClassID: |
15577 | return AMDGPU::VReg_1024_Align2RegClassID; |
15578 | case AMDGPU::AReg_64RegClassID: |
15579 | return AMDGPU::AReg_64_Align2RegClassID; |
15580 | case AMDGPU::AReg_96RegClassID: |
15581 | return AMDGPU::AReg_96_Align2RegClassID; |
15582 | case AMDGPU::AReg_128RegClassID: |
15583 | return AMDGPU::AReg_128_Align2RegClassID; |
15584 | case AMDGPU::AReg_160RegClassID: |
15585 | return AMDGPU::AReg_160_Align2RegClassID; |
15586 | case AMDGPU::AReg_192RegClassID: |
15587 | return AMDGPU::AReg_192_Align2RegClassID; |
15588 | case AMDGPU::AReg_256RegClassID: |
15589 | return AMDGPU::AReg_256_Align2RegClassID; |
15590 | case AMDGPU::AReg_512RegClassID: |
15591 | return AMDGPU::AReg_512_Align2RegClassID; |
15592 | case AMDGPU::AReg_1024RegClassID: |
15593 | return AMDGPU::AReg_1024_Align2RegClassID; |
15594 | default: |
15595 | return -1; |
15596 | } |
15597 | } |
15598 | |
15599 | // Figure out which registers should be reserved for stack access. Only after |
15600 | // the function is legalized do we know all of the non-spill stack objects or if |
15601 | // calls are present. |
15602 | void SITargetLowering::finalizeLowering(MachineFunction &MF) const { |
15603 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
15604 | SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
15605 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
15606 | const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); |
15607 | const SIInstrInfo *TII = ST.getInstrInfo(); |
15608 | |
15609 | if (Info->isEntryFunction()) { |
15610 | // Callable functions have fixed registers used for stack access. |
15611 | reservePrivateMemoryRegs(TM: getTargetMachine(), MF, TRI: *TRI, Info&: *Info); |
15612 | } |
15613 | |
15614 | // TODO: Move this logic to getReservedRegs() |
15615 | // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling. |
15616 | unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF); |
15617 | Register SReg = ST.isWave32() |
15618 | ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1) |
15619 | : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2, |
15620 | &AMDGPU::SGPR_64RegClass); |
15621 | Info->setSGPRForEXECCopy(SReg); |
15622 | |
15623 | assert(!TRI->isSubRegister(Info->getScratchRSrcReg(), |
15624 | Info->getStackPtrOffsetReg())); |
15625 | if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG) |
15626 | MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg()); |
15627 | |
15628 | // We need to worry about replacing the default register with itself in case |
15629 | // of MIR testcases missing the MFI. |
15630 | if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG) |
15631 | MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg()); |
15632 | |
15633 | if (Info->getFrameOffsetReg() != AMDGPU::FP_REG) |
15634 | MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg()); |
15635 | |
15636 | Info->limitOccupancy(MF); |
15637 | |
15638 | if (ST.isWave32() && !MF.empty()) { |
15639 | for (auto &MBB : MF) { |
15640 | for (auto &MI : MBB) { |
15641 | TII->fixImplicitOperands(MI); |
15642 | } |
15643 | } |
15644 | } |
15645 | |
15646 | // FIXME: This is a hack to fixup AGPR classes to use the properly aligned |
15647 | // classes if required. Ideally the register class constraints would differ |
15648 | // per-subtarget, but there's no easy way to achieve that right now. This is |
15649 | // not a problem for VGPRs because the correctly aligned VGPR class is implied |
15650 | // from using them as the register class for legal types. |
15651 | if (ST.needsAlignedVGPRs()) { |
15652 | for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { |
15653 | const Register Reg = Register::index2VirtReg(Index: I); |
15654 | const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg); |
15655 | if (!RC) |
15656 | continue; |
15657 | int NewClassID = getAlignedAGPRClassID(UnalignedClassID: RC->getID()); |
15658 | if (NewClassID != -1) |
15659 | MRI.setRegClass(Reg, RC: TRI->getRegClass(RCID: NewClassID)); |
15660 | } |
15661 | } |
15662 | |
15663 | TargetLoweringBase::finalizeLowering(MF); |
15664 | } |
15665 | |
15666 | void SITargetLowering::computeKnownBitsForTargetNode(const SDValue Op, |
15667 | KnownBits &Known, |
15668 | const APInt &DemandedElts, |
15669 | const SelectionDAG &DAG, |
15670 | unsigned Depth) const { |
15671 | Known.resetAll(); |
15672 | unsigned Opc = Op.getOpcode(); |
15673 | switch (Opc) { |
15674 | case ISD::INTRINSIC_WO_CHAIN: { |
15675 | unsigned IID = Op.getConstantOperandVal(i: 0); |
15676 | switch (IID) { |
15677 | case Intrinsic::amdgcn_mbcnt_lo: |
15678 | case Intrinsic::amdgcn_mbcnt_hi: { |
15679 | const GCNSubtarget &ST = |
15680 | DAG.getMachineFunction().getSubtarget<GCNSubtarget>(); |
15681 | // These return at most the (wavefront size - 1) + src1 |
15682 | // As long as src1 is an immediate we can calc known bits |
15683 | KnownBits Src1Known = DAG.computeKnownBits(Op: Op.getOperand(i: 2), Depth: Depth + 1); |
15684 | unsigned Src1ValBits = Src1Known.countMaxActiveBits(); |
15685 | unsigned MaxActiveBits = std::max(Src1ValBits, ST.getWavefrontSizeLog2()); |
15686 | // Cater for potential carry |
15687 | MaxActiveBits += Src1ValBits ? 1 : 0; |
15688 | unsigned Size = Op.getValueType().getSizeInBits(); |
15689 | if (MaxActiveBits < Size) |
15690 | Known.Zero.setHighBits(Size - MaxActiveBits); |
15691 | return; |
15692 | } |
15693 | } |
15694 | break; |
15695 | } |
15696 | } |
15697 | return AMDGPUTargetLowering::computeKnownBitsForTargetNode( |
15698 | Op, Known, DemandedElts, DAG, Depth); |
15699 | } |
15700 | |
15701 | void SITargetLowering::computeKnownBitsForFrameIndex( |
15702 | const int FI, KnownBits &Known, const MachineFunction &MF) const { |
15703 | TargetLowering::computeKnownBitsForFrameIndex(FIOp: FI, Known, MF); |
15704 | |
15705 | // Set the high bits to zero based on the maximum allowed scratch size per |
15706 | // wave. We can't use vaddr in MUBUF instructions if we don't know the address |
15707 | // calculation won't overflow, so assume the sign bit is never set. |
15708 | Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex()); |
15709 | } |
15710 | |
15711 | static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB, |
15712 | KnownBits &Known, unsigned Dim) { |
15713 | unsigned MaxValue = |
15714 | ST.getMaxWorkitemID(KB.getMachineFunction().getFunction(), Dim); |
15715 | Known.Zero.setHighBits(llvm::countl_zero(Val: MaxValue)); |
15716 | } |
15717 | |
15718 | void SITargetLowering::computeKnownBitsForTargetInstr( |
15719 | GISelKnownBits &KB, Register R, KnownBits &Known, const APInt &DemandedElts, |
15720 | const MachineRegisterInfo &MRI, unsigned Depth) const { |
15721 | const MachineInstr *MI = MRI.getVRegDef(Reg: R); |
15722 | switch (MI->getOpcode()) { |
15723 | case AMDGPU::G_INTRINSIC: |
15724 | case AMDGPU::G_INTRINSIC_CONVERGENT: { |
15725 | switch (cast<GIntrinsic>(Val: MI)->getIntrinsicID()) { |
15726 | case Intrinsic::amdgcn_workitem_id_x: |
15727 | knownBitsForWorkitemID(ST: *getSubtarget(), KB, Known, Dim: 0); |
15728 | break; |
15729 | case Intrinsic::amdgcn_workitem_id_y: |
15730 | knownBitsForWorkitemID(ST: *getSubtarget(), KB, Known, Dim: 1); |
15731 | break; |
15732 | case Intrinsic::amdgcn_workitem_id_z: |
15733 | knownBitsForWorkitemID(ST: *getSubtarget(), KB, Known, Dim: 2); |
15734 | break; |
15735 | case Intrinsic::amdgcn_mbcnt_lo: |
15736 | case Intrinsic::amdgcn_mbcnt_hi: { |
15737 | // These return at most the wavefront size - 1. |
15738 | unsigned Size = MRI.getType(Reg: R).getSizeInBits(); |
15739 | Known.Zero.setHighBits(Size - getSubtarget()->getWavefrontSizeLog2()); |
15740 | break; |
15741 | } |
15742 | case Intrinsic::amdgcn_groupstaticsize: { |
15743 | // We can report everything over the maximum size as 0. We can't report |
15744 | // based on the actual size because we don't know if it's accurate or not |
15745 | // at any given point. |
15746 | Known.Zero.setHighBits( |
15747 | llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize())); |
15748 | break; |
15749 | } |
15750 | } |
15751 | break; |
15752 | } |
15753 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: |
15754 | Known.Zero.setHighBits(24); |
15755 | break; |
15756 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: |
15757 | Known.Zero.setHighBits(16); |
15758 | break; |
15759 | case AMDGPU::G_AMDGPU_SMED3: |
15760 | case AMDGPU::G_AMDGPU_UMED3: { |
15761 | auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs(); |
15762 | |
15763 | KnownBits Known2; |
15764 | KB.computeKnownBitsImpl(R: Src2, Known&: Known2, DemandedElts, Depth: Depth + 1); |
15765 | if (Known2.isUnknown()) |
15766 | break; |
15767 | |
15768 | KnownBits Known1; |
15769 | KB.computeKnownBitsImpl(R: Src1, Known&: Known1, DemandedElts, Depth: Depth + 1); |
15770 | if (Known1.isUnknown()) |
15771 | break; |
15772 | |
15773 | KnownBits Known0; |
15774 | KB.computeKnownBitsImpl(R: Src0, Known&: Known0, DemandedElts, Depth: Depth + 1); |
15775 | if (Known0.isUnknown()) |
15776 | break; |
15777 | |
15778 | // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling. |
15779 | Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero; |
15780 | Known.One = Known0.One & Known1.One & Known2.One; |
15781 | break; |
15782 | } |
15783 | } |
15784 | } |
15785 | |
15786 | Align SITargetLowering::computeKnownAlignForTargetInstr( |
15787 | GISelKnownBits &KB, Register R, const MachineRegisterInfo &MRI, |
15788 | unsigned Depth) const { |
15789 | const MachineInstr *MI = MRI.getVRegDef(Reg: R); |
15790 | if (auto *GI = dyn_cast<GIntrinsic>(Val: MI)) { |
15791 | // FIXME: Can this move to generic code? What about the case where the call |
15792 | // site specifies a lower alignment? |
15793 | Intrinsic::ID IID = GI->getIntrinsicID(); |
15794 | LLVMContext &Ctx = KB.getMachineFunction().getFunction().getContext(); |
15795 | AttributeList Attrs = Intrinsic::getAttributes(C&: Ctx, id: IID); |
15796 | if (MaybeAlign RetAlign = Attrs.getRetAlignment()) |
15797 | return *RetAlign; |
15798 | } |
15799 | return Align(1); |
15800 | } |
15801 | |
15802 | Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { |
15803 | const Align PrefAlign = TargetLowering::getPrefLoopAlignment(ML); |
15804 | const Align CacheLineAlign = Align(64); |
15805 | |
15806 | // Pre-GFX10 target did not benefit from loop alignment |
15807 | if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() || |
15808 | getSubtarget()->hasInstFwdPrefetchBug()) |
15809 | return PrefAlign; |
15810 | |
15811 | // On GFX10 I$ is 4 x 64 bytes cache lines. |
15812 | // By default prefetcher keeps one cache line behind and reads two ahead. |
15813 | // We can modify it with S_INST_PREFETCH for larger loops to have two lines |
15814 | // behind and one ahead. |
15815 | // Therefor we can benefit from aligning loop headers if loop fits 192 bytes. |
15816 | // If loop fits 64 bytes it always spans no more than two cache lines and |
15817 | // does not need an alignment. |
15818 | // Else if loop is less or equal 128 bytes we do not need to modify prefetch, |
15819 | // Else if loop is less or equal 192 bytes we need two lines behind. |
15820 | |
15821 | const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); |
15822 | const MachineBasicBlock * = ML->getHeader(); |
15823 | if (Header->getAlignment() != PrefAlign) |
15824 | return Header->getAlignment(); // Already processed. |
15825 | |
15826 | unsigned LoopSize = 0; |
15827 | for (const MachineBasicBlock *MBB : ML->blocks()) { |
15828 | // If inner loop block is aligned assume in average half of the alignment |
15829 | // size to be added as nops. |
15830 | if (MBB != Header) |
15831 | LoopSize += MBB->getAlignment().value() / 2; |
15832 | |
15833 | for (const MachineInstr &MI : *MBB) { |
15834 | LoopSize += TII->getInstSizeInBytes(MI); |
15835 | if (LoopSize > 192) |
15836 | return PrefAlign; |
15837 | } |
15838 | } |
15839 | |
15840 | if (LoopSize <= 64) |
15841 | return PrefAlign; |
15842 | |
15843 | if (LoopSize <= 128) |
15844 | return CacheLineAlign; |
15845 | |
15846 | // If any of parent loops is surrounded by prefetch instructions do not |
15847 | // insert new for inner loop, which would reset parent's settings. |
15848 | for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) { |
15849 | if (MachineBasicBlock *Exit = P->getExitBlock()) { |
15850 | auto I = Exit->getFirstNonDebugInstr(); |
15851 | if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH) |
15852 | return CacheLineAlign; |
15853 | } |
15854 | } |
15855 | |
15856 | MachineBasicBlock *Pre = ML->getLoopPreheader(); |
15857 | MachineBasicBlock *Exit = ML->getExitBlock(); |
15858 | |
15859 | if (Pre && Exit) { |
15860 | auto PreTerm = Pre->getFirstTerminator(); |
15861 | if (PreTerm == Pre->begin() || |
15862 | std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH) |
15863 | BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH)) |
15864 | .addImm(1); // prefetch 2 lines behind PC |
15865 | |
15866 | auto ExitHead = Exit->getFirstNonDebugInstr(); |
15867 | if (ExitHead == Exit->end() || |
15868 | ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH) |
15869 | BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH)) |
15870 | .addImm(2); // prefetch 1 line behind PC |
15871 | } |
15872 | |
15873 | return CacheLineAlign; |
15874 | } |
15875 | |
15876 | LLVM_ATTRIBUTE_UNUSED |
15877 | static bool isCopyFromRegOfInlineAsm(const SDNode *N) { |
15878 | assert(N->getOpcode() == ISD::CopyFromReg); |
15879 | do { |
15880 | // Follow the chain until we find an INLINEASM node. |
15881 | N = N->getOperand(Num: 0).getNode(); |
15882 | if (N->getOpcode() == ISD::INLINEASM || |
15883 | N->getOpcode() == ISD::INLINEASM_BR) |
15884 | return true; |
15885 | } while (N->getOpcode() == ISD::CopyFromReg); |
15886 | return false; |
15887 | } |
15888 | |
15889 | bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N, |
15890 | FunctionLoweringInfo *FLI, |
15891 | UniformityInfo *UA) const { |
15892 | switch (N->getOpcode()) { |
15893 | case ISD::CopyFromReg: { |
15894 | const RegisterSDNode *R = cast<RegisterSDNode>(Val: N->getOperand(Num: 1)); |
15895 | const MachineRegisterInfo &MRI = FLI->MF->getRegInfo(); |
15896 | const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); |
15897 | Register Reg = R->getReg(); |
15898 | |
15899 | // FIXME: Why does this need to consider isLiveIn? |
15900 | if (Reg.isPhysical() || MRI.isLiveIn(Reg)) |
15901 | return !TRI->isSGPRReg(MRI, Reg); |
15902 | |
15903 | if (const Value *V = FLI->getValueFromVirtualReg(Vreg: R->getReg())) |
15904 | return UA->isDivergent(V); |
15905 | |
15906 | assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N)); |
15907 | return !TRI->isSGPRReg(MRI, Reg); |
15908 | } |
15909 | case ISD::LOAD: { |
15910 | const LoadSDNode *L = cast<LoadSDNode>(Val: N); |
15911 | unsigned AS = L->getAddressSpace(); |
15912 | // A flat load may access private memory. |
15913 | return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS; |
15914 | } |
15915 | case ISD::CALLSEQ_END: |
15916 | return true; |
15917 | case ISD::INTRINSIC_WO_CHAIN: |
15918 | return AMDGPU::isIntrinsicSourceOfDivergence(IntrID: N->getConstantOperandVal(Num: 0)); |
15919 | case ISD::INTRINSIC_W_CHAIN: |
15920 | return AMDGPU::isIntrinsicSourceOfDivergence(IntrID: N->getConstantOperandVal(Num: 1)); |
15921 | case AMDGPUISD::ATOMIC_CMP_SWAP: |
15922 | case AMDGPUISD::ATOMIC_LOAD_FMIN: |
15923 | case AMDGPUISD::ATOMIC_LOAD_FMAX: |
15924 | case AMDGPUISD::BUFFER_ATOMIC_SWAP: |
15925 | case AMDGPUISD::BUFFER_ATOMIC_ADD: |
15926 | case AMDGPUISD::BUFFER_ATOMIC_SUB: |
15927 | case AMDGPUISD::BUFFER_ATOMIC_SMIN: |
15928 | case AMDGPUISD::BUFFER_ATOMIC_UMIN: |
15929 | case AMDGPUISD::BUFFER_ATOMIC_SMAX: |
15930 | case AMDGPUISD::BUFFER_ATOMIC_UMAX: |
15931 | case AMDGPUISD::BUFFER_ATOMIC_AND: |
15932 | case AMDGPUISD::BUFFER_ATOMIC_OR: |
15933 | case AMDGPUISD::BUFFER_ATOMIC_XOR: |
15934 | case AMDGPUISD::BUFFER_ATOMIC_INC: |
15935 | case AMDGPUISD::BUFFER_ATOMIC_DEC: |
15936 | case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP: |
15937 | case AMDGPUISD::BUFFER_ATOMIC_CSUB: |
15938 | case AMDGPUISD::BUFFER_ATOMIC_FADD: |
15939 | case AMDGPUISD::BUFFER_ATOMIC_FADD_BF16: |
15940 | case AMDGPUISD::BUFFER_ATOMIC_FMIN: |
15941 | case AMDGPUISD::BUFFER_ATOMIC_FMAX: |
15942 | // Target-specific read-modify-write atomics are sources of divergence. |
15943 | return true; |
15944 | default: |
15945 | if (auto *A = dyn_cast<AtomicSDNode>(Val: N)) { |
15946 | // Generic read-modify-write atomics are sources of divergence. |
15947 | return A->readMem() && A->writeMem(); |
15948 | } |
15949 | return false; |
15950 | } |
15951 | } |
15952 | |
15953 | bool SITargetLowering::denormalsEnabledForType(const SelectionDAG &DAG, |
15954 | EVT VT) const { |
15955 | switch (VT.getScalarType().getSimpleVT().SimpleTy) { |
15956 | case MVT::f32: |
15957 | return !denormalModeIsFlushAllF32(MF: DAG.getMachineFunction()); |
15958 | case MVT::f64: |
15959 | case MVT::f16: |
15960 | return !denormalModeIsFlushAllF64F16(MF: DAG.getMachineFunction()); |
15961 | default: |
15962 | return false; |
15963 | } |
15964 | } |
15965 | |
15966 | bool SITargetLowering::denormalsEnabledForType( |
15967 | LLT Ty, const MachineFunction &MF) const { |
15968 | switch (Ty.getScalarSizeInBits()) { |
15969 | case 32: |
15970 | return !denormalModeIsFlushAllF32(MF); |
15971 | case 64: |
15972 | case 16: |
15973 | return !denormalModeIsFlushAllF64F16(MF); |
15974 | default: |
15975 | return false; |
15976 | } |
15977 | } |
15978 | |
15979 | bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, |
15980 | const SelectionDAG &DAG, |
15981 | bool SNaN, |
15982 | unsigned Depth) const { |
15983 | if (Op.getOpcode() == AMDGPUISD::CLAMP) { |
15984 | const MachineFunction &MF = DAG.getMachineFunction(); |
15985 | const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
15986 | |
15987 | if (Info->getMode().DX10Clamp) |
15988 | return true; // Clamped to 0. |
15989 | return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 0), SNaN, Depth: Depth + 1); |
15990 | } |
15991 | |
15992 | return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op, DAG, |
15993 | SNaN, Depth); |
15994 | } |
15995 | |
15996 | #if 0 |
15997 | // FIXME: This should be checked before unsafe fp atomics are enabled |
15998 | // Global FP atomic instructions have a hardcoded FP mode and do not support |
15999 | // FP32 denormals, and only support v2f16 denormals. |
16000 | static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) { |
16001 | const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics(); |
16002 | auto DenormMode = RMW->getParent()->getParent()->getDenormalMode(Flt); |
16003 | if (&Flt == &APFloat::IEEEsingle()) |
16004 | return DenormMode == DenormalMode::getPreserveSign(); |
16005 | return DenormMode == DenormalMode::getIEEE(); |
16006 | } |
16007 | #endif |
16008 | |
16009 | // The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe |
16010 | // floating point atomic instructions. May generate more efficient code, |
16011 | // but may not respect rounding and denormal modes, and may give incorrect |
16012 | // results for certain memory destinations. |
16013 | bool unsafeFPAtomicsDisabled(Function *F) { |
16014 | return F->getFnAttribute(Kind: "amdgpu-unsafe-fp-atomics" ).getValueAsString() != |
16015 | "true" ; |
16016 | } |
16017 | |
16018 | static OptimizationRemark (const AtomicRMWInst *RMW) { |
16019 | LLVMContext &Ctx = RMW->getContext(); |
16020 | SmallVector<StringRef> SSNs; |
16021 | Ctx.getSyncScopeNames(SSNs); |
16022 | StringRef MemScope = SSNs[RMW->getSyncScopeID()].empty() |
16023 | ? "system" |
16024 | : SSNs[RMW->getSyncScopeID()]; |
16025 | |
16026 | return OptimizationRemark(DEBUG_TYPE, "Passed" , RMW) |
16027 | << "Hardware instruction generated for atomic " |
16028 | << RMW->getOperationName(Op: RMW->getOperation()) |
16029 | << " operation at memory scope " << MemScope; |
16030 | } |
16031 | |
16032 | TargetLowering::AtomicExpansionKind |
16033 | SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { |
16034 | unsigned AS = RMW->getPointerAddressSpace(); |
16035 | if (AS == AMDGPUAS::PRIVATE_ADDRESS) |
16036 | return AtomicExpansionKind::NotAtomic; |
16037 | |
16038 | auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) { |
16039 | OptimizationRemarkEmitter ORE(RMW->getFunction()); |
16040 | ORE.emit(RemarkBuilder: [=]() { |
16041 | return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request." ; |
16042 | }); |
16043 | return Kind; |
16044 | }; |
16045 | |
16046 | auto SSID = RMW->getSyncScopeID(); |
16047 | bool HasSystemScope = |
16048 | SSID == SyncScope::System || |
16049 | SSID == RMW->getContext().getOrInsertSyncScopeID(SSN: "one-as" ); |
16050 | |
16051 | switch (RMW->getOperation()) { |
16052 | case AtomicRMWInst::Sub: |
16053 | case AtomicRMWInst::Or: |
16054 | case AtomicRMWInst::Xor: { |
16055 | // Atomic sub/or/xor do not work over PCI express, but atomic add |
16056 | // does. InstCombine transforms these with 0 to or, so undo that. |
16057 | if (HasSystemScope && AMDGPU::isFlatGlobalAddrSpace(AS)) { |
16058 | if (Constant *ConstVal = dyn_cast<Constant>(Val: RMW->getValOperand()); |
16059 | ConstVal && ConstVal->isNullValue()) |
16060 | return AtomicExpansionKind::Expand; |
16061 | } |
16062 | |
16063 | break; |
16064 | } |
16065 | case AtomicRMWInst::FAdd: { |
16066 | Type *Ty = RMW->getType(); |
16067 | |
16068 | // TODO: Handle REGION_ADDRESS |
16069 | if (AS == AMDGPUAS::LOCAL_ADDRESS) { |
16070 | // DS F32 FP atomics do respect the denormal mode, but the rounding mode |
16071 | // is fixed to round-to-nearest-even. |
16072 | // |
16073 | // F64 / PK_F16 / PK_BF16 never flush and are also fixed to |
16074 | // round-to-nearest-even. |
16075 | // |
16076 | // We ignore the rounding mode problem, even in strictfp. The C++ standard |
16077 | // suggests it is OK if the floating-point mode may not match the calling |
16078 | // thread. |
16079 | if (Ty->isFloatTy()) { |
16080 | return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None |
16081 | : AtomicExpansionKind::CmpXChg; |
16082 | } |
16083 | |
16084 | if (Ty->isDoubleTy()) { |
16085 | // Ignores denormal mode, but we don't consider flushing mandatory. |
16086 | return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None |
16087 | : AtomicExpansionKind::CmpXChg; |
16088 | } |
16089 | |
16090 | // TODO: Handle v2f16/v2bf16 cases for gfx940 |
16091 | return AtomicExpansionKind::CmpXChg; |
16092 | } |
16093 | |
16094 | if (!AMDGPU::isFlatGlobalAddrSpace(AS) && |
16095 | AS != AMDGPUAS::BUFFER_FAT_POINTER) |
16096 | return AtomicExpansionKind::CmpXChg; |
16097 | |
16098 | // TODO: gfx940 supports v2f16 and v2bf16 |
16099 | if (Subtarget->hasGFX940Insts() && (Ty->isFloatTy() || Ty->isDoubleTy())) |
16100 | return AtomicExpansionKind::None; |
16101 | |
16102 | if (unsafeFPAtomicsDisabled(F: RMW->getFunction())) |
16103 | return AtomicExpansionKind::CmpXChg; |
16104 | |
16105 | // Always expand system scope fp atomics. |
16106 | if (HasSystemScope) |
16107 | return AtomicExpansionKind::CmpXChg; |
16108 | |
16109 | // global and flat atomic fadd f64: gfx90a, gfx940. |
16110 | if (Subtarget->hasGFX90AInsts() && Ty->isDoubleTy()) |
16111 | return ReportUnsafeHWInst(AtomicExpansionKind::None); |
16112 | |
16113 | if (AS != AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) { |
16114 | // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+. |
16115 | if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts()) |
16116 | return ReportUnsafeHWInst(AtomicExpansionKind::None); |
16117 | // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+. |
16118 | if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts()) |
16119 | return ReportUnsafeHWInst(AtomicExpansionKind::None); |
16120 | } |
16121 | |
16122 | // flat atomic fadd f32: gfx940, gfx11+. |
16123 | if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) { |
16124 | if (Subtarget->hasFlatAtomicFaddF32Inst()) |
16125 | return ReportUnsafeHWInst(AtomicExpansionKind::None); |
16126 | |
16127 | // If it is in flat address space, and the type is float, we will try to |
16128 | // expand it, if the target supports global and lds atomic fadd. The |
16129 | // reason we need that is, in the expansion, we emit the check of address |
16130 | // space. If it is in global address space, we emit the global atomic |
16131 | // fadd; if it is in shared address space, we emit the LDS atomic fadd. |
16132 | if (Subtarget->hasLDSFPAtomicAddF32()) { |
16133 | if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts()) |
16134 | return AtomicExpansionKind::Expand; |
16135 | if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts()) |
16136 | return AtomicExpansionKind::Expand; |
16137 | } |
16138 | } |
16139 | |
16140 | return AtomicExpansionKind::CmpXChg; |
16141 | } |
16142 | case AtomicRMWInst::FMin: |
16143 | case AtomicRMWInst::FMax: |
16144 | case AtomicRMWInst::Min: |
16145 | case AtomicRMWInst::Max: |
16146 | case AtomicRMWInst::UMin: |
16147 | case AtomicRMWInst::UMax: { |
16148 | if (AMDGPU::isFlatGlobalAddrSpace(AS) || |
16149 | AS == AMDGPUAS::BUFFER_FAT_POINTER) { |
16150 | if (RMW->getType()->isFloatTy() && |
16151 | unsafeFPAtomicsDisabled(F: RMW->getFunction())) |
16152 | return AtomicExpansionKind::CmpXChg; |
16153 | |
16154 | // Always expand system scope min/max atomics. |
16155 | if (HasSystemScope) |
16156 | return AtomicExpansionKind::CmpXChg; |
16157 | } |
16158 | break; |
16159 | } |
16160 | default: |
16161 | break; |
16162 | } |
16163 | |
16164 | return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW); |
16165 | } |
16166 | |
16167 | TargetLowering::AtomicExpansionKind |
16168 | SITargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { |
16169 | return LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS |
16170 | ? AtomicExpansionKind::NotAtomic |
16171 | : AtomicExpansionKind::None; |
16172 | } |
16173 | |
16174 | TargetLowering::AtomicExpansionKind |
16175 | SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { |
16176 | return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS |
16177 | ? AtomicExpansionKind::NotAtomic |
16178 | : AtomicExpansionKind::None; |
16179 | } |
16180 | |
16181 | TargetLowering::AtomicExpansionKind |
16182 | SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const { |
16183 | return CmpX->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS |
16184 | ? AtomicExpansionKind::NotAtomic |
16185 | : AtomicExpansionKind::None; |
16186 | } |
16187 | |
16188 | const TargetRegisterClass * |
16189 | SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const { |
16190 | const TargetRegisterClass *RC = TargetLoweringBase::getRegClassFor(VT, isDivergent: false); |
16191 | const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); |
16192 | if (RC == &AMDGPU::VReg_1RegClass && !isDivergent) |
16193 | return Subtarget->getWavefrontSize() == 64 ? &AMDGPU::SReg_64RegClass |
16194 | : &AMDGPU::SReg_32RegClass; |
16195 | if (!TRI->isSGPRClass(RC) && !isDivergent) |
16196 | return TRI->getEquivalentSGPRClass(VRC: RC); |
16197 | else if (TRI->isSGPRClass(RC) && isDivergent) |
16198 | return TRI->getEquivalentVGPRClass(SRC: RC); |
16199 | |
16200 | return RC; |
16201 | } |
16202 | |
16203 | // FIXME: This is a workaround for DivergenceAnalysis not understanding always |
16204 | // uniform values (as produced by the mask results of control flow intrinsics) |
16205 | // used outside of divergent blocks. The phi users need to also be treated as |
16206 | // always uniform. |
16207 | // |
16208 | // FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis? |
16209 | static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited, |
16210 | unsigned WaveSize) { |
16211 | // FIXME: We assume we never cast the mask results of a control flow |
16212 | // intrinsic. |
16213 | // Early exit if the type won't be consistent as a compile time hack. |
16214 | IntegerType *IT = dyn_cast<IntegerType>(Val: V->getType()); |
16215 | if (!IT || IT->getBitWidth() != WaveSize) |
16216 | return false; |
16217 | |
16218 | if (!isa<Instruction>(Val: V)) |
16219 | return false; |
16220 | if (!Visited.insert(Ptr: V).second) |
16221 | return false; |
16222 | bool Result = false; |
16223 | for (const auto *U : V->users()) { |
16224 | if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: U)) { |
16225 | if (V == U->getOperand(i: 1)) { |
16226 | switch (Intrinsic->getIntrinsicID()) { |
16227 | default: |
16228 | Result = false; |
16229 | break; |
16230 | case Intrinsic::amdgcn_if_break: |
16231 | case Intrinsic::amdgcn_if: |
16232 | case Intrinsic::amdgcn_else: |
16233 | Result = true; |
16234 | break; |
16235 | } |
16236 | } |
16237 | if (V == U->getOperand(i: 0)) { |
16238 | switch (Intrinsic->getIntrinsicID()) { |
16239 | default: |
16240 | Result = false; |
16241 | break; |
16242 | case Intrinsic::amdgcn_end_cf: |
16243 | case Intrinsic::amdgcn_loop: |
16244 | Result = true; |
16245 | break; |
16246 | } |
16247 | } |
16248 | } else { |
16249 | Result = hasCFUser(V: U, Visited, WaveSize); |
16250 | } |
16251 | if (Result) |
16252 | break; |
16253 | } |
16254 | return Result; |
16255 | } |
16256 | |
16257 | bool SITargetLowering::requiresUniformRegister(MachineFunction &MF, |
16258 | const Value *V) const { |
16259 | if (const CallInst *CI = dyn_cast<CallInst>(Val: V)) { |
16260 | if (CI->isInlineAsm()) { |
16261 | // FIXME: This cannot give a correct answer. This should only trigger in |
16262 | // the case where inline asm returns mixed SGPR and VGPR results, used |
16263 | // outside the defining block. We don't have a specific result to |
16264 | // consider, so this assumes if any value is SGPR, the overall register |
16265 | // also needs to be SGPR. |
16266 | const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo(); |
16267 | TargetLowering::AsmOperandInfoVector TargetConstraints = ParseConstraints( |
16268 | MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI); |
16269 | for (auto &TC : TargetConstraints) { |
16270 | if (TC.Type == InlineAsm::isOutput) { |
16271 | ComputeConstraintToUse(TC, SDValue()); |
16272 | const TargetRegisterClass *RC = getRegForInlineAsmConstraint( |
16273 | SIRI, TC.ConstraintCode, TC.ConstraintVT).second; |
16274 | if (RC && SIRI->isSGPRClass(RC)) |
16275 | return true; |
16276 | } |
16277 | } |
16278 | } |
16279 | } |
16280 | SmallPtrSet<const Value *, 16> Visited; |
16281 | return hasCFUser(V, Visited, Subtarget->getWavefrontSize()); |
16282 | } |
16283 | |
16284 | bool SITargetLowering::hasMemSDNodeUser(SDNode *N) const { |
16285 | SDNode::use_iterator I = N->use_begin(), E = N->use_end(); |
16286 | for (; I != E; ++I) { |
16287 | if (MemSDNode *M = dyn_cast<MemSDNode>(Val: *I)) { |
16288 | if (getBasePtrIndex(N: M) == I.getOperandNo()) |
16289 | return true; |
16290 | } |
16291 | } |
16292 | return false; |
16293 | } |
16294 | |
16295 | bool SITargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0, |
16296 | SDValue N1) const { |
16297 | if (!N0.hasOneUse()) |
16298 | return false; |
16299 | // Take care of the opportunity to keep N0 uniform |
16300 | if (N0->isDivergent() || !N1->isDivergent()) |
16301 | return true; |
16302 | // Check if we have a good chance to form the memory access pattern with the |
16303 | // base and offset |
16304 | return (DAG.isBaseWithConstantOffset(Op: N0) && |
16305 | hasMemSDNodeUser(N: *N0->use_begin())); |
16306 | } |
16307 | |
16308 | bool SITargetLowering::isReassocProfitable(MachineRegisterInfo &MRI, |
16309 | Register N0, Register N1) const { |
16310 | return MRI.hasOneNonDBGUse(RegNo: N0); // FIXME: handle regbanks |
16311 | } |
16312 | |
16313 | MachineMemOperand::Flags |
16314 | SITargetLowering::getTargetMMOFlags(const Instruction &I) const { |
16315 | // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load. |
16316 | MachineMemOperand::Flags Flags = MachineMemOperand::MONone; |
16317 | if (I.getMetadata(Kind: "amdgpu.noclobber" )) |
16318 | Flags |= MONoClobber; |
16319 | if (I.getMetadata(Kind: "amdgpu.last.use" )) |
16320 | Flags |= MOLastUse; |
16321 | return Flags; |
16322 | } |
16323 | |
16324 | bool SITargetLowering::checkForPhysRegDependency( |
16325 | SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, |
16326 | const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const { |
16327 | if (User->getOpcode() != ISD::CopyToReg) |
16328 | return false; |
16329 | if (!Def->isMachineOpcode()) |
16330 | return false; |
16331 | MachineSDNode *MDef = dyn_cast<MachineSDNode>(Val: Def); |
16332 | if (!MDef) |
16333 | return false; |
16334 | |
16335 | unsigned ResNo = User->getOperand(Num: Op).getResNo(); |
16336 | if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1) |
16337 | return false; |
16338 | const MCInstrDesc &II = TII->get(Opcode: MDef->getMachineOpcode()); |
16339 | if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) { |
16340 | PhysReg = AMDGPU::SCC; |
16341 | const TargetRegisterClass *RC = |
16342 | TRI->getMinimalPhysRegClass(Reg: PhysReg, VT: Def->getSimpleValueType(ResNo)); |
16343 | Cost = RC->getCopyCost(); |
16344 | return true; |
16345 | } |
16346 | return false; |
16347 | } |
16348 | |
16349 | void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { |
16350 | AtomicRMWInst::BinOp Op = AI->getOperation(); |
16351 | |
16352 | if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or || |
16353 | Op == AtomicRMWInst::Xor) { |
16354 | // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0 |
16355 | assert(cast<Constant>(AI->getValOperand())->isNullValue() && |
16356 | "this cannot be replaced with add" ); |
16357 | AI->setOperation(AtomicRMWInst::Add); |
16358 | return; |
16359 | } |
16360 | |
16361 | assert(Subtarget->hasAtomicFaddInsts() && |
16362 | "target should have atomic fadd instructions" ); |
16363 | assert(AI->getType()->isFloatTy() && |
16364 | AI->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS && |
16365 | "generic atomicrmw expansion only supports FP32 operand in flat " |
16366 | "address space" ); |
16367 | assert(Op == AtomicRMWInst::FAdd && "only fadd is supported for now" ); |
16368 | |
16369 | // Given: atomicrmw fadd ptr %addr, float %val ordering |
16370 | // |
16371 | // With this expansion we produce the following code: |
16372 | // [...] |
16373 | // br label %atomicrmw.check.shared |
16374 | // |
16375 | // atomicrmw.check.shared: |
16376 | // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr) |
16377 | // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private |
16378 | // |
16379 | // atomicrmw.shared: |
16380 | // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3) |
16381 | // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared, |
16382 | // float %val ordering |
16383 | // br label %atomicrmw.phi |
16384 | // |
16385 | // atomicrmw.check.private: |
16386 | // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr) |
16387 | // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global |
16388 | // |
16389 | // atomicrmw.private: |
16390 | // %cast.private = addrspacecast ptr %addr to ptr addrspace(5) |
16391 | // %loaded.private = load float, ptr addrspace(5) %cast.private |
16392 | // %val.new = fadd float %loaded.private, %val |
16393 | // store float %val.new, ptr addrspace(5) %cast.private |
16394 | // br label %atomicrmw.phi |
16395 | // |
16396 | // atomicrmw.global: |
16397 | // %cast.global = addrspacecast ptr %addr to ptr addrspace(1) |
16398 | // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global, |
16399 | // float %val ordering |
16400 | // br label %atomicrmw.phi |
16401 | // |
16402 | // atomicrmw.phi: |
16403 | // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ], |
16404 | // [ %loaded.private, %atomicrmw.private ], |
16405 | // [ %loaded.global, %atomicrmw.global ] |
16406 | // br label %atomicrmw.end |
16407 | // |
16408 | // atomicrmw.end: |
16409 | // [...] |
16410 | |
16411 | IRBuilder<> Builder(AI); |
16412 | LLVMContext &Ctx = Builder.getContext(); |
16413 | |
16414 | BasicBlock *BB = Builder.GetInsertBlock(); |
16415 | Function *F = BB->getParent(); |
16416 | BasicBlock *ExitBB = |
16417 | BB->splitBasicBlock(I: Builder.GetInsertPoint(), BBName: "atomicrmw.end" ); |
16418 | BasicBlock *CheckSharedBB = |
16419 | BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.check.shared" , Parent: F, InsertBefore: ExitBB); |
16420 | BasicBlock *SharedBB = BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.shared" , Parent: F, InsertBefore: ExitBB); |
16421 | BasicBlock *CheckPrivateBB = |
16422 | BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.check.private" , Parent: F, InsertBefore: ExitBB); |
16423 | BasicBlock *PrivateBB = |
16424 | BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.private" , Parent: F, InsertBefore: ExitBB); |
16425 | BasicBlock *GlobalBB = BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.global" , Parent: F, InsertBefore: ExitBB); |
16426 | BasicBlock *PhiBB = BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.phi" , Parent: F, InsertBefore: ExitBB); |
16427 | |
16428 | Value *Val = AI->getValOperand(); |
16429 | Type *ValTy = Val->getType(); |
16430 | Value *Addr = AI->getPointerOperand(); |
16431 | |
16432 | auto CreateNewAtomicRMW = [AI](IRBuilder<> &Builder, Value *Addr, |
16433 | Value *Val) -> Value * { |
16434 | AtomicRMWInst *OldVal = |
16435 | Builder.CreateAtomicRMW(Op: AI->getOperation(), Ptr: Addr, Val, Align: AI->getAlign(), |
16436 | Ordering: AI->getOrdering(), SSID: AI->getSyncScopeID()); |
16437 | SmallVector<std::pair<unsigned, MDNode *>> MDs; |
16438 | AI->getAllMetadata(MDs); |
16439 | for (auto &P : MDs) |
16440 | OldVal->setMetadata(KindID: P.first, Node: P.second); |
16441 | return OldVal; |
16442 | }; |
16443 | |
16444 | std::prev(x: BB->end())->eraseFromParent(); |
16445 | Builder.SetInsertPoint(BB); |
16446 | Builder.CreateBr(Dest: CheckSharedBB); |
16447 | |
16448 | Builder.SetInsertPoint(CheckSharedBB); |
16449 | CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {}, |
16450 | {Addr}, nullptr, "is.shared" ); |
16451 | Builder.CreateCondBr(Cond: IsShared, True: SharedBB, False: CheckPrivateBB); |
16452 | |
16453 | Builder.SetInsertPoint(SharedBB); |
16454 | Value *CastToLocal = Builder.CreateAddrSpaceCast( |
16455 | V: Addr, DestTy: PointerType::get(C&: Ctx, AddressSpace: AMDGPUAS::LOCAL_ADDRESS)); |
16456 | Value *LoadedShared = CreateNewAtomicRMW(Builder, CastToLocal, Val); |
16457 | Builder.CreateBr(Dest: PhiBB); |
16458 | |
16459 | Builder.SetInsertPoint(CheckPrivateBB); |
16460 | CallInst *IsPrivate = Builder.CreateIntrinsic( |
16461 | Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private" ); |
16462 | Builder.CreateCondBr(Cond: IsPrivate, True: PrivateBB, False: GlobalBB); |
16463 | |
16464 | Builder.SetInsertPoint(PrivateBB); |
16465 | Value *CastToPrivate = Builder.CreateAddrSpaceCast( |
16466 | V: Addr, DestTy: PointerType::get(C&: Ctx, AddressSpace: AMDGPUAS::PRIVATE_ADDRESS)); |
16467 | Value *LoadedPrivate = |
16468 | Builder.CreateLoad(Ty: ValTy, Ptr: CastToPrivate, Name: "loaded.private" ); |
16469 | Value *NewVal = Builder.CreateFAdd(L: LoadedPrivate, R: Val, Name: "val.new" ); |
16470 | Builder.CreateStore(Val: NewVal, Ptr: CastToPrivate); |
16471 | Builder.CreateBr(Dest: PhiBB); |
16472 | |
16473 | Builder.SetInsertPoint(GlobalBB); |
16474 | Value *CastToGlobal = Builder.CreateAddrSpaceCast( |
16475 | V: Addr, DestTy: PointerType::get(C&: Ctx, AddressSpace: AMDGPUAS::GLOBAL_ADDRESS)); |
16476 | Value *LoadedGlobal = CreateNewAtomicRMW(Builder, CastToGlobal, Val); |
16477 | Builder.CreateBr(Dest: PhiBB); |
16478 | |
16479 | Builder.SetInsertPoint(PhiBB); |
16480 | PHINode *Loaded = Builder.CreatePHI(Ty: ValTy, NumReservedValues: 3, Name: "loaded.phi" ); |
16481 | Loaded->addIncoming(V: LoadedShared, BB: SharedBB); |
16482 | Loaded->addIncoming(V: LoadedPrivate, BB: PrivateBB); |
16483 | Loaded->addIncoming(V: LoadedGlobal, BB: GlobalBB); |
16484 | Builder.CreateBr(Dest: ExitBB); |
16485 | |
16486 | AI->replaceAllUsesWith(V: Loaded); |
16487 | AI->eraseFromParent(); |
16488 | } |
16489 | |
16490 | LoadInst * |
16491 | SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { |
16492 | IRBuilder<> Builder(AI); |
16493 | auto Order = AI->getOrdering(); |
16494 | |
16495 | // The optimization removes store aspect of the atomicrmw. Therefore, cache |
16496 | // must be flushed if the atomic ordering had a release semantics. This is |
16497 | // not necessary a fence, a release fence just coincides to do that flush. |
16498 | // Avoid replacing of an atomicrmw with a release semantics. |
16499 | if (isReleaseOrStronger(AO: Order)) |
16500 | return nullptr; |
16501 | |
16502 | LoadInst *LI = Builder.CreateAlignedLoad( |
16503 | Ty: AI->getType(), Ptr: AI->getPointerOperand(), Align: AI->getAlign()); |
16504 | LI->setAtomic(Ordering: Order, SSID: AI->getSyncScopeID()); |
16505 | LI->copyMetadata(SrcInst: *AI); |
16506 | LI->takeName(V: AI); |
16507 | AI->replaceAllUsesWith(V: LI); |
16508 | AI->eraseFromParent(); |
16509 | return LI; |
16510 | } |
16511 | |