1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
18#include "AMDGPUMachineFunction.h"
19#include "SIMachineFunctionInfo.h"
20#include "llvm/CodeGen/Analysis.h"
21#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
22#include "llvm/CodeGen/MachineFrameInfo.h"
23#include "llvm/IR/DiagnosticInfo.h"
24#include "llvm/IR/IntrinsicsAMDGPU.h"
25#include "llvm/IR/PatternMatch.h"
26#include "llvm/Support/CommandLine.h"
27#include "llvm/Support/KnownBits.h"
28#include "llvm/Target/TargetMachine.h"
29
30using namespace llvm;
31
32#include "AMDGPUGenCallingConv.inc"
33
34static cl::opt<bool> AMDGPUBypassSlowDiv(
35 "amdgpu-bypass-slow-div",
36 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
37 cl::init(true));
38
39// Find a larger type to do a load / store of a vector with.
40EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
41 unsigned StoreSize = VT.getStoreSizeInBits();
42 if (StoreSize <= 32)
43 return EVT::getIntegerVT(Context&: Ctx, BitWidth: StoreSize);
44
45 assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
46 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
47}
48
49unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
50 return DAG.computeKnownBits(Op).countMaxActiveBits();
51}
52
53unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
54 // In order for this to be a signed 24-bit value, bit 23, must
55 // be a sign bit.
56 return DAG.ComputeMaxSignificantBits(Op);
57}
58
59AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
60 const AMDGPUSubtarget &STI)
61 : TargetLowering(TM), Subtarget(&STI) {
62 // Always lower memset, memcpy, and memmove intrinsics to load/store
63 // instructions, rather then generating calls to memset, mempcy or memmove.
64 MaxStoresPerMemset = MaxStoresPerMemsetOptSize = ~0U;
65 MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = ~0U;
66 MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = ~0U;
67
68 // Lower floating point store/load to integer store/load to reduce the number
69 // of patterns in tablegen.
70 setOperationAction(ISD::LOAD, MVT::f32, Promote);
71 AddPromotedToType(Opc: ISD::LOAD, MVT::OrigVT: f32, MVT::DestVT: i32);
72
73 setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
74 AddPromotedToType(Opc: ISD::LOAD, MVT::OrigVT: v2f32, MVT::DestVT: v2i32);
75
76 setOperationAction(ISD::LOAD, MVT::v3f32, Promote);
77 AddPromotedToType(Opc: ISD::LOAD, MVT::OrigVT: v3f32, MVT::DestVT: v3i32);
78
79 setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
80 AddPromotedToType(Opc: ISD::LOAD, MVT::OrigVT: v4f32, MVT::DestVT: v4i32);
81
82 setOperationAction(ISD::LOAD, MVT::v5f32, Promote);
83 AddPromotedToType(Opc: ISD::LOAD, MVT::OrigVT: v5f32, MVT::DestVT: v5i32);
84
85 setOperationAction(ISD::LOAD, MVT::v6f32, Promote);
86 AddPromotedToType(Opc: ISD::LOAD, MVT::OrigVT: v6f32, MVT::DestVT: v6i32);
87
88 setOperationAction(ISD::LOAD, MVT::v7f32, Promote);
89 AddPromotedToType(Opc: ISD::LOAD, MVT::OrigVT: v7f32, MVT::DestVT: v7i32);
90
91 setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
92 AddPromotedToType(Opc: ISD::LOAD, MVT::OrigVT: v8f32, MVT::DestVT: v8i32);
93
94 setOperationAction(ISD::LOAD, MVT::v9f32, Promote);
95 AddPromotedToType(Opc: ISD::LOAD, MVT::OrigVT: v9f32, MVT::DestVT: v9i32);
96
97 setOperationAction(ISD::LOAD, MVT::v10f32, Promote);
98 AddPromotedToType(Opc: ISD::LOAD, MVT::OrigVT: v10f32, MVT::DestVT: v10i32);
99
100 setOperationAction(ISD::LOAD, MVT::v11f32, Promote);
101 AddPromotedToType(Opc: ISD::LOAD, MVT::OrigVT: v11f32, MVT::DestVT: v11i32);
102
103 setOperationAction(ISD::LOAD, MVT::v12f32, Promote);
104 AddPromotedToType(Opc: ISD::LOAD, MVT::OrigVT: v12f32, MVT::DestVT: v12i32);
105
106 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
107 AddPromotedToType(Opc: ISD::LOAD, MVT::OrigVT: v16f32, MVT::DestVT: v16i32);
108
109 setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
110 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
111
112 setOperationAction(ISD::LOAD, MVT::i64, Promote);
113 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
114
115 setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
116 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
117
118 setOperationAction(ISD::LOAD, MVT::f64, Promote);
119 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
120
121 setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
122 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
123
124 setOperationAction(ISD::LOAD, MVT::v3i64, Promote);
125 AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
126
127 setOperationAction(ISD::LOAD, MVT::v4i64, Promote);
128 AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
129
130 setOperationAction(ISD::LOAD, MVT::v3f64, Promote);
131 AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
132
133 setOperationAction(ISD::LOAD, MVT::v4f64, Promote);
134 AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
135
136 setOperationAction(ISD::LOAD, MVT::v8i64, Promote);
137 AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
138
139 setOperationAction(ISD::LOAD, MVT::v8f64, Promote);
140 AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
141
142 setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
143 AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
144
145 setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
146 AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
147
148 setOperationAction(ISD::LOAD, MVT::i128, Promote);
149 AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32);
150
151 // There are no 64-bit extloads. These should be done as a 32-bit extload and
152 // an extension to 64-bit.
153 for (MVT VT : MVT::integer_valuetypes())
154 setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::i64, VT,
155 Expand);
156
157 for (MVT VT : MVT::integer_valuetypes()) {
158 if (VT == MVT::i64)
159 continue;
160
161 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
162 setLoadExtAction(Op, VT, MVT::i1, Promote);
163 setLoadExtAction(Op, VT, MVT::i8, Legal);
164 setLoadExtAction(Op, VT, MVT::i16, Legal);
165 setLoadExtAction(Op, VT, MVT::i32, Expand);
166 }
167 }
168
169 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
170 for (auto MemVT :
171 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
172 setLoadExtAction({ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}, VT, MemVT,
173 Expand);
174
175 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
176 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
177 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
178 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
179 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
180 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3bf16, Expand);
181 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
182 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
183 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
184 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
185 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
186 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16bf16, Expand);
187 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
188 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32bf16, Expand);
189
190 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
191 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
192 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
193 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
194 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
195 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
196
197 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
198 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
199 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
200 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
201 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
202 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3bf16, Expand);
203 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
204 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
205 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
206 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
207 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
208 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16bf16, Expand);
209
210 setOperationAction(ISD::STORE, MVT::f32, Promote);
211 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
212
213 setOperationAction(ISD::STORE, MVT::v2f32, Promote);
214 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
215
216 setOperationAction(ISD::STORE, MVT::v3f32, Promote);
217 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
218
219 setOperationAction(ISD::STORE, MVT::v4f32, Promote);
220 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
221
222 setOperationAction(ISD::STORE, MVT::v5f32, Promote);
223 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
224
225 setOperationAction(ISD::STORE, MVT::v6f32, Promote);
226 AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
227
228 setOperationAction(ISD::STORE, MVT::v7f32, Promote);
229 AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
230
231 setOperationAction(ISD::STORE, MVT::v8f32, Promote);
232 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
233
234 setOperationAction(ISD::STORE, MVT::v9f32, Promote);
235 AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32);
236
237 setOperationAction(ISD::STORE, MVT::v10f32, Promote);
238 AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32);
239
240 setOperationAction(ISD::STORE, MVT::v11f32, Promote);
241 AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32);
242
243 setOperationAction(ISD::STORE, MVT::v12f32, Promote);
244 AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32);
245
246 setOperationAction(ISD::STORE, MVT::v16f32, Promote);
247 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
248
249 setOperationAction(ISD::STORE, MVT::v32f32, Promote);
250 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
251
252 setOperationAction(ISD::STORE, MVT::i64, Promote);
253 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
254
255 setOperationAction(ISD::STORE, MVT::v2i64, Promote);
256 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
257
258 setOperationAction(ISD::STORE, MVT::f64, Promote);
259 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
260
261 setOperationAction(ISD::STORE, MVT::v2f64, Promote);
262 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
263
264 setOperationAction(ISD::STORE, MVT::v3i64, Promote);
265 AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
266
267 setOperationAction(ISD::STORE, MVT::v3f64, Promote);
268 AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
269
270 setOperationAction(ISD::STORE, MVT::v4i64, Promote);
271 AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
272
273 setOperationAction(ISD::STORE, MVT::v4f64, Promote);
274 AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
275
276 setOperationAction(ISD::STORE, MVT::v8i64, Promote);
277 AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
278
279 setOperationAction(ISD::STORE, MVT::v8f64, Promote);
280 AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
281
282 setOperationAction(ISD::STORE, MVT::v16i64, Promote);
283 AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
284
285 setOperationAction(ISD::STORE, MVT::v16f64, Promote);
286 AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
287
288 setOperationAction(ISD::STORE, MVT::i128, Promote);
289 AddPromotedToType(ISD::STORE, MVT::i128, MVT::v4i32);
290
291 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
292 setTruncStoreAction(MVT::i64, MVT::i8, Expand);
293 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
294 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
295
296 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
297 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
298 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
299 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
300
301 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
302 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
303 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
304 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
305 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
306 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
307 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
308 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
309
310 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
311 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
312 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
313
314 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
315 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
316
317 setTruncStoreAction(MVT::v3i32, MVT::v3i8, Expand);
318
319 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
320 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
321 setTruncStoreAction(MVT::v3i64, MVT::v3i8, Expand);
322 setTruncStoreAction(MVT::v3i64, MVT::v3i1, Expand);
323 setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
324 setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
325
326 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
327 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
328 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
329 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
330
331 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
332 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
333
334 setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
335 setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
336 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
337 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
338 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
339 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
340 setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
341
342 setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal);
343 setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal);
344
345 setOperationAction({ISD::BR_JT, ISD::BRIND}, MVT::Other, Expand);
346
347 // For R600, this is totally unsupported, just custom lower to produce an
348 // error.
349 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
350
351 // Library functions. These default to Expand, but we have instructions
352 // for them.
353 setOperationAction({ISD::FCEIL, ISD::FPOW, ISD::FABS, ISD::FFLOOR,
354 ISD::FROUNDEVEN, ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM},
355 MVT::f32, Legal);
356
357 setOperationAction(ISD::FLOG2, MVT::f32, Custom);
358 setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
359
360 setOperationAction(
361 {ISD::FLOG, ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10}, MVT::f32,
362 Custom);
363
364 setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
365
366 setOperationAction(ISD::FRINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
367
368 setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom);
369
370 if (Subtarget->has16BitInsts())
371 setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
372 else {
373 setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
374 setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Custom);
375 }
376
377 setOperationAction({ISD::FLOG10, ISD::FLOG, ISD::FEXP, ISD::FEXP10}, MVT::f16,
378 Custom);
379
380 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
381 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
382 // default unless marked custom/legal.
383 setOperationAction(
384 ISD::IS_FPCLASS,
385 {MVT::v2f16, MVT::v3f16, MVT::v4f16, MVT::v16f16, MVT::v2f32, MVT::v3f32,
386 MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
387 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64, MVT::v16f64},
388 Custom);
389
390 // Expand to fneg + fadd.
391 setOperationAction(ISD::FSUB, MVT::f64, Expand);
392
393 setOperationAction(ISD::CONCAT_VECTORS,
394 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
395 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
396 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
397 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
398 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
399 Custom);
400
401 // FIXME: Why is v8f16/v8bf16 missing?
402 setOperationAction(
403 ISD::EXTRACT_SUBVECTOR,
404 {MVT::v2f16, MVT::v2bf16, MVT::v2i16, MVT::v4f16, MVT::v4bf16,
405 MVT::v4i16, MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32,
406 MVT::v4f32, MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32,
407 MVT::v6i32, MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32,
408 MVT::v9f32, MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32,
409 MVT::v11f32, MVT::v12i32, MVT::v12f32, MVT::v16f16, MVT::v16bf16,
410 MVT::v16i16, MVT::v16f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
411 MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64,
412 MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64,
413 MVT::v32i16, MVT::v32f16, MVT::v32bf16},
414 Custom);
415
416 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
417 setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);
418
419 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
420 for (MVT VT : ScalarIntVTs) {
421 // These should use [SU]DIVREM, so set them to expand
422 setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM}, VT,
423 Expand);
424
425 // GPU does not have divrem function for signed or unsigned.
426 setOperationAction({ISD::SDIVREM, ISD::UDIVREM}, VT, Custom);
427
428 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
429 setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, VT, Expand);
430
431 setOperationAction({ISD::BSWAP, ISD::CTTZ, ISD::CTLZ}, VT, Expand);
432
433 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
434 setOperationAction({ISD::ADDC, ISD::SUBC, ISD::ADDE, ISD::SUBE}, VT, Legal);
435 }
436
437 // The hardware supports 32-bit FSHR, but not FSHL.
438 setOperationAction(ISD::FSHR, MVT::i32, Legal);
439
440 // The hardware supports 32-bit ROTR, but not ROTL.
441 setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand);
442 setOperationAction(ISD::ROTR, MVT::i64, Expand);
443
444 setOperationAction({ISD::MULHU, ISD::MULHS}, MVT::i16, Expand);
445
446 setOperationAction({ISD::MUL, ISD::MULHU, ISD::MULHS}, MVT::i64, Expand);
447 setOperationAction(
448 {ISD::UINT_TO_FP, ISD::SINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT},
449 MVT::i64, Custom);
450 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
451
452 setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i32,
453 Legal);
454
455 setOperationAction(
456 {ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF},
457 MVT::i64, Custom);
458
459 for (auto VT : {MVT::i8, MVT::i16})
460 setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, VT, Custom);
461
462 static const MVT::SimpleValueType VectorIntTypes[] = {
463 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
464 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
465
466 for (MVT VT : VectorIntTypes) {
467 // Expand the following operations for the current type by default.
468 setOperationAction({ISD::ADD, ISD::AND, ISD::FP_TO_SINT,
469 ISD::FP_TO_UINT, ISD::MUL, ISD::MULHU,
470 ISD::MULHS, ISD::OR, ISD::SHL,
471 ISD::SRA, ISD::SRL, ISD::ROTL,
472 ISD::ROTR, ISD::SUB, ISD::SINT_TO_FP,
473 ISD::UINT_TO_FP, ISD::SDIV, ISD::UDIV,
474 ISD::SREM, ISD::UREM, ISD::SMUL_LOHI,
475 ISD::UMUL_LOHI, ISD::SDIVREM, ISD::UDIVREM,
476 ISD::SELECT, ISD::VSELECT, ISD::SELECT_CC,
477 ISD::XOR, ISD::BSWAP, ISD::CTPOP,
478 ISD::CTTZ, ISD::CTLZ, ISD::VECTOR_SHUFFLE,
479 ISD::SETCC},
480 VT, Expand);
481 }
482
483 static const MVT::SimpleValueType FloatVectorTypes[] = {
484 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
485 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
486
487 for (MVT VT : FloatVectorTypes) {
488 setOperationAction(
489 {ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM,
490 ISD::FADD, ISD::FCEIL, ISD::FCOS,
491 ISD::FDIV, ISD::FEXP2, ISD::FEXP,
492 ISD::FEXP10, ISD::FLOG2, ISD::FREM,
493 ISD::FLOG, ISD::FLOG10, ISD::FPOW,
494 ISD::FFLOOR, ISD::FTRUNC, ISD::FMUL,
495 ISD::FMA, ISD::FRINT, ISD::FNEARBYINT,
496 ISD::FSQRT, ISD::FSIN, ISD::FSUB,
497 ISD::FNEG, ISD::VSELECT, ISD::SELECT_CC,
498 ISD::FCOPYSIGN, ISD::VECTOR_SHUFFLE, ISD::SETCC,
499 ISD::FCANONICALIZE, ISD::FROUNDEVEN},
500 VT, Expand);
501 }
502
503 // This causes using an unrolled select operation rather than expansion with
504 // bit operations. This is in general better, but the alternative using BFI
505 // instructions may be better if the select sources are SGPRs.
506 setOperationAction(ISD::SELECT, MVT::v2f32, Promote);
507 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
508
509 setOperationAction(ISD::SELECT, MVT::v3f32, Promote);
510 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
511
512 setOperationAction(ISD::SELECT, MVT::v4f32, Promote);
513 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
514
515 setOperationAction(ISD::SELECT, MVT::v5f32, Promote);
516 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
517
518 setOperationAction(ISD::SELECT, MVT::v6f32, Promote);
519 AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
520
521 setOperationAction(ISD::SELECT, MVT::v7f32, Promote);
522 AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
523
524 setOperationAction(ISD::SELECT, MVT::v9f32, Promote);
525 AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32);
526
527 setOperationAction(ISD::SELECT, MVT::v10f32, Promote);
528 AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32);
529
530 setOperationAction(ISD::SELECT, MVT::v11f32, Promote);
531 AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32);
532
533 setOperationAction(ISD::SELECT, MVT::v12f32, Promote);
534 AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);
535
536 // Disable most libcalls.
537 for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I) {
538 if (I < RTLIB::ATOMIC_LOAD || I > RTLIB::ATOMIC_FETCH_NAND_16)
539 setLibcallName(Call: static_cast<RTLIB::Libcall>(I), Name: nullptr);
540 }
541
542 setSchedulingPreference(Sched::RegPressure);
543 setJumpIsExpensive(true);
544
545 // FIXME: This is only partially true. If we have to do vector compares, any
546 // SGPR pair can be a condition register. If we have a uniform condition, we
547 // are better off doing SALU operations, where there is only one SCC. For now,
548 // we don't have a way of knowing during instruction selection if a condition
549 // will be uniform and we always use vector compares. Assume we are using
550 // vector compares until that is fixed.
551 setHasMultipleConditionRegisters(true);
552
553 setMinCmpXchgSizeInBits(32);
554 setSupportsUnalignedAtomics(false);
555
556 PredictableSelectIsExpensive = false;
557
558 // We want to find all load dependencies for long chains of stores to enable
559 // merging into very wide vectors. The problem is with vectors with > 4
560 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
561 // vectors are a legal type, even though we have to split the loads
562 // usually. When we can more precisely specify load legality per address
563 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
564 // smarter so that they can figure out what to do in 2 iterations without all
565 // N > 4 stores on the same chain.
566 GatherAllAliasesMaxDepth = 16;
567
568 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
569 // about these during lowering.
570 MaxStoresPerMemcpy = 0xffffffff;
571 MaxStoresPerMemmove = 0xffffffff;
572 MaxStoresPerMemset = 0xffffffff;
573
574 // The expansion for 64-bit division is enormous.
575 if (AMDGPUBypassSlowDiv)
576 addBypassSlowDiv(SlowBitWidth: 64, FastBitWidth: 32);
577
578 setTargetDAGCombine({ISD::BITCAST, ISD::SHL,
579 ISD::SRA, ISD::SRL,
580 ISD::TRUNCATE, ISD::MUL,
581 ISD::SMUL_LOHI, ISD::UMUL_LOHI,
582 ISD::MULHU, ISD::MULHS,
583 ISD::SELECT, ISD::SELECT_CC,
584 ISD::STORE, ISD::FADD,
585 ISD::FSUB, ISD::FNEG,
586 ISD::FABS, ISD::AssertZext,
587 ISD::AssertSext, ISD::INTRINSIC_WO_CHAIN});
588
589 setMaxAtomicSizeInBitsSupported(64);
590 setMaxDivRemBitWidthSupported(64);
591 setMaxLargeFPConvertBitWidthSupported(64);
592}
593
594bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const {
595 if (getTargetMachine().Options.NoSignedZerosFPMath)
596 return true;
597
598 const auto Flags = Op.getNode()->getFlags();
599 if (Flags.hasNoSignedZeros())
600 return true;
601
602 return false;
603}
604
605//===----------------------------------------------------------------------===//
606// Target Information
607//===----------------------------------------------------------------------===//
608
609LLVM_READNONE
610static bool fnegFoldsIntoOpcode(unsigned Opc) {
611 switch (Opc) {
612 case ISD::FADD:
613 case ISD::FSUB:
614 case ISD::FMUL:
615 case ISD::FMA:
616 case ISD::FMAD:
617 case ISD::FMINNUM:
618 case ISD::FMAXNUM:
619 case ISD::FMINNUM_IEEE:
620 case ISD::FMAXNUM_IEEE:
621 case ISD::FMINIMUM:
622 case ISD::FMAXIMUM:
623 case ISD::SELECT:
624 case ISD::FSIN:
625 case ISD::FTRUNC:
626 case ISD::FRINT:
627 case ISD::FNEARBYINT:
628 case ISD::FROUNDEVEN:
629 case ISD::FCANONICALIZE:
630 case AMDGPUISD::RCP:
631 case AMDGPUISD::RCP_LEGACY:
632 case AMDGPUISD::RCP_IFLAG:
633 case AMDGPUISD::SIN_HW:
634 case AMDGPUISD::FMUL_LEGACY:
635 case AMDGPUISD::FMIN_LEGACY:
636 case AMDGPUISD::FMAX_LEGACY:
637 case AMDGPUISD::FMED3:
638 // TODO: handle llvm.amdgcn.fma.legacy
639 return true;
640 case ISD::BITCAST:
641 llvm_unreachable("bitcast is special cased");
642 default:
643 return false;
644 }
645}
646
647static bool fnegFoldsIntoOp(const SDNode *N) {
648 unsigned Opc = N->getOpcode();
649 if (Opc == ISD::BITCAST) {
650 // TODO: Is there a benefit to checking the conditions performFNegCombine
651 // does? We don't for the other cases.
652 SDValue BCSrc = N->getOperand(Num: 0);
653 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
654 return BCSrc.getNumOperands() == 2 &&
655 BCSrc.getOperand(i: 1).getValueSizeInBits() == 32;
656 }
657
658 return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
659 }
660
661 return fnegFoldsIntoOpcode(Opc);
662}
663
664/// \p returns true if the operation will definitely need to use a 64-bit
665/// encoding, and thus will use a VOP3 encoding regardless of the source
666/// modifiers.
667LLVM_READONLY
668static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
669 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
670 VT == MVT::f64;
671}
672
673/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
674/// type for ISD::SELECT.
675LLVM_READONLY
676static bool selectSupportsSourceMods(const SDNode *N) {
677 // TODO: Only applies if select will be vector
678 return N->getValueType(0) == MVT::f32;
679}
680
681// Most FP instructions support source modifiers, but this could be refined
682// slightly.
683LLVM_READONLY
684static bool hasSourceMods(const SDNode *N) {
685 if (isa<MemSDNode>(Val: N))
686 return false;
687
688 switch (N->getOpcode()) {
689 case ISD::CopyToReg:
690 case ISD::FDIV:
691 case ISD::FREM:
692 case ISD::INLINEASM:
693 case ISD::INLINEASM_BR:
694 case AMDGPUISD::DIV_SCALE:
695 case ISD::INTRINSIC_W_CHAIN:
696
697 // TODO: Should really be looking at the users of the bitcast. These are
698 // problematic because bitcasts are used to legalize all stores to integer
699 // types.
700 case ISD::BITCAST:
701 return false;
702 case ISD::INTRINSIC_WO_CHAIN: {
703 switch (N->getConstantOperandVal(Num: 0)) {
704 case Intrinsic::amdgcn_interp_p1:
705 case Intrinsic::amdgcn_interp_p2:
706 case Intrinsic::amdgcn_interp_mov:
707 case Intrinsic::amdgcn_interp_p1_f16:
708 case Intrinsic::amdgcn_interp_p2_f16:
709 return false;
710 default:
711 return true;
712 }
713 }
714 case ISD::SELECT:
715 return selectSupportsSourceMods(N);
716 default:
717 return true;
718 }
719}
720
721bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
722 unsigned CostThreshold) {
723 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
724 // it is truly free to use a source modifier in all cases. If there are
725 // multiple users but for each one will necessitate using VOP3, there will be
726 // a code size increase. Try to avoid increasing code size unless we know it
727 // will save on the instruction count.
728 unsigned NumMayIncreaseSize = 0;
729 MVT VT = N->getValueType(ResNo: 0).getScalarType().getSimpleVT();
730
731 assert(!N->use_empty());
732
733 // XXX - Should this limit number of uses to check?
734 for (const SDNode *U : N->uses()) {
735 if (!hasSourceMods(N: U))
736 return false;
737
738 if (!opMustUseVOP3Encoding(N: U, VT)) {
739 if (++NumMayIncreaseSize > CostThreshold)
740 return false;
741 }
742 }
743
744 return true;
745}
746
747EVT AMDGPUTargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
748 ISD::NodeType ExtendKind) const {
749 assert(!VT.isVector() && "only scalar expected");
750
751 // Round to the next multiple of 32-bits.
752 unsigned Size = VT.getSizeInBits();
753 if (Size <= 32)
754 return MVT::i32;
755 return EVT::getIntegerVT(Context, BitWidth: 32 * ((Size + 31) / 32));
756}
757
758MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
759 return MVT::i32;
760}
761
762bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
763 return true;
764}
765
766// The backend supports 32 and 64 bit floating point immediates.
767// FIXME: Why are we reporting vectors of FP immediates as legal?
768bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
769 bool ForCodeSize) const {
770 EVT ScalarVT = VT.getScalarType();
771 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
772 (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
773}
774
775// We don't want to shrink f64 / f32 constants.
776bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
777 EVT ScalarVT = VT.getScalarType();
778 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
779}
780
781bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
782 ISD::LoadExtType ExtTy,
783 EVT NewVT) const {
784 // TODO: This may be worth removing. Check regression tests for diffs.
785 if (!TargetLoweringBase::shouldReduceLoadWidth(Load: N, ExtTy, NewVT))
786 return false;
787
788 unsigned NewSize = NewVT.getStoreSizeInBits();
789
790 // If we are reducing to a 32-bit load or a smaller multi-dword load,
791 // this is always better.
792 if (NewSize >= 32)
793 return true;
794
795 EVT OldVT = N->getValueType(ResNo: 0);
796 unsigned OldSize = OldVT.getStoreSizeInBits();
797
798 MemSDNode *MN = cast<MemSDNode>(Val: N);
799 unsigned AS = MN->getAddressSpace();
800 // Do not shrink an aligned scalar load to sub-dword.
801 // Scalar engine cannot do sub-dword loads.
802 // TODO: Update this for GFX12 which does have scalar sub-dword loads.
803 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
804 (AS == AMDGPUAS::CONSTANT_ADDRESS ||
805 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
806 (isa<LoadSDNode>(Val: N) && AS == AMDGPUAS::GLOBAL_ADDRESS &&
807 MN->isInvariant())) &&
808 AMDGPUInstrInfo::isUniformMMO(MMO: MN->getMemOperand()))
809 return false;
810
811 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
812 // extloads, so doing one requires using a buffer_load. In cases where we
813 // still couldn't use a scalar load, using the wider load shouldn't really
814 // hurt anything.
815
816 // If the old size already had to be an extload, there's no harm in continuing
817 // to reduce the width.
818 return (OldSize < 32);
819}
820
821bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy,
822 const SelectionDAG &DAG,
823 const MachineMemOperand &MMO) const {
824
825 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
826
827 if (LoadTy.getScalarType() == MVT::i32)
828 return false;
829
830 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
831 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
832
833 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
834 return false;
835
836 unsigned Fast = 0;
837 return allowsMemoryAccessForAlignment(Context&: *DAG.getContext(), DL: DAG.getDataLayout(),
838 VT: CastTy, MMO, Fast: &Fast) &&
839 Fast;
840}
841
842// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
843// profitable with the expansion for 64-bit since it's generally good to
844// speculate things.
845bool AMDGPUTargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
846 return true;
847}
848
849bool AMDGPUTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
850 return true;
851}
852
853bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode *N) const {
854 switch (N->getOpcode()) {
855 case ISD::EntryToken:
856 case ISD::TokenFactor:
857 return true;
858 case ISD::INTRINSIC_WO_CHAIN: {
859 unsigned IntrID = N->getConstantOperandVal(Num: 0);
860 return AMDGPU::isIntrinsicAlwaysUniform(IntrID);
861 }
862 case ISD::LOAD:
863 if (cast<LoadSDNode>(Val: N)->getMemOperand()->getAddrSpace() ==
864 AMDGPUAS::CONSTANT_ADDRESS_32BIT)
865 return true;
866 return false;
867 case AMDGPUISD::SETCC: // ballot-style instruction
868 return true;
869 }
870 return false;
871}
872
873SDValue AMDGPUTargetLowering::getNegatedExpression(
874 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
875 NegatibleCost &Cost, unsigned Depth) const {
876
877 switch (Op.getOpcode()) {
878 case ISD::FMA:
879 case ISD::FMAD: {
880 // Negating a fma is not free if it has users without source mods.
881 if (!allUsesHaveSourceMods(N: Op.getNode()))
882 return SDValue();
883 break;
884 }
885 case AMDGPUISD::RCP: {
886 SDValue Src = Op.getOperand(i: 0);
887 EVT VT = Op.getValueType();
888 SDLoc SL(Op);
889
890 SDValue NegSrc = getNegatedExpression(Op: Src, DAG, LegalOperations,
891 ForCodeSize, Cost, Depth: Depth + 1);
892 if (NegSrc)
893 return DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: NegSrc, Flags: Op->getFlags());
894 return SDValue();
895 }
896 default:
897 break;
898 }
899
900 return TargetLowering::getNegatedExpression(Op, DAG, LegalOps: LegalOperations,
901 OptForSize: ForCodeSize, Cost, Depth);
902}
903
904//===---------------------------------------------------------------------===//
905// Target Properties
906//===---------------------------------------------------------------------===//
907
908bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
909 assert(VT.isFloatingPoint());
910
911 // Packed operations do not have a fabs modifier.
912 return VT == MVT::f32 || VT == MVT::f64 ||
913 (Subtarget->has16BitInsts() && VT == MVT::f16);
914}
915
916bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
917 assert(VT.isFloatingPoint());
918 // Report this based on the end legalized type.
919 VT = VT.getScalarType();
920 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16;
921}
922
923bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT,
924 unsigned NumElem,
925 unsigned AS) const {
926 return true;
927}
928
929bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
930 // There are few operations which truly have vector input operands. Any vector
931 // operation is going to involve operations on each component, and a
932 // build_vector will be a copy per element, so it always makes sense to use a
933 // build_vector input in place of the extracted element to avoid a copy into a
934 // super register.
935 //
936 // We should probably only do this if all users are extracts only, but this
937 // should be the common case.
938 return true;
939}
940
941bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
942 // Truncate is just accessing a subregister.
943
944 unsigned SrcSize = Source.getSizeInBits();
945 unsigned DestSize = Dest.getSizeInBits();
946
947 return DestSize < SrcSize && DestSize % 32 == 0 ;
948}
949
950bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
951 // Truncate is just accessing a subregister.
952
953 unsigned SrcSize = Source->getScalarSizeInBits();
954 unsigned DestSize = Dest->getScalarSizeInBits();
955
956 if (DestSize== 16 && Subtarget->has16BitInsts())
957 return SrcSize >= 32;
958
959 return DestSize < SrcSize && DestSize % 32 == 0;
960}
961
962bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
963 unsigned SrcSize = Src->getScalarSizeInBits();
964 unsigned DestSize = Dest->getScalarSizeInBits();
965
966 if (SrcSize == 16 && Subtarget->has16BitInsts())
967 return DestSize >= 32;
968
969 return SrcSize == 32 && DestSize == 64;
970}
971
972bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
973 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
974 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
975 // this will enable reducing 64-bit operations the 32-bit, which is always
976 // good.
977
978 if (Src == MVT::i16)
979 return Dest == MVT::i32 ||Dest == MVT::i64 ;
980
981 return Src == MVT::i32 && Dest == MVT::i64;
982}
983
984bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
985 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
986 // limited number of native 64-bit operations. Shrinking an operation to fit
987 // in a single 32-bit register should always be helpful. As currently used,
988 // this is much less general than the name suggests, and is only used in
989 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
990 // not profitable, and may actually be harmful.
991 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
992}
993
994bool AMDGPUTargetLowering::isDesirableToCommuteWithShift(
995 const SDNode* N, CombineLevel Level) const {
996 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
997 N->getOpcode() == ISD::SRL) &&
998 "Expected shift op");
999 // Always commute pre-type legalization and right shifts.
1000 // We're looking for shl(or(x,y),z) patterns.
1001 if (Level < CombineLevel::AfterLegalizeTypes ||
1002 N->getOpcode() != ISD::SHL || N->getOperand(Num: 0).getOpcode() != ISD::OR)
1003 return true;
1004
1005 // If only user is a i32 right-shift, then don't destroy a BFE pattern.
1006 if (N->getValueType(0) == MVT::i32 && N->use_size() == 1 &&
1007 (N->use_begin()->getOpcode() == ISD::SRA ||
1008 N->use_begin()->getOpcode() == ISD::SRL))
1009 return false;
1010
1011 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
1012 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
1013 if (LHS.getOpcode() != ISD::SHL)
1014 return false;
1015 auto *RHSLd = dyn_cast<LoadSDNode>(Val&: RHS);
1016 auto *LHS0 = dyn_cast<LoadSDNode>(Val: LHS.getOperand(i: 0));
1017 auto *LHS1 = dyn_cast<ConstantSDNode>(Val: LHS.getOperand(i: 1));
1018 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
1019 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
1020 RHSLd->getExtensionType() == ISD::ZEXTLOAD;
1021 };
1022 SDValue LHS = N->getOperand(Num: 0).getOperand(i: 0);
1023 SDValue RHS = N->getOperand(Num: 0).getOperand(i: 1);
1024 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
1025}
1026
1027//===---------------------------------------------------------------------===//
1028// TargetLowering Callbacks
1029//===---------------------------------------------------------------------===//
1030
1031CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
1032 bool IsVarArg) {
1033 switch (CC) {
1034 case CallingConv::AMDGPU_VS:
1035 case CallingConv::AMDGPU_GS:
1036 case CallingConv::AMDGPU_PS:
1037 case CallingConv::AMDGPU_CS:
1038 case CallingConv::AMDGPU_HS:
1039 case CallingConv::AMDGPU_ES:
1040 case CallingConv::AMDGPU_LS:
1041 return CC_AMDGPU;
1042 case CallingConv::AMDGPU_CS_Chain:
1043 case CallingConv::AMDGPU_CS_ChainPreserve:
1044 return CC_AMDGPU_CS_CHAIN;
1045 case CallingConv::C:
1046 case CallingConv::Fast:
1047 case CallingConv::Cold:
1048 return CC_AMDGPU_Func;
1049 case CallingConv::AMDGPU_Gfx:
1050 return CC_SI_Gfx;
1051 case CallingConv::AMDGPU_KERNEL:
1052 case CallingConv::SPIR_KERNEL:
1053 default:
1054 report_fatal_error(reason: "Unsupported calling convention for call");
1055 }
1056}
1057
1058CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
1059 bool IsVarArg) {
1060 switch (CC) {
1061 case CallingConv::AMDGPU_KERNEL:
1062 case CallingConv::SPIR_KERNEL:
1063 llvm_unreachable("kernels should not be handled here");
1064 case CallingConv::AMDGPU_VS:
1065 case CallingConv::AMDGPU_GS:
1066 case CallingConv::AMDGPU_PS:
1067 case CallingConv::AMDGPU_CS:
1068 case CallingConv::AMDGPU_CS_Chain:
1069 case CallingConv::AMDGPU_CS_ChainPreserve:
1070 case CallingConv::AMDGPU_HS:
1071 case CallingConv::AMDGPU_ES:
1072 case CallingConv::AMDGPU_LS:
1073 return RetCC_SI_Shader;
1074 case CallingConv::AMDGPU_Gfx:
1075 return RetCC_SI_Gfx;
1076 case CallingConv::C:
1077 case CallingConv::Fast:
1078 case CallingConv::Cold:
1079 return RetCC_AMDGPU_Func;
1080 default:
1081 report_fatal_error(reason: "Unsupported calling convention.");
1082 }
1083}
1084
1085/// The SelectionDAGBuilder will automatically promote function arguments
1086/// with illegal types. However, this does not work for the AMDGPU targets
1087/// since the function arguments are stored in memory as these illegal types.
1088/// In order to handle this properly we need to get the original types sizes
1089/// from the LLVM IR Function and fixup the ISD:InputArg values before
1090/// passing them to AnalyzeFormalArguments()
1091
1092/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1093/// input values across multiple registers. Each item in the Ins array
1094/// represents a single value that will be stored in registers. Ins[x].VT is
1095/// the value type of the value that will be stored in the register, so
1096/// whatever SDNode we lower the argument to needs to be this type.
1097///
1098/// In order to correctly lower the arguments we need to know the size of each
1099/// argument. Since Ins[x].VT gives us the size of the register that will
1100/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1101/// for the original function argument so that we can deduce the correct memory
1102/// type to use for Ins[x]. In most cases the correct memory type will be
1103/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1104/// we have a kernel argument of type v8i8, this argument will be split into
1105/// 8 parts and each part will be represented by its own item in the Ins array.
1106/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1107/// the argument before it was split. From this, we deduce that the memory type
1108/// for each individual part is i8. We pass the memory type as LocVT to the
1109/// calling convention analysis function and the register type (Ins[x].VT) as
1110/// the ValVT.
1111void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
1112 CCState &State,
1113 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1114 const MachineFunction &MF = State.getMachineFunction();
1115 const Function &Fn = MF.getFunction();
1116 LLVMContext &Ctx = Fn.getParent()->getContext();
1117 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
1118 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset();
1119 CallingConv::ID CC = Fn.getCallingConv();
1120
1121 Align MaxAlign = Align(1);
1122 uint64_t ExplicitArgOffset = 0;
1123 const DataLayout &DL = Fn.getParent()->getDataLayout();
1124
1125 unsigned InIndex = 0;
1126
1127 for (const Argument &Arg : Fn.args()) {
1128 const bool IsByRef = Arg.hasByRefAttr();
1129 Type *BaseArgTy = Arg.getType();
1130 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1131 Align Alignment = DL.getValueOrABITypeAlignment(
1132 Alignment: IsByRef ? Arg.getParamAlign() : std::nullopt, Ty: MemArgTy);
1133 MaxAlign = std::max(a: Alignment, b: MaxAlign);
1134 uint64_t AllocSize = DL.getTypeAllocSize(Ty: MemArgTy);
1135
1136 uint64_t ArgOffset = alignTo(Size: ExplicitArgOffset, A: Alignment) + ExplicitOffset;
1137 ExplicitArgOffset = alignTo(Size: ExplicitArgOffset, A: Alignment) + AllocSize;
1138
1139 // We're basically throwing away everything passed into us and starting over
1140 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1141 // to us as computed in Ins.
1142 //
1143 // We also need to figure out what type legalization is trying to do to get
1144 // the correct memory offsets.
1145
1146 SmallVector<EVT, 16> ValueVTs;
1147 SmallVector<uint64_t, 16> Offsets;
1148 ComputeValueVTs(TLI: *this, DL, Ty: BaseArgTy, ValueVTs, FixedOffsets: &Offsets, StartingOffset: ArgOffset);
1149
1150 for (unsigned Value = 0, NumValues = ValueVTs.size();
1151 Value != NumValues; ++Value) {
1152 uint64_t BasePartOffset = Offsets[Value];
1153
1154 EVT ArgVT = ValueVTs[Value];
1155 EVT MemVT = ArgVT;
1156 MVT RegisterVT = getRegisterTypeForCallingConv(Context&: Ctx, CC, VT: ArgVT);
1157 unsigned NumRegs = getNumRegistersForCallingConv(Context&: Ctx, CC, VT: ArgVT);
1158
1159 if (NumRegs == 1) {
1160 // This argument is not split, so the IR type is the memory type.
1161 if (ArgVT.isExtended()) {
1162 // We have an extended type, like i24, so we should just use the
1163 // register type.
1164 MemVT = RegisterVT;
1165 } else {
1166 MemVT = ArgVT;
1167 }
1168 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1169 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1170 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1171 // We have a vector value which has been split into a vector with
1172 // the same scalar type, but fewer elements. This should handle
1173 // all the floating-point vector types.
1174 MemVT = RegisterVT;
1175 } else if (ArgVT.isVector() &&
1176 ArgVT.getVectorNumElements() == NumRegs) {
1177 // This arg has been split so that each element is stored in a separate
1178 // register.
1179 MemVT = ArgVT.getScalarType();
1180 } else if (ArgVT.isExtended()) {
1181 // We have an extended type, like i65.
1182 MemVT = RegisterVT;
1183 } else {
1184 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1185 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1186 if (RegisterVT.isInteger()) {
1187 MemVT = EVT::getIntegerVT(Context&: State.getContext(), BitWidth: MemoryBits);
1188 } else if (RegisterVT.isVector()) {
1189 assert(!RegisterVT.getScalarType().isFloatingPoint());
1190 unsigned NumElements = RegisterVT.getVectorNumElements();
1191 assert(MemoryBits % NumElements == 0);
1192 // This vector type has been split into another vector type with
1193 // a different elements size.
1194 EVT ScalarVT = EVT::getIntegerVT(Context&: State.getContext(),
1195 BitWidth: MemoryBits / NumElements);
1196 MemVT = EVT::getVectorVT(Context&: State.getContext(), VT: ScalarVT, NumElements);
1197 } else {
1198 llvm_unreachable("cannot deduce memory type.");
1199 }
1200 }
1201
1202 // Convert one element vectors to scalar.
1203 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1204 MemVT = MemVT.getScalarType();
1205
1206 // Round up vec3/vec5 argument.
1207 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1208 assert(MemVT.getVectorNumElements() == 3 ||
1209 MemVT.getVectorNumElements() == 5 ||
1210 (MemVT.getVectorNumElements() >= 9 &&
1211 MemVT.getVectorNumElements() <= 12));
1212 MemVT = MemVT.getPow2VectorType(Context&: State.getContext());
1213 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1214 MemVT = MemVT.getRoundIntegerType(Context&: State.getContext());
1215 }
1216
1217 unsigned PartOffset = 0;
1218 for (unsigned i = 0; i != NumRegs; ++i) {
1219 State.addLoc(V: CCValAssign::getCustomMem(ValNo: InIndex++, ValVT: RegisterVT,
1220 Offset: BasePartOffset + PartOffset,
1221 LocVT: MemVT.getSimpleVT(),
1222 HTP: CCValAssign::Full));
1223 PartOffset += MemVT.getStoreSize();
1224 }
1225 }
1226 }
1227}
1228
1229SDValue AMDGPUTargetLowering::LowerReturn(
1230 SDValue Chain, CallingConv::ID CallConv,
1231 bool isVarArg,
1232 const SmallVectorImpl<ISD::OutputArg> &Outs,
1233 const SmallVectorImpl<SDValue> &OutVals,
1234 const SDLoc &DL, SelectionDAG &DAG) const {
1235 // FIXME: Fails for r600 tests
1236 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1237 // "wave terminate should not have return values");
1238 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1239}
1240
1241//===---------------------------------------------------------------------===//
1242// Target specific lowering
1243//===---------------------------------------------------------------------===//
1244
1245/// Selects the correct CCAssignFn for a given CallingConvention value.
1246CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
1247 bool IsVarArg) {
1248 return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1249}
1250
1251CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
1252 bool IsVarArg) {
1253 return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
1254}
1255
1256SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain,
1257 SelectionDAG &DAG,
1258 MachineFrameInfo &MFI,
1259 int ClobberedFI) const {
1260 SmallVector<SDValue, 8> ArgChains;
1261 int64_t FirstByte = MFI.getObjectOffset(ObjectIdx: ClobberedFI);
1262 int64_t LastByte = FirstByte + MFI.getObjectSize(ObjectIdx: ClobberedFI) - 1;
1263
1264 // Include the original chain at the beginning of the list. When this is
1265 // used by target LowerCall hooks, this helps legalize find the
1266 // CALLSEQ_BEGIN node.
1267 ArgChains.push_back(Elt: Chain);
1268
1269 // Add a chain value for each stack argument corresponding
1270 for (SDNode *U : DAG.getEntryNode().getNode()->uses()) {
1271 if (LoadSDNode *L = dyn_cast<LoadSDNode>(Val: U)) {
1272 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Val: L->getBasePtr())) {
1273 if (FI->getIndex() < 0) {
1274 int64_t InFirstByte = MFI.getObjectOffset(ObjectIdx: FI->getIndex());
1275 int64_t InLastByte = InFirstByte;
1276 InLastByte += MFI.getObjectSize(ObjectIdx: FI->getIndex()) - 1;
1277
1278 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1279 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1280 ArgChains.push_back(Elt: SDValue(L, 1));
1281 }
1282 }
1283 }
1284 }
1285
1286 // Build a tokenfactor for all the chains.
1287 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1288}
1289
1290SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI,
1291 SmallVectorImpl<SDValue> &InVals,
1292 StringRef Reason) const {
1293 SDValue Callee = CLI.Callee;
1294 SelectionDAG &DAG = CLI.DAG;
1295
1296 const Function &Fn = DAG.getMachineFunction().getFunction();
1297
1298 StringRef FuncName("<unknown>");
1299
1300 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Val&: Callee))
1301 FuncName = G->getSymbol();
1302 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Val&: Callee))
1303 FuncName = G->getGlobal()->getName();
1304
1305 DiagnosticInfoUnsupported NoCalls(
1306 Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1307 DAG.getContext()->diagnose(DI: NoCalls);
1308
1309 if (!CLI.IsTailCall) {
1310 for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
1311 InVals.push_back(Elt: DAG.getUNDEF(VT: CLI.Ins[I].VT));
1312 }
1313
1314 return DAG.getEntryNode();
1315}
1316
1317SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
1318 SmallVectorImpl<SDValue> &InVals) const {
1319 return lowerUnhandledCall(CLI, InVals, Reason: "unsupported call to function ");
1320}
1321
1322SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
1323 SelectionDAG &DAG) const {
1324 const Function &Fn = DAG.getMachineFunction().getFunction();
1325
1326 DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1327 SDLoc(Op).getDebugLoc());
1328 DAG.getContext()->diagnose(DI: NoDynamicAlloca);
1329 auto Ops = {DAG.getConstant(Val: 0, DL: SDLoc(), VT: Op.getValueType()), Op.getOperand(i: 0)};
1330 return DAG.getMergeValues(Ops, dl: SDLoc());
1331}
1332
1333SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
1334 SelectionDAG &DAG) const {
1335 switch (Op.getOpcode()) {
1336 default:
1337 Op->print(OS&: errs(), G: &DAG);
1338 llvm_unreachable("Custom lowering code for this "
1339 "instruction is not implemented yet!");
1340 break;
1341 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
1342 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1343 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
1344 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1345 case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1346 case ISD::FREM: return LowerFREM(Op, DAG);
1347 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1348 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1349 case ISD::FRINT: return LowerFRINT(Op, DAG);
1350 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1351 case ISD::FROUNDEVEN:
1352 return LowerFROUNDEVEN(Op, DAG);
1353 case ISD::FROUND: return LowerFROUND(Op, DAG);
1354 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1355 case ISD::FLOG2:
1356 return LowerFLOG2(Op, DAG);
1357 case ISD::FLOG:
1358 case ISD::FLOG10:
1359 return LowerFLOGCommon(Op, DAG);
1360 case ISD::FEXP:
1361 case ISD::FEXP10:
1362 return lowerFEXP(Op, DAG);
1363 case ISD::FEXP2:
1364 return lowerFEXP2(Op, DAG);
1365 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1366 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1367 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1368 case ISD::FP_TO_SINT:
1369 case ISD::FP_TO_UINT:
1370 return LowerFP_TO_INT(Op, DAG);
1371 case ISD::CTTZ:
1372 case ISD::CTTZ_ZERO_UNDEF:
1373 case ISD::CTLZ:
1374 case ISD::CTLZ_ZERO_UNDEF:
1375 return LowerCTLZ_CTTZ(Op, DAG);
1376 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
1377 }
1378 return Op;
1379}
1380
1381void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
1382 SmallVectorImpl<SDValue> &Results,
1383 SelectionDAG &DAG) const {
1384 switch (N->getOpcode()) {
1385 case ISD::SIGN_EXTEND_INREG:
1386 // Different parts of legalization seem to interpret which type of
1387 // sign_extend_inreg is the one to check for custom lowering. The extended
1388 // from type is what really matters, but some places check for custom
1389 // lowering of the result type. This results in trying to use
1390 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1391 // nothing here and let the illegal result integer be handled normally.
1392 return;
1393 case ISD::FLOG2:
1394 if (SDValue Lowered = LowerFLOG2(Op: SDValue(N, 0), DAG))
1395 Results.push_back(Elt: Lowered);
1396 return;
1397 case ISD::FLOG:
1398 case ISD::FLOG10:
1399 if (SDValue Lowered = LowerFLOGCommon(Op: SDValue(N, 0), DAG))
1400 Results.push_back(Elt: Lowered);
1401 return;
1402 case ISD::FEXP2:
1403 if (SDValue Lowered = lowerFEXP2(Op: SDValue(N, 0), DAG))
1404 Results.push_back(Elt: Lowered);
1405 return;
1406 case ISD::FEXP:
1407 case ISD::FEXP10:
1408 if (SDValue Lowered = lowerFEXP(Op: SDValue(N, 0), DAG))
1409 Results.push_back(Elt: Lowered);
1410 return;
1411 case ISD::CTLZ:
1412 case ISD::CTLZ_ZERO_UNDEF:
1413 if (auto Lowered = lowerCTLZResults(Op: SDValue(N, 0u), DAG))
1414 Results.push_back(Elt: Lowered);
1415 return;
1416 default:
1417 return;
1418 }
1419}
1420
1421SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
1422 SDValue Op,
1423 SelectionDAG &DAG) const {
1424
1425 const DataLayout &DL = DAG.getDataLayout();
1426 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Val&: Op);
1427 const GlobalValue *GV = G->getGlobal();
1428
1429 if (!MFI->isModuleEntryFunction()) {
1430 if (std::optional<uint32_t> Address =
1431 AMDGPUMachineFunction::getLDSAbsoluteAddress(GV: *GV)) {
1432 return DAG.getConstant(Val: *Address, DL: SDLoc(Op), VT: Op.getValueType());
1433 }
1434 }
1435
1436 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1437 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1438 if (!MFI->isModuleEntryFunction() &&
1439 !GV->getName().equals(RHS: "llvm.amdgcn.module.lds")) {
1440 SDLoc DL(Op);
1441 const Function &Fn = DAG.getMachineFunction().getFunction();
1442 DiagnosticInfoUnsupported BadLDSDecl(
1443 Fn, "local memory global used by non-kernel function",
1444 DL.getDebugLoc(), DS_Warning);
1445 DAG.getContext()->diagnose(DI: BadLDSDecl);
1446
1447 // We currently don't have a way to correctly allocate LDS objects that
1448 // aren't directly associated with a kernel. We do force inlining of
1449 // functions that use local objects. However, if these dead functions are
1450 // not eliminated, we don't want a compile time error. Just emit a warning
1451 // and a trap, since there should be no callable path here.
1452 SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1453 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1454 Trap, DAG.getRoot());
1455 DAG.setRoot(OutputChain);
1456 return DAG.getUNDEF(VT: Op.getValueType());
1457 }
1458
1459 // XXX: What does the value of G->getOffset() mean?
1460 assert(G->getOffset() == 0 &&
1461 "Do not know what to do with an non-zero offset");
1462
1463 // TODO: We could emit code to handle the initialization somewhere.
1464 // We ignore the initializer for now and legalize it to allow selection.
1465 // The initializer will anyway get errored out during assembly emission.
1466 unsigned Offset = MFI->allocateLDSGlobal(DL, GV: *cast<GlobalVariable>(Val: GV));
1467 return DAG.getConstant(Val: Offset, DL: SDLoc(Op), VT: Op.getValueType());
1468 }
1469 return SDValue();
1470}
1471
1472SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
1473 SelectionDAG &DAG) const {
1474 SmallVector<SDValue, 8> Args;
1475 SDLoc SL(Op);
1476
1477 EVT VT = Op.getValueType();
1478 if (VT.getVectorElementType().getSizeInBits() < 32) {
1479 unsigned OpBitSize = Op.getOperand(i: 0).getValueType().getSizeInBits();
1480 if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1481 unsigned NewNumElt = OpBitSize / 32;
1482 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1483 : EVT::getVectorVT(*DAG.getContext(),
1484 MVT::i32, NewNumElt);
1485 for (const SDUse &U : Op->ops()) {
1486 SDValue In = U.get();
1487 SDValue NewIn = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewEltVT, Operand: In);
1488 if (NewNumElt > 1)
1489 DAG.ExtractVectorElements(Op: NewIn, Args);
1490 else
1491 Args.push_back(Elt: NewIn);
1492 }
1493
1494 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
1495 NewNumElt * Op.getNumOperands());
1496 SDValue BV = DAG.getBuildVector(VT: NewVT, DL: SL, Ops: Args);
1497 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: BV);
1498 }
1499 }
1500
1501 for (const SDUse &U : Op->ops())
1502 DAG.ExtractVectorElements(Op: U.get(), Args);
1503
1504 return DAG.getBuildVector(VT: Op.getValueType(), DL: SL, Ops: Args);
1505}
1506
1507SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
1508 SelectionDAG &DAG) const {
1509 SDLoc SL(Op);
1510 SmallVector<SDValue, 8> Args;
1511 unsigned Start = Op.getConstantOperandVal(i: 1);
1512 EVT VT = Op.getValueType();
1513 EVT SrcVT = Op.getOperand(i: 0).getValueType();
1514
1515 if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {
1516 unsigned NumElt = VT.getVectorNumElements();
1517 unsigned NumSrcElt = SrcVT.getVectorNumElements();
1518 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");
1519
1520 // Extract 32-bit registers at a time.
1521 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2);
1522 EVT NewVT = NumElt == 2
1523 ? MVT::i32
1524 : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2);
1525 SDValue Tmp = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewSrcVT, Operand: Op.getOperand(i: 0));
1526
1527 DAG.ExtractVectorElements(Op: Tmp, Args, Start: Start / 2, Count: NumElt / 2);
1528 if (NumElt == 2)
1529 Tmp = Args[0];
1530 else
1531 Tmp = DAG.getBuildVector(VT: NewVT, DL: SL, Ops: Args);
1532
1533 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Tmp);
1534 }
1535
1536 DAG.ExtractVectorElements(Op: Op.getOperand(i: 0), Args, Start,
1537 Count: VT.getVectorNumElements());
1538
1539 return DAG.getBuildVector(VT: Op.getValueType(), DL: SL, Ops: Args);
1540}
1541
1542// TODO: Handle fabs too
1543static SDValue peekFNeg(SDValue Val) {
1544 if (Val.getOpcode() == ISD::FNEG)
1545 return Val.getOperand(i: 0);
1546
1547 return Val;
1548}
1549
1550static SDValue peekFPSignOps(SDValue Val) {
1551 if (Val.getOpcode() == ISD::FNEG)
1552 Val = Val.getOperand(i: 0);
1553 if (Val.getOpcode() == ISD::FABS)
1554 Val = Val.getOperand(i: 0);
1555 if (Val.getOpcode() == ISD::FCOPYSIGN)
1556 Val = Val.getOperand(i: 0);
1557 return Val;
1558}
1559
1560SDValue AMDGPUTargetLowering::combineFMinMaxLegacyImpl(
1561 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1562 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1563 SelectionDAG &DAG = DCI.DAG;
1564 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Val&: CC)->get();
1565 switch (CCOpcode) {
1566 case ISD::SETOEQ:
1567 case ISD::SETONE:
1568 case ISD::SETUNE:
1569 case ISD::SETNE:
1570 case ISD::SETUEQ:
1571 case ISD::SETEQ:
1572 case ISD::SETFALSE:
1573 case ISD::SETFALSE2:
1574 case ISD::SETTRUE:
1575 case ISD::SETTRUE2:
1576 case ISD::SETUO:
1577 case ISD::SETO:
1578 break;
1579 case ISD::SETULE:
1580 case ISD::SETULT: {
1581 if (LHS == True)
1582 return DAG.getNode(Opcode: AMDGPUISD::FMIN_LEGACY, DL, VT, N1: RHS, N2: LHS);
1583 return DAG.getNode(Opcode: AMDGPUISD::FMAX_LEGACY, DL, VT, N1: LHS, N2: RHS);
1584 }
1585 case ISD::SETOLE:
1586 case ISD::SETOLT:
1587 case ISD::SETLE:
1588 case ISD::SETLT: {
1589 // Ordered. Assume ordered for undefined.
1590
1591 // Only do this after legalization to avoid interfering with other combines
1592 // which might occur.
1593 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1594 !DCI.isCalledByLegalizer())
1595 return SDValue();
1596
1597 // We need to permute the operands to get the correct NaN behavior. The
1598 // selected operand is the second one based on the failing compare with NaN,
1599 // so permute it based on the compare type the hardware uses.
1600 if (LHS == True)
1601 return DAG.getNode(Opcode: AMDGPUISD::FMIN_LEGACY, DL, VT, N1: LHS, N2: RHS);
1602 return DAG.getNode(Opcode: AMDGPUISD::FMAX_LEGACY, DL, VT, N1: RHS, N2: LHS);
1603 }
1604 case ISD::SETUGE:
1605 case ISD::SETUGT: {
1606 if (LHS == True)
1607 return DAG.getNode(Opcode: AMDGPUISD::FMAX_LEGACY, DL, VT, N1: RHS, N2: LHS);
1608 return DAG.getNode(Opcode: AMDGPUISD::FMIN_LEGACY, DL, VT, N1: LHS, N2: RHS);
1609 }
1610 case ISD::SETGT:
1611 case ISD::SETGE:
1612 case ISD::SETOGE:
1613 case ISD::SETOGT: {
1614 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1615 !DCI.isCalledByLegalizer())
1616 return SDValue();
1617
1618 if (LHS == True)
1619 return DAG.getNode(Opcode: AMDGPUISD::FMAX_LEGACY, DL, VT, N1: LHS, N2: RHS);
1620 return DAG.getNode(Opcode: AMDGPUISD::FMIN_LEGACY, DL, VT, N1: RHS, N2: LHS);
1621 }
1622 case ISD::SETCC_INVALID:
1623 llvm_unreachable("Invalid setcc condcode!");
1624 }
1625 return SDValue();
1626}
1627
1628/// Generate Min/Max node
1629SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
1630 SDValue LHS, SDValue RHS,
1631 SDValue True, SDValue False,
1632 SDValue CC,
1633 DAGCombinerInfo &DCI) const {
1634 if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
1635 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1636
1637 SelectionDAG &DAG = DCI.DAG;
1638
1639 // If we can't directly match this, try to see if we can fold an fneg to
1640 // match.
1641
1642 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(Val&: RHS);
1643 ConstantFPSDNode *CFalse = dyn_cast<ConstantFPSDNode>(Val&: False);
1644 SDValue NegTrue = peekFNeg(Val: True);
1645
1646 // Undo the combine foldFreeOpFromSelect does if it helps us match the
1647 // fmin/fmax.
1648 //
1649 // select (fcmp olt (lhs, K)), (fneg lhs), -K
1650 // -> fneg (fmin_legacy lhs, K)
1651 //
1652 // TODO: Use getNegatedExpression
1653 if (LHS == NegTrue && CFalse && CRHS) {
1654 APFloat NegRHS = neg(X: CRHS->getValueAPF());
1655 if (NegRHS == CFalse->getValueAPF()) {
1656 SDValue Combined =
1657 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True: NegTrue, False, CC, DCI);
1658 if (Combined)
1659 return DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: Combined);
1660 return SDValue();
1661 }
1662 }
1663
1664 return SDValue();
1665}
1666
1667std::pair<SDValue, SDValue>
1668AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
1669 SDLoc SL(Op);
1670
1671 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1672
1673 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1674 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1675
1676 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1677 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1678
1679 return std::pair(Lo, Hi);
1680}
1681
1682SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
1683 SDLoc SL(Op);
1684
1685 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1686 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1687 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1688}
1689
1690SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
1691 SDLoc SL(Op);
1692
1693 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1694 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1695 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1696}
1697
1698// Split a vector type into two parts. The first part is a power of two vector.
1699// The second part is whatever is left over, and is a scalar if it would
1700// otherwise be a 1-vector.
1701std::pair<EVT, EVT>
1702AMDGPUTargetLowering::getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const {
1703 EVT LoVT, HiVT;
1704 EVT EltVT = VT.getVectorElementType();
1705 unsigned NumElts = VT.getVectorNumElements();
1706 unsigned LoNumElts = PowerOf2Ceil(A: (NumElts + 1) / 2);
1707 LoVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: EltVT, NumElements: LoNumElts);
1708 HiVT = NumElts - LoNumElts == 1
1709 ? EltVT
1710 : EVT::getVectorVT(Context&: *DAG.getContext(), VT: EltVT, NumElements: NumElts - LoNumElts);
1711 return std::pair(LoVT, HiVT);
1712}
1713
1714// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1715// scalar.
1716std::pair<SDValue, SDValue>
1717AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL,
1718 const EVT &LoVT, const EVT &HiVT,
1719 SelectionDAG &DAG) const {
1720 assert(LoVT.getVectorNumElements() +
1721 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1722 N.getValueType().getVectorNumElements() &&
1723 "More vector elements requested than available!");
1724 SDValue Lo = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: LoVT, N1: N,
1725 N2: DAG.getVectorIdxConstant(Val: 0, DL));
1726 SDValue Hi = DAG.getNode(
1727 Opcode: HiVT.isVector() ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT, DL,
1728 VT: HiVT, N1: N, N2: DAG.getVectorIdxConstant(Val: LoVT.getVectorNumElements(), DL));
1729 return std::pair(Lo, Hi);
1730}
1731
1732SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
1733 SelectionDAG &DAG) const {
1734 LoadSDNode *Load = cast<LoadSDNode>(Val: Op);
1735 EVT VT = Op.getValueType();
1736 SDLoc SL(Op);
1737
1738
1739 // If this is a 2 element vector, we really want to scalarize and not create
1740 // weird 1 element vectors.
1741 if (VT.getVectorNumElements() == 2) {
1742 SDValue Ops[2];
1743 std::tie(args&: Ops[0], args&: Ops[1]) = scalarizeVectorLoad(LD: Load, DAG);
1744 return DAG.getMergeValues(Ops, dl: SL);
1745 }
1746
1747 SDValue BasePtr = Load->getBasePtr();
1748 EVT MemVT = Load->getMemoryVT();
1749
1750 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1751
1752 EVT LoVT, HiVT;
1753 EVT LoMemVT, HiMemVT;
1754 SDValue Lo, Hi;
1755
1756 std::tie(args&: LoVT, args&: HiVT) = getSplitDestVTs(VT, DAG);
1757 std::tie(args&: LoMemVT, args&: HiMemVT) = getSplitDestVTs(VT: MemVT, DAG);
1758 std::tie(args&: Lo, args&: Hi) = splitVector(N: Op, DL: SL, LoVT, HiVT, DAG);
1759
1760 unsigned Size = LoMemVT.getStoreSize();
1761 Align BaseAlign = Load->getAlign();
1762 Align HiAlign = commonAlignment(A: BaseAlign, Offset: Size);
1763
1764 SDValue LoLoad = DAG.getExtLoad(ExtType: Load->getExtensionType(), dl: SL, VT: LoVT,
1765 Chain: Load->getChain(), Ptr: BasePtr, PtrInfo: SrcValue, MemVT: LoMemVT,
1766 Alignment: BaseAlign, MMOFlags: Load->getMemOperand()->getFlags());
1767 SDValue HiPtr = DAG.getObjectPtrOffset(SL, Ptr: BasePtr, Offset: TypeSize::getFixed(ExactSize: Size));
1768 SDValue HiLoad =
1769 DAG.getExtLoad(ExtType: Load->getExtensionType(), dl: SL, VT: HiVT, Chain: Load->getChain(),
1770 Ptr: HiPtr, PtrInfo: SrcValue.getWithOffset(O: LoMemVT.getStoreSize()),
1771 MemVT: HiMemVT, Alignment: HiAlign, MMOFlags: Load->getMemOperand()->getFlags());
1772
1773 SDValue Join;
1774 if (LoVT == HiVT) {
1775 // This is the case that the vector is power of two so was evenly split.
1776 Join = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SL, VT, N1: LoLoad, N2: HiLoad);
1777 } else {
1778 Join = DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: SL, VT, N1: DAG.getUNDEF(VT), N2: LoLoad,
1779 N3: DAG.getVectorIdxConstant(Val: 0, DL: SL));
1780 Join = DAG.getNode(
1781 Opcode: HiVT.isVector() ? ISD::INSERT_SUBVECTOR : ISD::INSERT_VECTOR_ELT, DL: SL,
1782 VT, N1: Join, N2: HiLoad,
1783 N3: DAG.getVectorIdxConstant(Val: LoVT.getVectorNumElements(), DL: SL));
1784 }
1785
1786 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1787 LoLoad.getValue(1), HiLoad.getValue(1))};
1788
1789 return DAG.getMergeValues(Ops, SL);
1790}
1791
1792SDValue AMDGPUTargetLowering::WidenOrSplitVectorLoad(SDValue Op,
1793 SelectionDAG &DAG) const {
1794 LoadSDNode *Load = cast<LoadSDNode>(Val&: Op);
1795 EVT VT = Op.getValueType();
1796 SDValue BasePtr = Load->getBasePtr();
1797 EVT MemVT = Load->getMemoryVT();
1798 SDLoc SL(Op);
1799 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1800 Align BaseAlign = Load->getAlign();
1801 unsigned NumElements = MemVT.getVectorNumElements();
1802
1803 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1804 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1805 if (NumElements != 3 ||
1806 (BaseAlign < Align(8) &&
1807 !SrcValue.isDereferenceable(Size: 16, C&: *DAG.getContext(), DL: DAG.getDataLayout())))
1808 return SplitVectorLoad(Op, DAG);
1809
1810 assert(NumElements == 3);
1811
1812 EVT WideVT =
1813 EVT::getVectorVT(Context&: *DAG.getContext(), VT: VT.getVectorElementType(), NumElements: 4);
1814 EVT WideMemVT =
1815 EVT::getVectorVT(Context&: *DAG.getContext(), VT: MemVT.getVectorElementType(), NumElements: 4);
1816 SDValue WideLoad = DAG.getExtLoad(
1817 ExtType: Load->getExtensionType(), dl: SL, VT: WideVT, Chain: Load->getChain(), Ptr: BasePtr, PtrInfo: SrcValue,
1818 MemVT: WideMemVT, Alignment: BaseAlign, MMOFlags: Load->getMemOperand()->getFlags());
1819 return DAG.getMergeValues(
1820 Ops: {DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT, N1: WideLoad,
1821 N2: DAG.getVectorIdxConstant(Val: 0, DL: SL)),
1822 WideLoad.getValue(R: 1)},
1823 dl: SL);
1824}
1825
1826SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
1827 SelectionDAG &DAG) const {
1828 StoreSDNode *Store = cast<StoreSDNode>(Val&: Op);
1829 SDValue Val = Store->getValue();
1830 EVT VT = Val.getValueType();
1831
1832 // If this is a 2 element vector, we really want to scalarize and not create
1833 // weird 1 element vectors.
1834 if (VT.getVectorNumElements() == 2)
1835 return scalarizeVectorStore(ST: Store, DAG);
1836
1837 EVT MemVT = Store->getMemoryVT();
1838 SDValue Chain = Store->getChain();
1839 SDValue BasePtr = Store->getBasePtr();
1840 SDLoc SL(Op);
1841
1842 EVT LoVT, HiVT;
1843 EVT LoMemVT, HiMemVT;
1844 SDValue Lo, Hi;
1845
1846 std::tie(args&: LoVT, args&: HiVT) = getSplitDestVTs(VT, DAG);
1847 std::tie(args&: LoMemVT, args&: HiMemVT) = getSplitDestVTs(VT: MemVT, DAG);
1848 std::tie(args&: Lo, args&: Hi) = splitVector(N: Val, DL: SL, LoVT, HiVT, DAG);
1849
1850 SDValue HiPtr = DAG.getObjectPtrOffset(SL, Ptr: BasePtr, Offset: LoMemVT.getStoreSize());
1851
1852 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1853 Align BaseAlign = Store->getAlign();
1854 unsigned Size = LoMemVT.getStoreSize();
1855 Align HiAlign = commonAlignment(A: BaseAlign, Offset: Size);
1856
1857 SDValue LoStore =
1858 DAG.getTruncStore(Chain, dl: SL, Val: Lo, Ptr: BasePtr, PtrInfo: SrcValue, SVT: LoMemVT, Alignment: BaseAlign,
1859 MMOFlags: Store->getMemOperand()->getFlags());
1860 SDValue HiStore =
1861 DAG.getTruncStore(Chain, dl: SL, Val: Hi, Ptr: HiPtr, PtrInfo: SrcValue.getWithOffset(O: Size),
1862 SVT: HiMemVT, Alignment: HiAlign, MMOFlags: Store->getMemOperand()->getFlags());
1863
1864 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1865}
1866
1867// This is a shortcut for integer division because we have fast i32<->f32
1868// conversions, and fast f32 reciprocal instructions. The fractional part of a
1869// float is enough to accurately represent up to a 24-bit signed integer.
1870SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
1871 bool Sign) const {
1872 SDLoc DL(Op);
1873 EVT VT = Op.getValueType();
1874 SDValue LHS = Op.getOperand(i: 0);
1875 SDValue RHS = Op.getOperand(i: 1);
1876 MVT IntVT = MVT::i32;
1877 MVT FltVT = MVT::f32;
1878
1879 unsigned LHSSignBits = DAG.ComputeNumSignBits(Op: LHS);
1880 if (LHSSignBits < 9)
1881 return SDValue();
1882
1883 unsigned RHSSignBits = DAG.ComputeNumSignBits(Op: RHS);
1884 if (RHSSignBits < 9)
1885 return SDValue();
1886
1887 unsigned BitSize = VT.getSizeInBits();
1888 unsigned SignBits = std::min(a: LHSSignBits, b: RHSSignBits);
1889 unsigned DivBits = BitSize - SignBits;
1890 if (Sign)
1891 ++DivBits;
1892
1893 ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
1894 ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
1895
1896 SDValue jq = DAG.getConstant(Val: 1, DL, VT: IntVT);
1897
1898 if (Sign) {
1899 // char|short jq = ia ^ ib;
1900 jq = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: LHS, N2: RHS);
1901
1902 // jq = jq >> (bitsize - 2)
1903 jq = DAG.getNode(Opcode: ISD::SRA, DL, VT, N1: jq,
1904 N2: DAG.getConstant(Val: BitSize - 2, DL, VT));
1905
1906 // jq = jq | 0x1
1907 jq = DAG.getNode(Opcode: ISD::OR, DL, VT, N1: jq, N2: DAG.getConstant(Val: 1, DL, VT));
1908 }
1909
1910 // int ia = (int)LHS;
1911 SDValue ia = LHS;
1912
1913 // int ib, (int)RHS;
1914 SDValue ib = RHS;
1915
1916 // float fa = (float)ia;
1917 SDValue fa = DAG.getNode(Opcode: ToFp, DL, VT: FltVT, Operand: ia);
1918
1919 // float fb = (float)ib;
1920 SDValue fb = DAG.getNode(Opcode: ToFp, DL, VT: FltVT, Operand: ib);
1921
1922 SDValue fq = DAG.getNode(Opcode: ISD::FMUL, DL, VT: FltVT,
1923 N1: fa, N2: DAG.getNode(Opcode: AMDGPUISD::RCP, DL, VT: FltVT, Operand: fb));
1924
1925 // fq = trunc(fq);
1926 fq = DAG.getNode(Opcode: ISD::FTRUNC, DL, VT: FltVT, Operand: fq);
1927
1928 // float fqneg = -fq;
1929 SDValue fqneg = DAG.getNode(Opcode: ISD::FNEG, DL, VT: FltVT, Operand: fq);
1930
1931 MachineFunction &MF = DAG.getMachineFunction();
1932
1933 bool UseFmadFtz = false;
1934 if (Subtarget->isGCN()) {
1935 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1936 UseFmadFtz =
1937 MFI->getMode().FP32Denormals != DenormalMode::getPreserveSign();
1938 }
1939
1940 // float fr = mad(fqneg, fb, fa);
1941 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
1942 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
1943 : (unsigned)ISD::FMAD;
1944 SDValue fr = DAG.getNode(Opcode: OpCode, DL, VT: FltVT, N1: fqneg, N2: fb, N3: fa);
1945
1946 // int iq = (int)fq;
1947 SDValue iq = DAG.getNode(Opcode: ToInt, DL, VT: IntVT, Operand: fq);
1948
1949 // fr = fabs(fr);
1950 fr = DAG.getNode(Opcode: ISD::FABS, DL, VT: FltVT, Operand: fr);
1951
1952 // fb = fabs(fb);
1953 fb = DAG.getNode(Opcode: ISD::FABS, DL, VT: FltVT, Operand: fb);
1954
1955 EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
1956
1957 // int cv = fr >= fb;
1958 SDValue cv = DAG.getSetCC(DL, VT: SetCCVT, LHS: fr, RHS: fb, Cond: ISD::SETOGE);
1959
1960 // jq = (cv ? jq : 0);
1961 jq = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: cv, N2: jq, N3: DAG.getConstant(Val: 0, DL, VT));
1962
1963 // dst = iq + jq;
1964 SDValue Div = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: iq, N2: jq);
1965
1966 // Rem needs compensation, it's easier to recompute it
1967 SDValue Rem = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: Div, N2: RHS);
1968 Rem = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: LHS, N2: Rem);
1969
1970 // Truncate to number of bits this divide really is.
1971 if (Sign) {
1972 SDValue InRegSize
1973 = DAG.getValueType(EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: DivBits));
1974 Div = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT, N1: Div, N2: InRegSize);
1975 Rem = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT, N1: Rem, N2: InRegSize);
1976 } else {
1977 SDValue TruncMask = DAG.getConstant(Val: (UINT64_C(1) << DivBits) - 1, DL, VT);
1978 Div = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: Div, N2: TruncMask);
1979 Rem = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: Rem, N2: TruncMask);
1980 }
1981
1982 return DAG.getMergeValues(Ops: { Div, Rem }, dl: DL);
1983}
1984
1985void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
1986 SelectionDAG &DAG,
1987 SmallVectorImpl<SDValue> &Results) const {
1988 SDLoc DL(Op);
1989 EVT VT = Op.getValueType();
1990
1991 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
1992
1993 EVT HalfVT = VT.getHalfSizedIntegerVT(Context&: *DAG.getContext());
1994
1995 SDValue One = DAG.getConstant(Val: 1, DL, VT: HalfVT);
1996 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: HalfVT);
1997
1998 //HiLo split
1999 SDValue LHS_Lo, LHS_Hi;
2000 SDValue LHS = Op.getOperand(i: 0);
2001 std::tie(args&: LHS_Lo, args&: LHS_Hi) = DAG.SplitScalar(N: LHS, DL, LoVT: HalfVT, HiVT: HalfVT);
2002
2003 SDValue RHS_Lo, RHS_Hi;
2004 SDValue RHS = Op.getOperand(i: 1);
2005 std::tie(args&: RHS_Lo, args&: RHS_Hi) = DAG.SplitScalar(N: RHS, DL, LoVT: HalfVT, HiVT: HalfVT);
2006
2007 if (DAG.MaskedValueIsZero(Op: RHS, Mask: APInt::getHighBitsSet(numBits: 64, hiBitsSet: 32)) &&
2008 DAG.MaskedValueIsZero(Op: LHS, Mask: APInt::getHighBitsSet(numBits: 64, hiBitsSet: 32))) {
2009
2010 SDValue Res = DAG.getNode(Opcode: ISD::UDIVREM, DL, VTList: DAG.getVTList(VT1: HalfVT, VT2: HalfVT),
2011 N1: LHS_Lo, N2: RHS_Lo);
2012
2013 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
2014 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
2015
2016 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
2017 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
2018 return;
2019 }
2020
2021 if (isTypeLegal(MVT::i64)) {
2022 // The algorithm here is based on ideas from "Software Integer Division",
2023 // Tom Rodeheffer, August 2008.
2024
2025 MachineFunction &MF = DAG.getMachineFunction();
2026 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2027
2028 // Compute denominator reciprocal.
2029 unsigned FMAD =
2030 !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2031 : MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign()
2032 ? (unsigned)ISD::FMAD
2033 : (unsigned)AMDGPUISD::FMAD_FTZ;
2034
2035 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
2036 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
2037 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
2038 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
2039 Cvt_Lo);
2040 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
2041 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
2042 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
2043 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
2044 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
2045 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
2046 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
2047 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
2048 Mul1);
2049 SDValue Rcp_Lo = DAG.getNode(Opcode: ISD::FP_TO_UINT, DL, VT: HalfVT, Operand: Mad2);
2050 SDValue Rcp_Hi = DAG.getNode(Opcode: ISD::FP_TO_UINT, DL, VT: HalfVT, Operand: Trunc);
2051 SDValue Rcp64 = DAG.getBitcast(VT,
2052 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
2053
2054 SDValue Zero64 = DAG.getConstant(Val: 0, DL, VT);
2055 SDValue One64 = DAG.getConstant(Val: 1, DL, VT);
2056 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
2057 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
2058
2059 // First round of UNR (Unsigned integer Newton-Raphson).
2060 SDValue Neg_RHS = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Zero64, N2: RHS);
2061 SDValue Mullo1 = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: Neg_RHS, N2: Rcp64);
2062 SDValue Mulhi1 = DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: Rcp64, N2: Mullo1);
2063 SDValue Mulhi1_Lo, Mulhi1_Hi;
2064 std::tie(args&: Mulhi1_Lo, args&: Mulhi1_Hi) =
2065 DAG.SplitScalar(N: Mulhi1, DL, LoVT: HalfVT, HiVT: HalfVT);
2066 SDValue Add1_Lo = DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: HalfCarryVT, N1: Rcp_Lo,
2067 N2: Mulhi1_Lo, N3: Zero1);
2068 SDValue Add1_Hi = DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: HalfCarryVT, N1: Rcp_Hi,
2069 N2: Mulhi1_Hi, N3: Add1_Lo.getValue(R: 1));
2070 SDValue Add1 = DAG.getBitcast(VT,
2071 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
2072
2073 // Second round of UNR.
2074 SDValue Mullo2 = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: Neg_RHS, N2: Add1);
2075 SDValue Mulhi2 = DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: Add1, N2: Mullo2);
2076 SDValue Mulhi2_Lo, Mulhi2_Hi;
2077 std::tie(args&: Mulhi2_Lo, args&: Mulhi2_Hi) =
2078 DAG.SplitScalar(N: Mulhi2, DL, LoVT: HalfVT, HiVT: HalfVT);
2079 SDValue Add2_Lo = DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: HalfCarryVT, N1: Add1_Lo,
2080 N2: Mulhi2_Lo, N3: Zero1);
2081 SDValue Add2_Hi = DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: HalfCarryVT, N1: Add1_Hi,
2082 N2: Mulhi2_Hi, N3: Add2_Lo.getValue(R: 1));
2083 SDValue Add2 = DAG.getBitcast(VT,
2084 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
2085
2086 SDValue Mulhi3 = DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: LHS, N2: Add2);
2087
2088 SDValue Mul3 = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: RHS, N2: Mulhi3);
2089
2090 SDValue Mul3_Lo, Mul3_Hi;
2091 std::tie(args&: Mul3_Lo, args&: Mul3_Hi) = DAG.SplitScalar(N: Mul3, DL, LoVT: HalfVT, HiVT: HalfVT);
2092 SDValue Sub1_Lo = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: LHS_Lo,
2093 N2: Mul3_Lo, N3: Zero1);
2094 SDValue Sub1_Hi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: LHS_Hi,
2095 N2: Mul3_Hi, N3: Sub1_Lo.getValue(R: 1));
2096 SDValue Sub1_Mi = DAG.getNode(Opcode: ISD::SUB, DL, VT: HalfVT, N1: LHS_Hi, N2: Mul3_Hi);
2097 SDValue Sub1 = DAG.getBitcast(VT,
2098 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
2099
2100 SDValue MinusOne = DAG.getConstant(Val: 0xffffffffu, DL, VT: HalfVT);
2101 SDValue C1 = DAG.getSelectCC(DL, LHS: Sub1_Hi, RHS: RHS_Hi, True: MinusOne, False: Zero,
2102 Cond: ISD::SETUGE);
2103 SDValue C2 = DAG.getSelectCC(DL, LHS: Sub1_Lo, RHS: RHS_Lo, True: MinusOne, False: Zero,
2104 Cond: ISD::SETUGE);
2105 SDValue C3 = DAG.getSelectCC(DL, LHS: Sub1_Hi, RHS: RHS_Hi, True: C2, False: C1, Cond: ISD::SETEQ);
2106
2107 // TODO: Here and below portions of the code can be enclosed into if/endif.
2108 // Currently control flow is unconditional and we have 4 selects after
2109 // potential endif to substitute PHIs.
2110
2111 // if C3 != 0 ...
2112 SDValue Sub2_Lo = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub1_Lo,
2113 N2: RHS_Lo, N3: Zero1);
2114 SDValue Sub2_Mi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub1_Mi,
2115 N2: RHS_Hi, N3: Sub1_Lo.getValue(R: 1));
2116 SDValue Sub2_Hi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub2_Mi,
2117 N2: Zero, N3: Sub2_Lo.getValue(R: 1));
2118 SDValue Sub2 = DAG.getBitcast(VT,
2119 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
2120
2121 SDValue Add3 = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Mulhi3, N2: One64);
2122
2123 SDValue C4 = DAG.getSelectCC(DL, LHS: Sub2_Hi, RHS: RHS_Hi, True: MinusOne, False: Zero,
2124 Cond: ISD::SETUGE);
2125 SDValue C5 = DAG.getSelectCC(DL, LHS: Sub2_Lo, RHS: RHS_Lo, True: MinusOne, False: Zero,
2126 Cond: ISD::SETUGE);
2127 SDValue C6 = DAG.getSelectCC(DL, LHS: Sub2_Hi, RHS: RHS_Hi, True: C5, False: C4, Cond: ISD::SETEQ);
2128
2129 // if (C6 != 0)
2130 SDValue Add4 = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Add3, N2: One64);
2131
2132 SDValue Sub3_Lo = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub2_Lo,
2133 N2: RHS_Lo, N3: Zero1);
2134 SDValue Sub3_Mi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub2_Mi,
2135 N2: RHS_Hi, N3: Sub2_Lo.getValue(R: 1));
2136 SDValue Sub3_Hi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub3_Mi,
2137 N2: Zero, N3: Sub3_Lo.getValue(R: 1));
2138 SDValue Sub3 = DAG.getBitcast(VT,
2139 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
2140
2141 // endif C6
2142 // endif C3
2143
2144 SDValue Sel1 = DAG.getSelectCC(DL, LHS: C6, RHS: Zero, True: Add4, False: Add3, Cond: ISD::SETNE);
2145 SDValue Div = DAG.getSelectCC(DL, LHS: C3, RHS: Zero, True: Sel1, False: Mulhi3, Cond: ISD::SETNE);
2146
2147 SDValue Sel2 = DAG.getSelectCC(DL, LHS: C6, RHS: Zero, True: Sub3, False: Sub2, Cond: ISD::SETNE);
2148 SDValue Rem = DAG.getSelectCC(DL, LHS: C3, RHS: Zero, True: Sel2, False: Sub1, Cond: ISD::SETNE);
2149
2150 Results.push_back(Elt: Div);
2151 Results.push_back(Elt: Rem);
2152
2153 return;
2154 }
2155
2156 // r600 expandion.
2157 // Get Speculative values
2158 SDValue DIV_Part = DAG.getNode(Opcode: ISD::UDIV, DL, VT: HalfVT, N1: LHS_Hi, N2: RHS_Lo);
2159 SDValue REM_Part = DAG.getNode(Opcode: ISD::UREM, DL, VT: HalfVT, N1: LHS_Hi, N2: RHS_Lo);
2160
2161 SDValue REM_Lo = DAG.getSelectCC(DL, LHS: RHS_Hi, RHS: Zero, True: REM_Part, False: LHS_Hi, Cond: ISD::SETEQ);
2162 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
2163 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
2164
2165 SDValue DIV_Hi = DAG.getSelectCC(DL, LHS: RHS_Hi, RHS: Zero, True: DIV_Part, False: Zero, Cond: ISD::SETEQ);
2166 SDValue DIV_Lo = Zero;
2167
2168 const unsigned halfBitWidth = HalfVT.getSizeInBits();
2169
2170 for (unsigned i = 0; i < halfBitWidth; ++i) {
2171 const unsigned bitPos = halfBitWidth - i - 1;
2172 SDValue POS = DAG.getConstant(Val: bitPos, DL, VT: HalfVT);
2173 // Get value of high bit
2174 SDValue HBit = DAG.getNode(Opcode: ISD::SRL, DL, VT: HalfVT, N1: LHS_Lo, N2: POS);
2175 HBit = DAG.getNode(Opcode: ISD::AND, DL, VT: HalfVT, N1: HBit, N2: One);
2176 HBit = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT, Operand: HBit);
2177
2178 // Shift
2179 REM = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: REM, N2: DAG.getConstant(Val: 1, DL, VT));
2180 // Add LHS high bit
2181 REM = DAG.getNode(Opcode: ISD::OR, DL, VT, N1: REM, N2: HBit);
2182
2183 SDValue BIT = DAG.getConstant(Val: 1ULL << bitPos, DL, VT: HalfVT);
2184 SDValue realBIT = DAG.getSelectCC(DL, LHS: REM, RHS, True: BIT, False: Zero, Cond: ISD::SETUGE);
2185
2186 DIV_Lo = DAG.getNode(Opcode: ISD::OR, DL, VT: HalfVT, N1: DIV_Lo, N2: realBIT);
2187
2188 // Update REM
2189 SDValue REM_sub = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: REM, N2: RHS);
2190 REM = DAG.getSelectCC(DL, LHS: REM, RHS, True: REM_sub, False: REM, Cond: ISD::SETUGE);
2191 }
2192
2193 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2194 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2195 Results.push_back(Elt: DIV);
2196 Results.push_back(Elt: REM);
2197}
2198
2199SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
2200 SelectionDAG &DAG) const {
2201 SDLoc DL(Op);
2202 EVT VT = Op.getValueType();
2203
2204 if (VT == MVT::i64) {
2205 SmallVector<SDValue, 2> Results;
2206 LowerUDIVREM64(Op, DAG, Results);
2207 return DAG.getMergeValues(Ops: Results, dl: DL);
2208 }
2209
2210 if (VT == MVT::i32) {
2211 if (SDValue Res = LowerDIVREM24(Op, DAG, Sign: false))
2212 return Res;
2213 }
2214
2215 SDValue X = Op.getOperand(i: 0);
2216 SDValue Y = Op.getOperand(i: 1);
2217
2218 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2219 // algorithm used here.
2220
2221 // Initial estimate of inv(y).
2222 SDValue Z = DAG.getNode(Opcode: AMDGPUISD::URECIP, DL, VT, Operand: Y);
2223
2224 // One round of UNR.
2225 SDValue NegY = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: DAG.getConstant(Val: 0, DL, VT), N2: Y);
2226 SDValue NegYZ = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: NegY, N2: Z);
2227 Z = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Z,
2228 N2: DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: Z, N2: NegYZ));
2229
2230 // Quotient/remainder estimate.
2231 SDValue Q = DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: X, N2: Z);
2232 SDValue R =
2233 DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: X, N2: DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: Q, N2: Y));
2234
2235 // First quotient/remainder refinement.
2236 EVT CCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
2237 SDValue One = DAG.getConstant(Val: 1, DL, VT);
2238 SDValue Cond = DAG.getSetCC(DL, VT: CCVT, LHS: R, RHS: Y, Cond: ISD::SETUGE);
2239 Q = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: Cond,
2240 N2: DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Q, N2: One), N3: Q);
2241 R = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: Cond,
2242 N2: DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: R, N2: Y), N3: R);
2243
2244 // Second quotient/remainder refinement.
2245 Cond = DAG.getSetCC(DL, VT: CCVT, LHS: R, RHS: Y, Cond: ISD::SETUGE);
2246 Q = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: Cond,
2247 N2: DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Q, N2: One), N3: Q);
2248 R = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: Cond,
2249 N2: DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: R, N2: Y), N3: R);
2250
2251 return DAG.getMergeValues(Ops: {Q, R}, dl: DL);
2252}
2253
2254SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
2255 SelectionDAG &DAG) const {
2256 SDLoc DL(Op);
2257 EVT VT = Op.getValueType();
2258
2259 SDValue LHS = Op.getOperand(i: 0);
2260 SDValue RHS = Op.getOperand(i: 1);
2261
2262 SDValue Zero = DAG.getConstant(Val: 0, DL, VT);
2263 SDValue NegOne = DAG.getConstant(Val: -1, DL, VT);
2264
2265 if (VT == MVT::i32) {
2266 if (SDValue Res = LowerDIVREM24(Op, DAG, Sign: true))
2267 return Res;
2268 }
2269
2270 if (VT == MVT::i64 &&
2271 DAG.ComputeNumSignBits(LHS) > 32 &&
2272 DAG.ComputeNumSignBits(RHS) > 32) {
2273 EVT HalfVT = VT.getHalfSizedIntegerVT(Context&: *DAG.getContext());
2274
2275 //HiLo split
2276 SDValue LHS_Lo = DAG.getNode(Opcode: ISD::EXTRACT_ELEMENT, DL, VT: HalfVT, N1: LHS, N2: Zero);
2277 SDValue RHS_Lo = DAG.getNode(Opcode: ISD::EXTRACT_ELEMENT, DL, VT: HalfVT, N1: RHS, N2: Zero);
2278 SDValue DIVREM = DAG.getNode(Opcode: ISD::SDIVREM, DL, VTList: DAG.getVTList(VT1: HalfVT, VT2: HalfVT),
2279 N1: LHS_Lo, N2: RHS_Lo);
2280 SDValue Res[2] = {
2281 DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT, Operand: DIVREM.getValue(R: 0)),
2282 DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT, Operand: DIVREM.getValue(R: 1))
2283 };
2284 return DAG.getMergeValues(Ops: Res, dl: DL);
2285 }
2286
2287 SDValue LHSign = DAG.getSelectCC(DL, LHS, RHS: Zero, True: NegOne, False: Zero, Cond: ISD::SETLT);
2288 SDValue RHSign = DAG.getSelectCC(DL, LHS: RHS, RHS: Zero, True: NegOne, False: Zero, Cond: ISD::SETLT);
2289 SDValue DSign = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: LHSign, N2: RHSign);
2290 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2291
2292 LHS = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: LHS, N2: LHSign);
2293 RHS = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: RHS, N2: RHSign);
2294
2295 LHS = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: LHS, N2: LHSign);
2296 RHS = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: RHS, N2: RHSign);
2297
2298 SDValue Div = DAG.getNode(Opcode: ISD::UDIVREM, DL, VTList: DAG.getVTList(VT1: VT, VT2: VT), N1: LHS, N2: RHS);
2299 SDValue Rem = Div.getValue(R: 1);
2300
2301 Div = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: Div, N2: DSign);
2302 Rem = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: Rem, N2: RSign);
2303
2304 Div = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Div, N2: DSign);
2305 Rem = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Rem, N2: RSign);
2306
2307 SDValue Res[2] = {
2308 Div,
2309 Rem
2310 };
2311 return DAG.getMergeValues(Ops: Res, dl: DL);
2312}
2313
2314// (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
2315SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
2316 SDLoc SL(Op);
2317 EVT VT = Op.getValueType();
2318 auto Flags = Op->getFlags();
2319 SDValue X = Op.getOperand(i: 0);
2320 SDValue Y = Op.getOperand(i: 1);
2321
2322 SDValue Div = DAG.getNode(Opcode: ISD::FDIV, DL: SL, VT, N1: X, N2: Y, Flags);
2323 SDValue Trunc = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT, Operand: Div, Flags);
2324 SDValue Neg = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Trunc, Flags);
2325 // TODO: For f32 use FMAD instead if !hasFastFMA32?
2326 return DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: Neg, N2: Y, N3: X, Flags);
2327}
2328
2329SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
2330 SDLoc SL(Op);
2331 SDValue Src = Op.getOperand(i: 0);
2332
2333 // result = trunc(src)
2334 // if (src > 0.0 && src != result)
2335 // result += 1.0
2336
2337 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2338
2339 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2340 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2341
2342 EVT SetCCVT =
2343 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2344
2345 SDValue Lt0 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: Zero, Cond: ISD::SETOGT);
2346 SDValue NeTrunc = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: Trunc, Cond: ISD::SETONE);
2347 SDValue And = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: SetCCVT, N1: Lt0, N2: NeTrunc);
2348
2349 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2350 // TODO: Should this propagate fast-math-flags?
2351 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2352}
2353
2354static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
2355 SelectionDAG &DAG) {
2356 const unsigned FractBits = 52;
2357 const unsigned ExpBits = 11;
2358
2359 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2360 Hi,
2361 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2362 DAG.getConstant(ExpBits, SL, MVT::i32));
2363 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2364 DAG.getConstant(1023, SL, MVT::i32));
2365
2366 return Exp;
2367}
2368
2369SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
2370 SDLoc SL(Op);
2371 SDValue Src = Op.getOperand(i: 0);
2372
2373 assert(Op.getValueType() == MVT::f64);
2374
2375 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2376
2377 // Extract the upper half, since this is where we will find the sign and
2378 // exponent.
2379 SDValue Hi = getHiHalf64(Op: Src, DAG);
2380
2381 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2382
2383 const unsigned FractBits = 52;
2384
2385 // Extract the sign bit.
2386 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2387 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2388
2389 // Extend back to 64-bits.
2390 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2391 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2392
2393 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2394 const SDValue FractMask
2395 = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2396
2397 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2398 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2399 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2400
2401 EVT SetCCVT =
2402 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2403
2404 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2405
2406 SDValue ExpLt0 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Exp, RHS: Zero, Cond: ISD::SETLT);
2407 SDValue ExpGt51 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Exp, RHS: FiftyOne, Cond: ISD::SETGT);
2408
2409 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2410 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2411
2412 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2413}
2414
2415SDValue AMDGPUTargetLowering::LowerFROUNDEVEN(SDValue Op,
2416 SelectionDAG &DAG) const {
2417 SDLoc SL(Op);
2418 SDValue Src = Op.getOperand(i: 0);
2419
2420 assert(Op.getValueType() == MVT::f64);
2421
2422 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2423 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2424 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2425
2426 // TODO: Should this propagate fast-math-flags?
2427
2428 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2429 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2430
2431 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2432
2433 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2434 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2435
2436 EVT SetCCVT =
2437 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2438 SDValue Cond = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Fabs, RHS: C2, Cond: ISD::SETOGT);
2439
2440 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2441}
2442
2443SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op,
2444 SelectionDAG &DAG) const {
2445 // FNEARBYINT and FRINT are the same, except in their handling of FP
2446 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2447 // rint, so just treat them as equivalent.
2448 return DAG.getNode(Opcode: ISD::FROUNDEVEN, DL: SDLoc(Op), VT: Op.getValueType(),
2449 Operand: Op.getOperand(i: 0));
2450}
2451
2452SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
2453 auto VT = Op.getValueType();
2454 auto Arg = Op.getOperand(i: 0u);
2455 return DAG.getNode(Opcode: ISD::FROUNDEVEN, DL: SDLoc(Op), VT, Operand: Arg);
2456}
2457
2458// XXX - May require not supporting f32 denormals?
2459
2460// Don't handle v2f16. The extra instructions to scalarize and repack around the
2461// compare and vselect end up producing worse code than scalarizing the whole
2462// operation.
2463SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2464 SDLoc SL(Op);
2465 SDValue X = Op.getOperand(i: 0);
2466 EVT VT = Op.getValueType();
2467
2468 SDValue T = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT, Operand: X);
2469
2470 // TODO: Should this propagate fast-math-flags?
2471
2472 SDValue Diff = DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT, N1: X, N2: T);
2473
2474 SDValue AbsDiff = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT, Operand: Diff);
2475
2476 const SDValue Zero = DAG.getConstantFP(Val: 0.0, DL: SL, VT);
2477 const SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT);
2478
2479 EVT SetCCVT =
2480 getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
2481
2482 const SDValue Half = DAG.getConstantFP(Val: 0.5, DL: SL, VT);
2483 SDValue Cmp = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: AbsDiff, RHS: Half, Cond: ISD::SETOGE);
2484 SDValue OneOrZeroFP = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: Cmp, N2: One, N3: Zero);
2485
2486 SDValue SignedOffset = DAG.getNode(Opcode: ISD::FCOPYSIGN, DL: SL, VT, N1: OneOrZeroFP, N2: X);
2487 return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: T, N2: SignedOffset);
2488}
2489
2490SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
2491 SDLoc SL(Op);
2492 SDValue Src = Op.getOperand(i: 0);
2493
2494 // result = trunc(src);
2495 // if (src < 0.0 && src != result)
2496 // result += -1.0.
2497
2498 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2499
2500 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2501 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2502
2503 EVT SetCCVT =
2504 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2505
2506 SDValue Lt0 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: Zero, Cond: ISD::SETOLT);
2507 SDValue NeTrunc = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: Trunc, Cond: ISD::SETONE);
2508 SDValue And = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: SetCCVT, N1: Lt0, N2: NeTrunc);
2509
2510 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2511 // TODO: Should this propagate fast-math-flags?
2512 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2513}
2514
2515/// Return true if it's known that \p Src can never be an f32 denormal value.
2516static bool valueIsKnownNeverF32Denorm(SDValue Src) {
2517 switch (Src.getOpcode()) {
2518 case ISD::FP_EXTEND:
2519 return Src.getOperand(0).getValueType() == MVT::f16;
2520 case ISD::FP16_TO_FP:
2521 case ISD::FFREXP:
2522 return true;
2523 case ISD::INTRINSIC_WO_CHAIN: {
2524 unsigned IntrinsicID = Src.getConstantOperandVal(i: 0);
2525 switch (IntrinsicID) {
2526 case Intrinsic::amdgcn_frexp_mant:
2527 return true;
2528 default:
2529 return false;
2530 }
2531 }
2532 default:
2533 return false;
2534 }
2535
2536 llvm_unreachable("covered opcode switch");
2537}
2538
2539bool AMDGPUTargetLowering::allowApproxFunc(const SelectionDAG &DAG,
2540 SDNodeFlags Flags) {
2541 if (Flags.hasApproximateFuncs())
2542 return true;
2543 auto &Options = DAG.getTarget().Options;
2544 return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
2545}
2546
2547bool AMDGPUTargetLowering::needsDenormHandlingF32(const SelectionDAG &DAG,
2548 SDValue Src,
2549 SDNodeFlags Flags) {
2550 return !valueIsKnownNeverF32Denorm(Src) &&
2551 DAG.getMachineFunction()
2552 .getDenormalMode(FPType: APFloat::IEEEsingle())
2553 .Input != DenormalMode::PreserveSign;
2554}
2555
2556SDValue AMDGPUTargetLowering::getIsLtSmallestNormal(SelectionDAG &DAG,
2557 SDValue Src,
2558 SDNodeFlags Flags) const {
2559 SDLoc SL(Src);
2560 EVT VT = Src.getValueType();
2561 const fltSemantics &Semantics = SelectionDAG::EVTToAPFloatSemantics(VT);
2562 SDValue SmallestNormal =
2563 DAG.getConstantFP(Val: APFloat::getSmallestNormalized(Sem: Semantics), DL: SL, VT);
2564
2565 // Want to scale denormals up, but negatives and 0 work just as well on the
2566 // scaled path.
2567 SDValue IsLtSmallestNormal = DAG.getSetCC(
2568 DL: SL, VT: getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT), LHS: Src,
2569 RHS: SmallestNormal, Cond: ISD::SETOLT);
2570
2571 return IsLtSmallestNormal;
2572}
2573
2574SDValue AMDGPUTargetLowering::getIsFinite(SelectionDAG &DAG, SDValue Src,
2575 SDNodeFlags Flags) const {
2576 SDLoc SL(Src);
2577 EVT VT = Src.getValueType();
2578 const fltSemantics &Semantics = SelectionDAG::EVTToAPFloatSemantics(VT);
2579 SDValue Inf = DAG.getConstantFP(Val: APFloat::getInf(Sem: Semantics), DL: SL, VT);
2580
2581 SDValue Fabs = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT, Operand: Src, Flags);
2582 SDValue IsFinite = DAG.getSetCC(
2583 DL: SL, VT: getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT), LHS: Fabs,
2584 RHS: Inf, Cond: ISD::SETOLT);
2585 return IsFinite;
2586}
2587
2588/// If denormal handling is required return the scaled input to FLOG2, and the
2589/// check for denormal range. Otherwise, return null values.
2590std::pair<SDValue, SDValue>
2591AMDGPUTargetLowering::getScaledLogInput(SelectionDAG &DAG, const SDLoc SL,
2592 SDValue Src, SDNodeFlags Flags) const {
2593 if (!needsDenormHandlingF32(DAG, Src, Flags))
2594 return {};
2595
2596 MVT VT = MVT::f32;
2597 const fltSemantics &Semantics = APFloat::IEEEsingle();
2598 SDValue SmallestNormal =
2599 DAG.getConstantFP(Val: APFloat::getSmallestNormalized(Sem: Semantics), DL: SL, VT);
2600
2601 SDValue IsLtSmallestNormal = DAG.getSetCC(
2602 DL: SL, VT: getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT), LHS: Src,
2603 RHS: SmallestNormal, Cond: ISD::SETOLT);
2604
2605 SDValue Scale32 = DAG.getConstantFP(Val: 0x1.0p+32, DL: SL, VT);
2606 SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT);
2607 SDValue ScaleFactor =
2608 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: IsLtSmallestNormal, N2: Scale32, N3: One, Flags);
2609
2610 SDValue ScaledInput = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Src, N2: ScaleFactor, Flags);
2611 return {ScaledInput, IsLtSmallestNormal};
2612}
2613
2614SDValue AMDGPUTargetLowering::LowerFLOG2(SDValue Op, SelectionDAG &DAG) const {
2615 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
2616 // If we have to handle denormals, scale up the input and adjust the result.
2617
2618 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
2619 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
2620
2621 SDLoc SL(Op);
2622 EVT VT = Op.getValueType();
2623 SDValue Src = Op.getOperand(i: 0);
2624 SDNodeFlags Flags = Op->getFlags();
2625
2626 if (VT == MVT::f16) {
2627 // Nothing in half is a denormal when promoted to f32.
2628 assert(!Subtarget->has16BitInsts());
2629 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2630 SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags);
2631 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2632 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2633 }
2634
2635 auto [ScaledInput, IsLtSmallestNormal] =
2636 getScaledLogInput(DAG, SL, Src, Flags);
2637 if (!ScaledInput)
2638 return DAG.getNode(Opcode: AMDGPUISD::LOG, DL: SL, VT, Operand: Src, Flags);
2639
2640 SDValue Log2 = DAG.getNode(Opcode: AMDGPUISD::LOG, DL: SL, VT, Operand: ScaledInput, Flags);
2641
2642 SDValue ThirtyTwo = DAG.getConstantFP(Val: 32.0, DL: SL, VT);
2643 SDValue Zero = DAG.getConstantFP(Val: 0.0, DL: SL, VT);
2644 SDValue ResultOffset =
2645 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: IsLtSmallestNormal, N2: ThirtyTwo, N3: Zero);
2646 return DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT, N1: Log2, N2: ResultOffset, Flags);
2647}
2648
2649static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,
2650 SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) {
2651 SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: Y, Flags);
2652 return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: Mul, N2: C, Flags);
2653}
2654
2655SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op,
2656 SelectionDAG &DAG) const {
2657 SDValue X = Op.getOperand(i: 0);
2658 EVT VT = Op.getValueType();
2659 SDNodeFlags Flags = Op->getFlags();
2660 SDLoc DL(Op);
2661
2662 const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
2663 assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
2664
2665 const auto &Options = getTargetMachine().Options;
2666 if (VT == MVT::f16 || Flags.hasApproximateFuncs() ||
2667 Options.ApproxFuncFPMath || Options.UnsafeFPMath) {
2668
2669 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2670 // Log and multiply in f32 is good enough for f16.
2671 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
2672 }
2673
2674 SDValue Lowered = LowerFLOGUnsafe(Op: X, SL: DL, DAG, IsLog10, Flags);
2675 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2676 return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered,
2677 DAG.getTargetConstant(0, DL, MVT::i32), Flags);
2678 }
2679
2680 return Lowered;
2681 }
2682
2683 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL: DL, Src: X, Flags);
2684 if (ScaledInput)
2685 X = ScaledInput;
2686
2687 SDValue Y = DAG.getNode(Opcode: AMDGPUISD::LOG, DL, VT, Operand: X, Flags);
2688
2689 SDValue R;
2690 if (Subtarget->hasFastFMAF32()) {
2691 // c+cc are ln(2)/ln(10) to more than 49 bits
2692 const float c_log10 = 0x1.344134p-2f;
2693 const float cc_log10 = 0x1.09f79ep-26f;
2694
2695 // c + cc is ln(2) to more than 49 bits
2696 const float c_log = 0x1.62e42ep-1f;
2697 const float cc_log = 0x1.efa39ep-25f;
2698
2699 SDValue C = DAG.getConstantFP(Val: IsLog10 ? c_log10 : c_log, DL, VT);
2700 SDValue CC = DAG.getConstantFP(Val: IsLog10 ? cc_log10 : cc_log, DL, VT);
2701
2702 R = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Y, N2: C, Flags);
2703 SDValue NegR = DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: R, Flags);
2704 SDValue FMA0 = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: Y, N2: C, N3: NegR, Flags);
2705 SDValue FMA1 = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: Y, N2: CC, N3: FMA0, Flags);
2706 R = DAG.getNode(Opcode: ISD::FADD, DL, VT, N1: R, N2: FMA1, Flags);
2707 } else {
2708 // ch+ct is ln(2)/ln(10) to more than 36 bits
2709 const float ch_log10 = 0x1.344000p-2f;
2710 const float ct_log10 = 0x1.3509f6p-18f;
2711
2712 // ch + ct is ln(2) to more than 36 bits
2713 const float ch_log = 0x1.62e000p-1f;
2714 const float ct_log = 0x1.0bfbe8p-15f;
2715
2716 SDValue CH = DAG.getConstantFP(Val: IsLog10 ? ch_log10 : ch_log, DL, VT);
2717 SDValue CT = DAG.getConstantFP(Val: IsLog10 ? ct_log10 : ct_log, DL, VT);
2718
2719 SDValue YAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Y);
2720 SDValue MaskConst = DAG.getConstant(0xfffff000, DL, MVT::i32);
2721 SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst);
2722 SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt);
2723 SDValue YT = DAG.getNode(Opcode: ISD::FSUB, DL, VT, N1: Y, N2: YH, Flags);
2724
2725 SDValue YTCT = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: YT, N2: CT, Flags);
2726 SDValue Mad0 = getMad(DAG, SL: DL, VT, X: YH, Y: CT, C: YTCT, Flags);
2727 SDValue Mad1 = getMad(DAG, SL: DL, VT, X: YT, Y: CH, C: Mad0, Flags);
2728 R = getMad(DAG, SL: DL, VT, X: YH, Y: CH, C: Mad1);
2729 }
2730
2731 const bool IsFiniteOnly = (Flags.hasNoNaNs() || Options.NoNaNsFPMath) &&
2732 (Flags.hasNoInfs() || Options.NoInfsFPMath);
2733
2734 // TODO: Check if known finite from source value.
2735 if (!IsFiniteOnly) {
2736 SDValue IsFinite = getIsFinite(DAG, Src: Y, Flags);
2737 R = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: IsFinite, N2: R, N3: Y, Flags);
2738 }
2739
2740 if (IsScaled) {
2741 SDValue Zero = DAG.getConstantFP(Val: 0.0f, DL, VT);
2742 SDValue ShiftK =
2743 DAG.getConstantFP(Val: IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT);
2744 SDValue Shift =
2745 DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: IsScaled, N2: ShiftK, N3: Zero, Flags);
2746 R = DAG.getNode(Opcode: ISD::FSUB, DL, VT, N1: R, N2: Shift, Flags);
2747 }
2748
2749 return R;
2750}
2751
2752SDValue AMDGPUTargetLowering::LowerFLOG10(SDValue Op, SelectionDAG &DAG) const {
2753 return LowerFLOGCommon(Op, DAG);
2754}
2755
2756// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
2757// promote f16 operation.
2758SDValue AMDGPUTargetLowering::LowerFLOGUnsafe(SDValue Src, const SDLoc &SL,
2759 SelectionDAG &DAG, bool IsLog10,
2760 SDNodeFlags Flags) const {
2761 EVT VT = Src.getValueType();
2762 unsigned LogOp =
2763 VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;
2764
2765 double Log2BaseInverted =
2766 IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
2767
2768 if (VT == MVT::f32) {
2769 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
2770 if (ScaledInput) {
2771 SDValue LogSrc = DAG.getNode(Opcode: AMDGPUISD::LOG, DL: SL, VT, Operand: ScaledInput, Flags);
2772 SDValue ScaledResultOffset =
2773 DAG.getConstantFP(Val: -32.0 * Log2BaseInverted, DL: SL, VT);
2774
2775 SDValue Zero = DAG.getConstantFP(Val: 0.0f, DL: SL, VT);
2776
2777 SDValue ResultOffset = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: IsScaled,
2778 N2: ScaledResultOffset, N3: Zero, Flags);
2779
2780 SDValue Log2Inv = DAG.getConstantFP(Val: Log2BaseInverted, DL: SL, VT);
2781
2782 if (Subtarget->hasFastFMAF32())
2783 return DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: LogSrc, N2: Log2Inv, N3: ResultOffset,
2784 Flags);
2785 SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: LogSrc, N2: Log2Inv, Flags);
2786 return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: Mul, N2: ResultOffset);
2787 }
2788 }
2789
2790 SDValue Log2Operand = DAG.getNode(Opcode: LogOp, DL: SL, VT, Operand: Src, Flags);
2791 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Val: Log2BaseInverted, DL: SL, VT);
2792
2793 return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Log2Operand, N2: Log2BaseInvertedOperand,
2794 Flags);
2795}
2796
2797SDValue AMDGPUTargetLowering::lowerFEXP2(SDValue Op, SelectionDAG &DAG) const {
2798 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
2799 // If we have to handle denormals, scale up the input and adjust the result.
2800
2801 SDLoc SL(Op);
2802 EVT VT = Op.getValueType();
2803 SDValue Src = Op.getOperand(i: 0);
2804 SDNodeFlags Flags = Op->getFlags();
2805
2806 if (VT == MVT::f16) {
2807 // Nothing in half is a denormal when promoted to f32.
2808 assert(!Subtarget->has16BitInsts());
2809 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2810 SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags);
2811 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2812 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2813 }
2814
2815 assert(VT == MVT::f32);
2816
2817 if (!needsDenormHandlingF32(DAG, Src, Flags))
2818 return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags);
2819
2820 // bool needs_scaling = x < -0x1.f80000p+6f;
2821 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
2822
2823 // -nextafter(128.0, -1)
2824 SDValue RangeCheckConst = DAG.getConstantFP(Val: -0x1.f80000p+6f, DL: SL, VT);
2825
2826 EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
2827
2828 SDValue NeedsScaling =
2829 DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: RangeCheckConst, Cond: ISD::SETOLT);
2830
2831 SDValue SixtyFour = DAG.getConstantFP(Val: 0x1.0p+6f, DL: SL, VT);
2832 SDValue Zero = DAG.getConstantFP(Val: 0.0, DL: SL, VT);
2833
2834 SDValue AddOffset =
2835 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: SixtyFour, N3: Zero);
2836
2837 SDValue AddInput = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: Src, N2: AddOffset, Flags);
2838 SDValue Exp2 = DAG.getNode(Opcode: AMDGPUISD::EXP, DL: SL, VT, Operand: AddInput, Flags);
2839
2840 SDValue TwoExpNeg64 = DAG.getConstantFP(Val: 0x1.0p-64f, DL: SL, VT);
2841 SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT);
2842 SDValue ResultScale =
2843 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: TwoExpNeg64, N3: One);
2844
2845 return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Exp2, N2: ResultScale, Flags);
2846}
2847
2848SDValue AMDGPUTargetLowering::lowerFEXPUnsafe(SDValue X, const SDLoc &SL,
2849 SelectionDAG &DAG,
2850 SDNodeFlags Flags) const {
2851 EVT VT = X.getValueType();
2852 const SDValue Log2E = DAG.getConstantFP(Val: numbers::log2e, DL: SL, VT);
2853
2854 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
2855 // exp2(M_LOG2E_F * f);
2856 SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: Log2E, Flags);
2857 return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
2858 : (unsigned)ISD::FEXP2,
2859 SL, VT, Mul, Flags);
2860 }
2861
2862 EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
2863
2864 SDValue Threshold = DAG.getConstantFP(Val: -0x1.5d58a0p+6f, DL: SL, VT);
2865 SDValue NeedsScaling = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: X, RHS: Threshold, Cond: ISD::SETOLT);
2866
2867 SDValue ScaleOffset = DAG.getConstantFP(Val: 0x1.0p+6f, DL: SL, VT);
2868
2869 SDValue ScaledX = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: X, N2: ScaleOffset, Flags);
2870
2871 SDValue AdjustedX =
2872 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: ScaledX, N3: X);
2873
2874 SDValue ExpInput = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: AdjustedX, N2: Log2E, Flags);
2875
2876 SDValue Exp2 = DAG.getNode(Opcode: AMDGPUISD::EXP, DL: SL, VT, Operand: ExpInput, Flags);
2877
2878 SDValue ResultScaleFactor = DAG.getConstantFP(Val: 0x1.969d48p-93f, DL: SL, VT);
2879 SDValue AdjustedResult =
2880 DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Exp2, N2: ResultScaleFactor, Flags);
2881
2882 return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: AdjustedResult, N3: Exp2,
2883 Flags);
2884}
2885
2886/// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be
2887/// handled correctly.
2888SDValue AMDGPUTargetLowering::lowerFEXP10Unsafe(SDValue X, const SDLoc &SL,
2889 SelectionDAG &DAG,
2890 SDNodeFlags Flags) const {
2891 const EVT VT = X.getValueType();
2892 const unsigned Exp2Op = VT == MVT::f32 ? AMDGPUISD::EXP : ISD::FEXP2;
2893
2894 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
2895 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
2896 SDValue K0 = DAG.getConstantFP(Val: 0x1.a92000p+1f, DL: SL, VT);
2897 SDValue K1 = DAG.getConstantFP(Val: 0x1.4f0978p-11f, DL: SL, VT);
2898
2899 SDValue Mul0 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: K0, Flags);
2900 SDValue Exp2_0 = DAG.getNode(Opcode: Exp2Op, DL: SL, VT, Operand: Mul0, Flags);
2901 SDValue Mul1 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: K1, Flags);
2902 SDValue Exp2_1 = DAG.getNode(Opcode: Exp2Op, DL: SL, VT, Operand: Mul1, Flags);
2903 return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Exp2_0, N2: Exp2_1);
2904 }
2905
2906 // bool s = x < -0x1.2f7030p+5f;
2907 // x += s ? 0x1.0p+5f : 0.0f;
2908 // exp10 = exp2(x * 0x1.a92000p+1f) *
2909 // exp2(x * 0x1.4f0978p-11f) *
2910 // (s ? 0x1.9f623ep-107f : 1.0f);
2911
2912 EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
2913
2914 SDValue Threshold = DAG.getConstantFP(Val: -0x1.2f7030p+5f, DL: SL, VT);
2915 SDValue NeedsScaling = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: X, RHS: Threshold, Cond: ISD::SETOLT);
2916
2917 SDValue ScaleOffset = DAG.getConstantFP(Val: 0x1.0p+5f, DL: SL, VT);
2918 SDValue ScaledX = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: X, N2: ScaleOffset, Flags);
2919 SDValue AdjustedX =
2920 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: ScaledX, N3: X);
2921
2922 SDValue K0 = DAG.getConstantFP(Val: 0x1.a92000p+1f, DL: SL, VT);
2923 SDValue K1 = DAG.getConstantFP(Val: 0x1.4f0978p-11f, DL: SL, VT);
2924
2925 SDValue Mul0 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: AdjustedX, N2: K0, Flags);
2926 SDValue Exp2_0 = DAG.getNode(Opcode: Exp2Op, DL: SL, VT, Operand: Mul0, Flags);
2927 SDValue Mul1 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: AdjustedX, N2: K1, Flags);
2928 SDValue Exp2_1 = DAG.getNode(Opcode: Exp2Op, DL: SL, VT, Operand: Mul1, Flags);
2929
2930 SDValue MulExps = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Exp2_0, N2: Exp2_1, Flags);
2931
2932 SDValue ResultScaleFactor = DAG.getConstantFP(Val: 0x1.9f623ep-107f, DL: SL, VT);
2933 SDValue AdjustedResult =
2934 DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: MulExps, N2: ResultScaleFactor, Flags);
2935
2936 return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: AdjustedResult, N3: MulExps,
2937 Flags);
2938}
2939
2940SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
2941 EVT VT = Op.getValueType();
2942 SDLoc SL(Op);
2943 SDValue X = Op.getOperand(i: 0);
2944 SDNodeFlags Flags = Op->getFlags();
2945 const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
2946
2947 if (VT.getScalarType() == MVT::f16) {
2948 // v_exp_f16 (fmul x, log2e)
2949 if (allowApproxFunc(DAG, Flags)) // TODO: Does this really require fast?
2950 return lowerFEXPUnsafe(X, SL, DAG, Flags);
2951
2952 if (VT.isVector())
2953 return SDValue();
2954
2955 // exp(f16 x) ->
2956 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
2957
2958 // Nothing in half is a denormal when promoted to f32.
2959 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags);
2960 SDValue Lowered = lowerFEXPUnsafe(X: Ext, SL, DAG, Flags);
2961 return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered,
2962 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2963 }
2964
2965 assert(VT == MVT::f32);
2966
2967 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
2968 // library behavior. Also, is known-not-daz source sufficient?
2969 if (allowApproxFunc(DAG, Flags)) {
2970 return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
2971 : lowerFEXPUnsafe(X, SL, DAG, Flags);
2972 }
2973
2974 // Algorithm:
2975 //
2976 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
2977 //
2978 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
2979 // n = 64*m + j, 0 <= j < 64
2980 //
2981 // e^x = 2^((64*m + j + f)/64)
2982 // = (2^m) * (2^(j/64)) * 2^(f/64)
2983 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
2984 //
2985 // f = x*(64/ln(2)) - n
2986 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
2987 //
2988 // e^x = (2^m) * (2^(j/64)) * e^r
2989 //
2990 // (2^(j/64)) is precomputed
2991 //
2992 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
2993 // e^r = 1 + q
2994 //
2995 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
2996 //
2997 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
2998 SDNodeFlags FlagsNoContract = Flags;
2999 FlagsNoContract.setAllowContract(false);
3000
3001 SDValue PH, PL;
3002 if (Subtarget->hasFastFMAF32()) {
3003 const float c_exp = numbers::log2ef;
3004 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3005 const float c_exp10 = 0x1.a934f0p+1f;
3006 const float cc_exp10 = 0x1.2f346ep-24f;
3007
3008 SDValue C = DAG.getConstantFP(Val: IsExp10 ? c_exp10 : c_exp, DL: SL, VT);
3009 SDValue CC = DAG.getConstantFP(Val: IsExp10 ? cc_exp10 : cc_exp, DL: SL, VT);
3010
3011 PH = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: C, Flags);
3012 SDValue NegPH = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: PH, Flags);
3013 SDValue FMA0 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: X, N2: C, N3: NegPH, Flags);
3014 PL = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: X, N2: CC, N3: FMA0, Flags);
3015 } else {
3016 const float ch_exp = 0x1.714000p+0f;
3017 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3018
3019 const float ch_exp10 = 0x1.a92000p+1f;
3020 const float cl_exp10 = 0x1.4f0978p-11f;
3021
3022 SDValue CH = DAG.getConstantFP(Val: IsExp10 ? ch_exp10 : ch_exp, DL: SL, VT);
3023 SDValue CL = DAG.getConstantFP(Val: IsExp10 ? cl_exp10 : cl_exp, DL: SL, VT);
3024
3025 SDValue XAsInt = DAG.getNode(ISD::BITCAST, SL, MVT::i32, X);
3026 SDValue MaskConst = DAG.getConstant(0xfffff000, SL, MVT::i32);
3027 SDValue XHAsInt = DAG.getNode(ISD::AND, SL, MVT::i32, XAsInt, MaskConst);
3028 SDValue XH = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: XHAsInt);
3029 SDValue XL = DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT, N1: X, N2: XH, Flags);
3030
3031 PH = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: XH, N2: CH, Flags);
3032
3033 SDValue XLCL = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: XL, N2: CL, Flags);
3034 SDValue Mad0 = getMad(DAG, SL, VT, X: XL, Y: CH, C: XLCL, Flags);
3035 PL = getMad(DAG, SL, VT, X: XH, Y: CL, C: Mad0, Flags);
3036 }
3037
3038 SDValue E = DAG.getNode(Opcode: ISD::FROUNDEVEN, DL: SL, VT, Operand: PH, Flags);
3039
3040 // It is unsafe to contract this fsub into the PH multiply.
3041 SDValue PHSubE = DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT, N1: PH, N2: E, Flags: FlagsNoContract);
3042
3043 SDValue A = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: PHSubE, N2: PL, Flags);
3044 SDValue IntE = DAG.getNode(ISD::FP_TO_SINT, SL, MVT::i32, E);
3045 SDValue Exp2 = DAG.getNode(Opcode: AMDGPUISD::EXP, DL: SL, VT, Operand: A, Flags);
3046
3047 SDValue R = DAG.getNode(Opcode: ISD::FLDEXP, DL: SL, VT, N1: Exp2, N2: IntE, Flags);
3048
3049 SDValue UnderflowCheckConst =
3050 DAG.getConstantFP(Val: IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, DL: SL, VT);
3051
3052 EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
3053 SDValue Zero = DAG.getConstantFP(Val: 0.0, DL: SL, VT);
3054 SDValue Underflow =
3055 DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: X, RHS: UnderflowCheckConst, Cond: ISD::SETOLT);
3056
3057 R = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: Underflow, N2: Zero, N3: R);
3058 const auto &Options = getTargetMachine().Options;
3059
3060 if (!Flags.hasNoInfs() && !Options.NoInfsFPMath) {
3061 SDValue OverflowCheckConst =
3062 DAG.getConstantFP(Val: IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, DL: SL, VT);
3063 SDValue Overflow =
3064 DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: X, RHS: OverflowCheckConst, Cond: ISD::SETOGT);
3065 SDValue Inf =
3066 DAG.getConstantFP(Val: APFloat::getInf(Sem: APFloat::IEEEsingle()), DL: SL, VT);
3067 R = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: Overflow, N2: Inf, N3: R);
3068 }
3069
3070 return R;
3071}
3072
3073static bool isCtlzOpc(unsigned Opc) {
3074 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
3075}
3076
3077static bool isCttzOpc(unsigned Opc) {
3078 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
3079}
3080
3081SDValue AMDGPUTargetLowering::lowerCTLZResults(SDValue Op,
3082 SelectionDAG &DAG) const {
3083 auto SL = SDLoc(Op);
3084 auto Arg = Op.getOperand(i: 0u);
3085 auto ResultVT = Op.getValueType();
3086
3087 if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
3088 return {};
3089
3090 assert(isCtlzOpc(Op.getOpcode()));
3091 assert(ResultVT == Arg.getValueType());
3092
3093 auto const LeadingZeroes = 32u - ResultVT.getFixedSizeInBits();
3094 auto SubVal = DAG.getConstant(LeadingZeroes, SL, MVT::i32);
3095 auto NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
3096 NewOp = DAG.getNode(Op.getOpcode(), SL, MVT::i32, NewOp);
3097 NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, SubVal);
3098 return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);
3099}
3100
3101SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const {
3102 SDLoc SL(Op);
3103 SDValue Src = Op.getOperand(i: 0);
3104
3105 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
3106 bool Ctlz = isCtlzOpc(Opc: Op.getOpcode());
3107 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
3108
3109 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
3110 Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
3111 bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
3112
3113 if (Src.getValueType() == MVT::i32 || Is64BitScalar) {
3114 // (ctlz hi:lo) -> (umin (ffbh src), 32)
3115 // (cttz hi:lo) -> (umin (ffbl src), 32)
3116 // (ctlz_zero_undef src) -> (ffbh src)
3117 // (cttz_zero_undef src) -> (ffbl src)
3118
3119 // 64-bit scalar version produce 32-bit result
3120 // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
3121 // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
3122 // (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src)
3123 // (cttz_zero_undef src) -> (S_FF1_I32_B64 src)
3124 SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
3125 if (!ZeroUndef) {
3126 const SDValue ConstVal = DAG.getConstant(
3127 Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);
3128 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal);
3129 }
3130 return DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: Src.getValueType(), Operand: NewOpr);
3131 }
3132
3133 SDValue Lo, Hi;
3134 std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Src, DAG);
3135
3136 SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
3137 SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
3138
3139 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
3140 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
3141 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
3142 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
3143
3144 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
3145 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
3146 if (Ctlz)
3147 OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
3148 else
3149 OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
3150
3151 SDValue NewOpr;
3152 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
3153 if (!ZeroUndef) {
3154 const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
3155 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
3156 }
3157
3158 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
3159}
3160
3161SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
3162 bool Signed) const {
3163 // The regular method converting a 64-bit integer to float roughly consists of
3164 // 2 steps: normalization and rounding. In fact, after normalization, the
3165 // conversion from a 64-bit integer to a float is essentially the same as the
3166 // one from a 32-bit integer. The only difference is that it has more
3167 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
3168 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
3169 // converted into the correct float number. The basic steps for the unsigned
3170 // conversion are illustrated in the following pseudo code:
3171 //
3172 // f32 uitofp(i64 u) {
3173 // i32 hi, lo = split(u);
3174 // // Only count the leading zeros in hi as we have native support of the
3175 // // conversion from i32 to f32. If hi is all 0s, the conversion is
3176 // // reduced to a 32-bit one automatically.
3177 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
3178 // u <<= shamt;
3179 // hi, lo = split(u);
3180 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
3181 // // convert it as a 32-bit integer and scale the result back.
3182 // return uitofp(hi) * 2^(32 - shamt);
3183 // }
3184 //
3185 // The signed one follows the same principle but uses 'ffbh_i32' to count its
3186 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
3187 // converted instead followed by negation based its sign bit.
3188
3189 SDLoc SL(Op);
3190 SDValue Src = Op.getOperand(i: 0);
3191
3192 SDValue Lo, Hi;
3193 std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Src, DAG);
3194 SDValue Sign;
3195 SDValue ShAmt;
3196 if (Signed && Subtarget->isGCN()) {
3197 // We also need to consider the sign bit in Lo if Hi has just sign bits,
3198 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
3199 // account. That is, the maximal shift is
3200 // - 32 if Lo and Hi have opposite signs;
3201 // - 33 if Lo and Hi have the same sign.
3202 //
3203 // Or, MaxShAmt = 33 + OppositeSign, where
3204 //
3205 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
3206 // - -1 if Lo and Hi have opposite signs; and
3207 // - 0 otherwise.
3208 //
3209 // All in all, ShAmt is calculated as
3210 //
3211 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
3212 //
3213 // or
3214 //
3215 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
3216 //
3217 // to reduce the critical path.
3218 SDValue OppositeSign = DAG.getNode(
3219 ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
3220 DAG.getConstant(31, SL, MVT::i32));
3221 SDValue MaxShAmt =
3222 DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3223 OppositeSign);
3224 // Count the leading sign bits.
3225 ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
3226 // Different from unsigned conversion, the shift should be one bit less to
3227 // preserve the sign bit.
3228 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
3229 DAG.getConstant(1, SL, MVT::i32));
3230 ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
3231 } else {
3232 if (Signed) {
3233 // Without 'ffbh_i32', only leading zeros could be counted. Take the
3234 // absolute value first.
3235 Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
3236 DAG.getConstant(63, SL, MVT::i64));
3237 SDValue Abs =
3238 DAG.getNode(ISD::XOR, SL, MVT::i64,
3239 DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
3240 std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Abs, DAG);
3241 }
3242 // Count the leading zeros.
3243 ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
3244 // The shift amount for signed integers is [0, 32].
3245 }
3246 // Normalize the given 64-bit integer.
3247 SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
3248 // Split it again.
3249 std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Norm, DAG);
3250 // Calculate the adjust bit for rounding.
3251 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
3252 SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
3253 DAG.getConstant(1, SL, MVT::i32), Lo);
3254 // Get the 32-bit normalized integer.
3255 Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
3256 // Convert the normalized 32-bit integer into f32.
3257 unsigned Opc =
3258 (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
3259 SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
3260
3261 // Finally, need to scale back the converted floating number as the original
3262 // 64-bit integer is converted as a 32-bit one.
3263 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3264 ShAmt);
3265 // On GCN, use LDEXP directly.
3266 if (Subtarget->isGCN())
3267 return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt);
3268
3269 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
3270 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
3271 // exponent is enough to avoid overflowing into the sign bit.
3272 SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
3273 DAG.getConstant(23, SL, MVT::i32));
3274 SDValue IVal =
3275 DAG.getNode(ISD::ADD, SL, MVT::i32,
3276 DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
3277 if (Signed) {
3278 // Set the sign bit.
3279 Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
3280 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
3281 DAG.getConstant(31, SL, MVT::i32));
3282 IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
3283 }
3284 return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
3285}
3286
3287SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
3288 bool Signed) const {
3289 SDLoc SL(Op);
3290 SDValue Src = Op.getOperand(i: 0);
3291
3292 SDValue Lo, Hi;
3293 std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Src, DAG);
3294
3295 SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
3296 SL, MVT::f64, Hi);
3297
3298 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
3299
3300 SDValue LdExp = DAG.getNode(ISD::FLDEXP, SL, MVT::f64, CvtHi,
3301 DAG.getConstant(32, SL, MVT::i32));
3302 // TODO: Should this propagate fast-math-flags?
3303 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
3304}
3305
3306SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
3307 SelectionDAG &DAG) const {
3308 // TODO: Factor out code common with LowerSINT_TO_FP.
3309 EVT DestVT = Op.getValueType();
3310 SDValue Src = Op.getOperand(i: 0);
3311 EVT SrcVT = Src.getValueType();
3312
3313 if (SrcVT == MVT::i16) {
3314 if (DestVT == MVT::f16)
3315 return Op;
3316 SDLoc DL(Op);
3317
3318 // Promote src to i32
3319 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
3320 return DAG.getNode(Opcode: ISD::UINT_TO_FP, DL, VT: DestVT, Operand: Ext);
3321 }
3322
3323 if (DestVT == MVT::bf16) {
3324 SDLoc SL(Op);
3325 SDValue ToF32 = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f32, Src);
3326 SDValue FPRoundFlag = DAG.getIntPtrConstant(Val: 0, DL: SL, /*isTarget=*/true);
3327 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3328 }
3329
3330 if (SrcVT != MVT::i64)
3331 return Op;
3332
3333 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3334 SDLoc DL(Op);
3335
3336 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3337 SDValue FPRoundFlag =
3338 DAG.getIntPtrConstant(Val: 0, DL: SDLoc(Op), /*isTarget=*/true);
3339 SDValue FPRound =
3340 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3341
3342 return FPRound;
3343 }
3344
3345 if (DestVT == MVT::f32)
3346 return LowerINT_TO_FP32(Op, DAG, Signed: false);
3347
3348 assert(DestVT == MVT::f64);
3349 return LowerINT_TO_FP64(Op, DAG, Signed: false);
3350}
3351
3352SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
3353 SelectionDAG &DAG) const {
3354 EVT DestVT = Op.getValueType();
3355
3356 SDValue Src = Op.getOperand(i: 0);
3357 EVT SrcVT = Src.getValueType();
3358
3359 if (SrcVT == MVT::i16) {
3360 if (DestVT == MVT::f16)
3361 return Op;
3362
3363 SDLoc DL(Op);
3364 // Promote src to i32
3365 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
3366 return DAG.getNode(Opcode: ISD::SINT_TO_FP, DL, VT: DestVT, Operand: Ext);
3367 }
3368
3369 if (DestVT == MVT::bf16) {
3370 SDLoc SL(Op);
3371 SDValue ToF32 = DAG.getNode(ISD::SINT_TO_FP, SL, MVT::f32, Src);
3372 SDValue FPRoundFlag = DAG.getIntPtrConstant(Val: 0, DL: SL, /*isTarget=*/true);
3373 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3374 }
3375
3376 if (SrcVT != MVT::i64)
3377 return Op;
3378
3379 // TODO: Factor out code common with LowerUINT_TO_FP.
3380
3381 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3382 SDLoc DL(Op);
3383 SDValue Src = Op.getOperand(i: 0);
3384
3385 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3386 SDValue FPRoundFlag =
3387 DAG.getIntPtrConstant(Val: 0, DL: SDLoc(Op), /*isTarget=*/true);
3388 SDValue FPRound =
3389 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3390
3391 return FPRound;
3392 }
3393
3394 if (DestVT == MVT::f32)
3395 return LowerINT_TO_FP32(Op, DAG, Signed: true);
3396
3397 assert(DestVT == MVT::f64);
3398 return LowerINT_TO_FP64(Op, DAG, Signed: true);
3399}
3400
3401SDValue AMDGPUTargetLowering::LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG,
3402 bool Signed) const {
3403 SDLoc SL(Op);
3404
3405 SDValue Src = Op.getOperand(i: 0);
3406 EVT SrcVT = Src.getValueType();
3407
3408 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
3409
3410 // The basic idea of converting a floating point number into a pair of 32-bit
3411 // integers is illustrated as follows:
3412 //
3413 // tf := trunc(val);
3414 // hif := floor(tf * 2^-32);
3415 // lof := tf - hif * 2^32; // lof is always positive due to floor.
3416 // hi := fptoi(hif);
3417 // lo := fptoi(lof);
3418 //
3419 SDValue Trunc = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT: SrcVT, Operand: Src);
3420 SDValue Sign;
3421 if (Signed && SrcVT == MVT::f32) {
3422 // However, a 32-bit floating point number has only 23 bits mantissa and
3423 // it's not enough to hold all the significant bits of `lof` if val is
3424 // negative. To avoid the loss of precision, We need to take the absolute
3425 // value after truncating and flip the result back based on the original
3426 // signedness.
3427 Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
3428 DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
3429 DAG.getConstant(31, SL, MVT::i32));
3430 Trunc = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT: SrcVT, Operand: Trunc);
3431 }
3432
3433 SDValue K0, K1;
3434 if (SrcVT == MVT::f64) {
3435 K0 = DAG.getConstantFP(
3436 Val: llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), DL: SL,
3437 VT: SrcVT);
3438 K1 = DAG.getConstantFP(
3439 Val: llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), DL: SL,
3440 VT: SrcVT);
3441 } else {
3442 K0 = DAG.getConstantFP(
3443 Val: llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), DL: SL, VT: SrcVT);
3444 K1 = DAG.getConstantFP(
3445 Val: llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), DL: SL, VT: SrcVT);
3446 }
3447 // TODO: Should this propagate fast-math-flags?
3448 SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: SrcVT, N1: Trunc, N2: K0);
3449
3450 SDValue FloorMul = DAG.getNode(Opcode: ISD::FFLOOR, DL: SL, VT: SrcVT, Operand: Mul);
3451
3452 SDValue Fma = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: SrcVT, N1: FloorMul, N2: K1, N3: Trunc);
3453
3454 SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
3455 : ISD::FP_TO_UINT,
3456 SL, MVT::i32, FloorMul);
3457 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
3458
3459 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3460 DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
3461
3462 if (Signed && SrcVT == MVT::f32) {
3463 assert(Sign);
3464 // Flip the result based on the signedness, which is either all 0s or 1s.
3465 Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3466 DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
3467 // r := xor(r, sign) - sign;
3468 Result =
3469 DAG.getNode(ISD::SUB, SL, MVT::i64,
3470 DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
3471 }
3472
3473 return Result;
3474}
3475
3476SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
3477 SDLoc DL(Op);
3478 SDValue N0 = Op.getOperand(i: 0);
3479
3480 // Convert to target node to get known bits
3481 if (N0.getValueType() == MVT::f32)
3482 return DAG.getNode(Opcode: AMDGPUISD::FP_TO_FP16, DL, VT: Op.getValueType(), Operand: N0);
3483
3484 if (getTargetMachine().Options.UnsafeFPMath) {
3485 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
3486 return SDValue();
3487 }
3488
3489 assert(N0.getSimpleValueType() == MVT::f64);
3490
3491 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
3492 const unsigned ExpMask = 0x7ff;
3493 const unsigned ExpBiasf64 = 1023;
3494 const unsigned ExpBiasf16 = 15;
3495 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
3496 SDValue One = DAG.getConstant(1, DL, MVT::i32);
3497 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
3498 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
3499 DAG.getConstant(32, DL, MVT::i64));
3500 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
3501 U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
3502 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3503 DAG.getConstant(20, DL, MVT::i64));
3504 E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
3505 DAG.getConstant(ExpMask, DL, MVT::i32));
3506 // Subtract the fp64 exponent bias (1023) to get the real exponent and
3507 // add the f16 bias (15) to get the biased exponent for the f16 format.
3508 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
3509 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
3510
3511 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3512 DAG.getConstant(8, DL, MVT::i32));
3513 M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
3514 DAG.getConstant(0xffe, DL, MVT::i32));
3515
3516 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
3517 DAG.getConstant(0x1ff, DL, MVT::i32));
3518 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
3519
3520 SDValue Lo40Set = DAG.getSelectCC(DL, LHS: MaskedSig, RHS: Zero, True: Zero, False: One, Cond: ISD::SETEQ);
3521 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
3522
3523 // (M != 0 ? 0x0200 : 0) | 0x7c00;
3524 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
3525 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
3526 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
3527
3528 // N = M | (E << 12);
3529 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3530 DAG.getNode(ISD::SHL, DL, MVT::i32, E,
3531 DAG.getConstant(12, DL, MVT::i32)));
3532
3533 // B = clamp(1-E, 0, 13);
3534 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
3535 One, E);
3536 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
3537 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
3538 DAG.getConstant(13, DL, MVT::i32));
3539
3540 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3541 DAG.getConstant(0x1000, DL, MVT::i32));
3542
3543 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
3544 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
3545 SDValue D1 = DAG.getSelectCC(DL, LHS: D0, RHS: SigSetHigh, True: One, False: Zero, Cond: ISD::SETNE);
3546 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
3547
3548 SDValue V = DAG.getSelectCC(DL, LHS: E, RHS: One, True: D, False: N, Cond: ISD::SETLT);
3549 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
3550 DAG.getConstant(0x7, DL, MVT::i32));
3551 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
3552 DAG.getConstant(2, DL, MVT::i32));
3553 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
3554 One, Zero, ISD::SETEQ);
3555 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
3556 One, Zero, ISD::SETGT);
3557 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
3558 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
3559
3560 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
3561 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
3562 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
3563 I, V, ISD::SETEQ);
3564
3565 // Extract the sign bit.
3566 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3567 DAG.getConstant(16, DL, MVT::i32));
3568 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
3569 DAG.getConstant(0x8000, DL, MVT::i32));
3570
3571 V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
3572 return DAG.getZExtOrTrunc(Op: V, DL, VT: Op.getValueType());
3573}
3574
3575SDValue AMDGPUTargetLowering::LowerFP_TO_INT(const SDValue Op,
3576 SelectionDAG &DAG) const {
3577 SDValue Src = Op.getOperand(i: 0);
3578 unsigned OpOpcode = Op.getOpcode();
3579 EVT SrcVT = Src.getValueType();
3580 EVT DestVT = Op.getValueType();
3581
3582 // Will be selected natively
3583 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3584 return Op;
3585
3586 if (SrcVT == MVT::bf16) {
3587 SDLoc DL(Op);
3588 SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
3589 return DAG.getNode(Opcode: Op.getOpcode(), DL, VT: DestVT, Operand: PromotedSrc);
3590 }
3591
3592 // Promote i16 to i32
3593 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
3594 SDLoc DL(Op);
3595
3596 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3597 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
3598 }
3599
3600 if (DestVT != MVT::i64)
3601 return Op;
3602
3603 if (SrcVT == MVT::f16 ||
3604 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
3605 SDLoc DL(Op);
3606
3607 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3608 unsigned Ext =
3609 OpOpcode == ISD::FP_TO_SINT ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3610 return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
3611 }
3612
3613 if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
3614 return LowerFP_TO_INT64(Op, DAG, Signed: OpOpcode == ISD::FP_TO_SINT);
3615
3616 return SDValue();
3617}
3618
3619SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
3620 SelectionDAG &DAG) const {
3621 EVT ExtraVT = cast<VTSDNode>(Val: Op.getOperand(i: 1))->getVT();
3622 MVT VT = Op.getSimpleValueType();
3623 MVT ScalarVT = VT.getScalarType();
3624
3625 assert(VT.isVector());
3626
3627 SDValue Src = Op.getOperand(i: 0);
3628 SDLoc DL(Op);
3629
3630 // TODO: Don't scalarize on Evergreen?
3631 unsigned NElts = VT.getVectorNumElements();
3632 SmallVector<SDValue, 8> Args;
3633 DAG.ExtractVectorElements(Op: Src, Args, Start: 0, Count: NElts);
3634
3635 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
3636 for (unsigned I = 0; I < NElts; ++I)
3637 Args[I] = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT: ScalarVT, N1: Args[I], N2: VTOp);
3638
3639 return DAG.getBuildVector(VT, DL, Ops: Args);
3640}
3641
3642//===----------------------------------------------------------------------===//
3643// Custom DAG optimizations
3644//===----------------------------------------------------------------------===//
3645
3646static bool isU24(SDValue Op, SelectionDAG &DAG) {
3647 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
3648}
3649
3650static bool isI24(SDValue Op, SelectionDAG &DAG) {
3651 EVT VT = Op.getValueType();
3652 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
3653 // as unsigned 24-bit values.
3654 AMDGPUTargetLowering::numBitsSigned(Op, DAG) <= 24;
3655}
3656
3657static SDValue simplifyMul24(SDNode *Node24,
3658 TargetLowering::DAGCombinerInfo &DCI) {
3659 SelectionDAG &DAG = DCI.DAG;
3660 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3661 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
3662
3663 SDValue LHS = IsIntrin ? Node24->getOperand(Num: 1) : Node24->getOperand(Num: 0);
3664 SDValue RHS = IsIntrin ? Node24->getOperand(Num: 2) : Node24->getOperand(Num: 1);
3665 unsigned NewOpcode = Node24->getOpcode();
3666 if (IsIntrin) {
3667 unsigned IID = Node24->getConstantOperandVal(Num: 0);
3668 switch (IID) {
3669 case Intrinsic::amdgcn_mul_i24:
3670 NewOpcode = AMDGPUISD::MUL_I24;
3671 break;
3672 case Intrinsic::amdgcn_mul_u24:
3673 NewOpcode = AMDGPUISD::MUL_U24;
3674 break;
3675 case Intrinsic::amdgcn_mulhi_i24:
3676 NewOpcode = AMDGPUISD::MULHI_I24;
3677 break;
3678 case Intrinsic::amdgcn_mulhi_u24:
3679 NewOpcode = AMDGPUISD::MULHI_U24;
3680 break;
3681 default:
3682 llvm_unreachable("Expected 24-bit mul intrinsic");
3683 }
3684 }
3685
3686 APInt Demanded = APInt::getLowBitsSet(numBits: LHS.getValueSizeInBits(), loBitsSet: 24);
3687
3688 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
3689 // the operands to have other uses, but will only perform simplifications that
3690 // involve bypassing some nodes for this user.
3691 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(Op: LHS, DemandedBits: Demanded, DAG);
3692 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(Op: RHS, DemandedBits: Demanded, DAG);
3693 if (DemandedLHS || DemandedRHS)
3694 return DAG.getNode(Opcode: NewOpcode, DL: SDLoc(Node24), VTList: Node24->getVTList(),
3695 N1: DemandedLHS ? DemandedLHS : LHS,
3696 N2: DemandedRHS ? DemandedRHS : RHS);
3697
3698 // Now try SimplifyDemandedBits which can simplify the nodes used by our
3699 // operands if this node is the only user.
3700 if (TLI.SimplifyDemandedBits(Op: LHS, DemandedBits: Demanded, DCI))
3701 return SDValue(Node24, 0);
3702 if (TLI.SimplifyDemandedBits(Op: RHS, DemandedBits: Demanded, DCI))
3703 return SDValue(Node24, 0);
3704
3705 return SDValue();
3706}
3707
3708template <typename IntTy>
3709static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
3710 uint32_t Width, const SDLoc &DL) {
3711 if (Width + Offset < 32) {
3712 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
3713 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
3714 return DAG.getConstant(Result, DL, MVT::i32);
3715 }
3716
3717 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
3718}
3719
3720static bool hasVolatileUser(SDNode *Val) {
3721 for (SDNode *U : Val->uses()) {
3722 if (MemSDNode *M = dyn_cast<MemSDNode>(Val: U)) {
3723 if (M->isVolatile())
3724 return true;
3725 }
3726 }
3727
3728 return false;
3729}
3730
3731bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
3732 // i32 vectors are the canonical memory type.
3733 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
3734 return false;
3735
3736 if (!VT.isByteSized())
3737 return false;
3738
3739 unsigned Size = VT.getStoreSize();
3740
3741 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
3742 return false;
3743
3744 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
3745 return false;
3746
3747 return true;
3748}
3749
3750// Replace load of an illegal type with a store of a bitcast to a friendlier
3751// type.
3752SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
3753 DAGCombinerInfo &DCI) const {
3754 if (!DCI.isBeforeLegalize())
3755 return SDValue();
3756
3757 LoadSDNode *LN = cast<LoadSDNode>(Val: N);
3758 if (!LN->isSimple() || !ISD::isNormalLoad(N: LN) || hasVolatileUser(Val: LN))
3759 return SDValue();
3760
3761 SDLoc SL(N);
3762 SelectionDAG &DAG = DCI.DAG;
3763 EVT VT = LN->getMemoryVT();
3764
3765 unsigned Size = VT.getStoreSize();
3766 Align Alignment = LN->getAlign();
3767 if (Alignment < Size && isTypeLegal(VT)) {
3768 unsigned IsFast;
3769 unsigned AS = LN->getAddressSpace();
3770
3771 // Expand unaligned loads earlier than legalization. Due to visitation order
3772 // problems during legalization, the emitted instructions to pack and unpack
3773 // the bytes again are not eliminated in the case of an unaligned copy.
3774 if (!allowsMisalignedMemoryAccesses(
3775 VT, AddrSpace: AS, Alignment, Flags: LN->getMemOperand()->getFlags(), &IsFast)) {
3776 if (VT.isVector())
3777 return SplitVectorLoad(Op: SDValue(LN, 0), DAG);
3778
3779 SDValue Ops[2];
3780 std::tie(args&: Ops[0], args&: Ops[1]) = expandUnalignedLoad(LD: LN, DAG);
3781
3782 return DAG.getMergeValues(Ops, dl: SDLoc(N));
3783 }
3784
3785 if (!IsFast)
3786 return SDValue();
3787 }
3788
3789 if (!shouldCombineMemoryType(VT))
3790 return SDValue();
3791
3792 EVT NewVT = getEquivalentMemType(Ctx&: *DAG.getContext(), VT);
3793
3794 SDValue NewLoad
3795 = DAG.getLoad(VT: NewVT, dl: SL, Chain: LN->getChain(),
3796 Ptr: LN->getBasePtr(), MMO: LN->getMemOperand());
3797
3798 SDValue BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: NewLoad);
3799 DCI.CombineTo(N, Res0: BC, Res1: NewLoad.getValue(R: 1));
3800 return SDValue(N, 0);
3801}
3802
3803// Replace store of an illegal type with a store of a bitcast to a friendlier
3804// type.
3805SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
3806 DAGCombinerInfo &DCI) const {
3807 if (!DCI.isBeforeLegalize())
3808 return SDValue();
3809
3810 StoreSDNode *SN = cast<StoreSDNode>(Val: N);
3811 if (!SN->isSimple() || !ISD::isNormalStore(N: SN))
3812 return SDValue();
3813
3814 EVT VT = SN->getMemoryVT();
3815 unsigned Size = VT.getStoreSize();
3816
3817 SDLoc SL(N);
3818 SelectionDAG &DAG = DCI.DAG;
3819 Align Alignment = SN->getAlign();
3820 if (Alignment < Size && isTypeLegal(VT)) {
3821 unsigned IsFast;
3822 unsigned AS = SN->getAddressSpace();
3823
3824 // Expand unaligned stores earlier than legalization. Due to visitation
3825 // order problems during legalization, the emitted instructions to pack and
3826 // unpack the bytes again are not eliminated in the case of an unaligned
3827 // copy.
3828 if (!allowsMisalignedMemoryAccesses(
3829 VT, AddrSpace: AS, Alignment, Flags: SN->getMemOperand()->getFlags(), &IsFast)) {
3830 if (VT.isVector())
3831 return SplitVectorStore(Op: SDValue(SN, 0), DAG);
3832
3833 return expandUnalignedStore(ST: SN, DAG);
3834 }
3835
3836 if (!IsFast)
3837 return SDValue();
3838 }
3839
3840 if (!shouldCombineMemoryType(VT))
3841 return SDValue();
3842
3843 EVT NewVT = getEquivalentMemType(Ctx&: *DAG.getContext(), VT);
3844 SDValue Val = SN->getValue();
3845
3846 //DCI.AddToWorklist(Val.getNode());
3847
3848 bool OtherUses = !Val.hasOneUse();
3849 SDValue CastVal = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVT, Operand: Val);
3850 if (OtherUses) {
3851 SDValue CastBack = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: CastVal);
3852 DAG.ReplaceAllUsesOfValueWith(From: Val, To: CastBack);
3853 }
3854
3855 return DAG.getStore(Chain: SN->getChain(), dl: SL, Val: CastVal,
3856 Ptr: SN->getBasePtr(), MMO: SN->getMemOperand());
3857}
3858
3859// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
3860// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
3861// issues.
3862SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
3863 DAGCombinerInfo &DCI) const {
3864 SelectionDAG &DAG = DCI.DAG;
3865 SDValue N0 = N->getOperand(Num: 0);
3866
3867 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
3868 // (vt2 (truncate (assertzext vt0:x, vt1)))
3869 if (N0.getOpcode() == ISD::TRUNCATE) {
3870 SDValue N1 = N->getOperand(Num: 1);
3871 EVT ExtVT = cast<VTSDNode>(Val&: N1)->getVT();
3872 SDLoc SL(N);
3873
3874 SDValue Src = N0.getOperand(i: 0);
3875 EVT SrcVT = Src.getValueType();
3876 if (SrcVT.bitsGE(VT: ExtVT)) {
3877 SDValue NewInReg = DAG.getNode(Opcode: N->getOpcode(), DL: SL, VT: SrcVT, N1: Src, N2: N1);
3878 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: N->getValueType(ResNo: 0), Operand: NewInReg);
3879 }
3880 }
3881
3882 return SDValue();
3883}
3884
3885SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
3886 SDNode *N, DAGCombinerInfo &DCI) const {
3887 unsigned IID = N->getConstantOperandVal(Num: 0);
3888 switch (IID) {
3889 case Intrinsic::amdgcn_mul_i24:
3890 case Intrinsic::amdgcn_mul_u24:
3891 case Intrinsic::amdgcn_mulhi_i24:
3892 case Intrinsic::amdgcn_mulhi_u24:
3893 return simplifyMul24(Node24: N, DCI);
3894 case Intrinsic::amdgcn_fract:
3895 case Intrinsic::amdgcn_rsq:
3896 case Intrinsic::amdgcn_rcp_legacy:
3897 case Intrinsic::amdgcn_rsq_legacy:
3898 case Intrinsic::amdgcn_rsq_clamp: {
3899 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
3900 SDValue Src = N->getOperand(Num: 1);
3901 return Src.isUndef() ? Src : SDValue();
3902 }
3903 case Intrinsic::amdgcn_frexp_exp: {
3904 // frexp_exp (fneg x) -> frexp_exp x
3905 // frexp_exp (fabs x) -> frexp_exp x
3906 // frexp_exp (fneg (fabs x)) -> frexp_exp x
3907 SDValue Src = N->getOperand(Num: 1);
3908 SDValue PeekSign = peekFPSignOps(Val: Src);
3909 if (PeekSign == Src)
3910 return SDValue();
3911 return SDValue(DCI.DAG.UpdateNodeOperands(N, Op1: N->getOperand(Num: 0), Op2: PeekSign),
3912 0);
3913 }
3914 default:
3915 return SDValue();
3916 }
3917}
3918
3919/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
3920/// binary operation \p Opc to it with the corresponding constant operands.
3921SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
3922 DAGCombinerInfo &DCI, const SDLoc &SL,
3923 unsigned Opc, SDValue LHS,
3924 uint32_t ValLo, uint32_t ValHi) const {
3925 SelectionDAG &DAG = DCI.DAG;
3926 SDValue Lo, Hi;
3927 std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: LHS, DAG);
3928
3929 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
3930 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
3931
3932 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
3933 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
3934
3935 // Re-visit the ands. It's possible we eliminated one of them and it could
3936 // simplify the vector.
3937 DCI.AddToWorklist(N: Lo.getNode());
3938 DCI.AddToWorklist(N: Hi.getNode());
3939
3940 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
3941 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3942}
3943
3944SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
3945 DAGCombinerInfo &DCI) const {
3946 EVT VT = N->getValueType(ResNo: 0);
3947
3948 ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
3949 if (!RHS)
3950 return SDValue();
3951
3952 SDValue LHS = N->getOperand(Num: 0);
3953 unsigned RHSVal = RHS->getZExtValue();
3954 if (!RHSVal)
3955 return LHS;
3956
3957 SDLoc SL(N);
3958 SelectionDAG &DAG = DCI.DAG;
3959
3960 switch (LHS->getOpcode()) {
3961 default:
3962 break;
3963 case ISD::ZERO_EXTEND:
3964 case ISD::SIGN_EXTEND:
3965 case ISD::ANY_EXTEND: {
3966 SDValue X = LHS->getOperand(Num: 0);
3967
3968 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
3969 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
3970 // Prefer build_vector as the canonical form if packed types are legal.
3971 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
3972 SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
3973 { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
3974 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
3975 }
3976
3977 // shl (ext x) => zext (shl x), if shift does not overflow int
3978 if (VT != MVT::i64)
3979 break;
3980 KnownBits Known = DAG.computeKnownBits(Op: X);
3981 unsigned LZ = Known.countMinLeadingZeros();
3982 if (LZ < RHSVal)
3983 break;
3984 EVT XVT = X.getValueType();
3985 SDValue Shl = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: XVT, N1: X, N2: SDValue(RHS, 0));
3986 return DAG.getZExtOrTrunc(Op: Shl, DL: SL, VT);
3987 }
3988 }
3989
3990 if (VT != MVT::i64)
3991 return SDValue();
3992
3993 // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
3994
3995 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
3996 // common case, splitting this into a move and a 32-bit shift is faster and
3997 // the same code size.
3998 if (RHSVal < 32)
3999 return SDValue();
4000
4001 SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
4002
4003 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
4004 SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
4005
4006 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
4007
4008 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
4009 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
4010}
4011
4012SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
4013 DAGCombinerInfo &DCI) const {
4014 if (N->getValueType(0) != MVT::i64)
4015 return SDValue();
4016
4017 const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
4018 if (!RHS)
4019 return SDValue();
4020
4021 SelectionDAG &DAG = DCI.DAG;
4022 SDLoc SL(N);
4023 unsigned RHSVal = RHS->getZExtValue();
4024
4025 // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
4026 if (RHSVal == 32) {
4027 SDValue Hi = getHiHalf64(Op: N->getOperand(Num: 0), DAG);
4028 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
4029 DAG.getConstant(31, SL, MVT::i32));
4030
4031 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
4032 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
4033 }
4034
4035 // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
4036 if (RHSVal == 63) {
4037 SDValue Hi = getHiHalf64(Op: N->getOperand(Num: 0), DAG);
4038 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
4039 DAG.getConstant(31, SL, MVT::i32));
4040 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
4041 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
4042 }
4043
4044 return SDValue();
4045}
4046
4047SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
4048 DAGCombinerInfo &DCI) const {
4049 auto *RHS = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
4050 if (!RHS)
4051 return SDValue();
4052
4053 EVT VT = N->getValueType(ResNo: 0);
4054 SDValue LHS = N->getOperand(Num: 0);
4055 unsigned ShiftAmt = RHS->getZExtValue();
4056 SelectionDAG &DAG = DCI.DAG;
4057 SDLoc SL(N);
4058
4059 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
4060 // this improves the ability to match BFE patterns in isel.
4061 if (LHS.getOpcode() == ISD::AND) {
4062 if (auto *Mask = dyn_cast<ConstantSDNode>(Val: LHS.getOperand(i: 1))) {
4063 unsigned MaskIdx, MaskLen;
4064 if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
4065 MaskIdx == ShiftAmt) {
4066 return DAG.getNode(
4067 Opcode: ISD::AND, DL: SL, VT,
4068 N1: DAG.getNode(Opcode: ISD::SRL, DL: SL, VT, N1: LHS.getOperand(i: 0), N2: N->getOperand(Num: 1)),
4069 N2: DAG.getNode(Opcode: ISD::SRL, DL: SL, VT, N1: LHS.getOperand(i: 1), N2: N->getOperand(Num: 1)));
4070 }
4071 }
4072 }
4073
4074 if (VT != MVT::i64)
4075 return SDValue();
4076
4077 if (ShiftAmt < 32)
4078 return SDValue();
4079
4080 // srl i64:x, C for C >= 32
4081 // =>
4082 // build_pair (srl hi_32(x), C - 32), 0
4083 SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
4084
4085 SDValue Hi = getHiHalf64(Op: LHS, DAG);
4086
4087 SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
4088 SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
4089
4090 SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
4091
4092 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
4093}
4094
4095SDValue AMDGPUTargetLowering::performTruncateCombine(
4096 SDNode *N, DAGCombinerInfo &DCI) const {
4097 SDLoc SL(N);
4098 SelectionDAG &DAG = DCI.DAG;
4099 EVT VT = N->getValueType(ResNo: 0);
4100 SDValue Src = N->getOperand(Num: 0);
4101
4102 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
4103 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
4104 SDValue Vec = Src.getOperand(i: 0);
4105 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
4106 SDValue Elt0 = Vec.getOperand(i: 0);
4107 EVT EltVT = Elt0.getValueType();
4108 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
4109 if (EltVT.isFloatingPoint()) {
4110 Elt0 = DAG.getNode(Opcode: ISD::BITCAST, DL: SL,
4111 VT: EltVT.changeTypeToInteger(), Operand: Elt0);
4112 }
4113
4114 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: Elt0);
4115 }
4116 }
4117 }
4118
4119 // Equivalent of above for accessing the high element of a vector as an
4120 // integer operation.
4121 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
4122 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
4123 if (auto K = isConstOrConstSplat(N: Src.getOperand(i: 1))) {
4124 if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
4125 SDValue BV = stripBitcast(Val: Src.getOperand(i: 0));
4126 if (BV.getOpcode() == ISD::BUILD_VECTOR &&
4127 BV.getValueType().getVectorNumElements() == 2) {
4128 SDValue SrcElt = BV.getOperand(i: 1);
4129 EVT SrcEltVT = SrcElt.getValueType();
4130 if (SrcEltVT.isFloatingPoint()) {
4131 SrcElt = DAG.getNode(Opcode: ISD::BITCAST, DL: SL,
4132 VT: SrcEltVT.changeTypeToInteger(), Operand: SrcElt);
4133 }
4134
4135 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: SrcElt);
4136 }
4137 }
4138 }
4139 }
4140
4141 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
4142 //
4143 // i16 (trunc (srl i64:x, K)), K <= 16 ->
4144 // i16 (trunc (srl (i32 (trunc x), K)))
4145 if (VT.getScalarSizeInBits() < 32) {
4146 EVT SrcVT = Src.getValueType();
4147 if (SrcVT.getScalarSizeInBits() > 32 &&
4148 (Src.getOpcode() == ISD::SRL ||
4149 Src.getOpcode() == ISD::SRA ||
4150 Src.getOpcode() == ISD::SHL)) {
4151 SDValue Amt = Src.getOperand(i: 1);
4152 KnownBits Known = DAG.computeKnownBits(Op: Amt);
4153
4154 // - For left shifts, do the transform as long as the shift
4155 // amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
4156 // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
4157 // losing information stored in the high bits when truncating.
4158 const unsigned MaxCstSize =
4159 (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits());
4160 if (Known.getMaxValue().ule(RHS: MaxCstSize)) {
4161 EVT MidVT = VT.isVector() ?
4162 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
4163 VT.getVectorNumElements()) : MVT::i32;
4164
4165 EVT NewShiftVT = getShiftAmountTy(LHSTy: MidVT, DL: DAG.getDataLayout());
4166 SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MidVT,
4167 Operand: Src.getOperand(i: 0));
4168 DCI.AddToWorklist(N: Trunc.getNode());
4169
4170 if (Amt.getValueType() != NewShiftVT) {
4171 Amt = DAG.getZExtOrTrunc(Op: Amt, DL: SL, VT: NewShiftVT);
4172 DCI.AddToWorklist(N: Amt.getNode());
4173 }
4174
4175 SDValue ShrunkShift = DAG.getNode(Opcode: Src.getOpcode(), DL: SL, VT: MidVT,
4176 N1: Trunc, N2: Amt);
4177 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: ShrunkShift);
4178 }
4179 }
4180 }
4181
4182 return SDValue();
4183}
4184
4185// We need to specifically handle i64 mul here to avoid unnecessary conversion
4186// instructions. If we only match on the legalized i64 mul expansion,
4187// SimplifyDemandedBits will be unable to remove them because there will be
4188// multiple uses due to the separate mul + mulh[su].
4189static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
4190 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
4191 if (Size <= 32) {
4192 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4193 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
4194 }
4195
4196 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4197 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
4198
4199 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
4200 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
4201
4202 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
4203}
4204
4205/// If \p V is an add of a constant 1, returns the other operand. Otherwise
4206/// return SDValue().
4207static SDValue getAddOneOp(const SDNode *V) {
4208 if (V->getOpcode() != ISD::ADD)
4209 return SDValue();
4210
4211 return isOneConstant(V: V->getOperand(Num: 1)) ? V->getOperand(Num: 0) : SDValue();
4212}
4213
4214SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
4215 DAGCombinerInfo &DCI) const {
4216 assert(N->getOpcode() == ISD::MUL);
4217 EVT VT = N->getValueType(ResNo: 0);
4218
4219 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4220 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4221 // unnecessarily). isDivergent() is used as an approximation of whether the
4222 // value is in an SGPR.
4223 if (!N->isDivergent())
4224 return SDValue();
4225
4226 unsigned Size = VT.getSizeInBits();
4227 if (VT.isVector() || Size > 64)
4228 return SDValue();
4229
4230 SelectionDAG &DAG = DCI.DAG;
4231 SDLoc DL(N);
4232
4233 SDValue N0 = N->getOperand(Num: 0);
4234 SDValue N1 = N->getOperand(Num: 1);
4235
4236 // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad
4237 // matching.
4238
4239 // mul x, (add y, 1) -> add (mul x, y), x
4240 auto IsFoldableAdd = [](SDValue V) -> SDValue {
4241 SDValue AddOp = getAddOneOp(V: V.getNode());
4242 if (!AddOp)
4243 return SDValue();
4244
4245 if (V.hasOneUse() || all_of(Range: V->uses(), P: [](const SDNode *U) -> bool {
4246 return U->getOpcode() == ISD::MUL;
4247 }))
4248 return AddOp;
4249
4250 return SDValue();
4251 };
4252
4253 // FIXME: The selection pattern is not properly checking for commuted
4254 // operands, so we have to place the mul in the LHS
4255 if (SDValue MulOper = IsFoldableAdd(N0)) {
4256 SDValue MulVal = DAG.getNode(Opcode: N->getOpcode(), DL, VT, N1, N2: MulOper);
4257 return DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: MulVal, N2: N1);
4258 }
4259
4260 if (SDValue MulOper = IsFoldableAdd(N1)) {
4261 SDValue MulVal = DAG.getNode(Opcode: N->getOpcode(), DL, VT, N1: N0, N2: MulOper);
4262 return DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: MulVal, N2: N0);
4263 }
4264
4265 // There are i16 integer mul/mad.
4266 if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
4267 return SDValue();
4268
4269 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4270 // in the source into any_extends if the result of the mul is truncated. Since
4271 // we can assume the high bits are whatever we want, use the underlying value
4272 // to avoid the unknown high bits from interfering.
4273 if (N0.getOpcode() == ISD::ANY_EXTEND)
4274 N0 = N0.getOperand(i: 0);
4275
4276 if (N1.getOpcode() == ISD::ANY_EXTEND)
4277 N1 = N1.getOperand(i: 0);
4278
4279 SDValue Mul;
4280
4281 if (Subtarget->hasMulU24() && isU24(Op: N0, DAG) && isU24(Op: N1, DAG)) {
4282 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4283 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4284 Mul = getMul24(DAG, SL: DL, N0, N1, Size, Signed: false);
4285 } else if (Subtarget->hasMulI24() && isI24(Op: N0, DAG) && isI24(Op: N1, DAG)) {
4286 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4287 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4288 Mul = getMul24(DAG, SL: DL, N0, N1, Size, Signed: true);
4289 } else {
4290 return SDValue();
4291 }
4292
4293 // We need to use sext even for MUL_U24, because MUL_U24 is used
4294 // for signed multiply of 8 and 16-bit types.
4295 return DAG.getSExtOrTrunc(Op: Mul, DL, VT);
4296}
4297
4298SDValue
4299AMDGPUTargetLowering::performMulLoHiCombine(SDNode *N,
4300 DAGCombinerInfo &DCI) const {
4301 if (N->getValueType(0) != MVT::i32)
4302 return SDValue();
4303
4304 SelectionDAG &DAG = DCI.DAG;
4305 SDLoc DL(N);
4306
4307 SDValue N0 = N->getOperand(Num: 0);
4308 SDValue N1 = N->getOperand(Num: 1);
4309
4310 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4311 // in the source into any_extends if the result of the mul is truncated. Since
4312 // we can assume the high bits are whatever we want, use the underlying value
4313 // to avoid the unknown high bits from interfering.
4314 if (N0.getOpcode() == ISD::ANY_EXTEND)
4315 N0 = N0.getOperand(i: 0);
4316 if (N1.getOpcode() == ISD::ANY_EXTEND)
4317 N1 = N1.getOperand(i: 0);
4318
4319 // Try to use two fast 24-bit multiplies (one for each half of the result)
4320 // instead of one slow extending multiply.
4321 unsigned LoOpcode, HiOpcode;
4322 if (Subtarget->hasMulU24() && isU24(Op: N0, DAG) && isU24(Op: N1, DAG)) {
4323 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4324 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4325 LoOpcode = AMDGPUISD::MUL_U24;
4326 HiOpcode = AMDGPUISD::MULHI_U24;
4327 } else if (Subtarget->hasMulI24() && isI24(Op: N0, DAG) && isI24(Op: N1, DAG)) {
4328 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4329 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4330 LoOpcode = AMDGPUISD::MUL_I24;
4331 HiOpcode = AMDGPUISD::MULHI_I24;
4332 } else {
4333 return SDValue();
4334 }
4335
4336 SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
4337 SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
4338 DCI.CombineTo(N, Res0: Lo, Res1: Hi);
4339 return SDValue(N, 0);
4340}
4341
4342SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
4343 DAGCombinerInfo &DCI) const {
4344 EVT VT = N->getValueType(ResNo: 0);
4345
4346 if (!Subtarget->hasMulI24() || VT.isVector())
4347 return SDValue();
4348
4349 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4350 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4351 // unnecessarily). isDivergent() is used as an approximation of whether the
4352 // value is in an SGPR.
4353 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4354 // valu op anyway)
4355 if (Subtarget->hasSMulHi() && !N->isDivergent())
4356 return SDValue();
4357
4358 SelectionDAG &DAG = DCI.DAG;
4359 SDLoc DL(N);
4360
4361 SDValue N0 = N->getOperand(Num: 0);
4362 SDValue N1 = N->getOperand(Num: 1);
4363
4364 if (!isI24(Op: N0, DAG) || !isI24(Op: N1, DAG))
4365 return SDValue();
4366
4367 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4368 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4369
4370 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
4371 DCI.AddToWorklist(N: Mulhi.getNode());
4372 return DAG.getSExtOrTrunc(Op: Mulhi, DL, VT);
4373}
4374
4375SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
4376 DAGCombinerInfo &DCI) const {
4377 EVT VT = N->getValueType(ResNo: 0);
4378
4379 if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
4380 return SDValue();
4381
4382 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4383 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4384 // unnecessarily). isDivergent() is used as an approximation of whether the
4385 // value is in an SGPR.
4386 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4387 // valu op anyway)
4388 if (Subtarget->hasSMulHi() && !N->isDivergent())
4389 return SDValue();
4390
4391 SelectionDAG &DAG = DCI.DAG;
4392 SDLoc DL(N);
4393
4394 SDValue N0 = N->getOperand(Num: 0);
4395 SDValue N1 = N->getOperand(Num: 1);
4396
4397 if (!isU24(Op: N0, DAG) || !isU24(Op: N1, DAG))
4398 return SDValue();
4399
4400 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4401 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4402
4403 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
4404 DCI.AddToWorklist(N: Mulhi.getNode());
4405 return DAG.getZExtOrTrunc(Op: Mulhi, DL, VT);
4406}
4407
4408SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
4409 SDValue Op,
4410 const SDLoc &DL,
4411 unsigned Opc) const {
4412 EVT VT = Op.getValueType();
4413 EVT LegalVT = getTypeToTransformTo(Context&: *DAG.getContext(), VT);
4414 if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
4415 LegalVT != MVT::i16))
4416 return SDValue();
4417
4418 if (VT != MVT::i32)
4419 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
4420
4421 SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
4422 if (VT != MVT::i32)
4423 FFBX = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: FFBX);
4424
4425 return FFBX;
4426}
4427
4428// The native instructions return -1 on 0 input. Optimize out a select that
4429// produces -1 on 0.
4430//
4431// TODO: If zero is not undef, we could also do this if the output is compared
4432// against the bitwidth.
4433//
4434// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
4435SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond,
4436 SDValue LHS, SDValue RHS,
4437 DAGCombinerInfo &DCI) const {
4438 if (!isNullConstant(V: Cond.getOperand(i: 1)))
4439 return SDValue();
4440
4441 SelectionDAG &DAG = DCI.DAG;
4442 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Val: Cond.getOperand(i: 2))->get();
4443 SDValue CmpLHS = Cond.getOperand(i: 0);
4444
4445 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
4446 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
4447 if (CCOpcode == ISD::SETEQ &&
4448 (isCtlzOpc(Opc: RHS.getOpcode()) || isCttzOpc(Opc: RHS.getOpcode())) &&
4449 RHS.getOperand(i: 0) == CmpLHS && isAllOnesConstant(V: LHS)) {
4450 unsigned Opc =
4451 isCttzOpc(Opc: RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
4452 return getFFBX_U32(DAG, Op: CmpLHS, DL: SL, Opc);
4453 }
4454
4455 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
4456 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
4457 if (CCOpcode == ISD::SETNE &&
4458 (isCtlzOpc(Opc: LHS.getOpcode()) || isCttzOpc(Opc: LHS.getOpcode())) &&
4459 LHS.getOperand(i: 0) == CmpLHS && isAllOnesConstant(V: RHS)) {
4460 unsigned Opc =
4461 isCttzOpc(Opc: LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
4462
4463 return getFFBX_U32(DAG, Op: CmpLHS, DL: SL, Opc);
4464 }
4465
4466 return SDValue();
4467}
4468
4469static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
4470 unsigned Op,
4471 const SDLoc &SL,
4472 SDValue Cond,
4473 SDValue N1,
4474 SDValue N2) {
4475 SelectionDAG &DAG = DCI.DAG;
4476 EVT VT = N1.getValueType();
4477
4478 SDValue NewSelect = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: Cond,
4479 N2: N1.getOperand(i: 0), N3: N2.getOperand(i: 0));
4480 DCI.AddToWorklist(N: NewSelect.getNode());
4481 return DAG.getNode(Opcode: Op, DL: SL, VT, Operand: NewSelect);
4482}
4483
4484// Pull a free FP operation out of a select so it may fold into uses.
4485//
4486// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
4487// select c, (fneg x), k -> fneg (select c, x, (fneg k))
4488//
4489// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
4490// select c, (fabs x), +k -> fabs (select c, x, k)
4491SDValue
4492AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
4493 SDValue N) const {
4494 SelectionDAG &DAG = DCI.DAG;
4495 SDValue Cond = N.getOperand(i: 0);
4496 SDValue LHS = N.getOperand(i: 1);
4497 SDValue RHS = N.getOperand(i: 2);
4498
4499 EVT VT = N.getValueType();
4500 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
4501 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
4502 if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N: N.getNode()))
4503 return SDValue();
4504
4505 return distributeOpThroughSelect(DCI, Op: LHS.getOpcode(),
4506 SL: SDLoc(N), Cond, N1: LHS, N2: RHS);
4507 }
4508
4509 bool Inv = false;
4510 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
4511 std::swap(a&: LHS, b&: RHS);
4512 Inv = true;
4513 }
4514
4515 // TODO: Support vector constants.
4516 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(Val&: RHS);
4517 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS &&
4518 !selectSupportsSourceMods(N: N.getNode())) {
4519 SDLoc SL(N);
4520 // If one side is an fneg/fabs and the other is a constant, we can push the
4521 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
4522 SDValue NewLHS = LHS.getOperand(i: 0);
4523 SDValue NewRHS = RHS;
4524
4525 // Careful: if the neg can be folded up, don't try to pull it back down.
4526 bool ShouldFoldNeg = true;
4527
4528 if (NewLHS.hasOneUse()) {
4529 unsigned Opc = NewLHS.getOpcode();
4530 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(N: NewLHS.getNode()))
4531 ShouldFoldNeg = false;
4532 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
4533 ShouldFoldNeg = false;
4534 }
4535
4536 if (ShouldFoldNeg) {
4537 if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())
4538 return SDValue();
4539
4540 // We're going to be forced to use a source modifier anyway, there's no
4541 // point to pulling the negate out unless we can get a size reduction by
4542 // negating the constant.
4543 //
4544 // TODO: Generalize to use getCheaperNegatedExpression which doesn't know
4545 // about cheaper constants.
4546 if (NewLHS.getOpcode() == ISD::FABS &&
4547 getConstantNegateCost(C: CRHS) != NegatibleCost::Cheaper)
4548 return SDValue();
4549
4550 if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N: N.getNode()))
4551 return SDValue();
4552
4553 if (LHS.getOpcode() == ISD::FNEG)
4554 NewRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
4555
4556 if (Inv)
4557 std::swap(a&: NewLHS, b&: NewRHS);
4558
4559 SDValue NewSelect = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT,
4560 N1: Cond, N2: NewLHS, N3: NewRHS);
4561 DCI.AddToWorklist(N: NewSelect.getNode());
4562 return DAG.getNode(Opcode: LHS.getOpcode(), DL: SL, VT, Operand: NewSelect);
4563 }
4564 }
4565
4566 return SDValue();
4567}
4568
4569SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
4570 DAGCombinerInfo &DCI) const {
4571 if (SDValue Folded = foldFreeOpFromSelect(DCI, N: SDValue(N, 0)))
4572 return Folded;
4573
4574 SDValue Cond = N->getOperand(Num: 0);
4575 if (Cond.getOpcode() != ISD::SETCC)
4576 return SDValue();
4577
4578 EVT VT = N->getValueType(ResNo: 0);
4579 SDValue LHS = Cond.getOperand(i: 0);
4580 SDValue RHS = Cond.getOperand(i: 1);
4581 SDValue CC = Cond.getOperand(i: 2);
4582
4583 SDValue True = N->getOperand(Num: 1);
4584 SDValue False = N->getOperand(Num: 2);
4585
4586 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
4587 SelectionDAG &DAG = DCI.DAG;
4588 if (DAG.isConstantValueOfAnyType(N: True) &&
4589 !DAG.isConstantValueOfAnyType(N: False)) {
4590 // Swap cmp + select pair to move constant to false input.
4591 // This will allow using VOPC cndmasks more often.
4592 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
4593
4594 SDLoc SL(N);
4595 ISD::CondCode NewCC =
4596 getSetCCInverse(Operation: cast<CondCodeSDNode>(Val&: CC)->get(), Type: LHS.getValueType());
4597
4598 SDValue NewCond = DAG.getSetCC(DL: SL, VT: Cond.getValueType(), LHS, RHS, Cond: NewCC);
4599 return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NewCond, N2: False, N3: True);
4600 }
4601
4602 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
4603 SDValue MinMax
4604 = combineFMinMaxLegacy(DL: SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
4605 // Revisit this node so we can catch min3/max3/med3 patterns.
4606 //DCI.AddToWorklist(MinMax.getNode());
4607 return MinMax;
4608 }
4609 }
4610
4611 // There's no reason to not do this if the condition has other uses.
4612 return performCtlz_CttzCombine(SL: SDLoc(N), Cond, LHS: True, RHS: False, DCI);
4613}
4614
4615static bool isInv2Pi(const APFloat &APF) {
4616 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
4617 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
4618 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
4619
4620 return APF.bitwiseIsEqual(RHS: KF16) ||
4621 APF.bitwiseIsEqual(RHS: KF32) ||
4622 APF.bitwiseIsEqual(RHS: KF64);
4623}
4624
4625// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
4626// additional cost to negate them.
4627TargetLowering::NegatibleCost
4628AMDGPUTargetLowering::getConstantNegateCost(const ConstantFPSDNode *C) const {
4629 if (C->isZero())
4630 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4631
4632 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(APF: C->getValueAPF()))
4633 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4634
4635 return NegatibleCost::Neutral;
4636}
4637
4638bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const {
4639 if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
4640 return getConstantNegateCost(C) == NegatibleCost::Expensive;
4641 return false;
4642}
4643
4644bool AMDGPUTargetLowering::isConstantCheaperToNegate(SDValue N) const {
4645 if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
4646 return getConstantNegateCost(C) == NegatibleCost::Cheaper;
4647 return false;
4648}
4649
4650static unsigned inverseMinMax(unsigned Opc) {
4651 switch (Opc) {
4652 case ISD::FMAXNUM:
4653 return ISD::FMINNUM;
4654 case ISD::FMINNUM:
4655 return ISD::FMAXNUM;
4656 case ISD::FMAXNUM_IEEE:
4657 return ISD::FMINNUM_IEEE;
4658 case ISD::FMINNUM_IEEE:
4659 return ISD::FMAXNUM_IEEE;
4660 case ISD::FMAXIMUM:
4661 return ISD::FMINIMUM;
4662 case ISD::FMINIMUM:
4663 return ISD::FMAXIMUM;
4664 case AMDGPUISD::FMAX_LEGACY:
4665 return AMDGPUISD::FMIN_LEGACY;
4666 case AMDGPUISD::FMIN_LEGACY:
4667 return AMDGPUISD::FMAX_LEGACY;
4668 default:
4669 llvm_unreachable("invalid min/max opcode");
4670 }
4671}
4672
4673/// \return true if it's profitable to try to push an fneg into its source
4674/// instruction.
4675bool AMDGPUTargetLowering::shouldFoldFNegIntoSrc(SDNode *N, SDValue N0) {
4676 // If the input has multiple uses and we can either fold the negate down, or
4677 // the other uses cannot, give up. This both prevents unprofitable
4678 // transformations and infinite loops: we won't repeatedly try to fold around
4679 // a negate that has no 'good' form.
4680 if (N0.hasOneUse()) {
4681 // This may be able to fold into the source, but at a code size cost. Don't
4682 // fold if the fold into the user is free.
4683 if (allUsesHaveSourceMods(N, CostThreshold: 0))
4684 return false;
4685 } else {
4686 if (fnegFoldsIntoOp(N: N0.getNode()) &&
4687 (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N: N0.getNode())))
4688 return false;
4689 }
4690
4691 return true;
4692}
4693
4694SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
4695 DAGCombinerInfo &DCI) const {
4696 SelectionDAG &DAG = DCI.DAG;
4697 SDValue N0 = N->getOperand(Num: 0);
4698 EVT VT = N->getValueType(ResNo: 0);
4699
4700 unsigned Opc = N0.getOpcode();
4701
4702 if (!shouldFoldFNegIntoSrc(N, N0))
4703 return SDValue();
4704
4705 SDLoc SL(N);
4706 switch (Opc) {
4707 case ISD::FADD: {
4708 if (!mayIgnoreSignedZero(Op: N0))
4709 return SDValue();
4710
4711 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
4712 SDValue LHS = N0.getOperand(i: 0);
4713 SDValue RHS = N0.getOperand(i: 1);
4714
4715 if (LHS.getOpcode() != ISD::FNEG)
4716 LHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: LHS);
4717 else
4718 LHS = LHS.getOperand(i: 0);
4719
4720 if (RHS.getOpcode() != ISD::FNEG)
4721 RHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
4722 else
4723 RHS = RHS.getOperand(i: 0);
4724
4725 SDValue Res = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: LHS, N2: RHS, Flags: N0->getFlags());
4726 if (Res.getOpcode() != ISD::FADD)
4727 return SDValue(); // Op got folded away.
4728 if (!N0.hasOneUse())
4729 DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res));
4730 return Res;
4731 }
4732 case ISD::FMUL:
4733 case AMDGPUISD::FMUL_LEGACY: {
4734 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
4735 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
4736 SDValue LHS = N0.getOperand(i: 0);
4737 SDValue RHS = N0.getOperand(i: 1);
4738
4739 if (LHS.getOpcode() == ISD::FNEG)
4740 LHS = LHS.getOperand(i: 0);
4741 else if (RHS.getOpcode() == ISD::FNEG)
4742 RHS = RHS.getOperand(i: 0);
4743 else
4744 RHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
4745
4746 SDValue Res = DAG.getNode(Opcode: Opc, DL: SL, VT, N1: LHS, N2: RHS, Flags: N0->getFlags());
4747 if (Res.getOpcode() != Opc)
4748 return SDValue(); // Op got folded away.
4749 if (!N0.hasOneUse())
4750 DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res));
4751 return Res;
4752 }
4753 case ISD::FMA:
4754 case ISD::FMAD: {
4755 // TODO: handle llvm.amdgcn.fma.legacy
4756 if (!mayIgnoreSignedZero(Op: N0))
4757 return SDValue();
4758
4759 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
4760 SDValue LHS = N0.getOperand(i: 0);
4761 SDValue MHS = N0.getOperand(i: 1);
4762 SDValue RHS = N0.getOperand(i: 2);
4763
4764 if (LHS.getOpcode() == ISD::FNEG)
4765 LHS = LHS.getOperand(i: 0);
4766 else if (MHS.getOpcode() == ISD::FNEG)
4767 MHS = MHS.getOperand(i: 0);
4768 else
4769 MHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: MHS);
4770
4771 if (RHS.getOpcode() != ISD::FNEG)
4772 RHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
4773 else
4774 RHS = RHS.getOperand(i: 0);
4775
4776 SDValue Res = DAG.getNode(Opcode: Opc, DL: SL, VT, N1: LHS, N2: MHS, N3: RHS);
4777 if (Res.getOpcode() != Opc)
4778 return SDValue(); // Op got folded away.
4779 if (!N0.hasOneUse())
4780 DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res));
4781 return Res;
4782 }
4783 case ISD::FMAXNUM:
4784 case ISD::FMINNUM:
4785 case ISD::FMAXNUM_IEEE:
4786 case ISD::FMINNUM_IEEE:
4787 case ISD::FMINIMUM:
4788 case ISD::FMAXIMUM:
4789 case AMDGPUISD::FMAX_LEGACY:
4790 case AMDGPUISD::FMIN_LEGACY: {
4791 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
4792 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
4793 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
4794 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
4795
4796 SDValue LHS = N0.getOperand(i: 0);
4797 SDValue RHS = N0.getOperand(i: 1);
4798
4799 // 0 doesn't have a negated inline immediate.
4800 // TODO: This constant check should be generalized to other operations.
4801 if (isConstantCostlierToNegate(N: RHS))
4802 return SDValue();
4803
4804 SDValue NegLHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: LHS);
4805 SDValue NegRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
4806 unsigned Opposite = inverseMinMax(Opc);
4807
4808 SDValue Res = DAG.getNode(Opcode: Opposite, DL: SL, VT, N1: NegLHS, N2: NegRHS, Flags: N0->getFlags());
4809 if (Res.getOpcode() != Opposite)
4810 return SDValue(); // Op got folded away.
4811 if (!N0.hasOneUse())
4812 DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res));
4813 return Res;
4814 }
4815 case AMDGPUISD::FMED3: {
4816 SDValue Ops[3];
4817 for (unsigned I = 0; I < 3; ++I)
4818 Ops[I] = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: N0->getOperand(Num: I), Flags: N0->getFlags());
4819
4820 SDValue Res = DAG.getNode(Opcode: AMDGPUISD::FMED3, DL: SL, VT, Ops, Flags: N0->getFlags());
4821 if (Res.getOpcode() != AMDGPUISD::FMED3)
4822 return SDValue(); // Op got folded away.
4823
4824 if (!N0.hasOneUse()) {
4825 SDValue Neg = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res);
4826 DAG.ReplaceAllUsesWith(From: N0, To: Neg);
4827
4828 for (SDNode *U : Neg->uses())
4829 DCI.AddToWorklist(N: U);
4830 }
4831
4832 return Res;
4833 }
4834 case ISD::FP_EXTEND:
4835 case ISD::FTRUNC:
4836 case ISD::FRINT:
4837 case ISD::FNEARBYINT: // XXX - Should fround be handled?
4838 case ISD::FROUNDEVEN:
4839 case ISD::FSIN:
4840 case ISD::FCANONICALIZE:
4841 case AMDGPUISD::RCP:
4842 case AMDGPUISD::RCP_LEGACY:
4843 case AMDGPUISD::RCP_IFLAG:
4844 case AMDGPUISD::SIN_HW: {
4845 SDValue CvtSrc = N0.getOperand(i: 0);
4846 if (CvtSrc.getOpcode() == ISD::FNEG) {
4847 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
4848 // (fneg (rcp (fneg x))) -> (rcp x)
4849 return DAG.getNode(Opcode: Opc, DL: SL, VT, Operand: CvtSrc.getOperand(i: 0));
4850 }
4851
4852 if (!N0.hasOneUse())
4853 return SDValue();
4854
4855 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
4856 // (fneg (rcp x)) -> (rcp (fneg x))
4857 SDValue Neg = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: CvtSrc.getValueType(), Operand: CvtSrc);
4858 return DAG.getNode(Opcode: Opc, DL: SL, VT, Operand: Neg, Flags: N0->getFlags());
4859 }
4860 case ISD::FP_ROUND: {
4861 SDValue CvtSrc = N0.getOperand(i: 0);
4862
4863 if (CvtSrc.getOpcode() == ISD::FNEG) {
4864 // (fneg (fp_round (fneg x))) -> (fp_round x)
4865 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT,
4866 N1: CvtSrc.getOperand(i: 0), N2: N0.getOperand(i: 1));
4867 }
4868
4869 if (!N0.hasOneUse())
4870 return SDValue();
4871
4872 // (fneg (fp_round x)) -> (fp_round (fneg x))
4873 SDValue Neg = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: CvtSrc.getValueType(), Operand: CvtSrc);
4874 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT, N1: Neg, N2: N0.getOperand(i: 1));
4875 }
4876 case ISD::FP16_TO_FP: {
4877 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
4878 // f16, but legalization of f16 fneg ends up pulling it out of the source.
4879 // Put the fneg back as a legal source operation that can be matched later.
4880 SDLoc SL(N);
4881
4882 SDValue Src = N0.getOperand(i: 0);
4883 EVT SrcVT = Src.getValueType();
4884
4885 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
4886 SDValue IntFNeg = DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: SrcVT, N1: Src,
4887 N2: DAG.getConstant(Val: 0x8000, DL: SL, VT: SrcVT));
4888 return DAG.getNode(Opcode: ISD::FP16_TO_FP, DL: SL, VT: N->getValueType(ResNo: 0), Operand: IntFNeg);
4889 }
4890 case ISD::SELECT: {
4891 // fneg (select c, a, b) -> select c, (fneg a), (fneg b)
4892 // TODO: Invert conditions of foldFreeOpFromSelect
4893 return SDValue();
4894 }
4895 case ISD::BITCAST: {
4896 SDLoc SL(N);
4897 SDValue BCSrc = N0.getOperand(i: 0);
4898 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
4899 SDValue HighBits = BCSrc.getOperand(i: BCSrc.getNumOperands() - 1);
4900 if (HighBits.getValueType().getSizeInBits() != 32 ||
4901 !fnegFoldsIntoOp(N: HighBits.getNode()))
4902 return SDValue();
4903
4904 // f64 fneg only really needs to operate on the high half of of the
4905 // register, so try to force it to an f32 operation to help make use of
4906 // source modifiers.
4907 //
4908 //
4909 // fneg (f64 (bitcast (build_vector x, y))) ->
4910 // f64 (bitcast (build_vector (bitcast i32:x to f32),
4911 // (fneg (bitcast i32:y to f32)))
4912
4913 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::f32, HighBits);
4914 SDValue NegHi = DAG.getNode(ISD::FNEG, SL, MVT::f32, CastHi);
4915 SDValue CastBack =
4916 DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: HighBits.getValueType(), Operand: NegHi);
4917
4918 SmallVector<SDValue, 8> Ops(BCSrc->op_begin(), BCSrc->op_end());
4919 Ops.back() = CastBack;
4920 DCI.AddToWorklist(N: NegHi.getNode());
4921 SDValue Build =
4922 DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: BCSrc.getValueType(), Ops);
4923 SDValue Result = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Build);
4924
4925 if (!N0.hasOneUse())
4926 DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Result));
4927 return Result;
4928 }
4929
4930 if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 &&
4931 BCSrc.hasOneUse()) {
4932 // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->
4933 // select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)
4934
4935 // TODO: Cast back result for multiple uses is beneficial in some cases.
4936
4937 SDValue LHS =
4938 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(1));
4939 SDValue RHS =
4940 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(2));
4941
4942 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, LHS);
4943 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHS);
4944
4945 return DAG.getNode(ISD::SELECT, SL, MVT::f32, BCSrc.getOperand(0), NegLHS,
4946 NegRHS);
4947 }
4948
4949 return SDValue();
4950 }
4951 default:
4952 return SDValue();
4953 }
4954}
4955
4956SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
4957 DAGCombinerInfo &DCI) const {
4958 SelectionDAG &DAG = DCI.DAG;
4959 SDValue N0 = N->getOperand(Num: 0);
4960
4961 if (!N0.hasOneUse())
4962 return SDValue();
4963
4964 switch (N0.getOpcode()) {
4965 case ISD::FP16_TO_FP: {
4966 assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
4967 SDLoc SL(N);
4968 SDValue Src = N0.getOperand(i: 0);
4969 EVT SrcVT = Src.getValueType();
4970
4971 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
4972 SDValue IntFAbs = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: SrcVT, N1: Src,
4973 N2: DAG.getConstant(Val: 0x7fff, DL: SL, VT: SrcVT));
4974 return DAG.getNode(Opcode: ISD::FP16_TO_FP, DL: SL, VT: N->getValueType(ResNo: 0), Operand: IntFAbs);
4975 }
4976 default:
4977 return SDValue();
4978 }
4979}
4980
4981SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N,
4982 DAGCombinerInfo &DCI) const {
4983 const auto *CFP = dyn_cast<ConstantFPSDNode>(Val: N->getOperand(Num: 0));
4984 if (!CFP)
4985 return SDValue();
4986
4987 // XXX - Should this flush denormals?
4988 const APFloat &Val = CFP->getValueAPF();
4989 APFloat One(Val.getSemantics(), "1.0");
4990 return DCI.DAG.getConstantFP(Val: One / Val, DL: SDLoc(N), VT: N->getValueType(ResNo: 0));
4991}
4992
4993SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
4994 DAGCombinerInfo &DCI) const {
4995 SelectionDAG &DAG = DCI.DAG;
4996 SDLoc DL(N);
4997
4998 switch(N->getOpcode()) {
4999 default:
5000 break;
5001 case ISD::BITCAST: {
5002 EVT DestVT = N->getValueType(ResNo: 0);
5003
5004 // Push casts through vector builds. This helps avoid emitting a large
5005 // number of copies when materializing floating point vector constants.
5006 //
5007 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
5008 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
5009 if (DestVT.isVector()) {
5010 SDValue Src = N->getOperand(Num: 0);
5011 if (Src.getOpcode() == ISD::BUILD_VECTOR &&
5012 (DCI.getDAGCombineLevel() < AfterLegalizeDAG ||
5013 isOperationLegal(Op: ISD::BUILD_VECTOR, VT: DestVT))) {
5014 EVT SrcVT = Src.getValueType();
5015 unsigned NElts = DestVT.getVectorNumElements();
5016
5017 if (SrcVT.getVectorNumElements() == NElts) {
5018 EVT DestEltVT = DestVT.getVectorElementType();
5019
5020 SmallVector<SDValue, 8> CastedElts;
5021 SDLoc SL(N);
5022 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
5023 SDValue Elt = Src.getOperand(i: I);
5024 CastedElts.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: DestEltVT, Operand: Elt));
5025 }
5026
5027 return DAG.getBuildVector(VT: DestVT, DL: SL, Ops: CastedElts);
5028 }
5029 }
5030 }
5031
5032 if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
5033 break;
5034
5035 // Fold bitcasts of constants.
5036 //
5037 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
5038 // TODO: Generalize and move to DAGCombiner
5039 SDValue Src = N->getOperand(Num: 0);
5040 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: Src)) {
5041 SDLoc SL(N);
5042 uint64_t CVal = C->getZExtValue();
5043 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5044 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5045 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5046 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: DestVT, Operand: BV);
5047 }
5048
5049 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: Src)) {
5050 const APInt &Val = C->getValueAPF().bitcastToAPInt();
5051 SDLoc SL(N);
5052 uint64_t CVal = Val.getZExtValue();
5053 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5054 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5055 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5056
5057 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: DestVT, Operand: Vec);
5058 }
5059
5060 break;
5061 }
5062 case ISD::SHL: {
5063 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
5064 break;
5065
5066 return performShlCombine(N, DCI);
5067 }
5068 case ISD::SRL: {
5069 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
5070 break;
5071
5072 return performSrlCombine(N, DCI);
5073 }
5074 case ISD::SRA: {
5075 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
5076 break;
5077
5078 return performSraCombine(N, DCI);
5079 }
5080 case ISD::TRUNCATE:
5081 return performTruncateCombine(N, DCI);
5082 case ISD::MUL:
5083 return performMulCombine(N, DCI);
5084 case AMDGPUISD::MUL_U24:
5085 case AMDGPUISD::MUL_I24: {
5086 if (SDValue Simplified = simplifyMul24(Node24: N, DCI))
5087 return Simplified;
5088 break;
5089 }
5090 case AMDGPUISD::MULHI_I24:
5091 case AMDGPUISD::MULHI_U24:
5092 return simplifyMul24(Node24: N, DCI);
5093 case ISD::SMUL_LOHI:
5094 case ISD::UMUL_LOHI:
5095 return performMulLoHiCombine(N, DCI);
5096 case ISD::MULHS:
5097 return performMulhsCombine(N, DCI);
5098 case ISD::MULHU:
5099 return performMulhuCombine(N, DCI);
5100 case ISD::SELECT:
5101 return performSelectCombine(N, DCI);
5102 case ISD::FNEG:
5103 return performFNegCombine(N, DCI);
5104 case ISD::FABS:
5105 return performFAbsCombine(N, DCI);
5106 case AMDGPUISD::BFE_I32:
5107 case AMDGPUISD::BFE_U32: {
5108 assert(!N->getValueType(0).isVector() &&
5109 "Vector handling of BFE not implemented");
5110 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 2));
5111 if (!Width)
5112 break;
5113
5114 uint32_t WidthVal = Width->getZExtValue() & 0x1f;
5115 if (WidthVal == 0)
5116 return DAG.getConstant(0, DL, MVT::i32);
5117
5118 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
5119 if (!Offset)
5120 break;
5121
5122 SDValue BitsFrom = N->getOperand(Num: 0);
5123 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
5124
5125 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
5126
5127 if (OffsetVal == 0) {
5128 // This is already sign / zero extended, so try to fold away extra BFEs.
5129 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
5130
5131 unsigned OpSignBits = DAG.ComputeNumSignBits(Op: BitsFrom);
5132 if (OpSignBits >= SignBits)
5133 return BitsFrom;
5134
5135 EVT SmallVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: WidthVal);
5136 if (Signed) {
5137 // This is a sign_extend_inreg. Replace it to take advantage of existing
5138 // DAG Combines. If not eliminated, we will match back to BFE during
5139 // selection.
5140
5141 // TODO: The sext_inreg of extended types ends, although we can could
5142 // handle them in a single BFE.
5143 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
5144 DAG.getValueType(SmallVT));
5145 }
5146
5147 return DAG.getZeroExtendInReg(Op: BitsFrom, DL, VT: SmallVT);
5148 }
5149
5150 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(Val&: BitsFrom)) {
5151 if (Signed) {
5152 return constantFoldBFE<int32_t>(DAG,
5153 Src0: CVal->getSExtValue(),
5154 Offset: OffsetVal,
5155 Width: WidthVal,
5156 DL);
5157 }
5158
5159 return constantFoldBFE<uint32_t>(DAG,
5160 Src0: CVal->getZExtValue(),
5161 Offset: OffsetVal,
5162 Width: WidthVal,
5163 DL);
5164 }
5165
5166 if ((OffsetVal + WidthVal) >= 32 &&
5167 !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
5168 SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
5169 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
5170 BitsFrom, ShiftVal);
5171 }
5172
5173 if (BitsFrom.hasOneUse()) {
5174 APInt Demanded = APInt::getBitsSet(numBits: 32,
5175 loBit: OffsetVal,
5176 hiBit: OffsetVal + WidthVal);
5177
5178 KnownBits Known;
5179 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
5180 !DCI.isBeforeLegalizeOps());
5181 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5182 if (TLI.ShrinkDemandedConstant(Op: BitsFrom, DemandedBits: Demanded, TLO) ||
5183 TLI.SimplifyDemandedBits(Op: BitsFrom, DemandedBits: Demanded, Known, TLO)) {
5184 DCI.CommitTargetLoweringOpt(TLO);
5185 }
5186 }
5187
5188 break;
5189 }
5190 case ISD::LOAD:
5191 return performLoadCombine(N, DCI);
5192 case ISD::STORE:
5193 return performStoreCombine(N, DCI);
5194 case AMDGPUISD::RCP:
5195 case AMDGPUISD::RCP_IFLAG:
5196 return performRcpCombine(N, DCI);
5197 case ISD::AssertZext:
5198 case ISD::AssertSext:
5199 return performAssertSZExtCombine(N, DCI);
5200 case ISD::INTRINSIC_WO_CHAIN:
5201 return performIntrinsicWOChainCombine(N, DCI);
5202 case AMDGPUISD::FMAD_FTZ: {
5203 SDValue N0 = N->getOperand(Num: 0);
5204 SDValue N1 = N->getOperand(Num: 1);
5205 SDValue N2 = N->getOperand(Num: 2);
5206 EVT VT = N->getValueType(ResNo: 0);
5207
5208 // FMAD_FTZ is a FMAD + flush denormals to zero.
5209 // We flush the inputs, the intermediate step, and the output.
5210 ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(Val&: N0);
5211 ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(Val&: N1);
5212 ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(Val&: N2);
5213 if (N0CFP && N1CFP && N2CFP) {
5214 const auto FTZ = [](const APFloat &V) {
5215 if (V.isDenormal()) {
5216 APFloat Zero(V.getSemantics(), 0);
5217 return V.isNegative() ? -Zero : Zero;
5218 }
5219 return V;
5220 };
5221
5222 APFloat V0 = FTZ(N0CFP->getValueAPF());
5223 APFloat V1 = FTZ(N1CFP->getValueAPF());
5224 APFloat V2 = FTZ(N2CFP->getValueAPF());
5225 V0.multiply(RHS: V1, RM: APFloat::rmNearestTiesToEven);
5226 V0 = FTZ(V0);
5227 V0.add(RHS: V2, RM: APFloat::rmNearestTiesToEven);
5228 return DAG.getConstantFP(Val: FTZ(V0), DL, VT);
5229 }
5230 break;
5231 }
5232 }
5233 return SDValue();
5234}
5235
5236//===----------------------------------------------------------------------===//
5237// Helper functions
5238//===----------------------------------------------------------------------===//
5239
5240SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
5241 const TargetRegisterClass *RC,
5242 Register Reg, EVT VT,
5243 const SDLoc &SL,
5244 bool RawReg) const {
5245 MachineFunction &MF = DAG.getMachineFunction();
5246 MachineRegisterInfo &MRI = MF.getRegInfo();
5247 Register VReg;
5248
5249 if (!MRI.isLiveIn(Reg)) {
5250 VReg = MRI.createVirtualRegister(RegClass: RC);
5251 MRI.addLiveIn(Reg, vreg: VReg);
5252 } else {
5253 VReg = MRI.getLiveInVirtReg(PReg: Reg);
5254 }
5255
5256 if (RawReg)
5257 return DAG.getRegister(Reg: VReg, VT);
5258
5259 return DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: SL, Reg: VReg, VT);
5260}
5261
5262// This may be called multiple times, and nothing prevents creating multiple
5263// objects at the same offset. See if we already defined this object.
5264static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size,
5265 int64_t Offset) {
5266 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
5267 if (MFI.getObjectOffset(ObjectIdx: I) == Offset) {
5268 assert(MFI.getObjectSize(I) == Size);
5269 return I;
5270 }
5271 }
5272
5273 return MFI.CreateFixedObject(Size, SPOffset: Offset, IsImmutable: true);
5274}
5275
5276SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
5277 EVT VT,
5278 const SDLoc &SL,
5279 int64_t Offset) const {
5280 MachineFunction &MF = DAG.getMachineFunction();
5281 MachineFrameInfo &MFI = MF.getFrameInfo();
5282 int FI = getOrCreateFixedStackObject(MFI, Size: VT.getStoreSize(), Offset);
5283
5284 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
5285 SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
5286
5287 return DAG.getLoad(VT, dl: SL, Chain: DAG.getEntryNode(), Ptr, PtrInfo: SrcPtrInfo, Alignment: Align(4),
5288 MMOFlags: MachineMemOperand::MODereferenceable |
5289 MachineMemOperand::MOInvariant);
5290}
5291
5292SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
5293 const SDLoc &SL,
5294 SDValue Chain,
5295 SDValue ArgVal,
5296 int64_t Offset) const {
5297 MachineFunction &MF = DAG.getMachineFunction();
5298 MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
5299 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
5300
5301 SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
5302 // Stores to the argument stack area are relative to the stack pointer.
5303 SDValue SP =
5304 DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);
5305 Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);
5306 SDValue Store = DAG.getStore(Chain, dl: SL, Val: ArgVal, Ptr, PtrInfo: DstInfo, Alignment: Align(4),
5307 MMOFlags: MachineMemOperand::MODereferenceable);
5308 return Store;
5309}
5310
5311SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
5312 const TargetRegisterClass *RC,
5313 EVT VT, const SDLoc &SL,
5314 const ArgDescriptor &Arg) const {
5315 assert(Arg && "Attempting to load missing argument");
5316
5317 SDValue V = Arg.isRegister() ?
5318 CreateLiveInRegister(DAG, RC, Reg: Arg.getRegister(), VT, SL) :
5319 loadStackInputValue(DAG, VT, SL, Offset: Arg.getStackOffset());
5320
5321 if (!Arg.isMasked())
5322 return V;
5323
5324 unsigned Mask = Arg.getMask();
5325 unsigned Shift = llvm::countr_zero<unsigned>(Val: Mask);
5326 V = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT, N1: V,
5327 N2: DAG.getShiftAmountConstant(Val: Shift, VT, DL: SL));
5328 return DAG.getNode(Opcode: ISD::AND, DL: SL, VT, N1: V,
5329 N2: DAG.getConstant(Val: Mask >> Shift, DL: SL, VT));
5330}
5331
5332uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
5333 uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const {
5334 unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
5335 const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr();
5336 uint64_t ArgOffset =
5337 alignTo(Size: ExplicitKernArgSize, A: Alignment) + ExplicitArgOffset;
5338 switch (Param) {
5339 case FIRST_IMPLICIT:
5340 return ArgOffset;
5341 case PRIVATE_BASE:
5342 return ArgOffset + AMDGPU::ImplicitArg::PRIVATE_BASE_OFFSET;
5343 case SHARED_BASE:
5344 return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
5345 case QUEUE_PTR:
5346 return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
5347 }
5348 llvm_unreachable("unexpected implicit parameter type");
5349}
5350
5351uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
5352 const MachineFunction &MF, const ImplicitParameter Param) const {
5353 const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
5354 return getImplicitParameterOffset(ExplicitKernArgSize: MFI->getExplicitKernArgSize(), Param);
5355}
5356
5357#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
5358
5359const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
5360 switch ((AMDGPUISD::NodeType)Opcode) {
5361 case AMDGPUISD::FIRST_NUMBER: break;
5362 // AMDIL DAG nodes
5363 NODE_NAME_CASE(UMUL);
5364 NODE_NAME_CASE(BRANCH_COND);
5365
5366 // AMDGPU DAG nodes
5367 NODE_NAME_CASE(IF)
5368 NODE_NAME_CASE(ELSE)
5369 NODE_NAME_CASE(LOOP)
5370 NODE_NAME_CASE(CALL)
5371 NODE_NAME_CASE(TC_RETURN)
5372 NODE_NAME_CASE(TC_RETURN_GFX)
5373 NODE_NAME_CASE(TC_RETURN_CHAIN)
5374 NODE_NAME_CASE(TRAP)
5375 NODE_NAME_CASE(RET_GLUE)
5376 NODE_NAME_CASE(WAVE_ADDRESS)
5377 NODE_NAME_CASE(RETURN_TO_EPILOG)
5378 NODE_NAME_CASE(ENDPGM)
5379 NODE_NAME_CASE(ENDPGM_TRAP)
5380 NODE_NAME_CASE(SIMULATED_TRAP)
5381 NODE_NAME_CASE(DWORDADDR)
5382 NODE_NAME_CASE(FRACT)
5383 NODE_NAME_CASE(SETCC)
5384 NODE_NAME_CASE(SETREG)
5385 NODE_NAME_CASE(DENORM_MODE)
5386 NODE_NAME_CASE(FMA_W_CHAIN)
5387 NODE_NAME_CASE(FMUL_W_CHAIN)
5388 NODE_NAME_CASE(CLAMP)
5389 NODE_NAME_CASE(COS_HW)
5390 NODE_NAME_CASE(SIN_HW)
5391 NODE_NAME_CASE(FMAX_LEGACY)
5392 NODE_NAME_CASE(FMIN_LEGACY)
5393 NODE_NAME_CASE(FMAX3)
5394 NODE_NAME_CASE(SMAX3)
5395 NODE_NAME_CASE(UMAX3)
5396 NODE_NAME_CASE(FMIN3)
5397 NODE_NAME_CASE(SMIN3)
5398 NODE_NAME_CASE(UMIN3)
5399 NODE_NAME_CASE(FMED3)
5400 NODE_NAME_CASE(SMED3)
5401 NODE_NAME_CASE(UMED3)
5402 NODE_NAME_CASE(FMAXIMUM3)
5403 NODE_NAME_CASE(FMINIMUM3)
5404 NODE_NAME_CASE(FDOT2)
5405 NODE_NAME_CASE(URECIP)
5406 NODE_NAME_CASE(DIV_SCALE)
5407 NODE_NAME_CASE(DIV_FMAS)
5408 NODE_NAME_CASE(DIV_FIXUP)
5409 NODE_NAME_CASE(FMAD_FTZ)
5410 NODE_NAME_CASE(RCP)
5411 NODE_NAME_CASE(RSQ)
5412 NODE_NAME_CASE(RCP_LEGACY)
5413 NODE_NAME_CASE(RCP_IFLAG)
5414 NODE_NAME_CASE(LOG)
5415 NODE_NAME_CASE(EXP)
5416 NODE_NAME_CASE(FMUL_LEGACY)
5417 NODE_NAME_CASE(RSQ_CLAMP)
5418 NODE_NAME_CASE(FP_CLASS)
5419 NODE_NAME_CASE(DOT4)
5420 NODE_NAME_CASE(CARRY)
5421 NODE_NAME_CASE(BORROW)
5422 NODE_NAME_CASE(BFE_U32)
5423 NODE_NAME_CASE(BFE_I32)
5424 NODE_NAME_CASE(BFI)
5425 NODE_NAME_CASE(BFM)
5426 NODE_NAME_CASE(FFBH_U32)
5427 NODE_NAME_CASE(FFBH_I32)
5428 NODE_NAME_CASE(FFBL_B32)
5429 NODE_NAME_CASE(MUL_U24)
5430 NODE_NAME_CASE(MUL_I24)
5431 NODE_NAME_CASE(MULHI_U24)
5432 NODE_NAME_CASE(MULHI_I24)
5433 NODE_NAME_CASE(MAD_U24)
5434 NODE_NAME_CASE(MAD_I24)
5435 NODE_NAME_CASE(MAD_I64_I32)
5436 NODE_NAME_CASE(MAD_U64_U32)
5437 NODE_NAME_CASE(PERM)
5438 NODE_NAME_CASE(TEXTURE_FETCH)
5439 NODE_NAME_CASE(R600_EXPORT)
5440 NODE_NAME_CASE(CONST_ADDRESS)
5441 NODE_NAME_CASE(REGISTER_LOAD)
5442 NODE_NAME_CASE(REGISTER_STORE)
5443 NODE_NAME_CASE(SAMPLE)
5444 NODE_NAME_CASE(SAMPLEB)
5445 NODE_NAME_CASE(SAMPLED)
5446 NODE_NAME_CASE(SAMPLEL)
5447 NODE_NAME_CASE(CVT_F32_UBYTE0)
5448 NODE_NAME_CASE(CVT_F32_UBYTE1)
5449 NODE_NAME_CASE(CVT_F32_UBYTE2)
5450 NODE_NAME_CASE(CVT_F32_UBYTE3)
5451 NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
5452 NODE_NAME_CASE(CVT_PKNORM_I16_F32)
5453 NODE_NAME_CASE(CVT_PKNORM_U16_F32)
5454 NODE_NAME_CASE(CVT_PK_I16_I32)
5455 NODE_NAME_CASE(CVT_PK_U16_U32)
5456 NODE_NAME_CASE(FP_TO_FP16)
5457 NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
5458 NODE_NAME_CASE(CONST_DATA_PTR)
5459 NODE_NAME_CASE(PC_ADD_REL_OFFSET)
5460 NODE_NAME_CASE(LDS)
5461 NODE_NAME_CASE(FPTRUNC_ROUND_UPWARD)
5462 NODE_NAME_CASE(FPTRUNC_ROUND_DOWNWARD)
5463 NODE_NAME_CASE(DUMMY_CHAIN)
5464 case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
5465 NODE_NAME_CASE(LOAD_D16_HI)
5466 NODE_NAME_CASE(LOAD_D16_LO)
5467 NODE_NAME_CASE(LOAD_D16_HI_I8)
5468 NODE_NAME_CASE(LOAD_D16_HI_U8)
5469 NODE_NAME_CASE(LOAD_D16_LO_I8)
5470 NODE_NAME_CASE(LOAD_D16_LO_U8)
5471 NODE_NAME_CASE(STORE_MSKOR)
5472 NODE_NAME_CASE(LOAD_CONSTANT)
5473 NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
5474 NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)
5475 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
5476 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
5477 NODE_NAME_CASE(DS_ORDERED_COUNT)
5478 NODE_NAME_CASE(ATOMIC_CMP_SWAP)
5479 NODE_NAME_CASE(ATOMIC_LOAD_FMIN)
5480 NODE_NAME_CASE(ATOMIC_LOAD_FMAX)
5481 NODE_NAME_CASE(BUFFER_LOAD)
5482 NODE_NAME_CASE(BUFFER_LOAD_UBYTE)
5483 NODE_NAME_CASE(BUFFER_LOAD_USHORT)
5484 NODE_NAME_CASE(BUFFER_LOAD_BYTE)
5485 NODE_NAME_CASE(BUFFER_LOAD_SHORT)
5486 NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
5487 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_TFE)
5488 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
5489 NODE_NAME_CASE(SBUFFER_LOAD)
5490 NODE_NAME_CASE(SBUFFER_LOAD_BYTE)
5491 NODE_NAME_CASE(SBUFFER_LOAD_UBYTE)
5492 NODE_NAME_CASE(SBUFFER_LOAD_SHORT)
5493 NODE_NAME_CASE(SBUFFER_LOAD_USHORT)
5494 NODE_NAME_CASE(BUFFER_STORE)
5495 NODE_NAME_CASE(BUFFER_STORE_BYTE)
5496 NODE_NAME_CASE(BUFFER_STORE_SHORT)
5497 NODE_NAME_CASE(BUFFER_STORE_FORMAT)
5498 NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)
5499 NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)
5500 NODE_NAME_CASE(BUFFER_ATOMIC_ADD)
5501 NODE_NAME_CASE(BUFFER_ATOMIC_SUB)
5502 NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)
5503 NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)
5504 NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)
5505 NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)
5506 NODE_NAME_CASE(BUFFER_ATOMIC_AND)
5507 NODE_NAME_CASE(BUFFER_ATOMIC_OR)
5508 NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
5509 NODE_NAME_CASE(BUFFER_ATOMIC_INC)
5510 NODE_NAME_CASE(BUFFER_ATOMIC_DEC)
5511 NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
5512 NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)
5513 NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
5514 NODE_NAME_CASE(BUFFER_ATOMIC_FADD_BF16)
5515 NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
5516 NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
5517 NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32)
5518
5519 case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
5520 }
5521 return nullptr;
5522}
5523
5524SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
5525 SelectionDAG &DAG, int Enabled,
5526 int &RefinementSteps,
5527 bool &UseOneConstNR,
5528 bool Reciprocal) const {
5529 EVT VT = Operand.getValueType();
5530
5531 if (VT == MVT::f32) {
5532 RefinementSteps = 0;
5533 return DAG.getNode(Opcode: AMDGPUISD::RSQ, DL: SDLoc(Operand), VT, Operand);
5534 }
5535
5536 // TODO: There is also f64 rsq instruction, but the documentation is less
5537 // clear on its precision.
5538
5539 return SDValue();
5540}
5541
5542SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
5543 SelectionDAG &DAG, int Enabled,
5544 int &RefinementSteps) const {
5545 EVT VT = Operand.getValueType();
5546
5547 if (VT == MVT::f32) {
5548 // Reciprocal, < 1 ulp error.
5549 //
5550 // This reciprocal approximation converges to < 0.5 ulp error with one
5551 // newton rhapson performed with two fused multiple adds (FMAs).
5552
5553 RefinementSteps = 0;
5554 return DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SDLoc(Operand), VT, Operand);
5555 }
5556
5557 // TODO: There is also f64 rcp instruction, but the documentation is less
5558 // clear on its precision.
5559
5560 return SDValue();
5561}
5562
5563static unsigned workitemIntrinsicDim(unsigned ID) {
5564 switch (ID) {
5565 case Intrinsic::amdgcn_workitem_id_x:
5566 return 0;
5567 case Intrinsic::amdgcn_workitem_id_y:
5568 return 1;
5569 case Intrinsic::amdgcn_workitem_id_z:
5570 return 2;
5571 default:
5572 llvm_unreachable("not a workitem intrinsic");
5573 }
5574}
5575
5576void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
5577 const SDValue Op, KnownBits &Known,
5578 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
5579
5580 Known.resetAll(); // Don't know anything.
5581
5582 unsigned Opc = Op.getOpcode();
5583
5584 switch (Opc) {
5585 default:
5586 break;
5587 case AMDGPUISD::CARRY:
5588 case AMDGPUISD::BORROW: {
5589 Known.Zero = APInt::getHighBitsSet(numBits: 32, hiBitsSet: 31);
5590 break;
5591 }
5592
5593 case AMDGPUISD::BFE_I32:
5594 case AMDGPUISD::BFE_U32: {
5595 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 2));
5596 if (!CWidth)
5597 return;
5598
5599 uint32_t Width = CWidth->getZExtValue() & 0x1f;
5600
5601 if (Opc == AMDGPUISD::BFE_U32)
5602 Known.Zero = APInt::getHighBitsSet(numBits: 32, hiBitsSet: 32 - Width);
5603
5604 break;
5605 }
5606 case AMDGPUISD::FP_TO_FP16: {
5607 unsigned BitWidth = Known.getBitWidth();
5608
5609 // High bits are zero.
5610 Known.Zero = APInt::getHighBitsSet(numBits: BitWidth, hiBitsSet: BitWidth - 16);
5611 break;
5612 }
5613 case AMDGPUISD::MUL_U24:
5614 case AMDGPUISD::MUL_I24: {
5615 KnownBits LHSKnown = DAG.computeKnownBits(Op: Op.getOperand(i: 0), Depth: Depth + 1);
5616 KnownBits RHSKnown = DAG.computeKnownBits(Op: Op.getOperand(i: 1), Depth: Depth + 1);
5617 unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
5618 RHSKnown.countMinTrailingZeros();
5619 Known.Zero.setLowBits(std::min(a: TrailZ, b: 32u));
5620 // Skip extra check if all bits are known zeros.
5621 if (TrailZ >= 32)
5622 break;
5623
5624 // Truncate to 24 bits.
5625 LHSKnown = LHSKnown.trunc(BitWidth: 24);
5626 RHSKnown = RHSKnown.trunc(BitWidth: 24);
5627
5628 if (Opc == AMDGPUISD::MUL_I24) {
5629 unsigned LHSValBits = LHSKnown.countMaxSignificantBits();
5630 unsigned RHSValBits = RHSKnown.countMaxSignificantBits();
5631 unsigned MaxValBits = LHSValBits + RHSValBits;
5632 if (MaxValBits > 32)
5633 break;
5634 unsigned SignBits = 32 - MaxValBits + 1;
5635 bool LHSNegative = LHSKnown.isNegative();
5636 bool LHSNonNegative = LHSKnown.isNonNegative();
5637 bool LHSPositive = LHSKnown.isStrictlyPositive();
5638 bool RHSNegative = RHSKnown.isNegative();
5639 bool RHSNonNegative = RHSKnown.isNonNegative();
5640 bool RHSPositive = RHSKnown.isStrictlyPositive();
5641
5642 if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
5643 Known.Zero.setHighBits(SignBits);
5644 else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
5645 Known.One.setHighBits(SignBits);
5646 } else {
5647 unsigned LHSValBits = LHSKnown.countMaxActiveBits();
5648 unsigned RHSValBits = RHSKnown.countMaxActiveBits();
5649 unsigned MaxValBits = LHSValBits + RHSValBits;
5650 if (MaxValBits >= 32)
5651 break;
5652 Known.Zero.setBitsFrom(MaxValBits);
5653 }
5654 break;
5655 }
5656 case AMDGPUISD::PERM: {
5657 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 2));
5658 if (!CMask)
5659 return;
5660
5661 KnownBits LHSKnown = DAG.computeKnownBits(Op: Op.getOperand(i: 0), Depth: Depth + 1);
5662 KnownBits RHSKnown = DAG.computeKnownBits(Op: Op.getOperand(i: 1), Depth: Depth + 1);
5663 unsigned Sel = CMask->getZExtValue();
5664
5665 for (unsigned I = 0; I < 32; I += 8) {
5666 unsigned SelBits = Sel & 0xff;
5667 if (SelBits < 4) {
5668 SelBits *= 8;
5669 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5670 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5671 } else if (SelBits < 7) {
5672 SelBits = (SelBits & 3) * 8;
5673 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5674 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5675 } else if (SelBits == 0x0c) {
5676 Known.Zero |= 0xFFull << I;
5677 } else if (SelBits > 0x0c) {
5678 Known.One |= 0xFFull << I;
5679 }
5680 Sel >>= 8;
5681 }
5682 break;
5683 }
5684 case AMDGPUISD::BUFFER_LOAD_UBYTE: {
5685 Known.Zero.setHighBits(24);
5686 break;
5687 }
5688 case AMDGPUISD::BUFFER_LOAD_USHORT: {
5689 Known.Zero.setHighBits(16);
5690 break;
5691 }
5692 case AMDGPUISD::LDS: {
5693 auto GA = cast<GlobalAddressSDNode>(Val: Op.getOperand(i: 0).getNode());
5694 Align Alignment = GA->getGlobal()->getPointerAlignment(DL: DAG.getDataLayout());
5695
5696 Known.Zero.setHighBits(16);
5697 Known.Zero.setLowBits(Log2(A: Alignment));
5698 break;
5699 }
5700 case AMDGPUISD::SMIN3:
5701 case AMDGPUISD::SMAX3:
5702 case AMDGPUISD::SMED3:
5703 case AMDGPUISD::UMIN3:
5704 case AMDGPUISD::UMAX3:
5705 case AMDGPUISD::UMED3: {
5706 KnownBits Known2 = DAG.computeKnownBits(Op: Op.getOperand(i: 2), Depth: Depth + 1);
5707 if (Known2.isUnknown())
5708 break;
5709
5710 KnownBits Known1 = DAG.computeKnownBits(Op: Op.getOperand(i: 1), Depth: Depth + 1);
5711 if (Known1.isUnknown())
5712 break;
5713
5714 KnownBits Known0 = DAG.computeKnownBits(Op: Op.getOperand(i: 0), Depth: Depth + 1);
5715 if (Known0.isUnknown())
5716 break;
5717
5718 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
5719 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
5720 Known.One = Known0.One & Known1.One & Known2.One;
5721 break;
5722 }
5723 case ISD::INTRINSIC_WO_CHAIN: {
5724 unsigned IID = Op.getConstantOperandVal(i: 0);
5725 switch (IID) {
5726 case Intrinsic::amdgcn_workitem_id_x:
5727 case Intrinsic::amdgcn_workitem_id_y:
5728 case Intrinsic::amdgcn_workitem_id_z: {
5729 unsigned MaxValue = Subtarget->getMaxWorkitemID(
5730 Kernel: DAG.getMachineFunction().getFunction(), Dimension: workitemIntrinsicDim(ID: IID));
5731 Known.Zero.setHighBits(llvm::countl_zero(Val: MaxValue));
5732 break;
5733 }
5734 default:
5735 break;
5736 }
5737 }
5738 }
5739}
5740
5741unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
5742 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
5743 unsigned Depth) const {
5744 switch (Op.getOpcode()) {
5745 case AMDGPUISD::BFE_I32: {
5746 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 2));
5747 if (!Width)
5748 return 1;
5749
5750 unsigned SignBits = 32 - Width->getZExtValue() + 1;
5751 if (!isNullConstant(V: Op.getOperand(i: 1)))
5752 return SignBits;
5753
5754 // TODO: Could probably figure something out with non-0 offsets.
5755 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op: Op.getOperand(i: 0), Depth: Depth + 1);
5756 return std::max(a: SignBits, b: Op0SignBits);
5757 }
5758
5759 case AMDGPUISD::BFE_U32: {
5760 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 2));
5761 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
5762 }
5763
5764 case AMDGPUISD::CARRY:
5765 case AMDGPUISD::BORROW:
5766 return 31;
5767 case AMDGPUISD::BUFFER_LOAD_BYTE:
5768 return 25;
5769 case AMDGPUISD::BUFFER_LOAD_SHORT:
5770 return 17;
5771 case AMDGPUISD::BUFFER_LOAD_UBYTE:
5772 return 24;
5773 case AMDGPUISD::BUFFER_LOAD_USHORT:
5774 return 16;
5775 case AMDGPUISD::FP_TO_FP16:
5776 return 16;
5777 case AMDGPUISD::SMIN3:
5778 case AMDGPUISD::SMAX3:
5779 case AMDGPUISD::SMED3:
5780 case AMDGPUISD::UMIN3:
5781 case AMDGPUISD::UMAX3:
5782 case AMDGPUISD::UMED3: {
5783 unsigned Tmp2 = DAG.ComputeNumSignBits(Op: Op.getOperand(i: 2), Depth: Depth + 1);
5784 if (Tmp2 == 1)
5785 return 1; // Early out.
5786
5787 unsigned Tmp1 = DAG.ComputeNumSignBits(Op: Op.getOperand(i: 1), Depth: Depth + 1);
5788 if (Tmp1 == 1)
5789 return 1; // Early out.
5790
5791 unsigned Tmp0 = DAG.ComputeNumSignBits(Op: Op.getOperand(i: 0), Depth: Depth + 1);
5792 if (Tmp0 == 1)
5793 return 1; // Early out.
5794
5795 return std::min(a: Tmp0, b: std::min(a: Tmp1, b: Tmp2));
5796 }
5797 default:
5798 return 1;
5799 }
5800}
5801
5802unsigned AMDGPUTargetLowering::computeNumSignBitsForTargetInstr(
5803 GISelKnownBits &Analysis, Register R,
5804 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
5805 unsigned Depth) const {
5806 const MachineInstr *MI = MRI.getVRegDef(Reg: R);
5807 if (!MI)
5808 return 1;
5809
5810 // TODO: Check range metadata on MMO.
5811 switch (MI->getOpcode()) {
5812 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
5813 return 25;
5814 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
5815 return 17;
5816 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
5817 return 24;
5818 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
5819 return 16;
5820 case AMDGPU::G_AMDGPU_SMED3:
5821 case AMDGPU::G_AMDGPU_UMED3: {
5822 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
5823 unsigned Tmp2 = Analysis.computeNumSignBits(R: Src2, DemandedElts, Depth: Depth + 1);
5824 if (Tmp2 == 1)
5825 return 1;
5826 unsigned Tmp1 = Analysis.computeNumSignBits(R: Src1, DemandedElts, Depth: Depth + 1);
5827 if (Tmp1 == 1)
5828 return 1;
5829 unsigned Tmp0 = Analysis.computeNumSignBits(R: Src0, DemandedElts, Depth: Depth + 1);
5830 if (Tmp0 == 1)
5831 return 1;
5832 return std::min(a: Tmp0, b: std::min(a: Tmp1, b: Tmp2));
5833 }
5834 default:
5835 return 1;
5836 }
5837}
5838
5839bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
5840 const SelectionDAG &DAG,
5841 bool SNaN,
5842 unsigned Depth) const {
5843 unsigned Opcode = Op.getOpcode();
5844 switch (Opcode) {
5845 case AMDGPUISD::FMIN_LEGACY:
5846 case AMDGPUISD::FMAX_LEGACY: {
5847 if (SNaN)
5848 return true;
5849
5850 // TODO: Can check no nans on one of the operands for each one, but which
5851 // one?
5852 return false;
5853 }
5854 case AMDGPUISD::FMUL_LEGACY:
5855 case AMDGPUISD::CVT_PKRTZ_F16_F32: {
5856 if (SNaN)
5857 return true;
5858 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 0), SNaN, Depth: Depth + 1) &&
5859 DAG.isKnownNeverNaN(Op: Op.getOperand(i: 1), SNaN, Depth: Depth + 1);
5860 }
5861 case AMDGPUISD::FMED3:
5862 case AMDGPUISD::FMIN3:
5863 case AMDGPUISD::FMAX3:
5864 case AMDGPUISD::FMINIMUM3:
5865 case AMDGPUISD::FMAXIMUM3:
5866 case AMDGPUISD::FMAD_FTZ: {
5867 if (SNaN)
5868 return true;
5869 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 0), SNaN, Depth: Depth + 1) &&
5870 DAG.isKnownNeverNaN(Op: Op.getOperand(i: 1), SNaN, Depth: Depth + 1) &&
5871 DAG.isKnownNeverNaN(Op: Op.getOperand(i: 2), SNaN, Depth: Depth + 1);
5872 }
5873 case AMDGPUISD::CVT_F32_UBYTE0:
5874 case AMDGPUISD::CVT_F32_UBYTE1:
5875 case AMDGPUISD::CVT_F32_UBYTE2:
5876 case AMDGPUISD::CVT_F32_UBYTE3:
5877 return true;
5878
5879 case AMDGPUISD::RCP:
5880 case AMDGPUISD::RSQ:
5881 case AMDGPUISD::RCP_LEGACY:
5882 case AMDGPUISD::RSQ_CLAMP: {
5883 if (SNaN)
5884 return true;
5885
5886 // TODO: Need is known positive check.
5887 return false;
5888 }
5889 case ISD::FLDEXP:
5890 case AMDGPUISD::FRACT: {
5891 if (SNaN)
5892 return true;
5893 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 0), SNaN, Depth: Depth + 1);
5894 }
5895 case AMDGPUISD::DIV_SCALE:
5896 case AMDGPUISD::DIV_FMAS:
5897 case AMDGPUISD::DIV_FIXUP:
5898 // TODO: Refine on operands.
5899 return SNaN;
5900 case AMDGPUISD::SIN_HW:
5901 case AMDGPUISD::COS_HW: {
5902 // TODO: Need check for infinity
5903 return SNaN;
5904 }
5905 case ISD::INTRINSIC_WO_CHAIN: {
5906 unsigned IntrinsicID = Op.getConstantOperandVal(i: 0);
5907 // TODO: Handle more intrinsics
5908 switch (IntrinsicID) {
5909 case Intrinsic::amdgcn_cubeid:
5910 return true;
5911
5912 case Intrinsic::amdgcn_frexp_mant: {
5913 if (SNaN)
5914 return true;
5915 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 1), SNaN, Depth: Depth + 1);
5916 }
5917 case Intrinsic::amdgcn_cvt_pkrtz: {
5918 if (SNaN)
5919 return true;
5920 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 1), SNaN, Depth: Depth + 1) &&
5921 DAG.isKnownNeverNaN(Op: Op.getOperand(i: 2), SNaN, Depth: Depth + 1);
5922 }
5923 case Intrinsic::amdgcn_rcp:
5924 case Intrinsic::amdgcn_rsq:
5925 case Intrinsic::amdgcn_rcp_legacy:
5926 case Intrinsic::amdgcn_rsq_legacy:
5927 case Intrinsic::amdgcn_rsq_clamp: {
5928 if (SNaN)
5929 return true;
5930
5931 // TODO: Need is known positive check.
5932 return false;
5933 }
5934 case Intrinsic::amdgcn_trig_preop:
5935 case Intrinsic::amdgcn_fdot2:
5936 // TODO: Refine on operand
5937 return SNaN;
5938 case Intrinsic::amdgcn_fma_legacy:
5939 if (SNaN)
5940 return true;
5941 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 1), SNaN, Depth: Depth + 1) &&
5942 DAG.isKnownNeverNaN(Op: Op.getOperand(i: 2), SNaN, Depth: Depth + 1) &&
5943 DAG.isKnownNeverNaN(Op: Op.getOperand(i: 3), SNaN, Depth: Depth + 1);
5944 default:
5945 return false;
5946 }
5947 }
5948 default:
5949 return false;
5950 }
5951}
5952
5953bool AMDGPUTargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,
5954 Register N0, Register N1) const {
5955 return MRI.hasOneNonDBGUse(RegNo: N0); // FIXME: handle regbanks
5956}
5957
5958TargetLowering::AtomicExpansionKind
5959AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
5960 switch (RMW->getOperation()) {
5961 case AtomicRMWInst::Nand:
5962 case AtomicRMWInst::FAdd:
5963 case AtomicRMWInst::FSub:
5964 case AtomicRMWInst::FMax:
5965 case AtomicRMWInst::FMin:
5966 return AtomicExpansionKind::CmpXChg;
5967 default: {
5968 if (auto *IntTy = dyn_cast<IntegerType>(Val: RMW->getType())) {
5969 unsigned Size = IntTy->getBitWidth();
5970 if (Size == 32 || Size == 64)
5971 return AtomicExpansionKind::None;
5972 }
5973
5974 return AtomicExpansionKind::CmpXChg;
5975 }
5976 }
5977}
5978
5979/// Whether it is profitable to sink the operands of an
5980/// Instruction I to the basic block of I.
5981/// This helps using several modifiers (like abs and neg) more often.
5982bool AMDGPUTargetLowering::shouldSinkOperands(
5983 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
5984 using namespace PatternMatch;
5985
5986 for (auto &Op : I->operands()) {
5987 // Ensure we are not already sinking this operand.
5988 if (any_of(Range&: Ops, P: [&](Use *U) { return U->get() == Op.get(); }))
5989 continue;
5990
5991 if (match(V: &Op, P: m_FAbs(Op0: m_Value())) || match(V: &Op, P: m_FNeg(X: m_Value())))
5992 Ops.push_back(Elt: &Op);
5993 }
5994
5995 return !Ops.empty();
5996}
5997

source code of llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp