AMDGPUISelLowering.cpp source code [llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp]

1	//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// This is the parent TargetLowering class for hardware code gen
11	/// targets.
12	//
13	//===----------------------------------------------------------------------===//
14
15	#include "AMDGPUISelLowering.h"
16	#include "AMDGPU.h"
17	#include "AMDGPUInstrInfo.h"
18	#include "AMDGPUMachineFunction.h"
19	#include "SIMachineFunctionInfo.h"
20	#include "llvm/CodeGen/Analysis.h"
21	#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
22	#include "llvm/CodeGen/MachineFrameInfo.h"
23	#include "llvm/IR/DiagnosticInfo.h"
24	#include "llvm/IR/IntrinsicsAMDGPU.h"
25	#include "llvm/IR/PatternMatch.h"
26	#include "llvm/Support/CommandLine.h"
27	#include "llvm/Support/KnownBits.h"
28	#include "llvm/Target/TargetMachine.h"
29
30	using namespace llvm;
31
32	#include "AMDGPUGenCallingConv.inc"
33
34	static cl::opt<bool> AMDGPUBypassSlowDiv(
35	"amdgpu-bypass-slow-div",
36	cl::desc ("Skip 64-bit divide for dynamic 32-bit values"),
37	cl::init(true));
38
39	// Find a larger type to do a load / store of a vector with.
40	EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
41	unsigned StoreSize = VT.getStoreSizeInBits();
42	if (StoreSize <= `32`)
43	return EVT::getIntegerVT(Context&: Ctx, BitWidth: StoreSize);
44
45	assert(StoreSize % `32` == `0` && "Store size not a multiple of 32");
46	return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / `32`);
47	}
48
49	unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
50	return DAG.computeKnownBits(Op).countMaxActiveBits();
51	}
52
53	unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
54	// In order for this to be a signed 24-bit value, bit 23, must
55	// be a sign bit.
56	return DAG.ComputeMaxSignificantBits(Op);
57	}
58
59	AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
60	const AMDGPUSubtarget &STI)
61	: TargetLowering (TM), Subtarget(&STI) {
62	// Always lower memset, memcpy, and memmove intrinsics to load/store
63	// instructions, rather then generating calls to memset, mempcy or memmove.
64	MaxStoresPerMemset = MaxStoresPerMemsetOptSize = ~`0U`;
65	MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = ~`0U`;
66	MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = ~`0U`;
67
68	// Lower floating point store/load to integer store/load to reduce the number
69	// of patterns in tablegen.
70	setOperationAction(ISD::LOAD, MVT::f32, Promote);
71	AddPromotedToType(Opc: ISD::LOAD, MVT::OrigVT: f32, MVT::DestVT: i32);
72
73	setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
74	AddPromotedToType(Opc: ISD::LOAD, MVT::OrigVT: v2f32, MVT::DestVT: v2i32);
75
76	setOperationAction(ISD::LOAD, MVT::v3f32, Promote);
77	AddPromotedToType(Opc: ISD::LOAD, MVT::OrigVT: v3f32, MVT::DestVT: v3i32);
78
79	setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
80	AddPromotedToType(Opc: ISD::LOAD, MVT::OrigVT: v4f32, MVT::DestVT: v4i32);
81
82	setOperationAction(ISD::LOAD, MVT::v5f32, Promote);
83	AddPromotedToType(Opc: ISD::LOAD, MVT::OrigVT: v5f32, MVT::DestVT: v5i32);
84
85	setOperationAction(ISD::LOAD, MVT::v6f32, Promote);
86	AddPromotedToType(Opc: ISD::LOAD, MVT::OrigVT: v6f32, MVT::DestVT: v6i32);
87
88	setOperationAction(ISD::LOAD, MVT::v7f32, Promote);
89	AddPromotedToType(Opc: ISD::LOAD, MVT::OrigVT: v7f32, MVT::DestVT: v7i32);
90
91	setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
92	AddPromotedToType(Opc: ISD::LOAD, MVT::OrigVT: v8f32, MVT::DestVT: v8i32);
93
94	setOperationAction(ISD::LOAD, MVT::v9f32, Promote);
95	AddPromotedToType(Opc: ISD::LOAD, MVT::OrigVT: v9f32, MVT::DestVT: v9i32);
96
97	setOperationAction(ISD::LOAD, MVT::v10f32, Promote);
98	AddPromotedToType(Opc: ISD::LOAD, MVT::OrigVT: v10f32, MVT::DestVT: v10i32);
99
100	setOperationAction(ISD::LOAD, MVT::v11f32, Promote);
101	AddPromotedToType(Opc: ISD::LOAD, MVT::OrigVT: v11f32, MVT::DestVT: v11i32);
102
103	setOperationAction(ISD::LOAD, MVT::v12f32, Promote);
104	AddPromotedToType(Opc: ISD::LOAD, MVT::OrigVT: v12f32, MVT::DestVT: v12i32);
105
106	setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
107	AddPromotedToType(Opc: ISD::LOAD, MVT::OrigVT: v16f32, MVT::DestVT: v16i32);
108
109	setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
110	AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
111
112	setOperationAction(ISD::LOAD, MVT::i64, Promote);
113	AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
114
115	setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
116	AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
117
118	setOperationAction(ISD::LOAD, MVT::f64, Promote);
119	AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
120
121	setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
122	AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
123
124	setOperationAction(ISD::LOAD, MVT::v3i64, Promote);
125	AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
126
127	setOperationAction(ISD::LOAD, MVT::v4i64, Promote);
128	AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
129
130	setOperationAction(ISD::LOAD, MVT::v3f64, Promote);
131	AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
132
133	setOperationAction(ISD::LOAD, MVT::v4f64, Promote);
134	AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
135
136	setOperationAction(ISD::LOAD, MVT::v8i64, Promote);
137	AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
138
139	setOperationAction(ISD::LOAD, MVT::v8f64, Promote);
140	AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
141
142	setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
143	AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
144
145	setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
146	AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
147
148	setOperationAction(ISD::LOAD, MVT::i128, Promote);
149	AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32);
150
151	// There are no 64-bit extloads. These should be done as a 32-bit extload and
152	// an extension to 64-bit.
153	for (MVT VT : MVT::integer_valuetypes())
154	setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::i64, VT,
155	Expand);
156
157	for (MVT VT : MVT::integer_valuetypes()) {
158	if (VT == MVT::i64)
159	continue;
160
161	for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
162	setLoadExtAction(Op, VT, MVT::i1, Promote);
163	setLoadExtAction(Op, VT, MVT::i8, Legal);
164	setLoadExtAction(Op, VT, MVT::i16, Legal);
165	setLoadExtAction(Op, VT, MVT::i32, Expand);
166	}
167	}
168
169	for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
170	for (auto MemVT :
171	{MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
172	setLoadExtAction({ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}, VT, MemVT,
173	Expand);
174
175	setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
176	setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
177	setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
178	setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
179	setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
180	setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3bf16, Expand);
181	setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
182	setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
183	setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
184	setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
185	setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
186	setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16bf16, Expand);
187	setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
188	setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32bf16, Expand);
189
190	setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
191	setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
192	setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
193	setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
194	setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
195	setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
196
197	setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
198	setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
199	setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
200	setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
201	setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
202	setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3bf16, Expand);
203	setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
204	setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
205	setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
206	setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
207	setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
208	setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16bf16, Expand);
209
210	setOperationAction(ISD::STORE, MVT::f32, Promote);
211	AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
212
213	setOperationAction(ISD::STORE, MVT::v2f32, Promote);
214	AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
215
216	setOperationAction(ISD::STORE, MVT::v3f32, Promote);
217	AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
218
219	setOperationAction(ISD::STORE, MVT::v4f32, Promote);
220	AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
221
222	setOperationAction(ISD::STORE, MVT::v5f32, Promote);
223	AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
224
225	setOperationAction(ISD::STORE, MVT::v6f32, Promote);
226	AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
227
228	setOperationAction(ISD::STORE, MVT::v7f32, Promote);
229	AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
230
231	setOperationAction(ISD::STORE, MVT::v8f32, Promote);
232	AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
233
234	setOperationAction(ISD::STORE, MVT::v9f32, Promote);
235	AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32);
236
237	setOperationAction(ISD::STORE, MVT::v10f32, Promote);
238	AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32);
239
240	setOperationAction(ISD::STORE, MVT::v11f32, Promote);
241	AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32);
242
243	setOperationAction(ISD::STORE, MVT::v12f32, Promote);
244	AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32);
245
246	setOperationAction(ISD::STORE, MVT::v16f32, Promote);
247	AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
248
249	setOperationAction(ISD::STORE, MVT::v32f32, Promote);
250	AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
251
252	setOperationAction(ISD::STORE, MVT::i64, Promote);
253	AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
254
255	setOperationAction(ISD::STORE, MVT::v2i64, Promote);
256	AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
257
258	setOperationAction(ISD::STORE, MVT::f64, Promote);
259	AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
260
261	setOperationAction(ISD::STORE, MVT::v2f64, Promote);
262	AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
263
264	setOperationAction(ISD::STORE, MVT::v3i64, Promote);
265	AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
266
267	setOperationAction(ISD::STORE, MVT::v3f64, Promote);
268	AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
269
270	setOperationAction(ISD::STORE, MVT::v4i64, Promote);
271	AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
272
273	setOperationAction(ISD::STORE, MVT::v4f64, Promote);
274	AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
275
276	setOperationAction(ISD::STORE, MVT::v8i64, Promote);
277	AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
278
279	setOperationAction(ISD::STORE, MVT::v8f64, Promote);
280	AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
281
282	setOperationAction(ISD::STORE, MVT::v16i64, Promote);
283	AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
284
285	setOperationAction(ISD::STORE, MVT::v16f64, Promote);
286	AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
287
288	setOperationAction(ISD::STORE, MVT::i128, Promote);
289	AddPromotedToType(ISD::STORE, MVT::i128, MVT::v4i32);
290
291	setTruncStoreAction(MVT::i64, MVT::i1, Expand);
292	setTruncStoreAction(MVT::i64, MVT::i8, Expand);
293	setTruncStoreAction(MVT::i64, MVT::i16, Expand);
294	setTruncStoreAction(MVT::i64, MVT::i32, Expand);
295
296	setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
297	setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
298	setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
299	setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
300
301	setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
302	setTruncStoreAction(MVT::f32, MVT::f16, Expand);
303	setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
304	setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
305	setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
306	setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
307	setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
308	setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
309
310	setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
311	setTruncStoreAction(MVT::f64, MVT::f16, Expand);
312	setTruncStoreAction(MVT::f64, MVT::f32, Expand);
313
314	setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
315	setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
316
317	setTruncStoreAction(MVT::v3i32, MVT::v3i8, Expand);
318
319	setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
320	setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
321	setTruncStoreAction(MVT::v3i64, MVT::v3i8, Expand);
322	setTruncStoreAction(MVT::v3i64, MVT::v3i1, Expand);
323	setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
324	setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
325
326	setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
327	setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
328	setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
329	setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
330
331	setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
332	setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
333
334	setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
335	setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
336	setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
337	setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
338	setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
339	setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
340	setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
341
342	setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal);
343	setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal);
344
345	setOperationAction({ISD::BR_JT, ISD::BRIND}, MVT::Other, Expand);
346
347	// For R600, this is totally unsupported, just custom lower to produce an
348	// error.
349	setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
350
351	// Library functions. These default to Expand, but we have instructions
352	// for them.
353	setOperationAction({ISD::FCEIL, ISD::FPOW, ISD::FABS, ISD::FFLOOR,
354	ISD::FROUNDEVEN, ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM},
355	MVT::f32, Legal);
356
357	setOperationAction(ISD::FLOG2, MVT::f32, Custom);
358	setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
359
360	setOperationAction(
361	{ISD::FLOG, ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10}, MVT::f32,
362	Custom);
363
364	setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
365
366	setOperationAction(ISD::FRINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
367
368	setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom);
369
370	if (Subtarget->has16BitInsts())
371	setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
372	else {
373	setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
374	setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Custom);
375	}
376
377	setOperationAction({ISD::FLOG10, ISD::FLOG, ISD::FEXP, ISD::FEXP10}, MVT::f16,
378	Custom);
379
380	// FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
381	// scalarization code. Can be removed when IS_FPCLASS expand isn't called by
382	// default unless marked custom/legal.
383	setOperationAction(
384	ISD::IS_FPCLASS,
385	{MVT::v2f16, MVT::v3f16, MVT::v4f16, MVT::v16f16, MVT::v2f32, MVT::v3f32,
386	MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
387	MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64, MVT::v16f64},
388	Custom);
389
390	// Expand to fneg + fadd.
391	setOperationAction(ISD::FSUB, MVT::f64, Expand);
392
393	setOperationAction(ISD::CONCAT_VECTORS,
394	{MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
395	MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
396	MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
397	MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
398	MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
399	Custom);
400
401	// FIXME: Why is v8f16/v8bf16 missing?
402	setOperationAction(
403	ISD::EXTRACT_SUBVECTOR,
404	{MVT::v2f16, MVT::v2bf16, MVT::v2i16, MVT::v4f16, MVT::v4bf16,
405	MVT::v4i16, MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32,
406	MVT::v4f32, MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32,
407	MVT::v6i32, MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32,
408	MVT::v9f32, MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32,
409	MVT::v11f32, MVT::v12i32, MVT::v12f32, MVT::v16f16, MVT::v16bf16,
410	MVT::v16i16, MVT::v16f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
411	MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64,
412	MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64,
413	MVT::v32i16, MVT::v32f16, MVT::v32bf16},
414	Custom);
415
416	setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
417	setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);
418
419	const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
420	for (MVT VT : ScalarIntVTs) {
421	// These should use [SU]DIVREM, so set them to expand
422	setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM}, VT,
423	Expand);
424
425	// GPU does not have divrem function for signed or unsigned.
426	setOperationAction({ISD::SDIVREM, ISD::UDIVREM}, VT, Custom);
427
428	// GPU does not have [S\|U]MUL_LOHI functions as a single instruction.
429	setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, VT, Expand);
430
431	setOperationAction({ISD::BSWAP, ISD::CTTZ, ISD::CTLZ}, VT, Expand);
432
433	// AMDGPU uses ADDC/SUBC/ADDE/SUBE
434	setOperationAction({ISD::ADDC, ISD::SUBC, ISD::ADDE, ISD::SUBE}, VT, Legal);
435	}
436
437	// The hardware supports 32-bit FSHR, but not FSHL.
438	setOperationAction(ISD::FSHR, MVT::i32, Legal);
439
440	// The hardware supports 32-bit ROTR, but not ROTL.
441	setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand);
442	setOperationAction(ISD::ROTR, MVT::i64, Expand);
443
444	setOperationAction({ISD::MULHU, ISD::MULHS}, MVT::i16, Expand);
445
446	setOperationAction({ISD::MUL, ISD::MULHU, ISD::MULHS}, MVT::i64, Expand);
447	setOperationAction(
448	{ISD::UINT_TO_FP, ISD::SINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT},
449	MVT::i64, Custom);
450	setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
451
452	setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i32,
453	Legal);
454
455	setOperationAction(
456	{ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF},
457	MVT::i64, Custom);
458
459	for (auto VT : {MVT::i8, MVT::i16})
460	setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, VT, Custom);
461
462	static const MVT::SimpleValueType VectorIntTypes[] = {
463	MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
464	MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
465
466	for (MVT VT : VectorIntTypes) {
467	// Expand the following operations for the current type by default.
468	setOperationAction({ISD::ADD, ISD::AND, ISD::FP_TO_SINT,
469	ISD::FP_TO_UINT, ISD::MUL, ISD::MULHU,
470	ISD::MULHS, ISD::OR, ISD::SHL,
471	ISD::SRA, ISD::SRL, ISD::ROTL,
472	ISD::ROTR, ISD::SUB, ISD::SINT_TO_FP,
473	ISD::UINT_TO_FP, ISD::SDIV, ISD::UDIV,
474	ISD::SREM, ISD::UREM, ISD::SMUL_LOHI,
475	ISD::UMUL_LOHI, ISD::SDIVREM, ISD::UDIVREM,
476	ISD::SELECT, ISD::VSELECT, ISD::SELECT_CC,
477	ISD::XOR, ISD::BSWAP, ISD::CTPOP,
478	ISD::CTTZ, ISD::CTLZ, ISD::VECTOR_SHUFFLE,
479	ISD::SETCC},
480	VT, Expand);
481	}
482
483	static const MVT::SimpleValueType FloatVectorTypes[] = {
484	MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
485	MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
486
487	for (MVT VT : FloatVectorTypes) {
488	setOperationAction(
489	{ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM,
490	ISD::FADD, ISD::FCEIL, ISD::FCOS,
491	ISD::FDIV, ISD::FEXP2, ISD::FEXP,
492	ISD::FEXP10, ISD::FLOG2, ISD::FREM,
493	ISD::FLOG, ISD::FLOG10, ISD::FPOW,
494	ISD::FFLOOR, ISD::FTRUNC, ISD::FMUL,
495	ISD::FMA, ISD::FRINT, ISD::FNEARBYINT,
496	ISD::FSQRT, ISD::FSIN, ISD::FSUB,
497	ISD::FNEG, ISD::VSELECT, ISD::SELECT_CC,
498	ISD::FCOPYSIGN, ISD::VECTOR_SHUFFLE, ISD::SETCC,
499	ISD::FCANONICALIZE, ISD::FROUNDEVEN},
500	VT, Expand);
501	}
502
503	// This causes using an unrolled select operation rather than expansion with
504	// bit operations. This is in general better, but the alternative using BFI
505	// instructions may be better if the select sources are SGPRs.
506	setOperationAction(ISD::SELECT, MVT::v2f32, Promote);
507	AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
508
509	setOperationAction(ISD::SELECT, MVT::v3f32, Promote);
510	AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
511
512	setOperationAction(ISD::SELECT, MVT::v4f32, Promote);
513	AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
514
515	setOperationAction(ISD::SELECT, MVT::v5f32, Promote);
516	AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
517
518	setOperationAction(ISD::SELECT, MVT::v6f32, Promote);
519	AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
520
521	setOperationAction(ISD::SELECT, MVT::v7f32, Promote);
522	AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
523
524	setOperationAction(ISD::SELECT, MVT::v9f32, Promote);
525	AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32);
526
527	setOperationAction(ISD::SELECT, MVT::v10f32, Promote);
528	AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32);
529
530	setOperationAction(ISD::SELECT, MVT::v11f32, Promote);
531	AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32);
532
533	setOperationAction(ISD::SELECT, MVT::v12f32, Promote);
534	AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);
535
536	// Disable most libcalls.
537	for (int I = `0`; I < RTLIB::UNKNOWN_LIBCALL; ++I) {
538	if (I < RTLIB::ATOMIC_LOAD \|\| I > RTLIB::ATOMIC_FETCH_NAND_16)
539	setLibcallName(Call: static_cast<RTLIB::Libcall>(I), Name: nullptr);
540	}
541
542	setSchedulingPreference(Sched::RegPressure);
543	setJumpIsExpensive(true);
544
545	// FIXME: This is only partially true. If we have to do vector compares, any
546	// SGPR pair can be a condition register. If we have a uniform condition, we
547	// are better off doing SALU operations, where there is only one SCC. For now,
548	// we don't have a way of knowing during instruction selection if a condition
549	// will be uniform and we always use vector compares. Assume we are using
550	// vector compares until that is fixed.
551	setHasMultipleConditionRegisters(true);
552
553	setMinCmpXchgSizeInBits(`32`);
554	setSupportsUnalignedAtomics(false);
555
556	PredictableSelectIsExpensive = false;
557
558	// We want to find all load dependencies for long chains of stores to enable
559	// merging into very wide vectors. The problem is with vectors with > 4
560	// elements. MergeConsecutiveStores will attempt to merge these because x8/x16
561	// vectors are a legal type, even though we have to split the loads
562	// usually. When we can more precisely specify load legality per address
563	// space, we should be able to make FindBetterChain/MergeConsecutiveStores
564	// smarter so that they can figure out what to do in 2 iterations without all
565	// N > 4 stores on the same chain.
566	GatherAllAliasesMaxDepth = `16`;
567
568	// memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
569	// about these during lowering.
570	MaxStoresPerMemcpy = `0xffffffff`;
571	MaxStoresPerMemmove = `0xffffffff`;
572	MaxStoresPerMemset = `0xffffffff`;
573
574	// The expansion for 64-bit division is enormous.
575	if (AMDGPUBypassSlowDiv)
576	addBypassSlowDiv(SlowBitWidth: `64`, FastBitWidth: `32`);
577
578	setTargetDAGCombine({ISD::BITCAST, ISD::SHL,
579	ISD::SRA, ISD::SRL,
580	ISD::TRUNCATE, ISD::MUL,
581	ISD::SMUL_LOHI, ISD::UMUL_LOHI,
582	ISD::MULHU, ISD::MULHS,
583	ISD::SELECT, ISD::SELECT_CC,
584	ISD::STORE, ISD::FADD,
585	ISD::FSUB, ISD::FNEG,
586	ISD::FABS, ISD::AssertZext,
587	ISD::AssertSext, ISD::INTRINSIC_WO_CHAIN});
588
589	setMaxAtomicSizeInBitsSupported(`64`);
590	setMaxDivRemBitWidthSupported(`64`);
591	setMaxLargeFPConvertBitWidthSupported(`64`);
592	}
593
594	bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const {
595	if (getTargetMachine().Options.NoSignedZerosFPMath)
596	return true;
597
598	const auto Flags = Op.getNode()->getFlags();
599	if (Flags.hasNoSignedZeros())
600	return true;
601
602	return false;
603	}
604
605	//===----------------------------------------------------------------------===//
606	// Target Information
607	//===----------------------------------------------------------------------===//
608
609	LLVM_READNONE
610	static bool fnegFoldsIntoOpcode(unsigned Opc) {
611	switch (Opc) {
612	case ISD::FADD:
613	case ISD::FSUB:
614	case ISD::FMUL:
615	case ISD::FMA:
616	case ISD::FMAD:
617	case ISD::FMINNUM:
618	case ISD::FMAXNUM:
619	case ISD::FMINNUM_IEEE:
620	case ISD::FMAXNUM_IEEE:
621	case ISD::FMINIMUM:
622	case ISD::FMAXIMUM:
623	case ISD::SELECT:
624	case ISD::FSIN:
625	case ISD::FTRUNC:
626	case ISD::FRINT:
627	case ISD::FNEARBYINT:
628	case ISD::FROUNDEVEN:
629	case ISD::FCANONICALIZE:
630	case AMDGPUISD::RCP:
631	case AMDGPUISD::RCP_LEGACY:
632	case AMDGPUISD::RCP_IFLAG:
633	case AMDGPUISD::SIN_HW:
634	case AMDGPUISD::FMUL_LEGACY:
635	case AMDGPUISD::FMIN_LEGACY:
636	case AMDGPUISD::FMAX_LEGACY:
637	case AMDGPUISD::FMED3:
638	// TODO: handle llvm.amdgcn.fma.legacy
639	return true;
640	case ISD::BITCAST:
641	llvm_unreachable("bitcast is special cased");
642	default:
643	return false;
644	}
645	}
646
647	static bool fnegFoldsIntoOp(const SDNode *N) {
648	unsigned Opc = N->getOpcode();
649	if (Opc == ISD::BITCAST) {
650	// TODO: Is there a benefit to checking the conditions performFNegCombine
651	// does? We don't for the other cases.
652	SDValue BCSrc = N->getOperand(Num: `0`);
653	if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
654	return BCSrc.getNumOperands() == `2` &&
655	BCSrc.getOperand(i: `1`).getValueSizeInBits() == `32`;
656	}
657
658	return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
659	}
660
661	return fnegFoldsIntoOpcode(Opc);
662	}
663
664	/// \p returns true if the operation will definitely need to use a 64-bit
665	/// encoding, and thus will use a VOP3 encoding regardless of the source
666	/// modifiers.
667	LLVM_READONLY
668	static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
669	return (N->getNumOperands() > `2` && N->getOpcode() != ISD::SELECT) \|\|
670	VT == MVT::f64;
671	}
672
673	/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
674	/// type for ISD::SELECT.
675	LLVM_READONLY
676	static bool selectSupportsSourceMods(const SDNode *N) {
677	// TODO: Only applies if select will be vector
678	return N->getValueType(`0`) == MVT::f32;
679	}
680
681	// Most FP instructions support source modifiers, but this could be refined
682	// slightly.
683	LLVM_READONLY
684	static bool hasSourceMods(const SDNode *N) {
685	if (isa<MemSDNode>(Val: N))
686	return false;
687
688	switch (N->getOpcode()) {
689	case ISD::CopyToReg:
690	case ISD::FDIV:
691	case ISD::FREM:
692	case ISD::INLINEASM:
693	case ISD::INLINEASM_BR:
694	case AMDGPUISD::DIV_SCALE:
695	case ISD::INTRINSIC_W_CHAIN:
696
697	// TODO: Should really be looking at the users of the bitcast. These are
698	// problematic because bitcasts are used to legalize all stores to integer
699	// types.
700	case ISD::BITCAST:
701	return false;
702	case ISD::INTRINSIC_WO_CHAIN: {
703	switch (N->getConstantOperandVal(Num: `0`)) {
704	case Intrinsic::amdgcn_interp_p1:
705	case Intrinsic::amdgcn_interp_p2:
706	case Intrinsic::amdgcn_interp_mov:
707	case Intrinsic::amdgcn_interp_p1_f16:
708	case Intrinsic::amdgcn_interp_p2_f16:
709	return false;
710	default:
711	return true;
712	}
713	}
714	case ISD::SELECT:
715	return selectSupportsSourceMods(N);
716	default:
717	return true;
718	}
719	}
720
721	bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
722	unsigned CostThreshold) {
723	// Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
724	// it is truly free to use a source modifier in all cases. If there are
725	// multiple users but for each one will necessitate using VOP3, there will be
726	// a code size increase. Try to avoid increasing code size unless we know it
727	// will save on the instruction count.
728	unsigned NumMayIncreaseSize = `0`;
729	MVT VT = N->getValueType(ResNo: `0`).getScalarType().getSimpleVT();
730
731	assert(!N->use_empty());
732
733	// XXX - Should this limit number of uses to check?
734	for (const SDNode *U : N->uses()) {
735	if (!hasSourceMods(N: U))
736	return false;
737
738	if (!opMustUseVOP3Encoding(N: U, VT)) {
739	if (++NumMayIncreaseSize > CostThreshold)
740	return false;
741	}
742	}
743
744	return true;
745	}
746
747	EVT AMDGPUTargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
748	ISD::NodeType ExtendKind) const {
749	assert(!VT.isVector() && "only scalar expected");
750
751	// Round to the next multiple of 32-bits.
752	unsigned Size = VT.getSizeInBits();
753	if (Size <= `32`)
754	return MVT::i32;
755	return EVT::getIntegerVT(Context, BitWidth: `32` * ((Size + `31`) / `32`));
756	}
757
758	MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
759	return MVT::i32;
760	}
761
762	bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
763	return true;
764	}
765
766	// The backend supports 32 and 64 bit floating point immediates.
767	// FIXME: Why are we reporting vectors of FP immediates as legal?
768	bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
769	bool ForCodeSize) const {
770	EVT ScalarVT = VT.getScalarType();
771	return (ScalarVT == MVT::f32 \|\| ScalarVT == MVT::f64 \|\|
772	(ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
773	}
774
775	// We don't want to shrink f64 / f32 constants.
776	bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
777	EVT ScalarVT = VT.getScalarType();
778	return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
779	}
780
781	bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
782	ISD::LoadExtType ExtTy,
783	EVT NewVT) const {
784	// TODO: This may be worth removing. Check regression tests for diffs.
785	if (!TargetLoweringBase::shouldReduceLoadWidth(Load: N, ExtTy, NewVT))
786	return false;
787
788	unsigned NewSize = NewVT.getStoreSizeInBits();
789
790	// If we are reducing to a 32-bit load or a smaller multi-dword load,
791	// this is always better.
792	if (NewSize >= `32`)
793	return true;
794
795	EVT OldVT = N->getValueType(ResNo: `0`);
796	unsigned OldSize = OldVT.getStoreSizeInBits();
797
798	MemSDNode *MN = cast<MemSDNode>(Val: N);
799	unsigned AS = MN->getAddressSpace();
800	// Do not shrink an aligned scalar load to sub-dword.
801	// Scalar engine cannot do sub-dword loads.
802	// TODO: Update this for GFX12 which does have scalar sub-dword loads.
803	if (OldSize >= `32` && NewSize < `32` && MN->getAlign() >= Align (`4`) &&
804	(AS == AMDGPUAS::CONSTANT_ADDRESS \|\|
805	AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT \|\|
806	(isa<LoadSDNode>(Val: N) && AS == AMDGPUAS::GLOBAL_ADDRESS &&
807	MN->isInvariant())) &&
808	AMDGPUInstrInfo::isUniformMMO(MMO: MN->getMemOperand()))
809	return false;
810
811	// Don't produce extloads from sub 32-bit types. SI doesn't have scalar
812	// extloads, so doing one requires using a buffer_load. In cases where we
813	// still couldn't use a scalar load, using the wider load shouldn't really
814	// hurt anything.
815
816	// If the old size already had to be an extload, there's no harm in continuing
817	// to reduce the width.
818	return (OldSize < `32`);
819	}
820
821	bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy,
822	const SelectionDAG &DAG,
823	const MachineMemOperand &MMO) const {
824
825	assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
826
827	if (LoadTy.getScalarType() == MVT::i32)
828	return false;
829
830	unsigned LScalarSize = LoadTy.getScalarSizeInBits();
831	unsigned CastScalarSize = CastTy.getScalarSizeInBits();
832
833	if ((LScalarSize >= CastScalarSize) && (CastScalarSize < `32`))
834	return false;
835
836	unsigned Fast = `0`;
837	return allowsMemoryAccessForAlignment(Context&: *DAG.getContext(), DL: DAG.getDataLayout(),
838	VT: CastTy, MMO, Fast: &Fast) &&
839	Fast;
840	}
841
842	// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
843	// profitable with the expansion for 64-bit since it's generally good to
844	// speculate things.
845	bool AMDGPUTargetLowering::isCheapToSpeculateCttz(Type Ty) const* {
846	return true;
847	}
848
849	bool AMDGPUTargetLowering::isCheapToSpeculateCtlz(Type Ty) const* {
850	return true;
851	}
852
853	bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode N) const* {
854	switch (N->getOpcode()) {
855	case ISD::EntryToken:
856	case ISD::TokenFactor:
857	return true;
858	case ISD::INTRINSIC_WO_CHAIN: {
859	unsigned IntrID = N->getConstantOperandVal(Num: `0`);
860	return AMDGPU::isIntrinsicAlwaysUniform(IntrID);
861	}
862	case ISD::LOAD:
863	if (cast<LoadSDNode>(Val: N)->getMemOperand()->getAddrSpace() ==
864	AMDGPUAS::CONSTANT_ADDRESS_32BIT)
865	return true;
866	return false;
867	case AMDGPUISD::SETCC: // ballot-style instruction
868	return true;
869	}
870	return false;
871	}
872
873	SDValue AMDGPUTargetLowering::getNegatedExpression(
874	SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
875	NegatibleCost &Cost, unsigned Depth) const {
876
877	switch (Op.getOpcode()) {
878	case ISD::FMA:
879	case ISD::FMAD: {
880	// Negating a fma is not free if it has users without source mods.
881	if (!allUsesHaveSourceMods(N: Op.getNode()))
882	return SDValue ();
883	break;
884	}
885	case AMDGPUISD::RCP: {
886	SDValue Src = Op.getOperand(i: `0`);
887	EVT VT = Op.getValueType();
888	SDLoc SL(Op);
889
890	SDValue NegSrc = getNegatedExpression(Op: Src, DAG, LegalOperations,
891	ForCodeSize, Cost, Depth: Depth + `1`);
892	if (NegSrc)
893	return DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: NegSrc, Flags: Op ->getFlags());
894	return SDValue ();
895	}
896	default:
897	break;
898	}
899
900	return TargetLowering::getNegatedExpression(Op, DAG, LegalOps: LegalOperations,
901	OptForSize: ForCodeSize, Cost, Depth);
902	}
903
904	//===---------------------------------------------------------------------===//
905	// Target Properties
906	//===---------------------------------------------------------------------===//
907
908	bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
909	assert(VT.isFloatingPoint());
910
911	// Packed operations do not have a fabs modifier.
912	return VT == MVT::f32 \|\| VT == MVT::f64 \|\|
913	(Subtarget->has16BitInsts() && VT == MVT::f16);
914	}
915
916	bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
917	assert(VT.isFloatingPoint());
918	// Report this based on the end legalized type.
919	VT = VT.getScalarType();
920	return VT == MVT::f32 \|\| VT == MVT::f64 \|\| VT == MVT::f16;
921	}
922
923	bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT,
924	unsigned NumElem,
925	unsigned AS) const {
926	return true;
927	}
928
929	bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
930	// There are few operations which truly have vector input operands. Any vector
931	// operation is going to involve operations on each component, and a
932	// build_vector will be a copy per element, so it always makes sense to use a
933	// build_vector input in place of the extracted element to avoid a copy into a
934	// super register.
935	//
936	// We should probably only do this if all users are extracts only, but this
937	// should be the common case.
938	return true;
939	}
940
941	bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
942	// Truncate is just accessing a subregister.
943
944	unsigned SrcSize = Source.getSizeInBits();
945	unsigned DestSize = Dest.getSizeInBits();
946
947	return DestSize < SrcSize && DestSize % `32` == `0` ;
948	}
949
950	bool AMDGPUTargetLowering::isTruncateFree(Type Source, Type Dest) const {
951	// Truncate is just accessing a subregister.
952
953	unsigned SrcSize = Source->getScalarSizeInBits();
954	unsigned DestSize = Dest->getScalarSizeInBits();
955
956	if (DestSize== `16` && Subtarget->has16BitInsts())
957	return SrcSize >= `32`;
958
959	return DestSize < SrcSize && DestSize % `32` == `0`;
960	}
961
962	bool AMDGPUTargetLowering::isZExtFree(Type Src, Type Dest) const {
963	unsigned SrcSize = Src->getScalarSizeInBits();
964	unsigned DestSize = Dest->getScalarSizeInBits();
965
966	if (SrcSize == `16` && Subtarget->has16BitInsts())
967	return DestSize >= `32`;
968
969	return SrcSize == `32` && DestSize == `64`;
970	}
971
972	bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
973	// Any register load of a 64-bit value really requires 2 32-bit moves. For all
974	// practical purposes, the extra mov 0 to load a 64-bit is free. As used,
975	// this will enable reducing 64-bit operations the 32-bit, which is always
976	// good.
977
978	if (Src == MVT::i16)
979	return Dest == MVT::i32 \|\|Dest == MVT::i64 ;
980
981	return Src == MVT::i32 && Dest == MVT::i64;
982	}
983
984	bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
985	// There aren't really 64-bit registers, but pairs of 32-bit ones and only a
986	// limited number of native 64-bit operations. Shrinking an operation to fit
987	// in a single 32-bit register should always be helpful. As currently used,
988	// this is much less general than the name suggests, and is only used in
989	// places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
990	// not profitable, and may actually be harmful.
991	return SrcVT.getSizeInBits() > `32` && DestVT.getSizeInBits() == `32`;
992	}
993
994	bool AMDGPUTargetLowering::isDesirableToCommuteWithShift(
995	const SDNode* N, CombineLevel Level) const {
996	assert((N->getOpcode() == ISD::SHL \|\| N->getOpcode() == ISD::SRA \|\|
997	N->getOpcode() == ISD::SRL) &&
998	"Expected shift op");
999	// Always commute pre-type legalization and right shifts.
1000	// We're looking for shl(or(x,y),z) patterns.
1001	if (Level < CombineLevel::AfterLegalizeTypes \|\|
1002	N->getOpcode() != ISD::SHL \|\| N->getOperand(Num: `0`).getOpcode() != ISD::OR)
1003	return true;
1004
1005	// If only user is a i32 right-shift, then don't destroy a BFE pattern.
1006	if (N->getValueType(`0`) == MVT::i32 && N->use_size() == `1` &&
1007	(N->use_begin()->getOpcode() == ISD::SRA \|\|
1008	N->use_begin()->getOpcode() == ISD::SRL))
1009	return false;
1010
1011	// Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
1012	auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
1013	if (LHS.getOpcode() != ISD::SHL)
1014	return false;
1015	auto *RHSLd = dyn_cast<LoadSDNode>(Val&: RHS);
1016	auto *LHS0 = dyn_cast<LoadSDNode>(Val: LHS.getOperand(i: `0`));
1017	auto *LHS1 = dyn_cast<ConstantSDNode>(Val: LHS.getOperand(i: `1`));
1018	return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
1019	LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
1020	RHSLd->getExtensionType() == ISD::ZEXTLOAD;
1021	};
1022	SDValue LHS = N->getOperand(Num: `0`).getOperand(i: `0`);
1023	SDValue RHS = N->getOperand(Num: `0`).getOperand(i: `1`);
1024	return !(IsShiftAndLoad (LHS, RHS) \|\| IsShiftAndLoad (RHS, LHS));
1025	}
1026
1027	//===---------------------------------------------------------------------===//
1028	// TargetLowering Callbacks
1029	//===---------------------------------------------------------------------===//
1030
1031	CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
1032	bool IsVarArg) {
1033	switch (CC) {
1034	case CallingConv::AMDGPU_VS:
1035	case CallingConv::AMDGPU_GS:
1036	case CallingConv::AMDGPU_PS:
1037	case CallingConv::AMDGPU_CS:
1038	case CallingConv::AMDGPU_HS:
1039	case CallingConv::AMDGPU_ES:
1040	case CallingConv::AMDGPU_LS:
1041	return CC_AMDGPU;
1042	case CallingConv::AMDGPU_CS_Chain:
1043	case CallingConv::AMDGPU_CS_ChainPreserve:
1044	return CC_AMDGPU_CS_CHAIN;
1045	case CallingConv::C:
1046	case CallingConv::Fast:
1047	case CallingConv::Cold:
1048	return CC_AMDGPU_Func;
1049	case CallingConv::AMDGPU_Gfx:
1050	return CC_SI_Gfx;
1051	case CallingConv::AMDGPU_KERNEL:
1052	case CallingConv::SPIR_KERNEL:
1053	default:
1054	report_fatal_error(reason: "Unsupported calling convention for call");
1055	}
1056	}
1057
1058	CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
1059	bool IsVarArg) {
1060	switch (CC) {
1061	case CallingConv::AMDGPU_KERNEL:
1062	case CallingConv::SPIR_KERNEL:
1063	llvm_unreachable("kernels should not be handled here");
1064	case CallingConv::AMDGPU_VS:
1065	case CallingConv::AMDGPU_GS:
1066	case CallingConv::AMDGPU_PS:
1067	case CallingConv::AMDGPU_CS:
1068	case CallingConv::AMDGPU_CS_Chain:
1069	case CallingConv::AMDGPU_CS_ChainPreserve:
1070	case CallingConv::AMDGPU_HS:
1071	case CallingConv::AMDGPU_ES:
1072	case CallingConv::AMDGPU_LS:
1073	return RetCC_SI_Shader;
1074	case CallingConv::AMDGPU_Gfx:
1075	return RetCC_SI_Gfx;
1076	case CallingConv::C:
1077	case CallingConv::Fast:
1078	case CallingConv::Cold:
1079	return RetCC_AMDGPU_Func;
1080	default:
1081	report_fatal_error(reason: "Unsupported calling convention.");
1082	}
1083	}
1084
1085	/// The SelectionDAGBuilder will automatically promote function arguments
1086	/// with illegal types. However, this does not work for the AMDGPU targets
1087	/// since the function arguments are stored in memory as these illegal types.
1088	/// In order to handle this properly we need to get the original types sizes
1089	/// from the LLVM IR Function and fixup the ISD:InputArg values before
1090	/// passing them to AnalyzeFormalArguments()
1091
1092	/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1093	/// input values across multiple registers. Each item in the Ins array
1094	/// represents a single value that will be stored in registers. Ins[x].VT is
1095	/// the value type of the value that will be stored in the register, so
1096	/// whatever SDNode we lower the argument to needs to be this type.
1097	///
1098	/// In order to correctly lower the arguments we need to know the size of each
1099	/// argument. Since Ins[x].VT gives us the size of the register that will
1100	/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1101	/// for the original function argument so that we can deduce the correct memory
1102	/// type to use for Ins[x]. In most cases the correct memory type will be
1103	/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1104	/// we have a kernel argument of type v8i8, this argument will be split into
1105	/// 8 parts and each part will be represented by its own item in the Ins array.
1106	/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1107	/// the argument before it was split. From this, we deduce that the memory type
1108	/// for each individual part is i8. We pass the memory type as LocVT to the
1109	/// calling convention analysis function and the register type (Ins[x].VT) as
1110	/// the ValVT.
1111	void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
1112	CCState &State,
1113	const SmallVectorImpl<ISD::InputArg> &Ins) const {
1114	const MachineFunction &MF = State.getMachineFunction();
1115	const Function &Fn = MF.getFunction();
1116	LLVMContext &Ctx = Fn.getParent()->getContext();
1117	const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
1118	const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset();
1119	CallingConv::ID CC = Fn.getCallingConv();
1120
1121	Align MaxAlign = Align (`1`);
1122	uint64_t ExplicitArgOffset = `0`;
1123	const DataLayout &DL = Fn.getParent()->getDataLayout();
1124
1125	unsigned InIndex = `0`;
1126
1127	for (const Argument &Arg : Fn.args()) {
1128	const bool IsByRef = Arg.hasByRefAttr();
1129	Type *BaseArgTy = Arg.getType();
1130	Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1131	Align Alignment = DL.getValueOrABITypeAlignment(
1132	Alignment: IsByRef ? Arg.getParamAlign() : std::nullopt, Ty: MemArgTy);
1133	MaxAlign = std::max(a: Alignment, b: MaxAlign);
1134	uint64_t AllocSize = DL.getTypeAllocSize(Ty: MemArgTy);
1135
1136	uint64_t ArgOffset = alignTo(Size: ExplicitArgOffset, A: Alignment) + ExplicitOffset;
1137	ExplicitArgOffset = alignTo(Size: ExplicitArgOffset, A: Alignment) + AllocSize;
1138
1139	// We're basically throwing away everything passed into us and starting over
1140	// to get accurate in-memory offsets. The "PartOffset" is completely useless
1141	// to us as computed in Ins.
1142	//
1143	// We also need to figure out what type legalization is trying to do to get
1144	// the correct memory offsets.
1145
1146	SmallVector<EVT, `16`> ValueVTs;
1147	SmallVector<uint64_t, `16`> Offsets;
1148	ComputeValueVTs(TLI: *this, DL, Ty: BaseArgTy, ValueVTs, FixedOffsets: &Offsets, StartingOffset: ArgOffset);
1149
1150	for (unsigned Value = `0`, NumValues = ValueVTs.size();
1151	Value != NumValues; ++Value) {
1152	uint64_t BasePartOffset = Offsets [Value];
1153
1154	EVT ArgVT = ValueVTs [Value];
1155	EVT MemVT = ArgVT;
1156	MVT RegisterVT = getRegisterTypeForCallingConv(Context&: Ctx, CC, VT: ArgVT);
1157	unsigned NumRegs = getNumRegistersForCallingConv(Context&: Ctx, CC, VT: ArgVT);
1158
1159	if (NumRegs == `1`) {
1160	// This argument is not split, so the IR type is the memory type.
1161	if (ArgVT.isExtended()) {
1162	// We have an extended type, like i24, so we should just use the
1163	// register type.
1164	MemVT = RegisterVT;
1165	} else {
1166	MemVT = ArgVT;
1167	}
1168	} else if (ArgVT.isVector() && RegisterVT.isVector() &&
1169	ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1170	assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1171	// We have a vector value which has been split into a vector with
1172	// the same scalar type, but fewer elements. This should handle
1173	// all the floating-point vector types.
1174	MemVT = RegisterVT;
1175	} else if (ArgVT.isVector() &&
1176	ArgVT.getVectorNumElements() == NumRegs) {
1177	// This arg has been split so that each element is stored in a separate
1178	// register.
1179	MemVT = ArgVT.getScalarType();
1180	} else if (ArgVT.isExtended()) {
1181	// We have an extended type, like i65.
1182	MemVT = RegisterVT;
1183	} else {
1184	unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1185	assert(ArgVT.getStoreSizeInBits() % NumRegs == `0`);
1186	if (RegisterVT.isInteger()) {
1187	MemVT = EVT::getIntegerVT(Context&: State.getContext(), BitWidth: MemoryBits);
1188	} else if (RegisterVT.isVector()) {
1189	assert(!RegisterVT.getScalarType().isFloatingPoint());
1190	unsigned NumElements = RegisterVT.getVectorNumElements();
1191	assert(MemoryBits % NumElements == `0`);
1192	// This vector type has been split into another vector type with
1193	// a different elements size.
1194	EVT ScalarVT = EVT::getIntegerVT(Context&: State.getContext(),
1195	BitWidth: MemoryBits / NumElements);
1196	MemVT = EVT::getVectorVT(Context&: State.getContext(), VT: ScalarVT, NumElements);
1197	} else {
1198	llvm_unreachable("cannot deduce memory type.");
1199	}
1200	}
1201
1202	// Convert one element vectors to scalar.
1203	if (MemVT.isVector() && MemVT.getVectorNumElements() == `1`)
1204	MemVT = MemVT.getScalarType();
1205
1206	// Round up vec3/vec5 argument.
1207	if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1208	assert(MemVT.getVectorNumElements() == `3` \|\|
1209	MemVT.getVectorNumElements() == `5` \|\|
1210	(MemVT.getVectorNumElements() >= `9` &&
1211	MemVT.getVectorNumElements() <= `12`));
1212	MemVT = MemVT.getPow2VectorType(Context&: State.getContext());
1213	} else if (!MemVT.isSimple() && !MemVT.isVector()) {
1214	MemVT = MemVT.getRoundIntegerType(Context&: State.getContext());
1215	}
1216
1217	unsigned PartOffset = `0`;
1218	for (unsigned i = `0`; i != NumRegs; ++i) {
1219	State.addLoc(V: CCValAssign::getCustomMem(ValNo: InIndex++, ValVT: RegisterVT,
1220	Offset: BasePartOffset + PartOffset,
1221	LocVT: MemVT.getSimpleVT(),
1222	HTP: CCValAssign::Full));
1223	PartOffset += MemVT.getStoreSize();
1224	}
1225	}
1226	}
1227	}
1228
1229	SDValue AMDGPUTargetLowering::LowerReturn(
1230	SDValue Chain, CallingConv::ID CallConv,
1231	bool isVarArg,
1232	const SmallVectorImpl<ISD::OutputArg> &Outs,
1233	const SmallVectorImpl<SDValue> &OutVals,
1234	const SDLoc &DL, SelectionDAG &DAG) const {
1235	// FIXME: Fails for r600 tests
1236	//assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1237	// "wave terminate should not have return values");
1238	return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1239	}
1240
1241	//===---------------------------------------------------------------------===//
1242	// Target specific lowering
1243	//===---------------------------------------------------------------------===//
1244
1245	/// Selects the correct CCAssignFn for a given CallingConvention value.
1246	CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
1247	bool IsVarArg) {
1248	return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1249	}
1250
1251	CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
1252	bool IsVarArg) {
1253	return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
1254	}
1255
1256	SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain,
1257	SelectionDAG &DAG,
1258	MachineFrameInfo &MFI,
1259	int ClobberedFI) const {
1260	SmallVector<SDValue, `8`> ArgChains;
1261	int64_t FirstByte = MFI.getObjectOffset(ObjectIdx: ClobberedFI);
1262	int64_t LastByte = FirstByte + MFI.getObjectSize(ObjectIdx: ClobberedFI) - `1`;
1263
1264	// Include the original chain at the beginning of the list. When this is
1265	// used by target LowerCall hooks, this helps legalize find the
1266	// CALLSEQ_BEGIN node.
1267	ArgChains.push_back(Elt: Chain);
1268
1269	// Add a chain value for each stack argument corresponding
1270	for (SDNode *U : DAG.getEntryNode().getNode()->uses()) {
1271	if (LoadSDNode *L = dyn_cast<LoadSDNode>(Val: U)) {
1272	if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Val: L->getBasePtr())) {
1273	if (FI->getIndex() < `0`) {
1274	int64_t InFirstByte = MFI.getObjectOffset(ObjectIdx: FI->getIndex());
1275	int64_t InLastByte = InFirstByte;
1276	InLastByte += MFI.getObjectSize(ObjectIdx: FI->getIndex()) - `1`;
1277
1278	if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) \|\|
1279	(FirstByte <= InFirstByte && InFirstByte <= LastByte))
1280	ArgChains.push_back(Elt: SDValue (L, `1`));
1281	}
1282	}
1283	}
1284	}
1285
1286	// Build a tokenfactor for all the chains.
1287	return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1288	}
1289
1290	SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI,
1291	SmallVectorImpl<SDValue> &InVals,
1292	StringRef Reason) const {
1293	SDValue Callee = CLI.Callee;
1294	SelectionDAG &DAG = CLI.DAG;
1295
1296	const Function &Fn = DAG.getMachineFunction().getFunction();
1297
1298	StringRef FuncName("<unknown>");
1299
1300	if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Val&: Callee))
1301	FuncName = G->getSymbol();
1302	else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Val&: Callee))
1303	FuncName = G->getGlobal()->getName();
1304
1305	DiagnosticInfoUnsupported NoCalls(
1306	Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1307	DAG.getContext()->diagnose(DI: NoCalls);
1308
1309	if (!CLI.IsTailCall) {
1310	for (unsigned I = `0`, E = CLI.Ins.size(); I != E; ++I)
1311	InVals.push_back(Elt: DAG.getUNDEF(VT: CLI.Ins [I].VT));
1312	}
1313
1314	return DAG.getEntryNode();
1315	}
1316
1317	SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
1318	SmallVectorImpl<SDValue> &InVals) const {
1319	return lowerUnhandledCall(CLI, InVals, Reason: "unsupported call to function ");
1320	}
1321
1322	SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
1323	SelectionDAG &DAG) const {
1324	const Function &Fn = DAG.getMachineFunction().getFunction();
1325
1326	DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1327	SDLoc (Op).getDebugLoc());
1328	DAG.getContext()->diagnose(DI: NoDynamicAlloca);
1329	auto Ops = {DAG.getConstant(Val: `0`, DL: SDLoc (), VT: Op.getValueType()), Op.getOperand(i: `0`)};
1330	return DAG.getMergeValues(Ops, dl: SDLoc ());
1331	}
1332
1333	SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
1334	SelectionDAG &DAG) const {
1335	switch (Op.getOpcode()) {
1336	default:
1337	Op ->print(OS&: errs(), G: &DAG);
1338	llvm_unreachable("Custom lowering code for this "
1339	"instruction is not implemented yet!");
1340	break;
1341	case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
1342	case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1343	case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
1344	case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1345	case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1346	case ISD::FREM: return LowerFREM(Op, DAG);
1347	case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1348	case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1349	case ISD::FRINT: return LowerFRINT(Op, DAG);
1350	case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1351	case ISD::FROUNDEVEN:
1352	return LowerFROUNDEVEN(Op, DAG);
1353	case ISD::FROUND: return LowerFROUND(Op, DAG);
1354	case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1355	case ISD::FLOG2:
1356	return LowerFLOG2(Op, DAG);
1357	case ISD::FLOG:
1358	case ISD::FLOG10:
1359	return LowerFLOGCommon(Op, DAG);
1360	case ISD::FEXP:
1361	case ISD::FEXP10:
1362	return lowerFEXP(Op, DAG);
1363	case ISD::FEXP2:
1364	return lowerFEXP2(Op, DAG);
1365	case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1366	case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1367	case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1368	case ISD::FP_TO_SINT:
1369	case ISD::FP_TO_UINT:
1370	return LowerFP_TO_INT(Op, DAG);
1371	case ISD::CTTZ:
1372	case ISD::CTTZ_ZERO_UNDEF:
1373	case ISD::CTLZ:
1374	case ISD::CTLZ_ZERO_UNDEF:
1375	return LowerCTLZ_CTTZ(Op, DAG);
1376	case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
1377	}
1378	return Op;
1379	}
1380
1381	void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
1382	SmallVectorImpl<SDValue> &Results,
1383	SelectionDAG &DAG) const {
1384	switch (N->getOpcode()) {
1385	case ISD::SIGN_EXTEND_INREG:
1386	// Different parts of legalization seem to interpret which type of
1387	// sign_extend_inreg is the one to check for custom lowering. The extended
1388	// from type is what really matters, but some places check for custom
1389	// lowering of the result type. This results in trying to use
1390	// ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1391	// nothing here and let the illegal result integer be handled normally.
1392	return;
1393	case ISD::FLOG2:
1394	if (SDValue Lowered = LowerFLOG2(Op: SDValue (N, `0`), DAG))
1395	Results.push_back(Elt: Lowered);
1396	return;
1397	case ISD::FLOG:
1398	case ISD::FLOG10:
1399	if (SDValue Lowered = LowerFLOGCommon(Op: SDValue (N, `0`), DAG))
1400	Results.push_back(Elt: Lowered);
1401	return;
1402	case ISD::FEXP2:
1403	if (SDValue Lowered = lowerFEXP2(Op: SDValue (N, `0`), DAG))
1404	Results.push_back(Elt: Lowered);
1405	return;
1406	case ISD::FEXP:
1407	case ISD::FEXP10:
1408	if (SDValue Lowered = lowerFEXP(Op: SDValue (N, `0`), DAG))
1409	Results.push_back(Elt: Lowered);
1410	return;
1411	case ISD::CTLZ:
1412	case ISD::CTLZ_ZERO_UNDEF:
1413	if (auto Lowered = lowerCTLZResults(Op: SDValue (N, `0u`), DAG))
1414	Results.push_back(Elt: Lowered);
1415	return;
1416	default:
1417	return;
1418	}
1419	}
1420
1421	SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
1422	SDValue Op,
1423	SelectionDAG &DAG) const {
1424
1425	const DataLayout &DL = DAG.getDataLayout();
1426	GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Val&: Op);
1427	const GlobalValue *GV = G->getGlobal();
1428
1429	if (!MFI->isModuleEntryFunction()) {
1430	if (std::optional<uint32_t> Address =
1431	AMDGPUMachineFunction::getLDSAbsoluteAddress(GV: *GV)) {
1432	return DAG.getConstant(Val: *Address, DL: SDLoc (Op), VT: Op.getValueType());
1433	}
1434	}
1435
1436	if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS \|\|
1437	G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1438	if (!MFI->isModuleEntryFunction() &&
1439	!GV->getName().equals(RHS: "llvm.amdgcn.module.lds")) {
1440	SDLoc DL(Op);
1441	const Function &Fn = DAG.getMachineFunction().getFunction();
1442	DiagnosticInfoUnsupported BadLDSDecl(
1443	Fn, "local memory global used by non-kernel function",
1444	DL.getDebugLoc(), DS_Warning);
1445	DAG.getContext()->diagnose(DI: BadLDSDecl);
1446
1447	// We currently don't have a way to correctly allocate LDS objects that
1448	// aren't directly associated with a kernel. We do force inlining of
1449	// functions that use local objects. However, if these dead functions are
1450	// not eliminated, we don't want a compile time error. Just emit a warning
1451	// and a trap, since there should be no callable path here.
1452	SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1453	SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1454	Trap, DAG.getRoot());
1455	DAG.setRoot(OutputChain);
1456	return DAG.getUNDEF(VT: Op.getValueType());
1457	}
1458
1459	// XXX: What does the value of G->getOffset() mean?
1460	assert(G->getOffset() == `0` &&
1461	"Do not know what to do with an non-zero offset");
1462
1463	// TODO: We could emit code to handle the initialization somewhere.
1464	// We ignore the initializer for now and legalize it to allow selection.
1465	// The initializer will anyway get errored out during assembly emission.
1466	unsigned Offset = MFI->allocateLDSGlobal(DL, GV: *cast<GlobalVariable>(Val: GV));
1467	return DAG.getConstant(Val: Offset, DL: SDLoc (Op), VT: Op.getValueType());
1468	}
1469	return SDValue ();
1470	}
1471
1472	SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
1473	SelectionDAG &DAG) const {
1474	SmallVector<SDValue, `8`> Args;
1475	SDLoc SL(Op);
1476
1477	EVT VT = Op.getValueType();
1478	if (VT.getVectorElementType().getSizeInBits() < `32`) {
1479	unsigned OpBitSize = Op.getOperand(i: `0`).getValueType().getSizeInBits();
1480	if (OpBitSize >= `32` && OpBitSize % `32` == `0`) {
1481	unsigned NewNumElt = OpBitSize / `32`;
1482	EVT NewEltVT = (NewNumElt == `1`) ? MVT::i32
1483	: EVT::getVectorVT(*DAG.getContext(),
1484	MVT::i32, NewNumElt);
1485	for (const SDUse &U : Op ->ops()) {
1486	SDValue In = U.get();
1487	SDValue NewIn = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewEltVT, Operand: In);
1488	if (NewNumElt > `1`)
1489	DAG.ExtractVectorElements(Op: NewIn, Args);
1490	else
1491	Args.push_back(Elt: NewIn);
1492	}
1493
1494	EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
1495	NewNumElt * Op.getNumOperands());
1496	SDValue BV = DAG.getBuildVector(VT: NewVT, DL: SL, Ops: Args);
1497	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: BV);
1498	}
1499	}
1500
1501	for (const SDUse &U : Op ->ops())
1502	DAG.ExtractVectorElements(Op: U.get(), Args);
1503
1504	return DAG.getBuildVector(VT: Op.getValueType(), DL: SL, Ops: Args);
1505	}
1506
1507	SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
1508	SelectionDAG &DAG) const {
1509	SDLoc SL(Op);
1510	SmallVector<SDValue, `8`> Args;
1511	unsigned Start = Op.getConstantOperandVal(i: `1`);
1512	EVT VT = Op.getValueType();
1513	EVT SrcVT = Op.getOperand(i: `0`).getValueType();
1514
1515	if (VT.getScalarSizeInBits() == `16` && Start % `2` == `0`) {
1516	unsigned NumElt = VT.getVectorNumElements();
1517	unsigned NumSrcElt = SrcVT.getVectorNumElements();
1518	assert(NumElt % `2` == `0` && NumSrcElt % `2` == `0` && "expect legal types");
1519
1520	// Extract 32-bit registers at a time.
1521	EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / `2`);
1522	EVT NewVT = NumElt == `2`
1523	? MVT::i32
1524	: EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / `2`);
1525	SDValue Tmp = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewSrcVT, Operand: Op.getOperand(i: `0`));
1526
1527	DAG.ExtractVectorElements(Op: Tmp, Args, Start: Start / `2`, Count: NumElt / `2`);
1528	if (NumElt == `2`)
1529	Tmp = Args [`0`];
1530	else
1531	Tmp = DAG.getBuildVector(VT: NewVT, DL: SL, Ops: Args);
1532
1533	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Tmp);
1534	}
1535
1536	DAG.ExtractVectorElements(Op: Op.getOperand(i: `0`), Args, Start,
1537	Count: VT.getVectorNumElements());
1538
1539	return DAG.getBuildVector(VT: Op.getValueType(), DL: SL, Ops: Args);
1540	}
1541
1542	// TODO: Handle fabs too
1543	static SDValue peekFNeg(SDValue Val) {
1544	if (Val.getOpcode() == ISD::FNEG)
1545	return Val.getOperand(i: `0`);
1546
1547	return Val;
1548	}
1549
1550	static SDValue peekFPSignOps(SDValue Val) {
1551	if (Val.getOpcode() == ISD::FNEG)
1552	Val = Val.getOperand(i: `0`);
1553	if (Val.getOpcode() == ISD::FABS)
1554	Val = Val.getOperand(i: `0`);
1555	if (Val.getOpcode() == ISD::FCOPYSIGN)
1556	Val = Val.getOperand(i: `0`);
1557	return Val;
1558	}
1559
1560	SDValue AMDGPUTargetLowering::combineFMinMaxLegacyImpl(
1561	const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1562	SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1563	SelectionDAG &DAG = DCI.DAG;
1564	ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Val&: CC)->get();
1565	switch (CCOpcode) {
1566	case ISD::SETOEQ:
1567	case ISD::SETONE:
1568	case ISD::SETUNE:
1569	case ISD::SETNE:
1570	case ISD::SETUEQ:
1571	case ISD::SETEQ:
1572	case ISD::SETFALSE:
1573	case ISD::SETFALSE2:
1574	case ISD::SETTRUE:
1575	case ISD::SETTRUE2:
1576	case ISD::SETUO:
1577	case ISD::SETO:
1578	break;
1579	case ISD::SETULE:
1580	case ISD::SETULT: {
1581	if (LHS == True)
1582	return DAG.getNode(Opcode: AMDGPUISD::FMIN_LEGACY, DL, VT, N1: RHS, N2: LHS);
1583	return DAG.getNode(Opcode: AMDGPUISD::FMAX_LEGACY, DL, VT, N1: LHS, N2: RHS);
1584	}
1585	case ISD::SETOLE:
1586	case ISD::SETOLT:
1587	case ISD::SETLE:
1588	case ISD::SETLT: {
1589	// Ordered. Assume ordered for undefined.
1590
1591	// Only do this after legalization to avoid interfering with other combines
1592	// which might occur.
1593	if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1594	!DCI.isCalledByLegalizer())
1595	return SDValue ();
1596
1597	// We need to permute the operands to get the correct NaN behavior. The
1598	// selected operand is the second one based on the failing compare with NaN,
1599	// so permute it based on the compare type the hardware uses.
1600	if (LHS == True)
1601	return DAG.getNode(Opcode: AMDGPUISD::FMIN_LEGACY, DL, VT, N1: LHS, N2: RHS);
1602	return DAG.getNode(Opcode: AMDGPUISD::FMAX_LEGACY, DL, VT, N1: RHS, N2: LHS);
1603	}
1604	case ISD::SETUGE:
1605	case ISD::SETUGT: {
1606	if (LHS == True)
1607	return DAG.getNode(Opcode: AMDGPUISD::FMAX_LEGACY, DL, VT, N1: RHS, N2: LHS);
1608	return DAG.getNode(Opcode: AMDGPUISD::FMIN_LEGACY, DL, VT, N1: LHS, N2: RHS);
1609	}
1610	case ISD::SETGT:
1611	case ISD::SETGE:
1612	case ISD::SETOGE:
1613	case ISD::SETOGT: {
1614	if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1615	!DCI.isCalledByLegalizer())
1616	return SDValue ();
1617
1618	if (LHS == True)
1619	return DAG.getNode(Opcode: AMDGPUISD::FMAX_LEGACY, DL, VT, N1: LHS, N2: RHS);
1620	return DAG.getNode(Opcode: AMDGPUISD::FMIN_LEGACY, DL, VT, N1: RHS, N2: LHS);
1621	}
1622	case ISD::SETCC_INVALID:
1623	llvm_unreachable("Invalid setcc condcode!");
1624	}
1625	return SDValue ();
1626	}
1627
1628	/// Generate Min/Max node
1629	SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
1630	SDValue LHS, SDValue RHS,
1631	SDValue True, SDValue False,
1632	SDValue CC,
1633	DAGCombinerInfo &DCI) const {
1634	if ((LHS == True && RHS == False) \|\| (LHS == False && RHS == True))
1635	return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1636
1637	SelectionDAG &DAG = DCI.DAG;
1638
1639	// If we can't directly match this, try to see if we can fold an fneg to
1640	// match.
1641
1642	ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(Val&: RHS);
1643	ConstantFPSDNode *CFalse = dyn_cast<ConstantFPSDNode>(Val&: False);
1644	SDValue NegTrue = peekFNeg(Val: True);
1645
1646	// Undo the combine foldFreeOpFromSelect does if it helps us match the
1647	// fmin/fmax.
1648	//
1649	// select (fcmp olt (lhs, K)), (fneg lhs), -K
1650	// -> fneg (fmin_legacy lhs, K)
1651	//
1652	// TODO: Use getNegatedExpression
1653	if (LHS == NegTrue && CFalse && CRHS) {
1654	APFloat NegRHS = neg(X: CRHS->getValueAPF());
1655	if (NegRHS == CFalse->getValueAPF()) {
1656	SDValue Combined =
1657	combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True: NegTrue, False, CC, DCI);
1658	if (Combined)
1659	return DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: Combined);
1660	return SDValue ();
1661	}
1662	}
1663
1664	return SDValue ();
1665	}
1666
1667	std::pair<SDValue, SDValue>
1668	AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
1669	SDLoc SL(Op);
1670
1671	SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1672
1673	const SDValue Zero = DAG.getConstant(`0`, SL, MVT::i32);
1674	const SDValue One = DAG.getConstant(`1`, SL, MVT::i32);
1675
1676	SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1677	SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1678
1679	return std::pair(Lo, Hi);
1680	}
1681
1682	SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
1683	SDLoc SL(Op);
1684
1685	SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1686	const SDValue Zero = DAG.getConstant(`0`, SL, MVT::i32);
1687	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1688	}
1689
1690	SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
1691	SDLoc SL(Op);
1692
1693	SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1694	const SDValue One = DAG.getConstant(`1`, SL, MVT::i32);
1695	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1696	}
1697
1698	// Split a vector type into two parts. The first part is a power of two vector.
1699	// The second part is whatever is left over, and is a scalar if it would
1700	// otherwise be a 1-vector.
1701	std::pair<EVT, EVT>
1702	AMDGPUTargetLowering::getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const {
1703	EVT LoVT, HiVT;
1704	EVT EltVT = VT.getVectorElementType();
1705	unsigned NumElts = VT.getVectorNumElements();
1706	unsigned LoNumElts = PowerOf2Ceil(A: (NumElts + `1`) / `2`);
1707	LoVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: EltVT, NumElements: LoNumElts);
1708	HiVT = NumElts - LoNumElts == `1`
1709	? EltVT
1710	: EVT::getVectorVT(Context&: *DAG.getContext(), VT: EltVT, NumElements: NumElts - LoNumElts);
1711	return std::pair(LoVT, HiVT);
1712	}
1713
1714	// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1715	// scalar.
1716	std::pair<SDValue, SDValue>
1717	AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL,
1718	const EVT &LoVT, const EVT &HiVT,
1719	SelectionDAG &DAG) const {
1720	assert(LoVT.getVectorNumElements() +
1721	(HiVT.isVector() ? HiVT.getVectorNumElements() : `1`) <=
1722	N.getValueType().getVectorNumElements() &&
1723	"More vector elements requested than available!");
1724	SDValue Lo = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: LoVT, N1: N,
1725	N2: DAG.getVectorIdxConstant(Val: `0`, DL));
1726	SDValue Hi = DAG.getNode(
1727	Opcode: HiVT.isVector() ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT, DL,
1728	VT: HiVT, N1: N, N2: DAG.getVectorIdxConstant(Val: LoVT.getVectorNumElements(), DL));
1729	return std::pair(Lo, Hi);
1730	}
1731
1732	SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
1733	SelectionDAG &DAG) const {
1734	LoadSDNode *Load = cast<LoadSDNode>(Val: Op);
1735	EVT VT = Op.getValueType();
1736	SDLoc SL(Op);
1737
1738
1739	// If this is a 2 element vector, we really want to scalarize and not create
1740	// weird 1 element vectors.
1741	if (VT.getVectorNumElements() == `2`) {
1742	SDValue Ops[`2`];
1743	std::tie(args&: Ops[`0`], args&: Ops[`1`]) = scalarizeVectorLoad(LD: Load, DAG);
1744	return DAG.getMergeValues(Ops, dl: SL);
1745	}
1746
1747	SDValue BasePtr = Load->getBasePtr();
1748	EVT MemVT = Load->getMemoryVT();
1749
1750	const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1751
1752	EVT LoVT, HiVT;
1753	EVT LoMemVT, HiMemVT;
1754	SDValue Lo, Hi;
1755
1756	std::tie(args&: LoVT, args&: HiVT) = getSplitDestVTs(VT, DAG);
1757	std::tie(args&: LoMemVT, args&: HiMemVT) = getSplitDestVTs(VT: MemVT, DAG);
1758	std::tie(args&: Lo, args&: Hi) = splitVector(N: Op, DL: SL, LoVT, HiVT, DAG);
1759
1760	unsigned Size = LoMemVT.getStoreSize();
1761	Align BaseAlign = Load->getAlign();
1762	Align HiAlign = commonAlignment(A: BaseAlign, Offset: Size);
1763
1764	SDValue LoLoad = DAG.getExtLoad(ExtType: Load->getExtensionType(), dl: SL, VT: LoVT,
1765	Chain: Load->getChain(), Ptr: BasePtr, PtrInfo: SrcValue, MemVT: LoMemVT,
1766	Alignment: BaseAlign, MMOFlags: Load->getMemOperand()->getFlags());
1767	SDValue HiPtr = DAG.getObjectPtrOffset(SL, Ptr: BasePtr, Offset: TypeSize::getFixed(ExactSize: Size));
1768	SDValue HiLoad =
1769	DAG.getExtLoad(ExtType: Load->getExtensionType(), dl: SL, VT: HiVT, Chain: Load->getChain(),
1770	Ptr: HiPtr, PtrInfo: SrcValue.getWithOffset(O: LoMemVT.getStoreSize()),
1771	MemVT: HiMemVT, Alignment: HiAlign, MMOFlags: Load->getMemOperand()->getFlags());
1772
1773	SDValue Join;
1774	if (LoVT == HiVT) {
1775	// This is the case that the vector is power of two so was evenly split.
1776	Join = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SL, VT, N1: LoLoad, N2: HiLoad);
1777	} else {
1778	Join = DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: SL, VT, N1: DAG.getUNDEF(VT), N2: LoLoad,
1779	N3: DAG.getVectorIdxConstant(Val: `0`, DL: SL));
1780	Join = DAG.getNode(
1781	Opcode: HiVT.isVector() ? ISD::INSERT_SUBVECTOR : ISD::INSERT_VECTOR_ELT, DL: SL,
1782	VT, N1: Join, N2: HiLoad,
1783	N3: DAG.getVectorIdxConstant(Val: LoVT.getVectorNumElements(), DL: SL));
1784	}
1785
1786	SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1787	LoLoad.getValue(`1`), HiLoad.getValue(`1`))};
1788
1789	return DAG.getMergeValues(Ops, SL);
1790	}
1791
1792	SDValue AMDGPUTargetLowering::WidenOrSplitVectorLoad(SDValue Op,
1793	SelectionDAG &DAG) const {
1794	LoadSDNode *Load = cast<LoadSDNode>(Val&: Op);
1795	EVT VT = Op.getValueType();
1796	SDValue BasePtr = Load->getBasePtr();
1797	EVT MemVT = Load->getMemoryVT();
1798	SDLoc SL(Op);
1799	const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1800	Align BaseAlign = Load->getAlign();
1801	unsigned NumElements = MemVT.getVectorNumElements();
1802
1803	// Widen from vec3 to vec4 when the load is at least 8-byte aligned
1804	// or 16-byte fully dereferenceable. Otherwise, split the vector load.
1805	if (NumElements != `3` \|\|
1806	(BaseAlign < Align (`8`) &&
1807	!SrcValue.isDereferenceable(Size: `16`, C&: *DAG.getContext(), DL: DAG.getDataLayout())))
1808	return SplitVectorLoad(Op, DAG);
1809
1810	assert(NumElements == `3`);
1811
1812	EVT WideVT =
1813	EVT::getVectorVT(Context&: *DAG.getContext(), VT: VT.getVectorElementType(), NumElements: `4`);
1814	EVT WideMemVT =
1815	EVT::getVectorVT(Context&: *DAG.getContext(), VT: MemVT.getVectorElementType(), NumElements: `4`);
1816	SDValue WideLoad = DAG.getExtLoad(
1817	ExtType: Load->getExtensionType(), dl: SL, VT: WideVT, Chain: Load->getChain(), Ptr: BasePtr, PtrInfo: SrcValue,
1818	MemVT: WideMemVT, Alignment: BaseAlign, MMOFlags: Load->getMemOperand()->getFlags());
1819	return DAG.getMergeValues(
1820	Ops: {DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT, N1: WideLoad,
1821	N2: DAG.getVectorIdxConstant(Val: `0`, DL: SL)),
1822	WideLoad.getValue(R: `1`)},
1823	dl: SL);
1824	}
1825
1826	SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
1827	SelectionDAG &DAG) const {
1828	StoreSDNode *Store = cast<StoreSDNode>(Val&: Op);
1829	SDValue Val = Store->getValue();
1830	EVT VT = Val.getValueType();
1831
1832	// If this is a 2 element vector, we really want to scalarize and not create
1833	// weird 1 element vectors.
1834	if (VT.getVectorNumElements() == `2`)
1835	return scalarizeVectorStore(ST: Store, DAG);
1836
1837	EVT MemVT = Store->getMemoryVT();
1838	SDValue Chain = Store->getChain();
1839	SDValue BasePtr = Store->getBasePtr();
1840	SDLoc SL(Op);
1841
1842	EVT LoVT, HiVT;
1843	EVT LoMemVT, HiMemVT;
1844	SDValue Lo, Hi;
1845
1846	std::tie(args&: LoVT, args&: HiVT) = getSplitDestVTs(VT, DAG);
1847	std::tie(args&: LoMemVT, args&: HiMemVT) = getSplitDestVTs(VT: MemVT, DAG);
1848	std::tie(args&: Lo, args&: Hi) = splitVector(N: Val, DL: SL, LoVT, HiVT, DAG);
1849
1850	SDValue HiPtr = DAG.getObjectPtrOffset(SL, Ptr: BasePtr, Offset: LoMemVT.getStoreSize());
1851
1852	const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1853	Align BaseAlign = Store->getAlign();
1854	unsigned Size = LoMemVT.getStoreSize();
1855	Align HiAlign = commonAlignment(A: BaseAlign, Offset: Size);
1856
1857	SDValue LoStore =
1858	DAG.getTruncStore(Chain, dl: SL, Val: Lo, Ptr: BasePtr, PtrInfo: SrcValue, SVT: LoMemVT, Alignment: BaseAlign,
1859	MMOFlags: Store->getMemOperand()->getFlags());
1860	SDValue HiStore =
1861	DAG.getTruncStore(Chain, dl: SL, Val: Hi, Ptr: HiPtr, PtrInfo: SrcValue.getWithOffset(O: Size),
1862	SVT: HiMemVT, Alignment: HiAlign, MMOFlags: Store->getMemOperand()->getFlags());
1863
1864	return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1865	}
1866
1867	// This is a shortcut for integer division because we have fast i32<->f32
1868	// conversions, and fast f32 reciprocal instructions. The fractional part of a
1869	// float is enough to accurately represent up to a 24-bit signed integer.
1870	SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
1871	bool Sign) const {
1872	SDLoc DL(Op);
1873	EVT VT = Op.getValueType();
1874	SDValue LHS = Op.getOperand(i: `0`);
1875	SDValue RHS = Op.getOperand(i: `1`);
1876	MVT IntVT = MVT::i32;
1877	MVT FltVT = MVT::f32;
1878
1879	unsigned LHSSignBits = DAG.ComputeNumSignBits(Op: LHS);
1880	if (LHSSignBits < `9`)
1881	return SDValue ();
1882
1883	unsigned RHSSignBits = DAG.ComputeNumSignBits(Op: RHS);
1884	if (RHSSignBits < `9`)
1885	return SDValue ();
1886
1887	unsigned BitSize = VT.getSizeInBits();
1888	unsigned SignBits = std::min(a: LHSSignBits, b: RHSSignBits);
1889	unsigned DivBits = BitSize - SignBits;
1890	if (Sign)
1891	++DivBits;
1892
1893	ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
1894	ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
1895
1896	SDValue jq = DAG.getConstant(Val: `1`, DL, VT: IntVT);
1897
1898	if (Sign) {
1899	// char\|short jq = ia ^ ib;
1900	jq = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: LHS, N2: RHS);
1901
1902	// jq = jq >> (bitsize - 2)
1903	jq = DAG.getNode(Opcode: ISD::SRA, DL, VT, N1: jq,
1904	N2: DAG.getConstant(Val: BitSize - `2`, DL, VT));
1905
1906	// jq = jq \| 0x1
1907	jq = DAG.getNode(Opcode: ISD::OR, DL, VT, N1: jq, N2: DAG.getConstant(Val: `1`, DL, VT));
1908	}
1909
1910	// int ia = (int)LHS;
1911	SDValue ia = LHS;
1912
1913	// int ib, (int)RHS;
1914	SDValue ib = RHS;
1915
1916	// float fa = (float)ia;
1917	SDValue fa = DAG.getNode(Opcode: ToFp, DL, VT: FltVT, Operand: ia);
1918
1919	// float fb = (float)ib;
1920	SDValue fb = DAG.getNode(Opcode: ToFp, DL, VT: FltVT, Operand: ib);
1921
1922	SDValue fq = DAG.getNode(Opcode: ISD::FMUL, DL, VT: FltVT,
1923	N1: fa, N2: DAG.getNode(Opcode: AMDGPUISD::RCP, DL, VT: FltVT, Operand: fb));
1924
1925	// fq = trunc(fq);
1926	fq = DAG.getNode(Opcode: ISD::FTRUNC, DL, VT: FltVT, Operand: fq);
1927
1928	// float fqneg = -fq;
1929	SDValue fqneg = DAG.getNode(Opcode: ISD::FNEG, DL, VT: FltVT, Operand: fq);
1930
1931	MachineFunction &MF = DAG.getMachineFunction();
1932
1933	bool UseFmadFtz = false;
1934	if (Subtarget->isGCN()) {
1935	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1936	UseFmadFtz =
1937	MFI->getMode().FP32Denormals != DenormalMode::getPreserveSign();
1938	}
1939
1940	// float fr = mad(fqneg, fb, fa);
1941	unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
1942	: UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
1943	: (unsigned)ISD::FMAD;
1944	SDValue fr = DAG.getNode(Opcode: OpCode, DL, VT: FltVT, N1: fqneg, N2: fb, N3: fa);
1945
1946	// int iq = (int)fq;
1947	SDValue iq = DAG.getNode(Opcode: ToInt, DL, VT: IntVT, Operand: fq);
1948
1949	// fr = fabs(fr);
1950	fr = DAG.getNode(Opcode: ISD::FABS, DL, VT: FltVT, Operand: fr);
1951
1952	// fb = fabs(fb);
1953	fb = DAG.getNode(Opcode: ISD::FABS, DL, VT: FltVT, Operand: fb);
1954
1955	EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
1956
1957	// int cv = fr >= fb;
1958	SDValue cv = DAG.getSetCC(DL, VT: SetCCVT, LHS: fr, RHS: fb, Cond: ISD::SETOGE);
1959
1960	// jq = (cv ? jq : 0);
1961	jq = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: cv, N2: jq, N3: DAG.getConstant(Val: `0`, DL, VT));
1962
1963	// dst = iq + jq;
1964	SDValue Div = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: iq, N2: jq);
1965
1966	// Rem needs compensation, it's easier to recompute it
1967	SDValue Rem = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: Div, N2: RHS);
1968	Rem = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: LHS, N2: Rem);
1969
1970	// Truncate to number of bits this divide really is.
1971	if (Sign) {
1972	SDValue InRegSize
1973	= DAG.getValueType(EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: DivBits));
1974	Div = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT, N1: Div, N2: InRegSize);
1975	Rem = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT, N1: Rem, N2: InRegSize);
1976	} else {
1977	SDValue TruncMask = DAG.getConstant(Val: (UINT64_C(`1`) << DivBits) - `1`, DL, VT);
1978	Div = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: Div, N2: TruncMask);
1979	Rem = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: Rem, N2: TruncMask);
1980	}
1981
1982	return DAG.getMergeValues(Ops: { Div, Rem }, dl: DL);
1983	}
1984
1985	void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
1986	SelectionDAG &DAG,
1987	SmallVectorImpl<SDValue> &Results) const {
1988	SDLoc DL(Op);
1989	EVT VT = Op.getValueType();
1990
1991	assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
1992
1993	EVT HalfVT = VT.getHalfSizedIntegerVT(Context&: *DAG.getContext());
1994
1995	SDValue One = DAG.getConstant(Val: `1`, DL, VT: HalfVT);
1996	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT: HalfVT);
1997
1998	//HiLo split
1999	SDValue LHS_Lo, LHS_Hi;
2000	SDValue LHS = Op.getOperand(i: `0`);
2001	std::tie(args&: LHS_Lo, args&: LHS_Hi) = DAG.SplitScalar(N: LHS, DL, LoVT: HalfVT, HiVT: HalfVT);
2002
2003	SDValue RHS_Lo, RHS_Hi;
2004	SDValue RHS = Op.getOperand(i: `1`);
2005	std::tie(args&: RHS_Lo, args&: RHS_Hi) = DAG.SplitScalar(N: RHS, DL, LoVT: HalfVT, HiVT: HalfVT);
2006
2007	if (DAG.MaskedValueIsZero(Op: RHS, Mask: APInt::getHighBitsSet(numBits: `64`, hiBitsSet: `32`)) &&
2008	DAG.MaskedValueIsZero(Op: LHS, Mask: APInt::getHighBitsSet(numBits: `64`, hiBitsSet: `32`))) {
2009
2010	SDValue Res = DAG.getNode(Opcode: ISD::UDIVREM, DL, VTList: DAG.getVTList(VT1: HalfVT, VT2: HalfVT),
2011	N1: LHS_Lo, N2: RHS_Lo);
2012
2013	SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(`0`), Zero});
2014	SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(`1`), Zero});
2015
2016	Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
2017	Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
2018	return;
2019	}
2020
2021	if (isTypeLegal(MVT::i64)) {
2022	// The algorithm here is based on ideas from "Software Integer Division",
2023	// Tom Rodeheffer, August 2008.
2024
2025	MachineFunction &MF = DAG.getMachineFunction();
2026	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2027
2028	// Compute denominator reciprocal.
2029	unsigned FMAD =
2030	!Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2031	: MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign()
2032	? (unsigned)ISD::FMAD
2033	: (unsigned)AMDGPUISD::FMAD_FTZ;
2034
2035	SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
2036	SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
2037	SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
2038	DAG.getConstantFP(APInt(`32`, `0x4f800000`).bitsToFloat(), DL, MVT::f32),
2039	Cvt_Lo);
2040	SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
2041	SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
2042	DAG.getConstantFP(APInt(`32`, `0x5f7ffffc`).bitsToFloat(), DL, MVT::f32));
2043	SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
2044	DAG.getConstantFP(APInt(`32`, `0x2f800000`).bitsToFloat(), DL, MVT::f32));
2045	SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
2046	SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
2047	DAG.getConstantFP(APInt(`32`, `0xcf800000`).bitsToFloat(), DL, MVT::f32),
2048	Mul1);
2049	SDValue Rcp_Lo = DAG.getNode(Opcode: ISD::FP_TO_UINT, DL, VT: HalfVT, Operand: Mad2);
2050	SDValue Rcp_Hi = DAG.getNode(Opcode: ISD::FP_TO_UINT, DL, VT: HalfVT, Operand: Trunc);
2051	SDValue Rcp64 = DAG.getBitcast(VT,
2052	DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
2053
2054	SDValue Zero64 = DAG.getConstant(Val: `0`, DL, VT);
2055	SDValue One64 = DAG.getConstant(Val: `1`, DL, VT);
2056	SDValue Zero1 = DAG.getConstant(`0`, DL, MVT::i1);
2057	SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
2058
2059	// First round of UNR (Unsigned integer Newton-Raphson).
2060	SDValue Neg_RHS = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Zero64, N2: RHS);
2061	SDValue Mullo1 = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: Neg_RHS, N2: Rcp64);
2062	SDValue Mulhi1 = DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: Rcp64, N2: Mullo1);
2063	SDValue Mulhi1_Lo, Mulhi1_Hi;
2064	std::tie(args&: Mulhi1_Lo, args&: Mulhi1_Hi) =
2065	DAG.SplitScalar(N: Mulhi1, DL, LoVT: HalfVT, HiVT: HalfVT);
2066	SDValue Add1_Lo = DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: HalfCarryVT, N1: Rcp_Lo,
2067	N2: Mulhi1_Lo, N3: Zero1);
2068	SDValue Add1_Hi = DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: HalfCarryVT, N1: Rcp_Hi,
2069	N2: Mulhi1_Hi, N3: Add1_Lo.getValue(R: `1`));
2070	SDValue Add1 = DAG.getBitcast(VT,
2071	DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
2072
2073	// Second round of UNR.
2074	SDValue Mullo2 = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: Neg_RHS, N2: Add1);
2075	SDValue Mulhi2 = DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: Add1, N2: Mullo2);
2076	SDValue Mulhi2_Lo, Mulhi2_Hi;
2077	std::tie(args&: Mulhi2_Lo, args&: Mulhi2_Hi) =
2078	DAG.SplitScalar(N: Mulhi2, DL, LoVT: HalfVT, HiVT: HalfVT);
2079	SDValue Add2_Lo = DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: HalfCarryVT, N1: Add1_Lo,
2080	N2: Mulhi2_Lo, N3: Zero1);
2081	SDValue Add2_Hi = DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: HalfCarryVT, N1: Add1_Hi,
2082	N2: Mulhi2_Hi, N3: Add2_Lo.getValue(R: `1`));
2083	SDValue Add2 = DAG.getBitcast(VT,
2084	DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
2085
2086	SDValue Mulhi3 = DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: LHS, N2: Add2);
2087
2088	SDValue Mul3 = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: RHS, N2: Mulhi3);
2089
2090	SDValue Mul3_Lo, Mul3_Hi;
2091	std::tie(args&: Mul3_Lo, args&: Mul3_Hi) = DAG.SplitScalar(N: Mul3, DL, LoVT: HalfVT, HiVT: HalfVT);
2092	SDValue Sub1_Lo = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: LHS_Lo,
2093	N2: Mul3_Lo, N3: Zero1);
2094	SDValue Sub1_Hi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: LHS_Hi,
2095	N2: Mul3_Hi, N3: Sub1_Lo.getValue(R: `1`));
2096	SDValue Sub1_Mi = DAG.getNode(Opcode: ISD::SUB, DL, VT: HalfVT, N1: LHS_Hi, N2: Mul3_Hi);
2097	SDValue Sub1 = DAG.getBitcast(VT,
2098	DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
2099
2100	SDValue MinusOne = DAG.getConstant(Val: `0xffffffffu`, DL, VT: HalfVT);
2101	SDValue C1 = DAG.getSelectCC(DL, LHS: Sub1_Hi, RHS: RHS_Hi, True: MinusOne, False: Zero,
2102	Cond: ISD::SETUGE);
2103	SDValue C2 = DAG.getSelectCC(DL, LHS: Sub1_Lo, RHS: RHS_Lo, True: MinusOne, False: Zero,
2104	Cond: ISD::SETUGE);
2105	SDValue C3 = DAG.getSelectCC(DL, LHS: Sub1_Hi, RHS: RHS_Hi, True: C2, False: C1, Cond: ISD::SETEQ);
2106
2107	// TODO: Here and below portions of the code can be enclosed into if/endif.
2108	// Currently control flow is unconditional and we have 4 selects after
2109	// potential endif to substitute PHIs.
2110
2111	// if C3 != 0 ...
2112	SDValue Sub2_Lo = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub1_Lo,
2113	N2: RHS_Lo, N3: Zero1);
2114	SDValue Sub2_Mi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub1_Mi,
2115	N2: RHS_Hi, N3: Sub1_Lo.getValue(R: `1`));
2116	SDValue Sub2_Hi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub2_Mi,
2117	N2: Zero, N3: Sub2_Lo.getValue(R: `1`));
2118	SDValue Sub2 = DAG.getBitcast(VT,
2119	DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
2120
2121	SDValue Add3 = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Mulhi3, N2: One64);
2122
2123	SDValue C4 = DAG.getSelectCC(DL, LHS: Sub2_Hi, RHS: RHS_Hi, True: MinusOne, False: Zero,
2124	Cond: ISD::SETUGE);
2125	SDValue C5 = DAG.getSelectCC(DL, LHS: Sub2_Lo, RHS: RHS_Lo, True: MinusOne, False: Zero,
2126	Cond: ISD::SETUGE);
2127	SDValue C6 = DAG.getSelectCC(DL, LHS: Sub2_Hi, RHS: RHS_Hi, True: C5, False: C4, Cond: ISD::SETEQ);
2128
2129	// if (C6 != 0)
2130	SDValue Add4 = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Add3, N2: One64);
2131
2132	SDValue Sub3_Lo = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub2_Lo,
2133	N2: RHS_Lo, N3: Zero1);
2134	SDValue Sub3_Mi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub2_Mi,
2135	N2: RHS_Hi, N3: Sub2_Lo.getValue(R: `1`));
2136	SDValue Sub3_Hi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub3_Mi,
2137	N2: Zero, N3: Sub3_Lo.getValue(R: `1`));
2138	SDValue Sub3 = DAG.getBitcast(VT,
2139	DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
2140
2141	// endif C6
2142	// endif C3
2143
2144	SDValue Sel1 = DAG.getSelectCC(DL, LHS: C6, RHS: Zero, True: Add4, False: Add3, Cond: ISD::SETNE);
2145	SDValue Div = DAG.getSelectCC(DL, LHS: C3, RHS: Zero, True: Sel1, False: Mulhi3, Cond: ISD::SETNE);
2146
2147	SDValue Sel2 = DAG.getSelectCC(DL, LHS: C6, RHS: Zero, True: Sub3, False: Sub2, Cond: ISD::SETNE);
2148	SDValue Rem = DAG.getSelectCC(DL, LHS: C3, RHS: Zero, True: Sel2, False: Sub1, Cond: ISD::SETNE);
2149
2150	Results.push_back(Elt: Div);
2151	Results.push_back(Elt: Rem);
2152
2153	return;
2154	}
2155
2156	// r600 expandion.
2157	// Get Speculative values
2158	SDValue DIV_Part = DAG.getNode(Opcode: ISD::UDIV, DL, VT: HalfVT, N1: LHS_Hi, N2: RHS_Lo);
2159	SDValue REM_Part = DAG.getNode(Opcode: ISD::UREM, DL, VT: HalfVT, N1: LHS_Hi, N2: RHS_Lo);
2160
2161	SDValue REM_Lo = DAG.getSelectCC(DL, LHS: RHS_Hi, RHS: Zero, True: REM_Part, False: LHS_Hi, Cond: ISD::SETEQ);
2162	SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
2163	REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
2164
2165	SDValue DIV_Hi = DAG.getSelectCC(DL, LHS: RHS_Hi, RHS: Zero, True: DIV_Part, False: Zero, Cond: ISD::SETEQ);
2166	SDValue DIV_Lo = Zero;
2167
2168	const unsigned halfBitWidth = HalfVT.getSizeInBits();
2169
2170	for (unsigned i = `0`; i < halfBitWidth; ++i) {
2171	const unsigned bitPos = halfBitWidth - i - `1`;
2172	SDValue POS = DAG.getConstant(Val: bitPos, DL, VT: HalfVT);
2173	// Get value of high bit
2174	SDValue HBit = DAG.getNode(Opcode: ISD::SRL, DL, VT: HalfVT, N1: LHS_Lo, N2: POS);
2175	HBit = DAG.getNode(Opcode: ISD::AND, DL, VT: HalfVT, N1: HBit, N2: One);
2176	HBit = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT, Operand: HBit);
2177
2178	// Shift
2179	REM = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: REM, N2: DAG.getConstant(Val: `1`, DL, VT));
2180	// Add LHS high bit
2181	REM = DAG.getNode(Opcode: ISD::OR, DL, VT, N1: REM, N2: HBit);
2182
2183	SDValue BIT = DAG.getConstant(Val: `1ULL` << bitPos, DL, VT: HalfVT);
2184	SDValue realBIT = DAG.getSelectCC(DL, LHS: REM, RHS, True: BIT, False: Zero, Cond: ISD::SETUGE);
2185
2186	DIV_Lo = DAG.getNode(Opcode: ISD::OR, DL, VT: HalfVT, N1: DIV_Lo, N2: realBIT);
2187
2188	// Update REM
2189	SDValue REM_sub = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: REM, N2: RHS);
2190	REM = DAG.getSelectCC(DL, LHS: REM, RHS, True: REM_sub, False: REM, Cond: ISD::SETUGE);
2191	}
2192
2193	SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2194	DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2195	Results.push_back(Elt: DIV);
2196	Results.push_back(Elt: REM);
2197	}
2198
2199	SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
2200	SelectionDAG &DAG) const {
2201	SDLoc DL(Op);
2202	EVT VT = Op.getValueType();
2203
2204	if (VT == MVT::i64) {
2205	SmallVector<SDValue, `2`> Results;
2206	LowerUDIVREM64(Op, DAG, Results);
2207	return DAG.getMergeValues(Ops: Results, dl: DL);
2208	}
2209
2210	if (VT == MVT::i32) {
2211	if (SDValue Res = LowerDIVREM24(Op, DAG, Sign: false))
2212	return Res;
2213	}
2214
2215	SDValue X = Op.getOperand(i: `0`);
2216	SDValue Y = Op.getOperand(i: `1`);
2217
2218	// See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2219	// algorithm used here.
2220
2221	// Initial estimate of inv(y).
2222	SDValue Z = DAG.getNode(Opcode: AMDGPUISD::URECIP, DL, VT, Operand: Y);
2223
2224	// One round of UNR.
2225	SDValue NegY = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: DAG.getConstant(Val: `0`, DL, VT), N2: Y);
2226	SDValue NegYZ = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: NegY, N2: Z);
2227	Z = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Z,
2228	N2: DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: Z, N2: NegYZ));
2229
2230	// Quotient/remainder estimate.
2231	SDValue Q = DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: X, N2: Z);
2232	SDValue R =
2233	DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: X, N2: DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: Q, N2: Y));
2234
2235	// First quotient/remainder refinement.
2236	EVT CCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
2237	SDValue One = DAG.getConstant(Val: `1`, DL, VT);
2238	SDValue Cond = DAG.getSetCC(DL, VT: CCVT, LHS: R, RHS: Y, Cond: ISD::SETUGE);
2239	Q = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: Cond,
2240	N2: DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Q, N2: One), N3: Q);
2241	R = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: Cond,
2242	N2: DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: R, N2: Y), N3: R);
2243
2244	// Second quotient/remainder refinement.
2245	Cond = DAG.getSetCC(DL, VT: CCVT, LHS: R, RHS: Y, Cond: ISD::SETUGE);
2246	Q = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: Cond,
2247	N2: DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Q, N2: One), N3: Q);
2248	R = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: Cond,
2249	N2: DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: R, N2: Y), N3: R);
2250
2251	return DAG.getMergeValues(Ops: {Q, R}, dl: DL);
2252	}
2253
2254	SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
2255	SelectionDAG &DAG) const {
2256	SDLoc DL(Op);
2257	EVT VT = Op.getValueType();
2258
2259	SDValue LHS = Op.getOperand(i: `0`);
2260	SDValue RHS = Op.getOperand(i: `1`);
2261
2262	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT);
2263	SDValue NegOne = DAG.getConstant(Val: -`1`, DL, VT);
2264
2265	if (VT == MVT::i32) {
2266	if (SDValue Res = LowerDIVREM24(Op, DAG, Sign: true))
2267	return Res;
2268	}
2269
2270	if (VT == MVT::i64 &&
2271	DAG.ComputeNumSignBits(LHS) > `32` &&
2272	DAG.ComputeNumSignBits(RHS) > `32`) {
2273	EVT HalfVT = VT.getHalfSizedIntegerVT(Context&: *DAG.getContext());
2274
2275	//HiLo split
2276	SDValue LHS_Lo = DAG.getNode(Opcode: ISD::EXTRACT_ELEMENT, DL, VT: HalfVT, N1: LHS, N2: Zero);
2277	SDValue RHS_Lo = DAG.getNode(Opcode: ISD::EXTRACT_ELEMENT, DL, VT: HalfVT, N1: RHS, N2: Zero);
2278	SDValue DIVREM = DAG.getNode(Opcode: ISD::SDIVREM, DL, VTList: DAG.getVTList(VT1: HalfVT, VT2: HalfVT),
2279	N1: LHS_Lo, N2: RHS_Lo);
2280	SDValue Res[`2`] = {
2281	DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT, Operand: DIVREM.getValue(R: `0`)),
2282	DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT, Operand: DIVREM.getValue(R: `1`))
2283	};
2284	return DAG.getMergeValues(Ops: Res, dl: DL);
2285	}
2286
2287	SDValue LHSign = DAG.getSelectCC(DL, LHS, RHS: Zero, True: NegOne, False: Zero, Cond: ISD::SETLT);
2288	SDValue RHSign = DAG.getSelectCC(DL, LHS: RHS, RHS: Zero, True: NegOne, False: Zero, Cond: ISD::SETLT);
2289	SDValue DSign = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: LHSign, N2: RHSign);
2290	SDValue RSign = LHSign; // Remainder sign is the same as LHS
2291
2292	LHS = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: LHS, N2: LHSign);
2293	RHS = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: RHS, N2: RHSign);
2294
2295	LHS = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: LHS, N2: LHSign);
2296	RHS = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: RHS, N2: RHSign);
2297
2298	SDValue Div = DAG.getNode(Opcode: ISD::UDIVREM, DL, VTList: DAG.getVTList(VT1: VT, VT2: VT), N1: LHS, N2: RHS);
2299	SDValue Rem = Div.getValue(R: `1`);
2300
2301	Div = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: Div, N2: DSign);
2302	Rem = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: Rem, N2: RSign);
2303
2304	Div = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Div, N2: DSign);
2305	Rem = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Rem, N2: RSign);
2306
2307	SDValue Res[`2`] = {
2308	Div,
2309	Rem
2310	};
2311	return DAG.getMergeValues(Ops: Res, dl: DL);
2312	}
2313
2314	// (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
2315	SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
2316	SDLoc SL(Op);
2317	EVT VT = Op.getValueType();
2318	auto Flags = Op ->getFlags();
2319	SDValue X = Op.getOperand(i: `0`);
2320	SDValue Y = Op.getOperand(i: `1`);
2321
2322	SDValue Div = DAG.getNode(Opcode: ISD::FDIV, DL: SL, VT, N1: X, N2: Y, Flags);
2323	SDValue Trunc = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT, Operand: Div, Flags);
2324	SDValue Neg = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Trunc, Flags);
2325	// TODO: For f32 use FMAD instead if !hasFastFMA32?
2326	return DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: Neg, N2: Y, N3: X, Flags);
2327	}
2328
2329	SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
2330	SDLoc SL(Op);
2331	SDValue Src = Op.getOperand(i: `0`);
2332
2333	// result = trunc(src)
2334	// if (src > 0.0 && src != result)
2335	// result += 1.0
2336
2337	SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2338
2339	const SDValue Zero = DAG.getConstantFP(`0.0`, SL, MVT::f64);
2340	const SDValue One = DAG.getConstantFP(`1.0`, SL, MVT::f64);
2341
2342	EVT SetCCVT =
2343	getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2344
2345	SDValue Lt0 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: Zero, Cond: ISD::SETOGT);
2346	SDValue NeTrunc = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: Trunc, Cond: ISD::SETONE);
2347	SDValue And = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: SetCCVT, N1: Lt0, N2: NeTrunc);
2348
2349	SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2350	// TODO: Should this propagate fast-math-flags?
2351	return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2352	}
2353
2354	static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
2355	SelectionDAG &DAG) {
2356	const unsigned FractBits = `52`;
2357	const unsigned ExpBits = `11`;
2358
2359	SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2360	Hi,
2361	DAG.getConstant(FractBits - `32`, SL, MVT::i32),
2362	DAG.getConstant(ExpBits, SL, MVT::i32));
2363	SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2364	DAG.getConstant(`1023`, SL, MVT::i32));
2365
2366	return Exp;
2367	}
2368
2369	SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
2370	SDLoc SL(Op);
2371	SDValue Src = Op.getOperand(i: `0`);
2372
2373	assert(Op.getValueType() == MVT::f64);
2374
2375	const SDValue Zero = DAG.getConstant(`0`, SL, MVT::i32);
2376
2377	// Extract the upper half, since this is where we will find the sign and
2378	// exponent.
2379	SDValue Hi = getHiHalf64(Op: Src, DAG);
2380
2381	SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2382
2383	const unsigned FractBits = `52`;
2384
2385	// Extract the sign bit.
2386	const SDValue SignBitMask = DAG.getConstant(UINT32_C(`1`) << `31`, SL, MVT::i32);
2387	SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2388
2389	// Extend back to 64-bits.
2390	SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2391	SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2392
2393	SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2394	const SDValue FractMask
2395	= DAG.getConstant((UINT64_C(`1`) << FractBits) - `1`, SL, MVT::i64);
2396
2397	SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2398	SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2399	SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2400
2401	EVT SetCCVT =
2402	getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2403
2404	const SDValue FiftyOne = DAG.getConstant(FractBits - `1`, SL, MVT::i32);
2405
2406	SDValue ExpLt0 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Exp, RHS: Zero, Cond: ISD::SETLT);
2407	SDValue ExpGt51 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Exp, RHS: FiftyOne, Cond: ISD::SETGT);
2408
2409	SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2410	SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2411
2412	return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2413	}
2414
2415	SDValue AMDGPUTargetLowering::LowerFROUNDEVEN(SDValue Op,
2416	SelectionDAG &DAG) const {
2417	SDLoc SL(Op);
2418	SDValue Src = Op.getOperand(i: `0`);
2419
2420	assert(Op.getValueType() == MVT::f64);
2421
2422	APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2423	SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2424	SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2425
2426	// TODO: Should this propagate fast-math-flags?
2427
2428	SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2429	SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2430
2431	SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2432
2433	APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2434	SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2435
2436	EVT SetCCVT =
2437	getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2438	SDValue Cond = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Fabs, RHS: C2, Cond: ISD::SETOGT);
2439
2440	return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2441	}
2442
2443	SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op,
2444	SelectionDAG &DAG) const {
2445	// FNEARBYINT and FRINT are the same, except in their handling of FP
2446	// exceptions. Those aren't really meaningful for us, and OpenCL only has
2447	// rint, so just treat them as equivalent.
2448	return DAG.getNode(Opcode: ISD::FROUNDEVEN, DL: SDLoc (Op), VT: Op.getValueType(),
2449	Operand: Op.getOperand(i: `0`));
2450	}
2451
2452	SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
2453	auto VT = Op.getValueType();
2454	auto Arg = Op.getOperand(i: `0u`);
2455	return DAG.getNode(Opcode: ISD::FROUNDEVEN, DL: SDLoc (Op), VT, Operand: Arg);
2456	}
2457
2458	// XXX - May require not supporting f32 denormals?
2459
2460	// Don't handle v2f16. The extra instructions to scalarize and repack around the
2461	// compare and vselect end up producing worse code than scalarizing the whole
2462	// operation.
2463	SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2464	SDLoc SL(Op);
2465	SDValue X = Op.getOperand(i: `0`);
2466	EVT VT = Op.getValueType();
2467
2468	SDValue T = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT, Operand: X);
2469
2470	// TODO: Should this propagate fast-math-flags?
2471
2472	SDValue Diff = DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT, N1: X, N2: T);
2473
2474	SDValue AbsDiff = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT, Operand: Diff);
2475
2476	const SDValue Zero = DAG.getConstantFP(Val: `0.0`, DL: SL, VT);
2477	const SDValue One = DAG.getConstantFP(Val: `1.0`, DL: SL, VT);
2478
2479	EVT SetCCVT =
2480	getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
2481
2482	const SDValue Half = DAG.getConstantFP(Val: `0.5`, DL: SL, VT);
2483	SDValue Cmp = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: AbsDiff, RHS: Half, Cond: ISD::SETOGE);
2484	SDValue OneOrZeroFP = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: Cmp, N2: One, N3: Zero);
2485
2486	SDValue SignedOffset = DAG.getNode(Opcode: ISD::FCOPYSIGN, DL: SL, VT, N1: OneOrZeroFP, N2: X);
2487	return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: T, N2: SignedOffset);
2488	}
2489
2490	SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
2491	SDLoc SL(Op);
2492	SDValue Src = Op.getOperand(i: `0`);
2493
2494	// result = trunc(src);
2495	// if (src < 0.0 && src != result)
2496	// result += -1.0.
2497
2498	SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2499
2500	const SDValue Zero = DAG.getConstantFP(`0.0`, SL, MVT::f64);
2501	const SDValue NegOne = DAG.getConstantFP(-`1.0`, SL, MVT::f64);
2502
2503	EVT SetCCVT =
2504	getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2505
2506	SDValue Lt0 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: Zero, Cond: ISD::SETOLT);
2507	SDValue NeTrunc = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: Trunc, Cond: ISD::SETONE);
2508	SDValue And = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: SetCCVT, N1: Lt0, N2: NeTrunc);
2509
2510	SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2511	// TODO: Should this propagate fast-math-flags?
2512	return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2513	}
2514
2515	/// Return true if it's known that \p Src can never be an f32 denormal value.
2516	static bool valueIsKnownNeverF32Denorm(SDValue Src) {
2517	switch (Src.getOpcode()) {
2518	case ISD::FP_EXTEND:
2519	return Src.getOperand(`0`).getValueType() == MVT::f16;
2520	case ISD::FP16_TO_FP:
2521	case ISD::FFREXP:
2522	return true;
2523	case ISD::INTRINSIC_WO_CHAIN: {
2524	unsigned IntrinsicID = Src.getConstantOperandVal(i: `0`);
2525	switch (IntrinsicID) {
2526	case Intrinsic::amdgcn_frexp_mant:
2527	return true;
2528	default:
2529	return false;
2530	}
2531	}
2532	default:
2533	return false;
2534	}
2535
2536	llvm_unreachable("covered opcode switch");
2537	}
2538
2539	bool AMDGPUTargetLowering::allowApproxFunc(const SelectionDAG &DAG,
2540	SDNodeFlags Flags) {
2541	if (Flags.hasApproximateFuncs())
2542	return true;
2543	auto &Options = DAG.getTarget().Options;
2544	return Options.UnsafeFPMath \|\| Options.ApproxFuncFPMath;
2545	}
2546
2547	bool AMDGPUTargetLowering::needsDenormHandlingF32(const SelectionDAG &DAG,
2548	SDValue Src,
2549	SDNodeFlags Flags) {
2550	return !valueIsKnownNeverF32Denorm(Src) &&
2551	DAG.getMachineFunction()
2552	.getDenormalMode(FPType: APFloat::IEEEsingle())
2553	.Input != DenormalMode::PreserveSign;
2554	}
2555
2556	SDValue AMDGPUTargetLowering::getIsLtSmallestNormal(SelectionDAG &DAG,
2557	SDValue Src,
2558	SDNodeFlags Flags) const {
2559	SDLoc SL(Src);
2560	EVT VT = Src.getValueType();
2561	const fltSemantics &Semantics = SelectionDAG::EVTToAPFloatSemantics(VT);
2562	SDValue SmallestNormal =
2563	DAG.getConstantFP(Val: APFloat::getSmallestNormalized(Sem: Semantics), DL: SL, VT);
2564
2565	// Want to scale denormals up, but negatives and 0 work just as well on the
2566	// scaled path.
2567	SDValue IsLtSmallestNormal = DAG.getSetCC(
2568	DL: SL, VT: getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT), LHS: Src,
2569	RHS: SmallestNormal, Cond: ISD::SETOLT);
2570
2571	return IsLtSmallestNormal;
2572	}
2573
2574	SDValue AMDGPUTargetLowering::getIsFinite(SelectionDAG &DAG, SDValue Src,
2575	SDNodeFlags Flags) const {
2576	SDLoc SL(Src);
2577	EVT VT = Src.getValueType();
2578	const fltSemantics &Semantics = SelectionDAG::EVTToAPFloatSemantics(VT);
2579	SDValue Inf = DAG.getConstantFP(Val: APFloat::getInf(Sem: Semantics), DL: SL, VT);
2580
2581	SDValue Fabs = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT, Operand: Src, Flags);
2582	SDValue IsFinite = DAG.getSetCC(
2583	DL: SL, VT: getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT), LHS: Fabs,
2584	RHS: Inf, Cond: ISD::SETOLT);
2585	return IsFinite;
2586	}
2587
2588	/// If denormal handling is required return the scaled input to FLOG2, and the
2589	/// check for denormal range. Otherwise, return null values.
2590	std::pair<SDValue, SDValue>
2591	AMDGPUTargetLowering::getScaledLogInput(SelectionDAG &DAG, const SDLoc SL,
2592	SDValue Src, SDNodeFlags Flags) const {
2593	if (!needsDenormHandlingF32(DAG, Src, Flags))
2594	return {};
2595
2596	MVT VT = MVT::f32;
2597	const fltSemantics &Semantics = APFloat::IEEEsingle();
2598	SDValue SmallestNormal =
2599	DAG.getConstantFP(Val: APFloat::getSmallestNormalized(Sem: Semantics), DL: SL, VT);
2600
2601	SDValue IsLtSmallestNormal = DAG.getSetCC(
2602	DL: SL, VT: getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT), LHS: Src,
2603	RHS: SmallestNormal, Cond: ISD::SETOLT);
2604
2605	SDValue Scale32 = DAG.getConstantFP(Val: `0x1.0p+32`, DL: SL, VT);
2606	SDValue One = DAG.getConstantFP(Val: `1.0`, DL: SL, VT);
2607	SDValue ScaleFactor =
2608	DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: IsLtSmallestNormal, N2: Scale32, N3: One, Flags);
2609
2610	SDValue ScaledInput = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Src, N2: ScaleFactor, Flags);
2611	return {ScaledInput, IsLtSmallestNormal};
2612	}
2613
2614	SDValue AMDGPUTargetLowering::LowerFLOG2(SDValue Op, SelectionDAG &DAG) const {
2615	// v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
2616	// If we have to handle denormals, scale up the input and adjust the result.
2617
2618	// scaled = x (is_denormal ? 0x1.0p+32 : 1.0)*
2619	// log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
2620
2621	SDLoc SL(Op);
2622	EVT VT = Op.getValueType();
2623	SDValue Src = Op.getOperand(i: `0`);
2624	SDNodeFlags Flags = Op ->getFlags();
2625
2626	if (VT == MVT::f16) {
2627	// Nothing in half is a denormal when promoted to f32.
2628	assert(!Subtarget->has16BitInsts());
2629	SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2630	SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags);
2631	return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2632	DAG.getTargetConstant(`0`, SL, MVT::i32), Flags);
2633	}
2634
2635	auto [ScaledInput, IsLtSmallestNormal] =
2636	getScaledLogInput(DAG, SL, Src, Flags);
2637	if (!ScaledInput)
2638	return DAG.getNode(Opcode: AMDGPUISD::LOG, DL: SL, VT, Operand: Src, Flags);
2639
2640	SDValue Log2 = DAG.getNode(Opcode: AMDGPUISD::LOG, DL: SL, VT, Operand: ScaledInput, Flags);
2641
2642	SDValue ThirtyTwo = DAG.getConstantFP(Val: `32.0`, DL: SL, VT);
2643	SDValue Zero = DAG.getConstantFP(Val: `0.0`, DL: SL, VT);
2644	SDValue ResultOffset =
2645	DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: IsLtSmallestNormal, N2: ThirtyTwo, N3: Zero);
2646	return DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT, N1: Log2, N2: ResultOffset, Flags);
2647	}
2648
2649	static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,
2650	SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags ()) {
2651	SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: Y, Flags);
2652	return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: Mul, N2: C, Flags);
2653	}
2654
2655	SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op,
2656	SelectionDAG &DAG) const {
2657	SDValue X = Op.getOperand(i: `0`);
2658	EVT VT = Op.getValueType();
2659	SDNodeFlags Flags = Op ->getFlags();
2660	SDLoc DL(Op);
2661
2662	const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
2663	assert(IsLog10 \|\| Op.getOpcode() == ISD::FLOG);
2664
2665	const auto &Options = getTargetMachine().Options;
2666	if (VT == MVT::f16 \|\| Flags.hasApproximateFuncs() \|\|
2667	Options.ApproxFuncFPMath \|\| Options.UnsafeFPMath) {
2668
2669	if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2670	// Log and multiply in f32 is good enough for f16.
2671	X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
2672	}
2673
2674	SDValue Lowered = LowerFLOGUnsafe(Op: X, SL: DL, DAG, IsLog10, Flags);
2675	if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2676	return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered,
2677	DAG.getTargetConstant(`0`, DL, MVT::i32), Flags);
2678	}
2679
2680	return Lowered;
2681	}
2682
2683	auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL: DL, Src: X, Flags);
2684	if (ScaledInput)
2685	X = ScaledInput;
2686
2687	SDValue Y = DAG.getNode(Opcode: AMDGPUISD::LOG, DL, VT, Operand: X, Flags);
2688
2689	SDValue R;
2690	if (Subtarget->hasFastFMAF32()) {
2691	// c+cc are ln(2)/ln(10) to more than 49 bits
2692	const float c_log10 = `0x1.344134p-2f`;
2693	const float cc_log10 = `0x1.09f79ep-26f`;
2694
2695	// c + cc is ln(2) to more than 49 bits
2696	const float c_log = `0x1.62e42ep-1f`;
2697	const float cc_log = `0x1.efa39ep-25f`;
2698
2699	SDValue C = DAG.getConstantFP(Val: IsLog10 ? c_log10 : c_log, DL, VT);
2700	SDValue CC = DAG.getConstantFP(Val: IsLog10 ? cc_log10 : cc_log, DL, VT);
2701
2702	R = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Y, N2: C, Flags);
2703	SDValue NegR = DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: R, Flags);
2704	SDValue FMA0 = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: Y, N2: C, N3: NegR, Flags);
2705	SDValue FMA1 = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: Y, N2: CC, N3: FMA0, Flags);
2706	R = DAG.getNode(Opcode: ISD::FADD, DL, VT, N1: R, N2: FMA1, Flags);
2707	} else {
2708	// ch+ct is ln(2)/ln(10) to more than 36 bits
2709	const float ch_log10 = `0x1.344000p-2f`;
2710	const float ct_log10 = `0x1.3509f6p-18f`;
2711
2712	// ch + ct is ln(2) to more than 36 bits
2713	const float ch_log = `0x1.62e000p-1f`;
2714	const float ct_log = `0x1.0bfbe8p-15f`;
2715
2716	SDValue CH = DAG.getConstantFP(Val: IsLog10 ? ch_log10 : ch_log, DL, VT);
2717	SDValue CT = DAG.getConstantFP(Val: IsLog10 ? ct_log10 : ct_log, DL, VT);
2718
2719	SDValue YAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Y);
2720	SDValue MaskConst = DAG.getConstant(`0xfffff000`, DL, MVT::i32);
2721	SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst);
2722	SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt);
2723	SDValue YT = DAG.getNode(Opcode: ISD::FSUB, DL, VT, N1: Y, N2: YH, Flags);
2724
2725	SDValue YTCT = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: YT, N2: CT, Flags);
2726	SDValue Mad0 = getMad(DAG, SL: DL, VT, X: YH, Y: CT, C: YTCT, Flags);
2727	SDValue Mad1 = getMad(DAG, SL: DL, VT, X: YT, Y: CH, C: Mad0, Flags);
2728	R = getMad(DAG, SL: DL, VT, X: YH, Y: CH, C: Mad1);
2729	}
2730
2731	const bool IsFiniteOnly = (Flags.hasNoNaNs() \|\| Options.NoNaNsFPMath) &&
2732	(Flags.hasNoInfs() \|\| Options.NoInfsFPMath);
2733
2734	// TODO: Check if known finite from source value.
2735	if (!IsFiniteOnly) {
2736	SDValue IsFinite = getIsFinite(DAG, Src: Y, Flags);
2737	R = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: IsFinite, N2: R, N3: Y, Flags);
2738	}
2739
2740	if (IsScaled) {
2741	SDValue Zero = DAG.getConstantFP(Val: `0.0f`, DL, VT);
2742	SDValue ShiftK =
2743	DAG.getConstantFP(Val: IsLog10 ? `0x1.344136p+3f` : `0x1.62e430p+4f`, DL, VT);
2744	SDValue Shift =
2745	DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: IsScaled, N2: ShiftK, N3: Zero, Flags);
2746	R = DAG.getNode(Opcode: ISD::FSUB, DL, VT, N1: R, N2: Shift, Flags);
2747	}
2748
2749	return R;
2750	}
2751
2752	SDValue AMDGPUTargetLowering::LowerFLOG10(SDValue Op, SelectionDAG &DAG) const {
2753	return LowerFLOGCommon(Op, DAG);
2754	}
2755
2756	// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
2757	// promote f16 operation.
2758	SDValue AMDGPUTargetLowering::LowerFLOGUnsafe(SDValue Src, const SDLoc &SL,
2759	SelectionDAG &DAG, bool IsLog10,
2760	SDNodeFlags Flags) const {
2761	EVT VT = Src.getValueType();
2762	unsigned LogOp =
2763	VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;
2764
2765	double Log2BaseInverted =
2766	IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
2767
2768	if (VT == MVT::f32) {
2769	auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
2770	if (ScaledInput) {
2771	SDValue LogSrc = DAG.getNode(Opcode: AMDGPUISD::LOG, DL: SL, VT, Operand: ScaledInput, Flags);
2772	SDValue ScaledResultOffset =
2773	DAG.getConstantFP(Val: -`32.0` * Log2BaseInverted, DL: SL, VT);
2774
2775	SDValue Zero = DAG.getConstantFP(Val: `0.0f`, DL: SL, VT);
2776
2777	SDValue ResultOffset = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: IsScaled,
2778	N2: ScaledResultOffset, N3: Zero, Flags);
2779
2780	SDValue Log2Inv = DAG.getConstantFP(Val: Log2BaseInverted, DL: SL, VT);
2781
2782	if (Subtarget->hasFastFMAF32())
2783	return DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: LogSrc, N2: Log2Inv, N3: ResultOffset,
2784	Flags);
2785	SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: LogSrc, N2: Log2Inv, Flags);
2786	return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: Mul, N2: ResultOffset);
2787	}
2788	}
2789
2790	SDValue Log2Operand = DAG.getNode(Opcode: LogOp, DL: SL, VT, Operand: Src, Flags);
2791	SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Val: Log2BaseInverted, DL: SL, VT);
2792
2793	return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Log2Operand, N2: Log2BaseInvertedOperand,
2794	Flags);
2795	}
2796
2797	SDValue AMDGPUTargetLowering::lowerFEXP2(SDValue Op, SelectionDAG &DAG) const {
2798	// v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
2799	// If we have to handle denormals, scale up the input and adjust the result.
2800
2801	SDLoc SL(Op);
2802	EVT VT = Op.getValueType();
2803	SDValue Src = Op.getOperand(i: `0`);
2804	SDNodeFlags Flags = Op ->getFlags();
2805
2806	if (VT == MVT::f16) {
2807	// Nothing in half is a denormal when promoted to f32.
2808	assert(!Subtarget->has16BitInsts());
2809	SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2810	SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags);
2811	return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2812	DAG.getTargetConstant(`0`, SL, MVT::i32), Flags);
2813	}
2814
2815	assert(VT == MVT::f32);
2816
2817	if (!needsDenormHandlingF32(DAG, Src, Flags))
2818	return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags);
2819
2820	// bool needs_scaling = x < -0x1.f80000p+6f;
2821	// v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) (s ? 0x1.0p-64f : 1.0f);*
2822
2823	// -nextafter(128.0, -1)
2824	SDValue RangeCheckConst = DAG.getConstantFP(Val: -`0x1.f80000p+6f`, DL: SL, VT);
2825
2826	EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
2827
2828	SDValue NeedsScaling =
2829	DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: RangeCheckConst, Cond: ISD::SETOLT);
2830
2831	SDValue SixtyFour = DAG.getConstantFP(Val: `0x1.0p+6f`, DL: SL, VT);
2832	SDValue Zero = DAG.getConstantFP(Val: `0.0`, DL: SL, VT);
2833
2834	SDValue AddOffset =
2835	DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: SixtyFour, N3: Zero);
2836
2837	SDValue AddInput = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: Src, N2: AddOffset, Flags);
2838	SDValue Exp2 = DAG.getNode(Opcode: AMDGPUISD::EXP, DL: SL, VT, Operand: AddInput, Flags);
2839
2840	SDValue TwoExpNeg64 = DAG.getConstantFP(Val: `0x1.0p-64f`, DL: SL, VT);
2841	SDValue One = DAG.getConstantFP(Val: `1.0`, DL: SL, VT);
2842	SDValue ResultScale =
2843	DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: TwoExpNeg64, N3: One);
2844
2845	return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Exp2, N2: ResultScale, Flags);
2846	}
2847
2848	SDValue AMDGPUTargetLowering::lowerFEXPUnsafe(SDValue X, const SDLoc &SL,
2849	SelectionDAG &DAG,
2850	SDNodeFlags Flags) const {
2851	EVT VT = X.getValueType();
2852	const SDValue Log2E = DAG.getConstantFP(Val: numbers::log2e, DL: SL, VT);
2853
2854	if (VT != MVT::f32 \|\| !needsDenormHandlingF32(DAG, X, Flags)) {
2855	// exp2(M_LOG2E_F f);*
2856	SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: Log2E, Flags);
2857	return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
2858	: (unsigned)ISD::FEXP2,
2859	SL, VT, Mul, Flags);
2860	}
2861
2862	EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
2863
2864	SDValue Threshold = DAG.getConstantFP(Val: -`0x1.5d58a0p+6f`, DL: SL, VT);
2865	SDValue NeedsScaling = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: X, RHS: Threshold, Cond: ISD::SETOLT);
2866
2867	SDValue ScaleOffset = DAG.getConstantFP(Val: `0x1.0p+6f`, DL: SL, VT);
2868
2869	SDValue ScaledX = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: X, N2: ScaleOffset, Flags);
2870
2871	SDValue AdjustedX =
2872	DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: ScaledX, N3: X);
2873
2874	SDValue ExpInput = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: AdjustedX, N2: Log2E, Flags);
2875
2876	SDValue Exp2 = DAG.getNode(Opcode: AMDGPUISD::EXP, DL: SL, VT, Operand: ExpInput, Flags);
2877
2878	SDValue ResultScaleFactor = DAG.getConstantFP(Val: `0x1.969d48p-93f`, DL: SL, VT);
2879	SDValue AdjustedResult =
2880	DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Exp2, N2: ResultScaleFactor, Flags);
2881
2882	return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: AdjustedResult, N3: Exp2,
2883	Flags);
2884	}
2885
2886	/// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be
2887	/// handled correctly.
2888	SDValue AMDGPUTargetLowering::lowerFEXP10Unsafe(SDValue X, const SDLoc &SL,
2889	SelectionDAG &DAG,
2890	SDNodeFlags Flags) const {
2891	const EVT VT = X.getValueType();
2892	const unsigned Exp2Op = VT == MVT::f32 ? AMDGPUISD::EXP : ISD::FEXP2;
2893
2894	if (VT != MVT::f32 \|\| !needsDenormHandlingF32(DAG, X, Flags)) {
2895	// exp2(x 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);*
2896	SDValue K0 = DAG.getConstantFP(Val: `0x1.a92000p+1f`, DL: SL, VT);
2897	SDValue K1 = DAG.getConstantFP(Val: `0x1.4f0978p-11f`, DL: SL, VT);
2898
2899	SDValue Mul0 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: K0, Flags);
2900	SDValue Exp2_0 = DAG.getNode(Opcode: Exp2Op, DL: SL, VT, Operand: Mul0, Flags);
2901	SDValue Mul1 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: K1, Flags);
2902	SDValue Exp2_1 = DAG.getNode(Opcode: Exp2Op, DL: SL, VT, Operand: Mul1, Flags);
2903	return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Exp2_0, N2: Exp2_1);
2904	}
2905
2906	// bool s = x < -0x1.2f7030p+5f;
2907	// x += s ? 0x1.0p+5f : 0.0f;
2908	// exp10 = exp2(x * 0x1.a92000p+1f) *
2909	// exp2(x * 0x1.4f0978p-11f) *
2910	// (s ? 0x1.9f623ep-107f : 1.0f);
2911
2912	EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
2913
2914	SDValue Threshold = DAG.getConstantFP(Val: -`0x1.2f7030p+5f`, DL: SL, VT);
2915	SDValue NeedsScaling = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: X, RHS: Threshold, Cond: ISD::SETOLT);
2916
2917	SDValue ScaleOffset = DAG.getConstantFP(Val: `0x1.0p+5f`, DL: SL, VT);
2918	SDValue ScaledX = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: X, N2: ScaleOffset, Flags);
2919	SDValue AdjustedX =
2920	DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: ScaledX, N3: X);
2921
2922	SDValue K0 = DAG.getConstantFP(Val: `0x1.a92000p+1f`, DL: SL, VT);
2923	SDValue K1 = DAG.getConstantFP(Val: `0x1.4f0978p-11f`, DL: SL, VT);
2924
2925	SDValue Mul0 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: AdjustedX, N2: K0, Flags);
2926	SDValue Exp2_0 = DAG.getNode(Opcode: Exp2Op, DL: SL, VT, Operand: Mul0, Flags);
2927	SDValue Mul1 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: AdjustedX, N2: K1, Flags);
2928	SDValue Exp2_1 = DAG.getNode(Opcode: Exp2Op, DL: SL, VT, Operand: Mul1, Flags);
2929
2930	SDValue MulExps = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Exp2_0, N2: Exp2_1, Flags);
2931
2932	SDValue ResultScaleFactor = DAG.getConstantFP(Val: `0x1.9f623ep-107f`, DL: SL, VT);
2933	SDValue AdjustedResult =
2934	DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: MulExps, N2: ResultScaleFactor, Flags);
2935
2936	return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: AdjustedResult, N3: MulExps,
2937	Flags);
2938	}
2939
2940	SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
2941	EVT VT = Op.getValueType();
2942	SDLoc SL(Op);
2943	SDValue X = Op.getOperand(i: `0`);
2944	SDNodeFlags Flags = Op ->getFlags();
2945	const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
2946
2947	if (VT.getScalarType() == MVT::f16) {
2948	// v_exp_f16 (fmul x, log2e)
2949	if (allowApproxFunc(DAG, Flags)) // TODO: Does this really require fast?
2950	return lowerFEXPUnsafe(X, SL, DAG, Flags);
2951
2952	if (VT.isVector())
2953	return SDValue ();
2954
2955	// exp(f16 x) ->
2956	// fptrunc (v_exp_f32 (fmul (fpext x), log2e))
2957
2958	// Nothing in half is a denormal when promoted to f32.
2959	SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags);
2960	SDValue Lowered = lowerFEXPUnsafe(X: Ext, SL, DAG, Flags);
2961	return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered,
2962	DAG.getTargetConstant(`0`, SL, MVT::i32), Flags);
2963	}
2964
2965	assert(VT == MVT::f32);
2966
2967	// TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
2968	// library behavior. Also, is known-not-daz source sufficient?
2969	if (allowApproxFunc(DAG, Flags)) {
2970	return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
2971	: lowerFEXPUnsafe(X, SL, DAG, Flags);
2972	}
2973
2974	// Algorithm:
2975	//
2976	// e^x = 2^(x/ln(2)) = 2^(x(64/ln(2))/64)*
2977	//
2978	// x(64/ln(2)) = n + f, \|f\| <= 0.5, n is integer*
2979	// n = 64m + j, 0 <= j < 64*
2980	//
2981	// e^x = 2^((64m + j + f)/64)*
2982	// = (2^m) (2^(j/64)) * 2^(f/64)*
2983	// = (2^m) (2^(j/64)) * e^(f(ln(2)/64))
2984	//
2985	// f = x(64/ln(2)) - n*
2986	// r = f(ln(2)/64) = x - n(ln(2)/64)
2987	//
2988	// e^x = (2^m) (2^(j/64)) * e^r*
2989	//
2990	// (2^(j/64)) is precomputed
2991	//
2992	// e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
2993	// e^r = 1 + q
2994	//
2995	// q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
2996	//
2997	// e^x = (2^m) ( (2^(j/64)) + q(2^(j/64)) )
2998	SDNodeFlags FlagsNoContract = Flags;
2999	FlagsNoContract.setAllowContract(false);
3000
3001	SDValue PH, PL;
3002	if (Subtarget->hasFastFMAF32()) {
3003	const float c_exp = numbers::log2ef;
3004	const float cc_exp = `0x1.4ae0bep-26f`; // c+cc are 49 bits
3005	const float c_exp10 = `0x1.a934f0p+1f`;
3006	const float cc_exp10 = `0x1.2f346ep-24f`;
3007
3008	SDValue C = DAG.getConstantFP(Val: IsExp10 ? c_exp10 : c_exp, DL: SL, VT);
3009	SDValue CC = DAG.getConstantFP(Val: IsExp10 ? cc_exp10 : cc_exp, DL: SL, VT);
3010
3011	PH = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: C, Flags);
3012	SDValue NegPH = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: PH, Flags);
3013	SDValue FMA0 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: X, N2: C, N3: NegPH, Flags);
3014	PL = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: X, N2: CC, N3: FMA0, Flags);
3015	} else {
3016	const float ch_exp = `0x1.714000p+0f`;
3017	const float cl_exp = `0x1.47652ap-12f`; // ch + cl are 36 bits
3018
3019	const float ch_exp10 = `0x1.a92000p+1f`;
3020	const float cl_exp10 = `0x1.4f0978p-11f`;
3021
3022	SDValue CH = DAG.getConstantFP(Val: IsExp10 ? ch_exp10 : ch_exp, DL: SL, VT);
3023	SDValue CL = DAG.getConstantFP(Val: IsExp10 ? cl_exp10 : cl_exp, DL: SL, VT);
3024
3025	SDValue XAsInt = DAG.getNode(ISD::BITCAST, SL, MVT::i32, X);
3026	SDValue MaskConst = DAG.getConstant(`0xfffff000`, SL, MVT::i32);
3027	SDValue XHAsInt = DAG.getNode(ISD::AND, SL, MVT::i32, XAsInt, MaskConst);
3028	SDValue XH = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: XHAsInt);
3029	SDValue XL = DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT, N1: X, N2: XH, Flags);
3030
3031	PH = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: XH, N2: CH, Flags);
3032
3033	SDValue XLCL = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: XL, N2: CL, Flags);
3034	SDValue Mad0 = getMad(DAG, SL, VT, X: XL, Y: CH, C: XLCL, Flags);
3035	PL = getMad(DAG, SL, VT, X: XH, Y: CL, C: Mad0, Flags);
3036	}
3037
3038	SDValue E = DAG.getNode(Opcode: ISD::FROUNDEVEN, DL: SL, VT, Operand: PH, Flags);
3039
3040	// It is unsafe to contract this fsub into the PH multiply.
3041	SDValue PHSubE = DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT, N1: PH, N2: E, Flags: FlagsNoContract);
3042
3043	SDValue A = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: PHSubE, N2: PL, Flags);
3044	SDValue IntE = DAG.getNode(ISD::FP_TO_SINT, SL, MVT::i32, E);
3045	SDValue Exp2 = DAG.getNode(Opcode: AMDGPUISD::EXP, DL: SL, VT, Operand: A, Flags);
3046
3047	SDValue R = DAG.getNode(Opcode: ISD::FLDEXP, DL: SL, VT, N1: Exp2, N2: IntE, Flags);
3048
3049	SDValue UnderflowCheckConst =
3050	DAG.getConstantFP(Val: IsExp10 ? -`0x1.66d3e8p+5f` : -`0x1.9d1da0p+6f`, DL: SL, VT);
3051
3052	EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
3053	SDValue Zero = DAG.getConstantFP(Val: `0.0`, DL: SL, VT);
3054	SDValue Underflow =
3055	DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: X, RHS: UnderflowCheckConst, Cond: ISD::SETOLT);
3056
3057	R = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: Underflow, N2: Zero, N3: R);
3058	const auto &Options = getTargetMachine().Options;
3059
3060	if (!Flags.hasNoInfs() && !Options.NoInfsFPMath) {
3061	SDValue OverflowCheckConst =
3062	DAG.getConstantFP(Val: IsExp10 ? `0x1.344136p+5f` : `0x1.62e430p+6f`, DL: SL, VT);
3063	SDValue Overflow =
3064	DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: X, RHS: OverflowCheckConst, Cond: ISD::SETOGT);
3065	SDValue Inf =
3066	DAG.getConstantFP(Val: APFloat::getInf(Sem: APFloat::IEEEsingle()), DL: SL, VT);
3067	R = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: Overflow, N2: Inf, N3: R);
3068	}
3069
3070	return R;
3071	}
3072
3073	static bool isCtlzOpc(unsigned Opc) {
3074	return Opc == ISD::CTLZ \|\| Opc == ISD::CTLZ_ZERO_UNDEF;
3075	}
3076
3077	static bool isCttzOpc(unsigned Opc) {
3078	return Opc == ISD::CTTZ \|\| Opc == ISD::CTTZ_ZERO_UNDEF;
3079	}
3080
3081	SDValue AMDGPUTargetLowering::lowerCTLZResults(SDValue Op,
3082	SelectionDAG &DAG) const {
3083	auto SL = SDLoc (Op);
3084	auto Arg = Op.getOperand(i: `0u`);
3085	auto ResultVT = Op.getValueType();
3086
3087	if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
3088	return {};
3089
3090	assert(isCtlzOpc(Op.getOpcode()));
3091	assert(ResultVT == Arg.getValueType());
3092
3093	auto const LeadingZeroes = `32u` - ResultVT.getFixedSizeInBits();
3094	auto SubVal = DAG.getConstant(LeadingZeroes, SL, MVT::i32);
3095	auto NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
3096	NewOp = DAG.getNode(Op.getOpcode(), SL, MVT::i32, NewOp);
3097	NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, SubVal);
3098	return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);
3099	}
3100
3101	SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const {
3102	SDLoc SL(Op);
3103	SDValue Src = Op.getOperand(i: `0`);
3104
3105	assert(isCtlzOpc(Op.getOpcode()) \|\| isCttzOpc(Op.getOpcode()));
3106	bool Ctlz = isCtlzOpc(Opc: Op.getOpcode());
3107	unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
3108
3109	bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF \|\|
3110	Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
3111	bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
3112
3113	if (Src.getValueType() == MVT::i32 \|\| Is64BitScalar) {
3114	// (ctlz hi:lo) -> (umin (ffbh src), 32)
3115	// (cttz hi:lo) -> (umin (ffbl src), 32)
3116	// (ctlz_zero_undef src) -> (ffbh src)
3117	// (cttz_zero_undef src) -> (ffbl src)
3118
3119	// 64-bit scalar version produce 32-bit result
3120	// (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
3121	// (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
3122	// (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src)
3123	// (cttz_zero_undef src) -> (S_FF1_I32_B64 src)
3124	SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
3125	if (!ZeroUndef) {
3126	const SDValue ConstVal = DAG.getConstant(
3127	Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);
3128	NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal);
3129	}
3130	return DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: Src.getValueType(), Operand: NewOpr);
3131	}
3132
3133	SDValue Lo, Hi;
3134	std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Src, DAG);
3135
3136	SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
3137	SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
3138
3139	// (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
3140	// (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
3141	// (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
3142	// (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
3143
3144	unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
3145	const SDValue Const32 = DAG.getConstant(`32`, SL, MVT::i32);
3146	if (Ctlz)
3147	OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
3148	else
3149	OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
3150
3151	SDValue NewOpr;
3152	NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
3153	if (!ZeroUndef) {
3154	const SDValue Const64 = DAG.getConstant(`64`, SL, MVT::i32);
3155	NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
3156	}
3157
3158	return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
3159	}
3160
3161	SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
3162	bool Signed) const {
3163	// The regular method converting a 64-bit integer to float roughly consists of
3164	// 2 steps: normalization and rounding. In fact, after normalization, the
3165	// conversion from a 64-bit integer to a float is essentially the same as the
3166	// one from a 32-bit integer. The only difference is that it has more
3167	// trailing bits to be rounded. To leverage the native 32-bit conversion, a
3168	// 64-bit integer could be preprocessed and fit into a 32-bit integer then
3169	// converted into the correct float number. The basic steps for the unsigned
3170	// conversion are illustrated in the following pseudo code:
3171	//
3172	// f32 uitofp(i64 u) {
3173	// i32 hi, lo = split(u);
3174	// // Only count the leading zeros in hi as we have native support of the
3175	// // conversion from i32 to f32. If hi is all 0s, the conversion is
3176	// // reduced to a 32-bit one automatically.
3177	// i32 shamt = clz(hi); // Return 32 if hi is all 0s.
3178	// u <<= shamt;
3179	// hi, lo = split(u);
3180	// hi \|= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
3181	// // convert it as a 32-bit integer and scale the result back.
3182	// return uitofp(hi) 2^(32 - shamt);*
3183	// }
3184	//
3185	// The signed one follows the same principle but uses 'ffbh_i32' to count its
3186	// sign bits instead. If 'ffbh_i32' is not available, its absolute value is
3187	// converted instead followed by negation based its sign bit.
3188
3189	SDLoc SL(Op);
3190	SDValue Src = Op.getOperand(i: `0`);
3191
3192	SDValue Lo, Hi;
3193	std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Src, DAG);
3194	SDValue Sign;
3195	SDValue ShAmt;
3196	if (Signed && Subtarget->isGCN()) {
3197	// We also need to consider the sign bit in Lo if Hi has just sign bits,
3198	// i.e. Hi is 0 or -1. However, that only needs to take the MSB into
3199	// account. That is, the maximal shift is
3200	// - 32 if Lo and Hi have opposite signs;
3201	// - 33 if Lo and Hi have the same sign.
3202	//
3203	// Or, MaxShAmt = 33 + OppositeSign, where
3204	//
3205	// OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
3206	// - -1 if Lo and Hi have opposite signs; and
3207	// - 0 otherwise.
3208	//
3209	// All in all, ShAmt is calculated as
3210	//
3211	// umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
3212	//
3213	// or
3214	//
3215	// umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
3216	//
3217	// to reduce the critical path.
3218	SDValue OppositeSign = DAG.getNode(
3219	ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
3220	DAG.getConstant(`31`, SL, MVT::i32));
3221	SDValue MaxShAmt =
3222	DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(`32`, SL, MVT::i32),
3223	OppositeSign);
3224	// Count the leading sign bits.
3225	ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
3226	// Different from unsigned conversion, the shift should be one bit less to
3227	// preserve the sign bit.
3228	ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
3229	DAG.getConstant(`1`, SL, MVT::i32));
3230	ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
3231	} else {
3232	if (Signed) {
3233	// Without 'ffbh_i32', only leading zeros could be counted. Take the
3234	// absolute value first.
3235	Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
3236	DAG.getConstant(`63`, SL, MVT::i64));
3237	SDValue Abs =
3238	DAG.getNode(ISD::XOR, SL, MVT::i64,
3239	DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
3240	std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Abs, DAG);
3241	}
3242	// Count the leading zeros.
3243	ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
3244	// The shift amount for signed integers is [0, 32].
3245	}
3246	// Normalize the given 64-bit integer.
3247	SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
3248	// Split it again.
3249	std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Norm, DAG);
3250	// Calculate the adjust bit for rounding.
3251	// (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
3252	SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
3253	DAG.getConstant(`1`, SL, MVT::i32), Lo);
3254	// Get the 32-bit normalized integer.
3255	Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
3256	// Convert the normalized 32-bit integer into f32.
3257	unsigned Opc =
3258	(Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
3259	SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
3260
3261	// Finally, need to scale back the converted floating number as the original
3262	// 64-bit integer is converted as a 32-bit one.
3263	ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(`32`, SL, MVT::i32),
3264	ShAmt);
3265	// On GCN, use LDEXP directly.
3266	if (Subtarget->isGCN())
3267	return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt);
3268
3269	// Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
3270	// part directly to emulate the multiplication of 2^ShAmt. That 8-bit
3271	// exponent is enough to avoid overflowing into the sign bit.
3272	SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
3273	DAG.getConstant(`23`, SL, MVT::i32));
3274	SDValue IVal =
3275	DAG.getNode(ISD::ADD, SL, MVT::i32,
3276	DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
3277	if (Signed) {
3278	// Set the sign bit.
3279	Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
3280	DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
3281	DAG.getConstant(`31`, SL, MVT::i32));
3282	IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
3283	}
3284	return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
3285	}
3286
3287	SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
3288	bool Signed) const {
3289	SDLoc SL(Op);
3290	SDValue Src = Op.getOperand(i: `0`);
3291
3292	SDValue Lo, Hi;
3293	std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Src, DAG);
3294
3295	SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
3296	SL, MVT::f64, Hi);
3297
3298	SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
3299
3300	SDValue LdExp = DAG.getNode(ISD::FLDEXP, SL, MVT::f64, CvtHi,
3301	DAG.getConstant(`32`, SL, MVT::i32));
3302	// TODO: Should this propagate fast-math-flags?
3303	return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
3304	}
3305
3306	SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
3307	SelectionDAG &DAG) const {
3308	// TODO: Factor out code common with LowerSINT_TO_FP.
3309	EVT DestVT = Op.getValueType();
3310	SDValue Src = Op.getOperand(i: `0`);
3311	EVT SrcVT = Src.getValueType();
3312
3313	if (SrcVT == MVT::i16) {
3314	if (DestVT == MVT::f16)
3315	return Op;
3316	SDLoc DL(Op);
3317
3318	// Promote src to i32
3319	SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
3320	return DAG.getNode(Opcode: ISD::UINT_TO_FP, DL, VT: DestVT, Operand: Ext);
3321	}
3322
3323	if (DestVT == MVT::bf16) {
3324	SDLoc SL(Op);
3325	SDValue ToF32 = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f32, Src);
3326	SDValue FPRoundFlag = DAG.getIntPtrConstant(Val: `0`, DL: SL, /isTarget=/true);
3327	return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3328	}
3329
3330	if (SrcVT != MVT::i64)
3331	return Op;
3332
3333	if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3334	SDLoc DL(Op);
3335
3336	SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3337	SDValue FPRoundFlag =
3338	DAG.getIntPtrConstant(Val: `0`, DL: SDLoc (Op), /isTarget=/true);
3339	SDValue FPRound =
3340	DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3341
3342	return FPRound;
3343	}
3344
3345	if (DestVT == MVT::f32)
3346	return LowerINT_TO_FP32(Op, DAG, Signed: false);
3347
3348	assert(DestVT == MVT::f64);
3349	return LowerINT_TO_FP64(Op, DAG, Signed: false);
3350	}
3351
3352	SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
3353	SelectionDAG &DAG) const {
3354	EVT DestVT = Op.getValueType();
3355
3356	SDValue Src = Op.getOperand(i: `0`);
3357	EVT SrcVT = Src.getValueType();
3358
3359	if (SrcVT == MVT::i16) {
3360	if (DestVT == MVT::f16)
3361	return Op;
3362
3363	SDLoc DL(Op);
3364	// Promote src to i32
3365	SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
3366	return DAG.getNode(Opcode: ISD::SINT_TO_FP, DL, VT: DestVT, Operand: Ext);
3367	}
3368
3369	if (DestVT == MVT::bf16) {
3370	SDLoc SL(Op);
3371	SDValue ToF32 = DAG.getNode(ISD::SINT_TO_FP, SL, MVT::f32, Src);
3372	SDValue FPRoundFlag = DAG.getIntPtrConstant(Val: `0`, DL: SL, /isTarget=/true);
3373	return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3374	}
3375
3376	if (SrcVT != MVT::i64)
3377	return Op;
3378
3379	// TODO: Factor out code common with LowerUINT_TO_FP.
3380
3381	if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3382	SDLoc DL(Op);
3383	SDValue Src = Op.getOperand(i: `0`);
3384
3385	SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3386	SDValue FPRoundFlag =
3387	DAG.getIntPtrConstant(Val: `0`, DL: SDLoc (Op), /isTarget=/true);
3388	SDValue FPRound =
3389	DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3390
3391	return FPRound;
3392	}
3393
3394	if (DestVT == MVT::f32)
3395	return LowerINT_TO_FP32(Op, DAG, Signed: true);
3396
3397	assert(DestVT == MVT::f64);
3398	return LowerINT_TO_FP64(Op, DAG, Signed: true);
3399	}
3400
3401	SDValue AMDGPUTargetLowering::LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG,
3402	bool Signed) const {
3403	SDLoc SL(Op);
3404
3405	SDValue Src = Op.getOperand(i: `0`);
3406	EVT SrcVT = Src.getValueType();
3407
3408	assert(SrcVT == MVT::f32 \|\| SrcVT == MVT::f64);
3409
3410	// The basic idea of converting a floating point number into a pair of 32-bit
3411	// integers is illustrated as follows:
3412	//
3413	// tf := trunc(val);
3414	// hif := floor(tf 2^-32);*
3415	// lof := tf - hif 2^32; // lof is always positive due to floor.*
3416	// hi := fptoi(hif);
3417	// lo := fptoi(lof);
3418	//
3419	SDValue Trunc = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT: SrcVT, Operand: Src);
3420	SDValue Sign;
3421	if (Signed && SrcVT == MVT::f32) {
3422	// However, a 32-bit floating point number has only 23 bits mantissa and
3423	// it's not enough to hold all the significant bits of `lof` if val is
3424	// negative. To avoid the loss of precision, We need to take the absolute
3425	// value after truncating and flip the result back based on the original
3426	// signedness.
3427	Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
3428	DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
3429	DAG.getConstant(`31`, SL, MVT::i32));
3430	Trunc = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT: SrcVT, Operand: Trunc);
3431	}
3432
3433	SDValue K0, K1;
3434	if (SrcVT == MVT::f64) {
3435	K0 = DAG.getConstantFP(
3436	Val: llvm::bit_cast<double>(UINT64_C(/2^-32/ `0x3df0000000000000`)), DL: SL,
3437	VT: SrcVT);
3438	K1 = DAG.getConstantFP(
3439	Val: llvm::bit_cast<double>(UINT64_C(/-2^32/ `0xc1f0000000000000`)), DL: SL,
3440	VT: SrcVT);
3441	} else {
3442	K0 = DAG.getConstantFP(
3443	Val: llvm::bit_cast<float>(UINT32_C(/2^-32/ `0x2f800000`)), DL: SL, VT: SrcVT);
3444	K1 = DAG.getConstantFP(
3445	Val: llvm::bit_cast<float>(UINT32_C(/-2^32/ `0xcf800000`)), DL: SL, VT: SrcVT);
3446	}
3447	// TODO: Should this propagate fast-math-flags?
3448	SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: SrcVT, N1: Trunc, N2: K0);
3449
3450	SDValue FloorMul = DAG.getNode(Opcode: ISD::FFLOOR, DL: SL, VT: SrcVT, Operand: Mul);
3451
3452	SDValue Fma = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: SrcVT, N1: FloorMul, N2: K1, N3: Trunc);
3453
3454	SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
3455	: ISD::FP_TO_UINT,
3456	SL, MVT::i32, FloorMul);
3457	SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
3458
3459	SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3460	DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
3461
3462	if (Signed && SrcVT == MVT::f32) {
3463	assert(Sign);
3464	// Flip the result based on the signedness, which is either all 0s or 1s.
3465	Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3466	DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
3467	// r := xor(r, sign) - sign;
3468	Result =
3469	DAG.getNode(ISD::SUB, SL, MVT::i64,
3470	DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
3471	}
3472
3473	return Result;
3474	}
3475
3476	SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
3477	SDLoc DL(Op);
3478	SDValue N0 = Op.getOperand(i: `0`);
3479
3480	// Convert to target node to get known bits
3481	if (N0.getValueType() == MVT::f32)
3482	return DAG.getNode(Opcode: AMDGPUISD::FP_TO_FP16, DL, VT: Op.getValueType(), Operand: N0);
3483
3484	if (getTargetMachine().Options.UnsafeFPMath) {
3485	// There is a generic expand for FP_TO_FP16 with unsafe fast math.
3486	return SDValue ();
3487	}
3488
3489	assert(N0.getSimpleValueType() == MVT::f64);
3490
3491	// f64 -> f16 conversion using round-to-nearest-even rounding mode.
3492	const unsigned ExpMask = `0x7ff`;
3493	const unsigned ExpBiasf64 = `1023`;
3494	const unsigned ExpBiasf16 = `15`;
3495	SDValue Zero = DAG.getConstant(`0`, DL, MVT::i32);
3496	SDValue One = DAG.getConstant(`1`, DL, MVT::i32);
3497	SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
3498	SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
3499	DAG.getConstant(`32`, DL, MVT::i64));
3500	UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
3501	U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
3502	SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3503	DAG.getConstant(`20`, DL, MVT::i64));
3504	E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
3505	DAG.getConstant(ExpMask, DL, MVT::i32));
3506	// Subtract the fp64 exponent bias (1023) to get the real exponent and
3507	// add the f16 bias (15) to get the biased exponent for the f16 format.
3508	E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
3509	DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
3510
3511	SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3512	DAG.getConstant(`8`, DL, MVT::i32));
3513	M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
3514	DAG.getConstant(`0xffe`, DL, MVT::i32));
3515
3516	SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
3517	DAG.getConstant(`0x1ff`, DL, MVT::i32));
3518	MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
3519
3520	SDValue Lo40Set = DAG.getSelectCC(DL, LHS: MaskedSig, RHS: Zero, True: Zero, False: One, Cond: ISD::SETEQ);
3521	M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
3522
3523	// (M != 0 ? 0x0200 : 0) \| 0x7c00;
3524	SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
3525	DAG.getSelectCC(DL, M, Zero, DAG.getConstant(`0x0200`, DL, MVT::i32),
3526	Zero, ISD::SETNE), DAG.getConstant(`0x7c00`, DL, MVT::i32));
3527
3528	// N = M \| (E << 12);
3529	SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3530	DAG.getNode(ISD::SHL, DL, MVT::i32, E,
3531	DAG.getConstant(`12`, DL, MVT::i32)));
3532
3533	// B = clamp(1-E, 0, 13);
3534	SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
3535	One, E);
3536	SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
3537	B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
3538	DAG.getConstant(`13`, DL, MVT::i32));
3539
3540	SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3541	DAG.getConstant(`0x1000`, DL, MVT::i32));
3542
3543	SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
3544	SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
3545	SDValue D1 = DAG.getSelectCC(DL, LHS: D0, RHS: SigSetHigh, True: One, False: Zero, Cond: ISD::SETNE);
3546	D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
3547
3548	SDValue V = DAG.getSelectCC(DL, LHS: E, RHS: One, True: D, False: N, Cond: ISD::SETLT);
3549	SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
3550	DAG.getConstant(`0x7`, DL, MVT::i32));
3551	V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
3552	DAG.getConstant(`2`, DL, MVT::i32));
3553	SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(`3`, DL, MVT::i32),
3554	One, Zero, ISD::SETEQ);
3555	SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(`5`, DL, MVT::i32),
3556	One, Zero, ISD::SETGT);
3557	V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
3558	V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
3559
3560	V = DAG.getSelectCC(DL, E, DAG.getConstant(`30`, DL, MVT::i32),
3561	DAG.getConstant(`0x7c00`, DL, MVT::i32), V, ISD::SETGT);
3562	V = DAG.getSelectCC(DL, E, DAG.getConstant(`1039`, DL, MVT::i32),
3563	I, V, ISD::SETEQ);
3564
3565	// Extract the sign bit.
3566	SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3567	DAG.getConstant(`16`, DL, MVT::i32));
3568	Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
3569	DAG.getConstant(`0x8000`, DL, MVT::i32));
3570
3571	V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
3572	return DAG.getZExtOrTrunc(Op: V, DL, VT: Op.getValueType());
3573	}
3574
3575	SDValue AMDGPUTargetLowering::LowerFP_TO_INT(const SDValue Op,
3576	SelectionDAG &DAG) const {
3577	SDValue Src = Op.getOperand(i: `0`);
3578	unsigned OpOpcode = Op.getOpcode();
3579	EVT SrcVT = Src.getValueType();
3580	EVT DestVT = Op.getValueType();
3581
3582	// Will be selected natively
3583	if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3584	return Op;
3585
3586	if (SrcVT == MVT::bf16) {
3587	SDLoc DL(Op);
3588	SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
3589	return DAG.getNode(Opcode: Op.getOpcode(), DL, VT: DestVT, Operand: PromotedSrc);
3590	}
3591
3592	// Promote i16 to i32
3593	if (DestVT == MVT::i16 && (SrcVT == MVT::f32 \|\| SrcVT == MVT::f64)) {
3594	SDLoc DL(Op);
3595
3596	SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3597	return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
3598	}
3599
3600	if (DestVT != MVT::i64)
3601	return Op;
3602
3603	if (SrcVT == MVT::f16 \|\|
3604	(SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
3605	SDLoc DL(Op);
3606
3607	SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3608	unsigned Ext =
3609	OpOpcode == ISD::FP_TO_SINT ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3610	return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
3611	}
3612
3613	if (SrcVT == MVT::f32 \|\| SrcVT == MVT::f64)
3614	return LowerFP_TO_INT64(Op, DAG, Signed: OpOpcode == ISD::FP_TO_SINT);
3615
3616	return SDValue ();
3617	}
3618
3619	SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
3620	SelectionDAG &DAG) const {
3621	EVT ExtraVT = cast<VTSDNode>(Val: Op.getOperand(i: `1`))->getVT();
3622	MVT VT = Op.getSimpleValueType();
3623	MVT ScalarVT = VT.getScalarType();
3624
3625	assert(VT.isVector());
3626
3627	SDValue Src = Op.getOperand(i: `0`);
3628	SDLoc DL(Op);
3629
3630	// TODO: Don't scalarize on Evergreen?
3631	unsigned NElts = VT.getVectorNumElements();
3632	SmallVector<SDValue, `8`> Args;
3633	DAG.ExtractVectorElements(Op: Src, Args, Start: `0`, Count: NElts);
3634
3635	SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
3636	for (unsigned I = `0`; I < NElts; ++I)
3637	Args [I] = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT: ScalarVT, N1: Args [I], N2: VTOp);
3638
3639	return DAG.getBuildVector(VT, DL, Ops: Args);
3640	}
3641
3642	//===----------------------------------------------------------------------===//
3643	// Custom DAG optimizations
3644	//===----------------------------------------------------------------------===//
3645
3646	static bool isU24(SDValue Op, SelectionDAG &DAG) {
3647	return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= `24`;
3648	}
3649
3650	static bool isI24(SDValue Op, SelectionDAG &DAG) {
3651	EVT VT = Op.getValueType();
3652	return VT.getSizeInBits() >= `24` && // Types less than 24-bit should be treated
3653	// as unsigned 24-bit values.
3654	AMDGPUTargetLowering::numBitsSigned(Op, DAG) <= `24`;
3655	}
3656
3657	static SDValue simplifyMul24(SDNode *Node24,
3658	TargetLowering::DAGCombinerInfo &DCI) {
3659	SelectionDAG &DAG = DCI.DAG;
3660	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3661	bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
3662
3663	SDValue LHS = IsIntrin ? Node24->getOperand(Num: `1`) : Node24->getOperand(Num: `0`);
3664	SDValue RHS = IsIntrin ? Node24->getOperand(Num: `2`) : Node24->getOperand(Num: `1`);
3665	unsigned NewOpcode = Node24->getOpcode();
3666	if (IsIntrin) {
3667	unsigned IID = Node24->getConstantOperandVal(Num: `0`);
3668	switch (IID) {
3669	case Intrinsic::amdgcn_mul_i24:
3670	NewOpcode = AMDGPUISD::MUL_I24;
3671	break;
3672	case Intrinsic::amdgcn_mul_u24:
3673	NewOpcode = AMDGPUISD::MUL_U24;
3674	break;
3675	case Intrinsic::amdgcn_mulhi_i24:
3676	NewOpcode = AMDGPUISD::MULHI_I24;
3677	break;
3678	case Intrinsic::amdgcn_mulhi_u24:
3679	NewOpcode = AMDGPUISD::MULHI_U24;
3680	break;
3681	default:
3682	llvm_unreachable("Expected 24-bit mul intrinsic");
3683	}
3684	}
3685
3686	APInt Demanded = APInt::getLowBitsSet(numBits: LHS.getValueSizeInBits(), loBitsSet: `24`);
3687
3688	// First try to simplify using SimplifyMultipleUseDemandedBits which allows
3689	// the operands to have other uses, but will only perform simplifications that
3690	// involve bypassing some nodes for this user.
3691	SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(Op: LHS, DemandedBits: Demanded, DAG);
3692	SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(Op: RHS, DemandedBits: Demanded, DAG);
3693	if (DemandedLHS \|\| DemandedRHS)
3694	return DAG.getNode(Opcode: NewOpcode, DL: SDLoc (Node24), VTList: Node24->getVTList(),
3695	N1: DemandedLHS ? DemandedLHS : LHS,
3696	N2: DemandedRHS ? DemandedRHS : RHS);
3697
3698	// Now try SimplifyDemandedBits which can simplify the nodes used by our
3699	// operands if this node is the only user.
3700	if (TLI.SimplifyDemandedBits(Op: LHS, DemandedBits: Demanded, DCI))
3701	return SDValue (Node24, `0`);
3702	if (TLI.SimplifyDemandedBits(Op: RHS, DemandedBits: Demanded, DCI))
3703	return SDValue (Node24, `0`);
3704
3705	return SDValue ();
3706	}
3707
3708	template <typename IntTy>
3709	static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
3710	uint32_t Width, const SDLoc &DL) {
3711	if (Width + Offset < `32`) {
3712	uint32_t Shl = static_cast<uint32_t>(Src0) << (`32` - Offset - Width);
3713	IntTy Result = static_cast<IntTy>(Shl) >> (`32` - Width);
3714	return DAG.getConstant(Result, DL, MVT::i32);
3715	}
3716
3717	return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
3718	}
3719
3720	static bool hasVolatileUser(SDNode *Val) {
3721	for (SDNode *U : Val->uses()) {
3722	if (MemSDNode *M = dyn_cast<MemSDNode>(Val: U)) {
3723	if (M->isVolatile())
3724	return true;
3725	}
3726	}
3727
3728	return false;
3729	}
3730
3731	bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
3732	// i32 vectors are the canonical memory type.
3733	if (VT.getScalarType() == MVT::i32 \|\| isTypeLegal(VT))
3734	return false;
3735
3736	if (!VT.isByteSized())
3737	return false;
3738
3739	unsigned Size = VT.getStoreSize();
3740
3741	if ((Size == `1` \|\| Size == `2` \|\| Size == `4`) && !VT.isVector())
3742	return false;
3743
3744	if (Size == `3` \|\| (Size > `4` && (Size % `4` != `0`)))
3745	return false;
3746
3747	return true;
3748	}
3749
3750	// Replace load of an illegal type with a store of a bitcast to a friendlier
3751	// type.
3752	SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
3753	DAGCombinerInfo &DCI) const {
3754	if (!DCI.isBeforeLegalize())
3755	return SDValue ();
3756
3757	LoadSDNode *LN = cast<LoadSDNode>(Val: N);
3758	if (!LN->isSimple() \|\| !ISD::isNormalLoad(N: LN) \|\| hasVolatileUser(Val: LN))
3759	return SDValue ();
3760
3761	SDLoc SL(N);
3762	SelectionDAG &DAG = DCI.DAG;
3763	EVT VT = LN->getMemoryVT();
3764
3765	unsigned Size = VT.getStoreSize();
3766	Align Alignment = LN->getAlign();
3767	if (Alignment < Size && isTypeLegal(VT)) {
3768	unsigned IsFast;
3769	unsigned AS = LN->getAddressSpace();
3770
3771	// Expand unaligned loads earlier than legalization. Due to visitation order
3772	// problems during legalization, the emitted instructions to pack and unpack
3773	// the bytes again are not eliminated in the case of an unaligned copy.
3774	if (!allowsMisalignedMemoryAccesses(
3775	VT, AddrSpace: AS, Alignment, Flags: LN->getMemOperand()->getFlags(), &IsFast)) {
3776	if (VT.isVector())
3777	return SplitVectorLoad(Op: SDValue (LN, `0`), DAG);
3778
3779	SDValue Ops[`2`];
3780	std::tie(args&: Ops[`0`], args&: Ops[`1`]) = expandUnalignedLoad(LD: LN, DAG);
3781
3782	return DAG.getMergeValues(Ops, dl: SDLoc (N));
3783	}
3784
3785	if (!IsFast)
3786	return SDValue ();
3787	}
3788
3789	if (!shouldCombineMemoryType(VT))
3790	return SDValue ();
3791
3792	EVT NewVT = getEquivalentMemType(Ctx&: *DAG.getContext(), VT);
3793
3794	SDValue NewLoad
3795	= DAG.getLoad(VT: NewVT, dl: SL, Chain: LN->getChain(),
3796	Ptr: LN->getBasePtr(), MMO: LN->getMemOperand());
3797
3798	SDValue BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: NewLoad);
3799	DCI.CombineTo(N, Res0: BC, Res1: NewLoad.getValue(R: `1`));
3800	return SDValue (N, `0`);
3801	}
3802
3803	// Replace store of an illegal type with a store of a bitcast to a friendlier
3804	// type.
3805	SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
3806	DAGCombinerInfo &DCI) const {
3807	if (!DCI.isBeforeLegalize())
3808	return SDValue ();
3809
3810	StoreSDNode *SN = cast<StoreSDNode>(Val: N);
3811	if (!SN->isSimple() \|\| !ISD::isNormalStore(N: SN))
3812	return SDValue ();
3813
3814	EVT VT = SN->getMemoryVT();
3815	unsigned Size = VT.getStoreSize();
3816
3817	SDLoc SL(N);
3818	SelectionDAG &DAG = DCI.DAG;
3819	Align Alignment = SN->getAlign();
3820	if (Alignment < Size && isTypeLegal(VT)) {
3821	unsigned IsFast;
3822	unsigned AS = SN->getAddressSpace();
3823
3824	// Expand unaligned stores earlier than legalization. Due to visitation
3825	// order problems during legalization, the emitted instructions to pack and
3826	// unpack the bytes again are not eliminated in the case of an unaligned
3827	// copy.
3828	if (!allowsMisalignedMemoryAccesses(
3829	VT, AddrSpace: AS, Alignment, Flags: SN->getMemOperand()->getFlags(), &IsFast)) {
3830	if (VT.isVector())
3831	return SplitVectorStore(Op: SDValue (SN, `0`), DAG);
3832
3833	return expandUnalignedStore(ST: SN, DAG);
3834	}
3835
3836	if (!IsFast)
3837	return SDValue ();
3838	}
3839
3840	if (!shouldCombineMemoryType(VT))
3841	return SDValue ();
3842
3843	EVT NewVT = getEquivalentMemType(Ctx&: *DAG.getContext(), VT);
3844	SDValue Val = SN->getValue();
3845
3846	//DCI.AddToWorklist(Val.getNode());
3847
3848	bool OtherUses = !Val.hasOneUse();
3849	SDValue CastVal = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVT, Operand: Val);
3850	if (OtherUses) {
3851	SDValue CastBack = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: CastVal);
3852	DAG.ReplaceAllUsesOfValueWith(From: Val, To: CastBack);
3853	}
3854
3855	return DAG.getStore(Chain: SN->getChain(), dl: SL, Val: CastVal,
3856	Ptr: SN->getBasePtr(), MMO: SN->getMemOperand());
3857	}
3858
3859	// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
3860	// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
3861	// issues.
3862	SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
3863	DAGCombinerInfo &DCI) const {
3864	SelectionDAG &DAG = DCI.DAG;
3865	SDValue N0 = N->getOperand(Num: `0`);
3866
3867	// (vt2 (assertzext (truncate vt0:x), vt1)) ->
3868	// (vt2 (truncate (assertzext vt0:x, vt1)))
3869	if (N0.getOpcode() == ISD::TRUNCATE) {
3870	SDValue N1 = N->getOperand(Num: `1`);
3871	EVT ExtVT = cast<VTSDNode>(Val&: N1)->getVT();
3872	SDLoc SL(N);
3873
3874	SDValue Src = N0.getOperand(i: `0`);
3875	EVT SrcVT = Src.getValueType();
3876	if (SrcVT.bitsGE(VT: ExtVT)) {
3877	SDValue NewInReg = DAG.getNode(Opcode: N->getOpcode(), DL: SL, VT: SrcVT, N1: Src, N2: N1);
3878	return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: N->getValueType(ResNo: `0`), Operand: NewInReg);
3879	}
3880	}
3881
3882	return SDValue ();
3883	}
3884
3885	SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
3886	SDNode N, DAGCombinerInfo &DCI) const* {
3887	unsigned IID = N->getConstantOperandVal(Num: `0`);
3888	switch (IID) {
3889	case Intrinsic::amdgcn_mul_i24:
3890	case Intrinsic::amdgcn_mul_u24:
3891	case Intrinsic::amdgcn_mulhi_i24:
3892	case Intrinsic::amdgcn_mulhi_u24:
3893	return simplifyMul24(Node24: N, DCI);
3894	case Intrinsic::amdgcn_fract:
3895	case Intrinsic::amdgcn_rsq:
3896	case Intrinsic::amdgcn_rcp_legacy:
3897	case Intrinsic::amdgcn_rsq_legacy:
3898	case Intrinsic::amdgcn_rsq_clamp: {
3899	// FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
3900	SDValue Src = N->getOperand(Num: `1`);
3901	return Src.isUndef() ? Src : SDValue ();
3902	}
3903	case Intrinsic::amdgcn_frexp_exp: {
3904	// frexp_exp (fneg x) -> frexp_exp x
3905	// frexp_exp (fabs x) -> frexp_exp x
3906	// frexp_exp (fneg (fabs x)) -> frexp_exp x
3907	SDValue Src = N->getOperand(Num: `1`);
3908	SDValue PeekSign = peekFPSignOps(Val: Src);
3909	if (PeekSign == Src)
3910	return SDValue ();
3911	return SDValue (DCI.DAG.UpdateNodeOperands(N, Op1: N->getOperand(Num: `0`), Op2: PeekSign),
3912	`0`);
3913	}
3914	default:
3915	return SDValue ();
3916	}
3917	}
3918
3919	/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
3920	/// binary operation \p Opc to it with the corresponding constant operands.
3921	SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
3922	DAGCombinerInfo &DCI, const SDLoc &SL,
3923	unsigned Opc, SDValue LHS,
3924	uint32_t ValLo, uint32_t ValHi) const {
3925	SelectionDAG &DAG = DCI.DAG;
3926	SDValue Lo, Hi;
3927	std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: LHS, DAG);
3928
3929	SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
3930	SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
3931
3932	SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
3933	SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
3934
3935	// Re-visit the ands. It's possible we eliminated one of them and it could
3936	// simplify the vector.
3937	DCI.AddToWorklist(N: Lo.getNode());
3938	DCI.AddToWorklist(N: Hi.getNode());
3939
3940	SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
3941	return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3942	}
3943
3944	SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
3945	DAGCombinerInfo &DCI) const {
3946	EVT VT = N->getValueType(ResNo: `0`);
3947
3948	ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `1`));
3949	if (!RHS)
3950	return SDValue ();
3951
3952	SDValue LHS = N->getOperand(Num: `0`);
3953	unsigned RHSVal = RHS->getZExtValue();
3954	if (!RHSVal)
3955	return LHS;
3956
3957	SDLoc SL(N);
3958	SelectionDAG &DAG = DCI.DAG;
3959
3960	switch (LHS ->getOpcode()) {
3961	default:
3962	break;
3963	case ISD::ZERO_EXTEND:
3964	case ISD::SIGN_EXTEND:
3965	case ISD::ANY_EXTEND: {
3966	SDValue X = LHS ->getOperand(Num: `0`);
3967
3968	if (VT == MVT::i32 && RHSVal == `16` && X.getValueType() == MVT::i16 &&
3969	isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
3970	// Prefer build_vector as the canonical form if packed types are legal.
3971	// (shl ([asz]ext i16:x), 16 -> build_vector 0, x
3972	SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
3973	{ DAG.getConstant(`0`, SL, MVT::i16), LHS->getOperand(`0`) });
3974	return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
3975	}
3976
3977	// shl (ext x) => zext (shl x), if shift does not overflow int
3978	if (VT != MVT::i64)
3979	break;
3980	KnownBits Known = DAG.computeKnownBits(Op: X);
3981	unsigned LZ = Known.countMinLeadingZeros();
3982	if (LZ < RHSVal)
3983	break;
3984	EVT XVT = X.getValueType();
3985	SDValue Shl = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: XVT, N1: X, N2: SDValue (RHS, `0`));
3986	return DAG.getZExtOrTrunc(Op: Shl, DL: SL, VT);
3987	}
3988	}
3989
3990	if (VT != MVT::i64)
3991	return SDValue ();
3992
3993	// i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
3994
3995	// On some subtargets, 64-bit shift is a quarter rate instruction. In the
3996	// common case, splitting this into a move and a 32-bit shift is faster and
3997	// the same code size.
3998	if (RHSVal < `32`)
3999	return SDValue ();
4000
4001	SDValue ShiftAmt = DAG.getConstant(RHSVal - `32`, SL, MVT::i32);
4002
4003	SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
4004	SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
4005
4006	const SDValue Zero = DAG.getConstant(`0`, SL, MVT::i32);
4007
4008	SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
4009	return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
4010	}
4011
4012	SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
4013	DAGCombinerInfo &DCI) const {
4014	if (N->getValueType(`0`) != MVT::i64)
4015	return SDValue ();
4016
4017	const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `1`));
4018	if (!RHS)
4019	return SDValue ();
4020
4021	SelectionDAG &DAG = DCI.DAG;
4022	SDLoc SL(N);
4023	unsigned RHSVal = RHS->getZExtValue();
4024
4025	// (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
4026	if (RHSVal == `32`) {
4027	SDValue Hi = getHiHalf64(Op: N->getOperand(Num: `0`), DAG);
4028	SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
4029	DAG.getConstant(`31`, SL, MVT::i32));
4030
4031	SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
4032	return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
4033	}
4034
4035	// (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
4036	if (RHSVal == `63`) {
4037	SDValue Hi = getHiHalf64(Op: N->getOperand(Num: `0`), DAG);
4038	SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
4039	DAG.getConstant(`31`, SL, MVT::i32));
4040	SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
4041	return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
4042	}
4043
4044	return SDValue ();
4045	}
4046
4047	SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
4048	DAGCombinerInfo &DCI) const {
4049	auto *RHS = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `1`));
4050	if (!RHS)
4051	return SDValue ();
4052
4053	EVT VT = N->getValueType(ResNo: `0`);
4054	SDValue LHS = N->getOperand(Num: `0`);
4055	unsigned ShiftAmt = RHS->getZExtValue();
4056	SelectionDAG &DAG = DCI.DAG;
4057	SDLoc SL(N);
4058
4059	// fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
4060	// this improves the ability to match BFE patterns in isel.
4061	if (LHS.getOpcode() == ISD::AND) {
4062	if (auto *Mask = dyn_cast<ConstantSDNode>(Val: LHS.getOperand(i: `1`))) {
4063	unsigned MaskIdx, MaskLen;
4064	if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
4065	MaskIdx == ShiftAmt) {
4066	return DAG.getNode(
4067	Opcode: ISD::AND, DL: SL, VT,
4068	N1: DAG.getNode(Opcode: ISD::SRL, DL: SL, VT, N1: LHS.getOperand(i: `0`), N2: N->getOperand(Num: `1`)),
4069	N2: DAG.getNode(Opcode: ISD::SRL, DL: SL, VT, N1: LHS.getOperand(i: `1`), N2: N->getOperand(Num: `1`)));
4070	}
4071	}
4072	}
4073
4074	if (VT != MVT::i64)
4075	return SDValue ();
4076
4077	if (ShiftAmt < `32`)
4078	return SDValue ();
4079
4080	// srl i64:x, C for C >= 32
4081	// =>
4082	// build_pair (srl hi_32(x), C - 32), 0
4083	SDValue Zero = DAG.getConstant(`0`, SL, MVT::i32);
4084
4085	SDValue Hi = getHiHalf64(Op: LHS, DAG);
4086
4087	SDValue NewConst = DAG.getConstant(ShiftAmt - `32`, SL, MVT::i32);
4088	SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
4089
4090	SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
4091
4092	return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
4093	}
4094
4095	SDValue AMDGPUTargetLowering::performTruncateCombine(
4096	SDNode N, DAGCombinerInfo &DCI) const* {
4097	SDLoc SL(N);
4098	SelectionDAG &DAG = DCI.DAG;
4099	EVT VT = N->getValueType(ResNo: `0`);
4100	SDValue Src = N->getOperand(Num: `0`);
4101
4102	// vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
4103	if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
4104	SDValue Vec = Src.getOperand(i: `0`);
4105	if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
4106	SDValue Elt0 = Vec.getOperand(i: `0`);
4107	EVT EltVT = Elt0.getValueType();
4108	if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
4109	if (EltVT.isFloatingPoint()) {
4110	Elt0 = DAG.getNode(Opcode: ISD::BITCAST, DL: SL,
4111	VT: EltVT.changeTypeToInteger(), Operand: Elt0);
4112	}
4113
4114	return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: Elt0);
4115	}
4116	}
4117	}
4118
4119	// Equivalent of above for accessing the high element of a vector as an
4120	// integer operation.
4121	// trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
4122	if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
4123	if (auto K = isConstOrConstSplat(N: Src.getOperand(i: `1`))) {
4124	if (`2` * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
4125	SDValue BV = stripBitcast(Val: Src.getOperand(i: `0`));
4126	if (BV.getOpcode() == ISD::BUILD_VECTOR &&
4127	BV.getValueType().getVectorNumElements() == `2`) {
4128	SDValue SrcElt = BV.getOperand(i: `1`);
4129	EVT SrcEltVT = SrcElt.getValueType();
4130	if (SrcEltVT.isFloatingPoint()) {
4131	SrcElt = DAG.getNode(Opcode: ISD::BITCAST, DL: SL,
4132	VT: SrcEltVT.changeTypeToInteger(), Operand: SrcElt);
4133	}
4134
4135	return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: SrcElt);
4136	}
4137	}
4138	}
4139	}
4140
4141	// Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
4142	//
4143	// i16 (trunc (srl i64:x, K)), K <= 16 ->
4144	// i16 (trunc (srl (i32 (trunc x), K)))
4145	if (VT.getScalarSizeInBits() < `32`) {
4146	EVT SrcVT = Src.getValueType();
4147	if (SrcVT.getScalarSizeInBits() > `32` &&
4148	(Src.getOpcode() == ISD::SRL \|\|
4149	Src.getOpcode() == ISD::SRA \|\|
4150	Src.getOpcode() == ISD::SHL)) {
4151	SDValue Amt = Src.getOperand(i: `1`);
4152	KnownBits Known = DAG.computeKnownBits(Op: Amt);
4153
4154	// - For left shifts, do the transform as long as the shift
4155	// amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
4156	// - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
4157	// losing information stored in the high bits when truncating.
4158	const unsigned MaxCstSize =
4159	(Src.getOpcode() == ISD::SHL) ? `31` : (`32` - VT.getScalarSizeInBits());
4160	if (Known.getMaxValue().ule(RHS: MaxCstSize)) {
4161	EVT MidVT = VT.isVector() ?
4162	EVT::getVectorVT(*DAG.getContext(), MVT::i32,
4163	VT.getVectorNumElements()) : MVT::i32;
4164
4165	EVT NewShiftVT = getShiftAmountTy(LHSTy: MidVT, DL: DAG.getDataLayout());
4166	SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MidVT,
4167	Operand: Src.getOperand(i: `0`));
4168	DCI.AddToWorklist(N: Trunc.getNode());
4169
4170	if (Amt.getValueType() != NewShiftVT) {
4171	Amt = DAG.getZExtOrTrunc(Op: Amt, DL: SL, VT: NewShiftVT);
4172	DCI.AddToWorklist(N: Amt.getNode());
4173	}
4174
4175	SDValue ShrunkShift = DAG.getNode(Opcode: Src.getOpcode(), DL: SL, VT: MidVT,
4176	N1: Trunc, N2: Amt);
4177	return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: ShrunkShift);
4178	}
4179	}
4180	}
4181
4182	return SDValue ();
4183	}
4184
4185	// We need to specifically handle i64 mul here to avoid unnecessary conversion
4186	// instructions. If we only match on the legalized i64 mul expansion,
4187	// SimplifyDemandedBits will be unable to remove them because there will be
4188	// multiple uses due to the separate mul + mulh[su].
4189	static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
4190	SDValue N0, SDValue N1, unsigned Size, bool Signed) {
4191	if (Size <= `32`) {
4192	unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4193	return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
4194	}
4195
4196	unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4197	unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
4198
4199	SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
4200	SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
4201
4202	return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
4203	}
4204
4205	/// If \p V is an add of a constant 1, returns the other operand. Otherwise
4206	/// return SDValue().
4207	static SDValue getAddOneOp(const SDNode *V) {
4208	if (V->getOpcode() != ISD::ADD)
4209	return SDValue ();
4210
4211	return isOneConstant(V: V->getOperand(Num: `1`)) ? V->getOperand(Num: `0`) : SDValue ();
4212	}
4213
4214	SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
4215	DAGCombinerInfo &DCI) const {
4216	assert(N->getOpcode() == ISD::MUL);
4217	EVT VT = N->getValueType(ResNo: `0`);
4218
4219	// Don't generate 24-bit multiplies on values that are in SGPRs, since
4220	// we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4221	// unnecessarily). isDivergent() is used as an approximation of whether the
4222	// value is in an SGPR.
4223	if (!N->isDivergent())
4224	return SDValue ();
4225
4226	unsigned Size = VT.getSizeInBits();
4227	if (VT.isVector() \|\| Size > `64`)
4228	return SDValue ();
4229
4230	SelectionDAG &DAG = DCI.DAG;
4231	SDLoc DL(N);
4232
4233	SDValue N0 = N->getOperand(Num: `0`);
4234	SDValue N1 = N->getOperand(Num: `1`);
4235
4236	// Undo InstCombine canonicalize X (Y + 1) -> X * Y + X to enable mad*
4237	// matching.
4238
4239	// mul x, (add y, 1) -> add (mul x, y), x
4240	auto IsFoldableAdd = [](SDValue V) -> SDValue {
4241	SDValue AddOp = getAddOneOp(V: V.getNode());
4242	if (!AddOp)
4243	return SDValue ();
4244
4245	if (V.hasOneUse() \|\| all_of(Range: V ->uses(), P: [](const SDNode U) -> bool* {
4246	return U->getOpcode() == ISD::MUL;
4247	}))
4248	return AddOp;
4249
4250	return SDValue ();
4251	};
4252
4253	// FIXME: The selection pattern is not properly checking for commuted
4254	// operands, so we have to place the mul in the LHS
4255	if (SDValue MulOper = IsFoldableAdd (N0)) {
4256	SDValue MulVal = DAG.getNode(Opcode: N->getOpcode(), DL, VT, N1, N2: MulOper);
4257	return DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: MulVal, N2: N1);
4258	}
4259
4260	if (SDValue MulOper = IsFoldableAdd (N1)) {
4261	SDValue MulVal = DAG.getNode(Opcode: N->getOpcode(), DL, VT, N1: N0, N2: MulOper);
4262	return DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: MulVal, N2: N0);
4263	}
4264
4265	// There are i16 integer mul/mad.
4266	if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
4267	return SDValue ();
4268
4269	// SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4270	// in the source into any_extends if the result of the mul is truncated. Since
4271	// we can assume the high bits are whatever we want, use the underlying value
4272	// to avoid the unknown high bits from interfering.
4273	if (N0.getOpcode() == ISD::ANY_EXTEND)
4274	N0 = N0.getOperand(i: `0`);
4275
4276	if (N1.getOpcode() == ISD::ANY_EXTEND)
4277	N1 = N1.getOperand(i: `0`);
4278
4279	SDValue Mul;
4280
4281	if (Subtarget->hasMulU24() && isU24(Op: N0, DAG) && isU24(Op: N1, DAG)) {
4282	N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4283	N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4284	Mul = getMul24(DAG, SL: DL, N0, N1, Size, Signed: false);
4285	} else if (Subtarget->hasMulI24() && isI24(Op: N0, DAG) && isI24(Op: N1, DAG)) {
4286	N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4287	N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4288	Mul = getMul24(DAG, SL: DL, N0, N1, Size, Signed: true);
4289	} else {
4290	return SDValue ();
4291	}
4292
4293	// We need to use sext even for MUL_U24, because MUL_U24 is used
4294	// for signed multiply of 8 and 16-bit types.
4295	return DAG.getSExtOrTrunc(Op: Mul, DL, VT);
4296	}
4297
4298	SDValue
4299	AMDGPUTargetLowering::performMulLoHiCombine(SDNode *N,
4300	DAGCombinerInfo &DCI) const {
4301	if (N->getValueType(`0`) != MVT::i32)
4302	return SDValue ();
4303
4304	SelectionDAG &DAG = DCI.DAG;
4305	SDLoc DL(N);
4306
4307	SDValue N0 = N->getOperand(Num: `0`);
4308	SDValue N1 = N->getOperand(Num: `1`);
4309
4310	// SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4311	// in the source into any_extends if the result of the mul is truncated. Since
4312	// we can assume the high bits are whatever we want, use the underlying value
4313	// to avoid the unknown high bits from interfering.
4314	if (N0.getOpcode() == ISD::ANY_EXTEND)
4315	N0 = N0.getOperand(i: `0`);
4316	if (N1.getOpcode() == ISD::ANY_EXTEND)
4317	N1 = N1.getOperand(i: `0`);
4318
4319	// Try to use two fast 24-bit multiplies (one for each half of the result)
4320	// instead of one slow extending multiply.
4321	unsigned LoOpcode, HiOpcode;
4322	if (Subtarget->hasMulU24() && isU24(Op: N0, DAG) && isU24(Op: N1, DAG)) {
4323	N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4324	N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4325	LoOpcode = AMDGPUISD::MUL_U24;
4326	HiOpcode = AMDGPUISD::MULHI_U24;
4327	} else if (Subtarget->hasMulI24() && isI24(Op: N0, DAG) && isI24(Op: N1, DAG)) {
4328	N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4329	N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4330	LoOpcode = AMDGPUISD::MUL_I24;
4331	HiOpcode = AMDGPUISD::MULHI_I24;
4332	} else {
4333	return SDValue ();
4334	}
4335
4336	SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
4337	SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
4338	DCI.CombineTo(N, Res0: Lo, Res1: Hi);
4339	return SDValue (N, `0`);
4340	}
4341
4342	SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
4343	DAGCombinerInfo &DCI) const {
4344	EVT VT = N->getValueType(ResNo: `0`);
4345
4346	if (!Subtarget->hasMulI24() \|\| VT.isVector())
4347	return SDValue ();
4348
4349	// Don't generate 24-bit multiplies on values that are in SGPRs, since
4350	// we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4351	// unnecessarily). isDivergent() is used as an approximation of whether the
4352	// value is in an SGPR.
4353	// This doesn't apply if no s_mul_hi is available (since we'll end up with a
4354	// valu op anyway)
4355	if (Subtarget->hasSMulHi() && !N->isDivergent())
4356	return SDValue ();
4357
4358	SelectionDAG &DAG = DCI.DAG;
4359	SDLoc DL(N);
4360
4361	SDValue N0 = N->getOperand(Num: `0`);
4362	SDValue N1 = N->getOperand(Num: `1`);
4363
4364	if (!isI24(Op: N0, DAG) \|\| !isI24(Op: N1, DAG))
4365	return SDValue ();
4366
4367	N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4368	N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4369
4370	SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
4371	DCI.AddToWorklist(N: Mulhi.getNode());
4372	return DAG.getSExtOrTrunc(Op: Mulhi, DL, VT);
4373	}
4374
4375	SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
4376	DAGCombinerInfo &DCI) const {
4377	EVT VT = N->getValueType(ResNo: `0`);
4378
4379	if (!Subtarget->hasMulU24() \|\| VT.isVector() \|\| VT.getSizeInBits() > `32`)
4380	return SDValue ();
4381
4382	// Don't generate 24-bit multiplies on values that are in SGPRs, since
4383	// we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4384	// unnecessarily). isDivergent() is used as an approximation of whether the
4385	// value is in an SGPR.
4386	// This doesn't apply if no s_mul_hi is available (since we'll end up with a
4387	// valu op anyway)
4388	if (Subtarget->hasSMulHi() && !N->isDivergent())
4389	return SDValue ();
4390
4391	SelectionDAG &DAG = DCI.DAG;
4392	SDLoc DL(N);
4393
4394	SDValue N0 = N->getOperand(Num: `0`);
4395	SDValue N1 = N->getOperand(Num: `1`);
4396
4397	if (!isU24(Op: N0, DAG) \|\| !isU24(Op: N1, DAG))
4398	return SDValue ();
4399
4400	N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4401	N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4402
4403	SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
4404	DCI.AddToWorklist(N: Mulhi.getNode());
4405	return DAG.getZExtOrTrunc(Op: Mulhi, DL, VT);
4406	}
4407
4408	SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
4409	SDValue Op,
4410	const SDLoc &DL,
4411	unsigned Opc) const {
4412	EVT VT = Op.getValueType();
4413	EVT LegalVT = getTypeToTransformTo(Context&: *DAG.getContext(), VT);
4414	if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
4415	LegalVT != MVT::i16))
4416	return SDValue ();
4417
4418	if (VT != MVT::i32)
4419	Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
4420
4421	SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
4422	if (VT != MVT::i32)
4423	FFBX = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: FFBX);
4424
4425	return FFBX;
4426	}
4427
4428	// The native instructions return -1 on 0 input. Optimize out a select that
4429	// produces -1 on 0.
4430	//
4431	// TODO: If zero is not undef, we could also do this if the output is compared
4432	// against the bitwidth.
4433	//
4434	// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
4435	SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond,
4436	SDValue LHS, SDValue RHS,
4437	DAGCombinerInfo &DCI) const {
4438	if (!isNullConstant(V: Cond.getOperand(i: `1`)))
4439	return SDValue ();
4440
4441	SelectionDAG &DAG = DCI.DAG;
4442	ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Val: Cond.getOperand(i: `2`))->get();
4443	SDValue CmpLHS = Cond.getOperand(i: `0`);
4444
4445	// select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
4446	// select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
4447	if (CCOpcode == ISD::SETEQ &&
4448	(isCtlzOpc(Opc: RHS.getOpcode()) \|\| isCttzOpc(Opc: RHS.getOpcode())) &&
4449	RHS.getOperand(i: `0`) == CmpLHS && isAllOnesConstant(V: LHS)) {
4450	unsigned Opc =
4451	isCttzOpc(Opc: RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
4452	return getFFBX_U32(DAG, Op: CmpLHS, DL: SL, Opc);
4453	}
4454
4455	// select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
4456	// select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
4457	if (CCOpcode == ISD::SETNE &&
4458	(isCtlzOpc(Opc: LHS.getOpcode()) \|\| isCttzOpc(Opc: LHS.getOpcode())) &&
4459	LHS.getOperand(i: `0`) == CmpLHS && isAllOnesConstant(V: RHS)) {
4460	unsigned Opc =
4461	isCttzOpc(Opc: LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
4462
4463	return getFFBX_U32(DAG, Op: CmpLHS, DL: SL, Opc);
4464	}
4465
4466	return SDValue ();
4467	}
4468
4469	static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
4470	unsigned Op,
4471	const SDLoc &SL,
4472	SDValue Cond,
4473	SDValue N1,
4474	SDValue N2) {
4475	SelectionDAG &DAG = DCI.DAG;
4476	EVT VT = N1.getValueType();
4477
4478	SDValue NewSelect = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: Cond,
4479	N2: N1.getOperand(i: `0`), N3: N2.getOperand(i: `0`));
4480	DCI.AddToWorklist(N: NewSelect.getNode());
4481	return DAG.getNode(Opcode: Op, DL: SL, VT, Operand: NewSelect);
4482	}
4483
4484	// Pull a free FP operation out of a select so it may fold into uses.
4485	//
4486	// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
4487	// select c, (fneg x), k -> fneg (select c, x, (fneg k))
4488	//
4489	// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
4490	// select c, (fabs x), +k -> fabs (select c, x, k)
4491	SDValue
4492	AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
4493	SDValue N) const {
4494	SelectionDAG &DAG = DCI.DAG;
4495	SDValue Cond = N.getOperand(i: `0`);
4496	SDValue LHS = N.getOperand(i: `1`);
4497	SDValue RHS = N.getOperand(i: `2`);
4498
4499	EVT VT = N.getValueType();
4500	if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) \|\|
4501	(LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
4502	if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N: N.getNode()))
4503	return SDValue ();
4504
4505	return distributeOpThroughSelect(DCI, Op: LHS.getOpcode(),
4506	SL: SDLoc (N), Cond, N1: LHS, N2: RHS);
4507	}
4508
4509	bool Inv = false;
4510	if (RHS.getOpcode() == ISD::FABS \|\| RHS.getOpcode() == ISD::FNEG) {
4511	std::swap(a&: LHS, b&: RHS);
4512	Inv = true;
4513	}
4514
4515	// TODO: Support vector constants.
4516	ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(Val&: RHS);
4517	if ((LHS.getOpcode() == ISD::FNEG \|\| LHS.getOpcode() == ISD::FABS) && CRHS &&
4518	!selectSupportsSourceMods(N: N.getNode())) {
4519	SDLoc SL(N);
4520	// If one side is an fneg/fabs and the other is a constant, we can push the
4521	// fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
4522	SDValue NewLHS = LHS.getOperand(i: `0`);
4523	SDValue NewRHS = RHS;
4524
4525	// Careful: if the neg can be folded up, don't try to pull it back down.
4526	bool ShouldFoldNeg = true;
4527
4528	if (NewLHS.hasOneUse()) {
4529	unsigned Opc = NewLHS.getOpcode();
4530	if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(N: NewLHS.getNode()))
4531	ShouldFoldNeg = false;
4532	if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
4533	ShouldFoldNeg = false;
4534	}
4535
4536	if (ShouldFoldNeg) {
4537	if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())
4538	return SDValue ();
4539
4540	// We're going to be forced to use a source modifier anyway, there's no
4541	// point to pulling the negate out unless we can get a size reduction by
4542	// negating the constant.
4543	//
4544	// TODO: Generalize to use getCheaperNegatedExpression which doesn't know
4545	// about cheaper constants.
4546	if (NewLHS.getOpcode() == ISD::FABS &&
4547	getConstantNegateCost(C: CRHS) != NegatibleCost::Cheaper)
4548	return SDValue ();
4549
4550	if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N: N.getNode()))
4551	return SDValue ();
4552
4553	if (LHS.getOpcode() == ISD::FNEG)
4554	NewRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
4555
4556	if (Inv)
4557	std::swap(a&: NewLHS, b&: NewRHS);
4558
4559	SDValue NewSelect = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT,
4560	N1: Cond, N2: NewLHS, N3: NewRHS);
4561	DCI.AddToWorklist(N: NewSelect.getNode());
4562	return DAG.getNode(Opcode: LHS.getOpcode(), DL: SL, VT, Operand: NewSelect);
4563	}
4564	}
4565
4566	return SDValue ();
4567	}
4568
4569	SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
4570	DAGCombinerInfo &DCI) const {
4571	if (SDValue Folded = foldFreeOpFromSelect(DCI, N: SDValue (N, `0`)))
4572	return Folded;
4573
4574	SDValue Cond = N->getOperand(Num: `0`);
4575	if (Cond.getOpcode() != ISD::SETCC)
4576	return SDValue ();
4577
4578	EVT VT = N->getValueType(ResNo: `0`);
4579	SDValue LHS = Cond.getOperand(i: `0`);
4580	SDValue RHS = Cond.getOperand(i: `1`);
4581	SDValue CC = Cond.getOperand(i: `2`);
4582
4583	SDValue True = N->getOperand(Num: `1`);
4584	SDValue False = N->getOperand(Num: `2`);
4585
4586	if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
4587	SelectionDAG &DAG = DCI.DAG;
4588	if (DAG.isConstantValueOfAnyType(N: True) &&
4589	!DAG.isConstantValueOfAnyType(N: False)) {
4590	// Swap cmp + select pair to move constant to false input.
4591	// This will allow using VOPC cndmasks more often.
4592	// select (setcc x, y), k, x -> select (setccinv x, y), x, k
4593
4594	SDLoc SL(N);
4595	ISD::CondCode NewCC =
4596	getSetCCInverse(Operation: cast<CondCodeSDNode>(Val&: CC)->get(), Type: LHS.getValueType());
4597
4598	SDValue NewCond = DAG.getSetCC(DL: SL, VT: Cond.getValueType(), LHS, RHS, Cond: NewCC);
4599	return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NewCond, N2: False, N3: True);
4600	}
4601
4602	if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
4603	SDValue MinMax
4604	= combineFMinMaxLegacy(DL: SDLoc (N), VT, LHS, RHS, True, False, CC, DCI);
4605	// Revisit this node so we can catch min3/max3/med3 patterns.
4606	//DCI.AddToWorklist(MinMax.getNode());
4607	return MinMax;
4608	}
4609	}
4610
4611	// There's no reason to not do this if the condition has other uses.
4612	return performCtlz_CttzCombine(SL: SDLoc (N), Cond, LHS: True, RHS: False, DCI);
4613	}
4614
4615	static bool isInv2Pi(const APFloat &APF) {
4616	static const APFloat KF16(APFloat::IEEEhalf(), APInt (`16`, `0x3118`));
4617	static const APFloat KF32(APFloat::IEEEsingle(), APInt (`32`, `0x3e22f983`));
4618	static const APFloat KF64(APFloat::IEEEdouble(), APInt (`64`, `0x3fc45f306dc9c882`));
4619
4620	return APF.bitwiseIsEqual(RHS: KF16) \|\|
4621	APF.bitwiseIsEqual(RHS: KF32) \|\|
4622	APF.bitwiseIsEqual(RHS: KF64);
4623	}
4624
4625	// 0 and 1.0 / (0.5 pi) do not have inline immmediates, so there is an*
4626	// additional cost to negate them.
4627	TargetLowering::NegatibleCost
4628	AMDGPUTargetLowering::getConstantNegateCost(const ConstantFPSDNode C) const* {
4629	if (C->isZero())
4630	return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4631
4632	if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(APF: C->getValueAPF()))
4633	return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4634
4635	return NegatibleCost::Neutral;
4636	}
4637
4638	bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const {
4639	if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
4640	return getConstantNegateCost(C) == NegatibleCost::Expensive;
4641	return false;
4642	}
4643
4644	bool AMDGPUTargetLowering::isConstantCheaperToNegate(SDValue N) const {
4645	if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
4646	return getConstantNegateCost(C) == NegatibleCost::Cheaper;
4647	return false;
4648	}
4649
4650	static unsigned inverseMinMax(unsigned Opc) {
4651	switch (Opc) {
4652	case ISD::FMAXNUM:
4653	return ISD::FMINNUM;
4654	case ISD::FMINNUM:
4655	return ISD::FMAXNUM;
4656	case ISD::FMAXNUM_IEEE:
4657	return ISD::FMINNUM_IEEE;
4658	case ISD::FMINNUM_IEEE:
4659	return ISD::FMAXNUM_IEEE;
4660	case ISD::FMAXIMUM:
4661	return ISD::FMINIMUM;
4662	case ISD::FMINIMUM:
4663	return ISD::FMAXIMUM;
4664	case AMDGPUISD::FMAX_LEGACY:
4665	return AMDGPUISD::FMIN_LEGACY;
4666	case AMDGPUISD::FMIN_LEGACY:
4667	return AMDGPUISD::FMAX_LEGACY;
4668	default:
4669	llvm_unreachable("invalid min/max opcode");
4670	}
4671	}
4672
4673	/// \return true if it's profitable to try to push an fneg into its source
4674	/// instruction.
4675	bool AMDGPUTargetLowering::shouldFoldFNegIntoSrc(SDNode *N, SDValue N0) {
4676	// If the input has multiple uses and we can either fold the negate down, or
4677	// the other uses cannot, give up. This both prevents unprofitable
4678	// transformations and infinite loops: we won't repeatedly try to fold around
4679	// a negate that has no 'good' form.
4680	if (N0.hasOneUse()) {
4681	// This may be able to fold into the source, but at a code size cost. Don't
4682	// fold if the fold into the user is free.
4683	if (allUsesHaveSourceMods(N, CostThreshold: `0`))
4684	return false;
4685	} else {
4686	if (fnegFoldsIntoOp(N: N0.getNode()) &&
4687	(allUsesHaveSourceMods(N) \|\| !allUsesHaveSourceMods(N: N0.getNode())))
4688	return false;
4689	}
4690
4691	return true;
4692	}
4693
4694	SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
4695	DAGCombinerInfo &DCI) const {
4696	SelectionDAG &DAG = DCI.DAG;
4697	SDValue N0 = N->getOperand(Num: `0`);
4698	EVT VT = N->getValueType(ResNo: `0`);
4699
4700	unsigned Opc = N0.getOpcode();
4701
4702	if (!shouldFoldFNegIntoSrc(N, N0))
4703	return SDValue ();
4704
4705	SDLoc SL(N);
4706	switch (Opc) {
4707	case ISD::FADD: {
4708	if (!mayIgnoreSignedZero(Op: N0))
4709	return SDValue ();
4710
4711	// (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
4712	SDValue LHS = N0.getOperand(i: `0`);
4713	SDValue RHS = N0.getOperand(i: `1`);
4714
4715	if (LHS.getOpcode() != ISD::FNEG)
4716	LHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: LHS);
4717	else
4718	LHS = LHS.getOperand(i: `0`);
4719
4720	if (RHS.getOpcode() != ISD::FNEG)
4721	RHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
4722	else
4723	RHS = RHS.getOperand(i: `0`);
4724
4725	SDValue Res = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: LHS, N2: RHS, Flags: N0 ->getFlags());
4726	if (Res.getOpcode() != ISD::FADD)
4727	return SDValue (); // Op got folded away.
4728	if (!N0.hasOneUse())
4729	DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res));
4730	return Res;
4731	}
4732	case ISD::FMUL:
4733	case AMDGPUISD::FMUL_LEGACY: {
4734	// (fneg (fmul x, y)) -> (fmul x, (fneg y))
4735	// (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
4736	SDValue LHS = N0.getOperand(i: `0`);
4737	SDValue RHS = N0.getOperand(i: `1`);
4738
4739	if (LHS.getOpcode() == ISD::FNEG)
4740	LHS = LHS.getOperand(i: `0`);
4741	else if (RHS.getOpcode() == ISD::FNEG)
4742	RHS = RHS.getOperand(i: `0`);
4743	else
4744	RHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
4745
4746	SDValue Res = DAG.getNode(Opcode: Opc, DL: SL, VT, N1: LHS, N2: RHS, Flags: N0 ->getFlags());
4747	if (Res.getOpcode() != Opc)
4748	return SDValue (); // Op got folded away.
4749	if (!N0.hasOneUse())
4750	DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res));
4751	return Res;
4752	}
4753	case ISD::FMA:
4754	case ISD::FMAD: {
4755	// TODO: handle llvm.amdgcn.fma.legacy
4756	if (!mayIgnoreSignedZero(Op: N0))
4757	return SDValue ();
4758
4759	// (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
4760	SDValue LHS = N0.getOperand(i: `0`);
4761	SDValue MHS = N0.getOperand(i: `1`);
4762	SDValue RHS = N0.getOperand(i: `2`);
4763
4764	if (LHS.getOpcode() == ISD::FNEG)
4765	LHS = LHS.getOperand(i: `0`);
4766	else if (MHS.getOpcode() == ISD::FNEG)
4767	MHS = MHS.getOperand(i: `0`);
4768	else
4769	MHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: MHS);
4770
4771	if (RHS.getOpcode() != ISD::FNEG)
4772	RHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
4773	else
4774	RHS = RHS.getOperand(i: `0`);
4775
4776	SDValue Res = DAG.getNode(Opcode: Opc, DL: SL, VT, N1: LHS, N2: MHS, N3: RHS);
4777	if (Res.getOpcode() != Opc)
4778	return SDValue (); // Op got folded away.
4779	if (!N0.hasOneUse())
4780	DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res));
4781	return Res;
4782	}
4783	case ISD::FMAXNUM:
4784	case ISD::FMINNUM:
4785	case ISD::FMAXNUM_IEEE:
4786	case ISD::FMINNUM_IEEE:
4787	case ISD::FMINIMUM:
4788	case ISD::FMAXIMUM:
4789	case AMDGPUISD::FMAX_LEGACY:
4790	case AMDGPUISD::FMIN_LEGACY: {
4791	// fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
4792	// fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
4793	// fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
4794	// fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
4795
4796	SDValue LHS = N0.getOperand(i: `0`);
4797	SDValue RHS = N0.getOperand(i: `1`);
4798
4799	// 0 doesn't have a negated inline immediate.
4800	// TODO: This constant check should be generalized to other operations.
4801	if (isConstantCostlierToNegate(N: RHS))
4802	return SDValue ();
4803
4804	SDValue NegLHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: LHS);
4805	SDValue NegRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
4806	unsigned Opposite = inverseMinMax(Opc);
4807
4808	SDValue Res = DAG.getNode(Opcode: Opposite, DL: SL, VT, N1: NegLHS, N2: NegRHS, Flags: N0 ->getFlags());
4809	if (Res.getOpcode() != Opposite)
4810	return SDValue (); // Op got folded away.
4811	if (!N0.hasOneUse())
4812	DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res));
4813	return Res;
4814	}
4815	case AMDGPUISD::FMED3: {
4816	SDValue Ops[`3`];
4817	for (unsigned I = `0`; I < `3`; ++I)
4818	Ops[I] = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: N0 ->getOperand(Num: I), Flags: N0 ->getFlags());
4819
4820	SDValue Res = DAG.getNode(Opcode: AMDGPUISD::FMED3, DL: SL, VT, Ops, Flags: N0 ->getFlags());
4821	if (Res.getOpcode() != AMDGPUISD::FMED3)
4822	return SDValue (); // Op got folded away.
4823
4824	if (!N0.hasOneUse()) {
4825	SDValue Neg = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res);
4826	DAG.ReplaceAllUsesWith(From: N0, To: Neg);
4827
4828	for (SDNode *U : Neg ->uses())
4829	DCI.AddToWorklist(N: U);
4830	}
4831
4832	return Res;
4833	}
4834	case ISD::FP_EXTEND:
4835	case ISD::FTRUNC:
4836	case ISD::FRINT:
4837	case ISD::FNEARBYINT: // XXX - Should fround be handled?
4838	case ISD::FROUNDEVEN:
4839	case ISD::FSIN:
4840	case ISD::FCANONICALIZE:
4841	case AMDGPUISD::RCP:
4842	case AMDGPUISD::RCP_LEGACY:
4843	case AMDGPUISD::RCP_IFLAG:
4844	case AMDGPUISD::SIN_HW: {
4845	SDValue CvtSrc = N0.getOperand(i: `0`);
4846	if (CvtSrc.getOpcode() == ISD::FNEG) {
4847	// (fneg (fp_extend (fneg x))) -> (fp_extend x)
4848	// (fneg (rcp (fneg x))) -> (rcp x)
4849	return DAG.getNode(Opcode: Opc, DL: SL, VT, Operand: CvtSrc.getOperand(i: `0`));
4850	}
4851
4852	if (!N0.hasOneUse())
4853	return SDValue ();
4854
4855	// (fneg (fp_extend x)) -> (fp_extend (fneg x))
4856	// (fneg (rcp x)) -> (rcp (fneg x))
4857	SDValue Neg = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: CvtSrc.getValueType(), Operand: CvtSrc);
4858	return DAG.getNode(Opcode: Opc, DL: SL, VT, Operand: Neg, Flags: N0 ->getFlags());
4859	}
4860	case ISD::FP_ROUND: {
4861	SDValue CvtSrc = N0.getOperand(i: `0`);
4862
4863	if (CvtSrc.getOpcode() == ISD::FNEG) {
4864	// (fneg (fp_round (fneg x))) -> (fp_round x)
4865	return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT,
4866	N1: CvtSrc.getOperand(i: `0`), N2: N0.getOperand(i: `1`));
4867	}
4868
4869	if (!N0.hasOneUse())
4870	return SDValue ();
4871
4872	// (fneg (fp_round x)) -> (fp_round (fneg x))
4873	SDValue Neg = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: CvtSrc.getValueType(), Operand: CvtSrc);
4874	return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT, N1: Neg, N2: N0.getOperand(i: `1`));
4875	}
4876	case ISD::FP16_TO_FP: {
4877	// v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
4878	// f16, but legalization of f16 fneg ends up pulling it out of the source.
4879	// Put the fneg back as a legal source operation that can be matched later.
4880	SDLoc SL(N);
4881
4882	SDValue Src = N0.getOperand(i: `0`);
4883	EVT SrcVT = Src.getValueType();
4884
4885	// fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
4886	SDValue IntFNeg = DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: SrcVT, N1: Src,
4887	N2: DAG.getConstant(Val: `0x8000`, DL: SL, VT: SrcVT));
4888	return DAG.getNode(Opcode: ISD::FP16_TO_FP, DL: SL, VT: N->getValueType(ResNo: `0`), Operand: IntFNeg);
4889	}
4890	case ISD::SELECT: {
4891	// fneg (select c, a, b) -> select c, (fneg a), (fneg b)
4892	// TODO: Invert conditions of foldFreeOpFromSelect
4893	return SDValue ();
4894	}
4895	case ISD::BITCAST: {
4896	SDLoc SL(N);
4897	SDValue BCSrc = N0.getOperand(i: `0`);
4898	if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
4899	SDValue HighBits = BCSrc.getOperand(i: BCSrc.getNumOperands() - `1`);
4900	if (HighBits.getValueType().getSizeInBits() != `32` \|\|
4901	!fnegFoldsIntoOp(N: HighBits.getNode()))
4902	return SDValue ();
4903
4904	// f64 fneg only really needs to operate on the high half of of the
4905	// register, so try to force it to an f32 operation to help make use of
4906	// source modifiers.
4907	//
4908	//
4909	// fneg (f64 (bitcast (build_vector x, y))) ->
4910	// f64 (bitcast (build_vector (bitcast i32:x to f32),
4911	// (fneg (bitcast i32:y to f32)))
4912
4913	SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::f32, HighBits);
4914	SDValue NegHi = DAG.getNode(ISD::FNEG, SL, MVT::f32, CastHi);
4915	SDValue CastBack =
4916	DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: HighBits.getValueType(), Operand: NegHi);
4917
4918	SmallVector<SDValue, `8`> Ops(BCSrc ->op_begin(), BCSrc ->op_end());
4919	Ops.back() = CastBack;
4920	DCI.AddToWorklist(N: NegHi.getNode());
4921	SDValue Build =
4922	DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: BCSrc.getValueType(), Ops);
4923	SDValue Result = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Build);
4924
4925	if (!N0.hasOneUse())
4926	DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Result));
4927	return Result;
4928	}
4929
4930	if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 &&
4931	BCSrc.hasOneUse()) {
4932	// fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->
4933	// select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)
4934
4935	// TODO: Cast back result for multiple uses is beneficial in some cases.
4936
4937	SDValue LHS =
4938	DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(`1`));
4939	SDValue RHS =
4940	DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(`2`));
4941
4942	SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, LHS);
4943	SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHS);
4944
4945	return DAG.getNode(ISD::SELECT, SL, MVT::f32, BCSrc.getOperand(`0`), NegLHS,
4946	NegRHS);
4947	}
4948
4949	return SDValue ();
4950	}
4951	default:
4952	return SDValue ();
4953	}
4954	}
4955
4956	SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
4957	DAGCombinerInfo &DCI) const {
4958	SelectionDAG &DAG = DCI.DAG;
4959	SDValue N0 = N->getOperand(Num: `0`);
4960
4961	if (!N0.hasOneUse())
4962	return SDValue ();
4963
4964	switch (N0.getOpcode()) {
4965	case ISD::FP16_TO_FP: {
4966	assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
4967	SDLoc SL(N);
4968	SDValue Src = N0.getOperand(i: `0`);
4969	EVT SrcVT = Src.getValueType();
4970
4971	// fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
4972	SDValue IntFAbs = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: SrcVT, N1: Src,
4973	N2: DAG.getConstant(Val: `0x7fff`, DL: SL, VT: SrcVT));
4974	return DAG.getNode(Opcode: ISD::FP16_TO_FP, DL: SL, VT: N->getValueType(ResNo: `0`), Operand: IntFAbs);
4975	}
4976	default:
4977	return SDValue ();
4978	}
4979	}
4980
4981	SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N,
4982	DAGCombinerInfo &DCI) const {
4983	const auto *CFP = dyn_cast<ConstantFPSDNode>(Val: N->getOperand(Num: `0`));
4984	if (!CFP)
4985	return SDValue ();
4986
4987	// XXX - Should this flush denormals?
4988	const APFloat &Val = CFP->getValueAPF();
4989	APFloat One(Val.getSemantics(), "1.0");
4990	return DCI.DAG.getConstantFP(Val: One / Val, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`));
4991	}
4992
4993	SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
4994	DAGCombinerInfo &DCI) const {
4995	SelectionDAG &DAG = DCI.DAG;
4996	SDLoc DL(N);
4997
4998	switch(N->getOpcode()) {
4999	default:
5000	break;
5001	case ISD::BITCAST: {
5002	EVT DestVT = N->getValueType(ResNo: `0`);
5003
5004	// Push casts through vector builds. This helps avoid emitting a large
5005	// number of copies when materializing floating point vector constants.
5006	//
5007	// vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
5008	// vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
5009	if (DestVT.isVector()) {
5010	SDValue Src = N->getOperand(Num: `0`);
5011	if (Src.getOpcode() == ISD::BUILD_VECTOR &&
5012	(DCI.getDAGCombineLevel() < AfterLegalizeDAG \|\|
5013	isOperationLegal(Op: ISD::BUILD_VECTOR, VT: DestVT))) {
5014	EVT SrcVT = Src.getValueType();
5015	unsigned NElts = DestVT.getVectorNumElements();
5016
5017	if (SrcVT.getVectorNumElements() == NElts) {
5018	EVT DestEltVT = DestVT.getVectorElementType();
5019
5020	SmallVector<SDValue, `8`> CastedElts;
5021	SDLoc SL(N);
5022	for (unsigned I = `0`, E = SrcVT.getVectorNumElements(); I != E; ++I) {
5023	SDValue Elt = Src.getOperand(i: I);
5024	CastedElts.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: DestEltVT, Operand: Elt));
5025	}
5026
5027	return DAG.getBuildVector(VT: DestVT, DL: SL, Ops: CastedElts);
5028	}
5029	}
5030	}
5031
5032	if (DestVT.getSizeInBits() != `64` \|\| !DestVT.isVector())
5033	break;
5034
5035	// Fold bitcasts of constants.
5036	//
5037	// v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
5038	// TODO: Generalize and move to DAGCombiner
5039	SDValue Src = N->getOperand(Num: `0`);
5040	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: Src)) {
5041	SDLoc SL(N);
5042	uint64_t CVal = C->getZExtValue();
5043	SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5044	DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5045	DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5046	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: DestVT, Operand: BV);
5047	}
5048
5049	if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: Src)) {
5050	const APInt &Val = C->getValueAPF().bitcastToAPInt();
5051	SDLoc SL(N);
5052	uint64_t CVal = Val.getZExtValue();
5053	SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5054	DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5055	DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5056
5057	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: DestVT, Operand: Vec);
5058	}
5059
5060	break;
5061	}
5062	case ISD::SHL: {
5063	if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
5064	break;
5065
5066	return performShlCombine(N, DCI);
5067	}
5068	case ISD::SRL: {
5069	if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
5070	break;
5071
5072	return performSrlCombine(N, DCI);
5073	}
5074	case ISD::SRA: {
5075	if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
5076	break;
5077
5078	return performSraCombine(N, DCI);
5079	}
5080	case ISD::TRUNCATE:
5081	return performTruncateCombine(N, DCI);
5082	case ISD::MUL:
5083	return performMulCombine(N, DCI);
5084	case AMDGPUISD::MUL_U24:
5085	case AMDGPUISD::MUL_I24: {
5086	if (SDValue Simplified = simplifyMul24(Node24: N, DCI))
5087	return Simplified;
5088	break;
5089	}
5090	case AMDGPUISD::MULHI_I24:
5091	case AMDGPUISD::MULHI_U24:
5092	return simplifyMul24(Node24: N, DCI);
5093	case ISD::SMUL_LOHI:
5094	case ISD::UMUL_LOHI:
5095	return performMulLoHiCombine(N, DCI);
5096	case ISD::MULHS:
5097	return performMulhsCombine(N, DCI);
5098	case ISD::MULHU:
5099	return performMulhuCombine(N, DCI);
5100	case ISD::SELECT:
5101	return performSelectCombine(N, DCI);
5102	case ISD::FNEG:
5103	return performFNegCombine(N, DCI);
5104	case ISD::FABS:
5105	return performFAbsCombine(N, DCI);
5106	case AMDGPUISD::BFE_I32:
5107	case AMDGPUISD::BFE_U32: {
5108	assert(!N->getValueType(`0`).isVector() &&
5109	"Vector handling of BFE not implemented");
5110	ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `2`));
5111	if (!Width)
5112	break;
5113
5114	uint32_t WidthVal = Width->getZExtValue() & `0x1f`;
5115	if (WidthVal == `0`)
5116	return DAG.getConstant(`0`, DL, MVT::i32);
5117
5118	ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `1`));
5119	if (!Offset)
5120	break;
5121
5122	SDValue BitsFrom = N->getOperand(Num: `0`);
5123	uint32_t OffsetVal = Offset->getZExtValue() & `0x1f`;
5124
5125	bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
5126
5127	if (OffsetVal == `0`) {
5128	// This is already sign / zero extended, so try to fold away extra BFEs.
5129	unsigned SignBits = Signed ? (`32` - WidthVal + `1`) : (`32` - WidthVal);
5130
5131	unsigned OpSignBits = DAG.ComputeNumSignBits(Op: BitsFrom);
5132	if (OpSignBits >= SignBits)
5133	return BitsFrom;
5134
5135	EVT SmallVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: WidthVal);
5136	if (Signed) {
5137	// This is a sign_extend_inreg. Replace it to take advantage of existing
5138	// DAG Combines. If not eliminated, we will match back to BFE during
5139	// selection.
5140
5141	// TODO: The sext_inreg of extended types ends, although we can could
5142	// handle them in a single BFE.
5143	return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
5144	DAG.getValueType(SmallVT));
5145	}
5146
5147	return DAG.getZeroExtendInReg(Op: BitsFrom, DL, VT: SmallVT);
5148	}
5149
5150	if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(Val&: BitsFrom)) {
5151	if (Signed) {
5152	return constantFoldBFE<int32_t>(DAG,
5153	Src0: CVal->getSExtValue(),
5154	Offset: OffsetVal,
5155	Width: WidthVal,
5156	DL);
5157	}
5158
5159	return constantFoldBFE<uint32_t>(DAG,
5160	Src0: CVal->getZExtValue(),
5161	Offset: OffsetVal,
5162	Width: WidthVal,
5163	DL);
5164	}
5165
5166	if ((OffsetVal + WidthVal) >= `32` &&
5167	!(Subtarget->hasSDWA() && OffsetVal == `16` && WidthVal == `16`)) {
5168	SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
5169	return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
5170	BitsFrom, ShiftVal);
5171	}
5172
5173	if (BitsFrom.hasOneUse()) {
5174	APInt Demanded = APInt::getBitsSet(numBits: `32`,
5175	loBit: OffsetVal,
5176	hiBit: OffsetVal + WidthVal);
5177
5178	KnownBits Known;
5179	TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
5180	!DCI.isBeforeLegalizeOps());
5181	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5182	if (TLI.ShrinkDemandedConstant(Op: BitsFrom, DemandedBits: Demanded, TLO) \|\|
5183	TLI.SimplifyDemandedBits(Op: BitsFrom, DemandedBits: Demanded, Known, TLO)) {
5184	DCI.CommitTargetLoweringOpt(TLO);
5185	}
5186	}
5187
5188	break;
5189	}
5190	case ISD::LOAD:
5191	return performLoadCombine(N, DCI);
5192	case ISD::STORE:
5193	return performStoreCombine(N, DCI);
5194	case AMDGPUISD::RCP:
5195	case AMDGPUISD::RCP_IFLAG:
5196	return performRcpCombine(N, DCI);
5197	case ISD::AssertZext:
5198	case ISD::AssertSext:
5199	return performAssertSZExtCombine(N, DCI);
5200	case ISD::INTRINSIC_WO_CHAIN:
5201	return performIntrinsicWOChainCombine(N, DCI);
5202	case AMDGPUISD::FMAD_FTZ: {
5203	SDValue N0 = N->getOperand(Num: `0`);
5204	SDValue N1 = N->getOperand(Num: `1`);
5205	SDValue N2 = N->getOperand(Num: `2`);
5206	EVT VT = N->getValueType(ResNo: `0`);
5207
5208	// FMAD_FTZ is a FMAD + flush denormals to zero.
5209	// We flush the inputs, the intermediate step, and the output.
5210	ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(Val&: N0);
5211	ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(Val&: N1);
5212	ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(Val&: N2);
5213	if (N0CFP && N1CFP && N2CFP) {
5214	const auto FTZ = [](const APFloat &V) {
5215	if (V.isDenormal()) {
5216	APFloat Zero(V.getSemantics(), `0`);
5217	return V.isNegative() ? -Zero : Zero;
5218	}
5219	return V;
5220	};
5221
5222	APFloat V0 = FTZ (N0CFP->getValueAPF());
5223	APFloat V1 = FTZ (N1CFP->getValueAPF());
5224	APFloat V2 = FTZ (N2CFP->getValueAPF());
5225	V0.multiply(RHS: V1, RM: APFloat::rmNearestTiesToEven);
5226	V0 = FTZ (V0);
5227	V0.add(RHS: V2, RM: APFloat::rmNearestTiesToEven);
5228	return DAG.getConstantFP(Val: FTZ (V0), DL, VT);
5229	}
5230	break;
5231	}
5232	}
5233	return SDValue ();
5234	}
5235
5236	//===----------------------------------------------------------------------===//
5237	// Helper functions
5238	//===----------------------------------------------------------------------===//
5239
5240	SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
5241	const TargetRegisterClass *RC,
5242	Register Reg, EVT VT,
5243	const SDLoc &SL,
5244	bool RawReg) const {
5245	MachineFunction &MF = DAG.getMachineFunction();
5246	MachineRegisterInfo &MRI = MF.getRegInfo();
5247	Register VReg;
5248
5249	if (!MRI.isLiveIn(Reg)) {
5250	VReg = MRI.createVirtualRegister(RegClass: RC);
5251	MRI.addLiveIn(Reg, vreg: VReg);
5252	} else {
5253	VReg = MRI.getLiveInVirtReg(PReg: Reg);
5254	}
5255
5256	if (RawReg)
5257	return DAG.getRegister(Reg: VReg, VT);
5258
5259	return DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: SL, Reg: VReg, VT);
5260	}
5261
5262	// This may be called multiple times, and nothing prevents creating multiple
5263	// objects at the same offset. See if we already defined this object.
5264	static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size,
5265	int64_t Offset) {
5266	for (int I = MFI.getObjectIndexBegin(); I < `0`; ++I) {
5267	if (MFI.getObjectOffset(ObjectIdx: I) == Offset) {
5268	assert(MFI.getObjectSize(I) == Size);
5269	return I;
5270	}
5271	}
5272
5273	return MFI.CreateFixedObject(Size, SPOffset: Offset, IsImmutable: true);
5274	}
5275
5276	SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
5277	EVT VT,
5278	const SDLoc &SL,
5279	int64_t Offset) const {
5280	MachineFunction &MF = DAG.getMachineFunction();
5281	MachineFrameInfo &MFI = MF.getFrameInfo();
5282	int FI = getOrCreateFixedStackObject(MFI, Size: VT.getStoreSize(), Offset);
5283
5284	auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
5285	SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
5286
5287	return DAG.getLoad(VT, dl: SL, Chain: DAG.getEntryNode(), Ptr, PtrInfo: SrcPtrInfo, Alignment: Align (`4`),
5288	MMOFlags: MachineMemOperand::MODereferenceable \|
5289	MachineMemOperand::MOInvariant);
5290	}
5291
5292	SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
5293	const SDLoc &SL,
5294	SDValue Chain,
5295	SDValue ArgVal,
5296	int64_t Offset) const {
5297	MachineFunction &MF = DAG.getMachineFunction();
5298	MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
5299	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
5300
5301	SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
5302	// Stores to the argument stack area are relative to the stack pointer.
5303	SDValue SP =
5304	DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);
5305	Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);
5306	SDValue Store = DAG.getStore(Chain, dl: SL, Val: ArgVal, Ptr, PtrInfo: DstInfo, Alignment: Align (`4`),
5307	MMOFlags: MachineMemOperand::MODereferenceable);
5308	return Store;
5309	}
5310
5311	SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
5312	const TargetRegisterClass *RC,
5313	EVT VT, const SDLoc &SL,
5314	const ArgDescriptor &Arg) const {
5315	assert(Arg && "Attempting to load missing argument");
5316
5317	SDValue V = Arg.isRegister() ?
5318	CreateLiveInRegister(DAG, RC, Reg: Arg.getRegister(), VT, SL) :
5319	loadStackInputValue(DAG, VT, SL, Offset: Arg.getStackOffset());
5320
5321	if (!Arg.isMasked())
5322	return V;
5323
5324	unsigned Mask = Arg.getMask();
5325	unsigned Shift = llvm::countr_zero<unsigned>(Val: Mask);
5326	V = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT, N1: V,
5327	N2: DAG.getShiftAmountConstant(Val: Shift, VT, DL: SL));
5328	return DAG.getNode(Opcode: ISD::AND, DL: SL, VT, N1: V,
5329	N2: DAG.getConstant(Val: Mask >> Shift, DL: SL, VT));
5330	}
5331
5332	uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
5333	uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const {
5334	unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
5335	const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr();
5336	uint64_t ArgOffset =
5337	alignTo(Size: ExplicitKernArgSize, A: Alignment) + ExplicitArgOffset;
5338	switch (Param) {
5339	case FIRST_IMPLICIT:
5340	return ArgOffset;
5341	case PRIVATE_BASE:
5342	return ArgOffset + AMDGPU::ImplicitArg::PRIVATE_BASE_OFFSET;
5343	case SHARED_BASE:
5344	return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
5345	case QUEUE_PTR:
5346	return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
5347	}
5348	llvm_unreachable("unexpected implicit parameter type");
5349	}
5350
5351	uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
5352	const MachineFunction &MF, const ImplicitParameter Param) const {
5353	const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
5354	return getImplicitParameterOffset(ExplicitKernArgSize: MFI->getExplicitKernArgSize(), Param);
5355	}
5356
5357	#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
5358
5359	const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
5360	switch ((AMDGPUISD::NodeType)Opcode) {
5361	case AMDGPUISD::FIRST_NUMBER: break;
5362	// AMDIL DAG nodes
5363	NODE_NAME_CASE(UMUL);
5364	NODE_NAME_CASE(BRANCH_COND);
5365
5366	// AMDGPU DAG nodes
5367	NODE_NAME_CASE(IF)
5368	NODE_NAME_CASE(ELSE)
5369	NODE_NAME_CASE(LOOP)
5370	NODE_NAME_CASE(CALL)
5371	NODE_NAME_CASE(TC_RETURN)
5372	NODE_NAME_CASE(TC_RETURN_GFX)
5373	NODE_NAME_CASE(TC_RETURN_CHAIN)
5374	NODE_NAME_CASE(TRAP)
5375	NODE_NAME_CASE(RET_GLUE)
5376	NODE_NAME_CASE(WAVE_ADDRESS)
5377	NODE_NAME_CASE(RETURN_TO_EPILOG)
5378	NODE_NAME_CASE(ENDPGM)
5379	NODE_NAME_CASE(ENDPGM_TRAP)
5380	NODE_NAME_CASE(SIMULATED_TRAP)
5381	NODE_NAME_CASE(DWORDADDR)
5382	NODE_NAME_CASE(FRACT)
5383	NODE_NAME_CASE(SETCC)
5384	NODE_NAME_CASE(SETREG)
5385	NODE_NAME_CASE(DENORM_MODE)
5386	NODE_NAME_CASE(FMA_W_CHAIN)
5387	NODE_NAME_CASE(FMUL_W_CHAIN)
5388	NODE_NAME_CASE(CLAMP)
5389	NODE_NAME_CASE(COS_HW)
5390	NODE_NAME_CASE(SIN_HW)
5391	NODE_NAME_CASE(FMAX_LEGACY)
5392	NODE_NAME_CASE(FMIN_LEGACY)
5393	NODE_NAME_CASE(FMAX3)
5394	NODE_NAME_CASE(SMAX3)
5395	NODE_NAME_CASE(UMAX3)
5396	NODE_NAME_CASE(FMIN3)
5397	NODE_NAME_CASE(SMIN3)
5398	NODE_NAME_CASE(UMIN3)
5399	NODE_NAME_CASE(FMED3)
5400	NODE_NAME_CASE(SMED3)
5401	NODE_NAME_CASE(UMED3)
5402	NODE_NAME_CASE(FMAXIMUM3)
5403	NODE_NAME_CASE(FMINIMUM3)
5404	NODE_NAME_CASE(FDOT2)
5405	NODE_NAME_CASE(URECIP)
5406	NODE_NAME_CASE(DIV_SCALE)
5407	NODE_NAME_CASE(DIV_FMAS)
5408	NODE_NAME_CASE(DIV_FIXUP)
5409	NODE_NAME_CASE(FMAD_FTZ)
5410	NODE_NAME_CASE(RCP)
5411	NODE_NAME_CASE(RSQ)
5412	NODE_NAME_CASE(RCP_LEGACY)
5413	NODE_NAME_CASE(RCP_IFLAG)
5414	NODE_NAME_CASE(LOG)
5415	NODE_NAME_CASE(EXP)
5416	NODE_NAME_CASE(FMUL_LEGACY)
5417	NODE_NAME_CASE(RSQ_CLAMP)
5418	NODE_NAME_CASE(FP_CLASS)
5419	NODE_NAME_CASE(DOT4)
5420	NODE_NAME_CASE(CARRY)
5421	NODE_NAME_CASE(BORROW)
5422	NODE_NAME_CASE(BFE_U32)
5423	NODE_NAME_CASE(BFE_I32)
5424	NODE_NAME_CASE(BFI)
5425	NODE_NAME_CASE(BFM)
5426	NODE_NAME_CASE(FFBH_U32)
5427	NODE_NAME_CASE(FFBH_I32)
5428	NODE_NAME_CASE(FFBL_B32)
5429	NODE_NAME_CASE(MUL_U24)
5430	NODE_NAME_CASE(MUL_I24)
5431	NODE_NAME_CASE(MULHI_U24)
5432	NODE_NAME_CASE(MULHI_I24)
5433	NODE_NAME_CASE(MAD_U24)
5434	NODE_NAME_CASE(MAD_I24)
5435	NODE_NAME_CASE(MAD_I64_I32)
5436	NODE_NAME_CASE(MAD_U64_U32)
5437	NODE_NAME_CASE(PERM)
5438	NODE_NAME_CASE(TEXTURE_FETCH)
5439	NODE_NAME_CASE(R600_EXPORT)
5440	NODE_NAME_CASE(CONST_ADDRESS)
5441	NODE_NAME_CASE(REGISTER_LOAD)
5442	NODE_NAME_CASE(REGISTER_STORE)
5443	NODE_NAME_CASE(SAMPLE)
5444	NODE_NAME_CASE(SAMPLEB)
5445	NODE_NAME_CASE(SAMPLED)
5446	NODE_NAME_CASE(SAMPLEL)
5447	NODE_NAME_CASE(CVT_F32_UBYTE0)
5448	NODE_NAME_CASE(CVT_F32_UBYTE1)
5449	NODE_NAME_CASE(CVT_F32_UBYTE2)
5450	NODE_NAME_CASE(CVT_F32_UBYTE3)
5451	NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
5452	NODE_NAME_CASE(CVT_PKNORM_I16_F32)
5453	NODE_NAME_CASE(CVT_PKNORM_U16_F32)
5454	NODE_NAME_CASE(CVT_PK_I16_I32)
5455	NODE_NAME_CASE(CVT_PK_U16_U32)
5456	NODE_NAME_CASE(FP_TO_FP16)
5457	NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
5458	NODE_NAME_CASE(CONST_DATA_PTR)
5459	NODE_NAME_CASE(PC_ADD_REL_OFFSET)
5460	NODE_NAME_CASE(LDS)
5461	NODE_NAME_CASE(FPTRUNC_ROUND_UPWARD)
5462	NODE_NAME_CASE(FPTRUNC_ROUND_DOWNWARD)
5463	NODE_NAME_CASE(DUMMY_CHAIN)
5464	case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
5465	NODE_NAME_CASE(LOAD_D16_HI)
5466	NODE_NAME_CASE(LOAD_D16_LO)
5467	NODE_NAME_CASE(LOAD_D16_HI_I8)
5468	NODE_NAME_CASE(LOAD_D16_HI_U8)
5469	NODE_NAME_CASE(LOAD_D16_LO_I8)
5470	NODE_NAME_CASE(LOAD_D16_LO_U8)
5471	NODE_NAME_CASE(STORE_MSKOR)
5472	NODE_NAME_CASE(LOAD_CONSTANT)
5473	NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
5474	NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)
5475	NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
5476	NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
5477	NODE_NAME_CASE(DS_ORDERED_COUNT)
5478	NODE_NAME_CASE(ATOMIC_CMP_SWAP)
5479	NODE_NAME_CASE(ATOMIC_LOAD_FMIN)
5480	NODE_NAME_CASE(ATOMIC_LOAD_FMAX)
5481	NODE_NAME_CASE(BUFFER_LOAD)
5482	NODE_NAME_CASE(BUFFER_LOAD_UBYTE)
5483	NODE_NAME_CASE(BUFFER_LOAD_USHORT)
5484	NODE_NAME_CASE(BUFFER_LOAD_BYTE)
5485	NODE_NAME_CASE(BUFFER_LOAD_SHORT)
5486	NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
5487	NODE_NAME_CASE(BUFFER_LOAD_FORMAT_TFE)
5488	NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
5489	NODE_NAME_CASE(SBUFFER_LOAD)
5490	NODE_NAME_CASE(SBUFFER_LOAD_BYTE)
5491	NODE_NAME_CASE(SBUFFER_LOAD_UBYTE)
5492	NODE_NAME_CASE(SBUFFER_LOAD_SHORT)
5493	NODE_NAME_CASE(SBUFFER_LOAD_USHORT)
5494	NODE_NAME_CASE(BUFFER_STORE)
5495	NODE_NAME_CASE(BUFFER_STORE_BYTE)
5496	NODE_NAME_CASE(BUFFER_STORE_SHORT)
5497	NODE_NAME_CASE(BUFFER_STORE_FORMAT)
5498	NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)
5499	NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)
5500	NODE_NAME_CASE(BUFFER_ATOMIC_ADD)
5501	NODE_NAME_CASE(BUFFER_ATOMIC_SUB)
5502	NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)
5503	NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)
5504	NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)
5505	NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)
5506	NODE_NAME_CASE(BUFFER_ATOMIC_AND)
5507	NODE_NAME_CASE(BUFFER_ATOMIC_OR)
5508	NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
5509	NODE_NAME_CASE(BUFFER_ATOMIC_INC)
5510	NODE_NAME_CASE(BUFFER_ATOMIC_DEC)
5511	NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
5512	NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)
5513	NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
5514	NODE_NAME_CASE(BUFFER_ATOMIC_FADD_BF16)
5515	NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
5516	NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
5517	NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32)
5518
5519	case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
5520	}
5521	return nullptr;
5522	}
5523
5524	SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
5525	SelectionDAG &DAG, int Enabled,
5526	int &RefinementSteps,
5527	bool &UseOneConstNR,
5528	bool Reciprocal) const {
5529	EVT VT = Operand.getValueType();
5530
5531	if (VT == MVT::f32) {
5532	RefinementSteps = `0`;
5533	return DAG.getNode(Opcode: AMDGPUISD::RSQ, DL: SDLoc (Operand), VT, Operand);
5534	}
5535
5536	// TODO: There is also f64 rsq instruction, but the documentation is less
5537	// clear on its precision.
5538
5539	return SDValue ();
5540	}
5541
5542	SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
5543	SelectionDAG &DAG, int Enabled,
5544	int &RefinementSteps) const {
5545	EVT VT = Operand.getValueType();
5546
5547	if (VT == MVT::f32) {
5548	// Reciprocal, < 1 ulp error.
5549	//
5550	// This reciprocal approximation converges to < 0.5 ulp error with one
5551	// newton rhapson performed with two fused multiple adds (FMAs).
5552
5553	RefinementSteps = `0`;
5554	return DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SDLoc (Operand), VT, Operand);
5555	}
5556
5557	// TODO: There is also f64 rcp instruction, but the documentation is less
5558	// clear on its precision.
5559
5560	return SDValue ();
5561	}
5562
5563	static unsigned workitemIntrinsicDim(unsigned ID) {
5564	switch (ID) {
5565	case Intrinsic::amdgcn_workitem_id_x:
5566	return `0`;
5567	case Intrinsic::amdgcn_workitem_id_y:
5568	return `1`;
5569	case Intrinsic::amdgcn_workitem_id_z:
5570	return `2`;
5571	default:
5572	llvm_unreachable("not a workitem intrinsic");
5573	}
5574	}
5575
5576	void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
5577	const SDValue Op, KnownBits &Known,
5578	const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
5579
5580	Known.resetAll(); // Don't know anything.
5581
5582	unsigned Opc = Op.getOpcode();
5583
5584	switch (Opc) {
5585	default:
5586	break;
5587	case AMDGPUISD::CARRY:
5588	case AMDGPUISD::BORROW: {
5589	Known.Zero = APInt::getHighBitsSet(numBits: `32`, hiBitsSet: `31`);
5590	break;
5591	}
5592
5593	case AMDGPUISD::BFE_I32:
5594	case AMDGPUISD::BFE_U32: {
5595	ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: `2`));
5596	if (!CWidth)
5597	return;
5598
5599	uint32_t Width = CWidth->getZExtValue() & `0x1f`;
5600
5601	if (Opc == AMDGPUISD::BFE_U32)
5602	Known.Zero = APInt::getHighBitsSet(numBits: `32`, hiBitsSet: `32` - Width);
5603
5604	break;
5605	}
5606	case AMDGPUISD::FP_TO_FP16: {
5607	unsigned BitWidth = Known.getBitWidth();
5608
5609	// High bits are zero.
5610	Known.Zero = APInt::getHighBitsSet(numBits: BitWidth, hiBitsSet: BitWidth - `16`);
5611	break;
5612	}
5613	case AMDGPUISD::MUL_U24:
5614	case AMDGPUISD::MUL_I24: {
5615	KnownBits LHSKnown = DAG.computeKnownBits(Op: Op.getOperand(i: `0`), Depth: Depth + `1`);
5616	KnownBits RHSKnown = DAG.computeKnownBits(Op: Op.getOperand(i: `1`), Depth: Depth + `1`);
5617	unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
5618	RHSKnown.countMinTrailingZeros();
5619	Known.Zero.setLowBits(std::min(a: TrailZ, b: `32u`));
5620	// Skip extra check if all bits are known zeros.
5621	if (TrailZ >= `32`)
5622	break;
5623
5624	// Truncate to 24 bits.
5625	LHSKnown = LHSKnown.trunc(BitWidth: `24`);
5626	RHSKnown = RHSKnown.trunc(BitWidth: `24`);
5627
5628	if (Opc == AMDGPUISD::MUL_I24) {
5629	unsigned LHSValBits = LHSKnown.countMaxSignificantBits();
5630	unsigned RHSValBits = RHSKnown.countMaxSignificantBits();
5631	unsigned MaxValBits = LHSValBits + RHSValBits;
5632	if (MaxValBits > `32`)
5633	break;
5634	unsigned SignBits = `32` - MaxValBits + `1`;
5635	bool LHSNegative = LHSKnown.isNegative();
5636	bool LHSNonNegative = LHSKnown.isNonNegative();
5637	bool LHSPositive = LHSKnown.isStrictlyPositive();
5638	bool RHSNegative = RHSKnown.isNegative();
5639	bool RHSNonNegative = RHSKnown.isNonNegative();
5640	bool RHSPositive = RHSKnown.isStrictlyPositive();
5641
5642	if ((LHSNonNegative && RHSNonNegative) \|\| (LHSNegative && RHSNegative))
5643	Known.Zero.setHighBits(SignBits);
5644	else if ((LHSNegative && RHSPositive) \|\| (LHSPositive && RHSNegative))
5645	Known.One.setHighBits(SignBits);
5646	} else {
5647	unsigned LHSValBits = LHSKnown.countMaxActiveBits();
5648	unsigned RHSValBits = RHSKnown.countMaxActiveBits();
5649	unsigned MaxValBits = LHSValBits + RHSValBits;
5650	if (MaxValBits >= `32`)
5651	break;
5652	Known.Zero.setBitsFrom(MaxValBits);
5653	}
5654	break;
5655	}
5656	case AMDGPUISD::PERM: {
5657	ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: `2`));
5658	if (!CMask)
5659	return;
5660
5661	KnownBits LHSKnown = DAG.computeKnownBits(Op: Op.getOperand(i: `0`), Depth: Depth + `1`);
5662	KnownBits RHSKnown = DAG.computeKnownBits(Op: Op.getOperand(i: `1`), Depth: Depth + `1`);
5663	unsigned Sel = CMask->getZExtValue();
5664
5665	for (unsigned I = `0`; I < `32`; I += `8`) {
5666	unsigned SelBits = Sel & `0xff`;
5667	if (SelBits < `4`) {
5668	SelBits *= `8`;
5669	Known.One \|= ((RHSKnown.One.getZExtValue() >> SelBits) & `0xff`) << I;
5670	Known.Zero \|= ((RHSKnown.Zero.getZExtValue() >> SelBits) & `0xff`) << I;
5671	} else if (SelBits < `7`) {
5672	SelBits = (SelBits & `3`) * `8`;
5673	Known.One \|= ((LHSKnown.One.getZExtValue() >> SelBits) & `0xff`) << I;
5674	Known.Zero \|= ((LHSKnown.Zero.getZExtValue() >> SelBits) & `0xff`) << I;
5675	} else if (SelBits == `0x0c`) {
5676	Known.Zero \|= `0xFFull` << I;
5677	} else if (SelBits > `0x0c`) {
5678	Known.One \|= `0xFFull` << I;
5679	}
5680	Sel >>= `8`;
5681	}
5682	break;
5683	}
5684	case AMDGPUISD::BUFFER_LOAD_UBYTE: {
5685	Known.Zero.setHighBits(`24`);
5686	break;
5687	}
5688	case AMDGPUISD::BUFFER_LOAD_USHORT: {
5689	Known.Zero.setHighBits(`16`);
5690	break;
5691	}
5692	case AMDGPUISD::LDS: {
5693	auto GA = cast<GlobalAddressSDNode>(Val: Op.getOperand(i: `0`).getNode());
5694	Align Alignment = GA->getGlobal()->getPointerAlignment(DL: DAG.getDataLayout());
5695
5696	Known.Zero.setHighBits(`16`);
5697	Known.Zero.setLowBits(Log2(A: Alignment));
5698	break;
5699	}
5700	case AMDGPUISD::SMIN3:
5701	case AMDGPUISD::SMAX3:
5702	case AMDGPUISD::SMED3:
5703	case AMDGPUISD::UMIN3:
5704	case AMDGPUISD::UMAX3:
5705	case AMDGPUISD::UMED3: {
5706	KnownBits Known2 = DAG.computeKnownBits(Op: Op.getOperand(i: `2`), Depth: Depth + `1`);
5707	if (Known2.isUnknown())
5708	break;
5709
5710	KnownBits Known1 = DAG.computeKnownBits(Op: Op.getOperand(i: `1`), Depth: Depth + `1`);
5711	if (Known1.isUnknown())
5712	break;
5713
5714	KnownBits Known0 = DAG.computeKnownBits(Op: Op.getOperand(i: `0`), Depth: Depth + `1`);
5715	if (Known0.isUnknown())
5716	break;
5717
5718	// TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
5719	Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
5720	Known.One = Known0.One & Known1.One & Known2.One;
5721	break;
5722	}
5723	case ISD::INTRINSIC_WO_CHAIN: {
5724	unsigned IID = Op.getConstantOperandVal(i: `0`);
5725	switch (IID) {
5726	case Intrinsic::amdgcn_workitem_id_x:
5727	case Intrinsic::amdgcn_workitem_id_y:
5728	case Intrinsic::amdgcn_workitem_id_z: {
5729	unsigned MaxValue = Subtarget->getMaxWorkitemID(
5730	Kernel: DAG.getMachineFunction().getFunction(), Dimension: workitemIntrinsicDim(ID: IID));
5731	Known.Zero.setHighBits(llvm::countl_zero(Val: MaxValue));
5732	break;
5733	}
5734	default:
5735	break;
5736	}
5737	}
5738	}
5739	}
5740
5741	unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
5742	SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
5743	unsigned Depth) const {
5744	switch (Op.getOpcode()) {
5745	case AMDGPUISD::BFE_I32: {
5746	ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: `2`));
5747	if (!Width)
5748	return `1`;
5749
5750	unsigned SignBits = `32` - Width->getZExtValue() + `1`;
5751	if (!isNullConstant(V: Op.getOperand(i: `1`)))
5752	return SignBits;
5753
5754	// TODO: Could probably figure something out with non-0 offsets.
5755	unsigned Op0SignBits = DAG.ComputeNumSignBits(Op: Op.getOperand(i: `0`), Depth: Depth + `1`);
5756	return std::max(a: SignBits, b: Op0SignBits);
5757	}
5758
5759	case AMDGPUISD::BFE_U32: {
5760	ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: `2`));
5761	return Width ? `32` - (Width->getZExtValue() & `0x1f`) : `1`;
5762	}
5763
5764	case AMDGPUISD::CARRY:
5765	case AMDGPUISD::BORROW:
5766	return `31`;
5767	case AMDGPUISD::BUFFER_LOAD_BYTE:
5768	return `25`;
5769	case AMDGPUISD::BUFFER_LOAD_SHORT:
5770	return `17`;
5771	case AMDGPUISD::BUFFER_LOAD_UBYTE:
5772	return `24`;
5773	case AMDGPUISD::BUFFER_LOAD_USHORT:
5774	return `16`;
5775	case AMDGPUISD::FP_TO_FP16:
5776	return `16`;
5777	case AMDGPUISD::SMIN3:
5778	case AMDGPUISD::SMAX3:
5779	case AMDGPUISD::SMED3:
5780	case AMDGPUISD::UMIN3:
5781	case AMDGPUISD::UMAX3:
5782	case AMDGPUISD::UMED3: {
5783	unsigned Tmp2 = DAG.ComputeNumSignBits(Op: Op.getOperand(i: `2`), Depth: Depth + `1`);
5784	if (Tmp2 == `1`)
5785	return `1`; // Early out.
5786
5787	unsigned Tmp1 = DAG.ComputeNumSignBits(Op: Op.getOperand(i: `1`), Depth: Depth + `1`);
5788	if (Tmp1 == `1`)
5789	return `1`; // Early out.
5790
5791	unsigned Tmp0 = DAG.ComputeNumSignBits(Op: Op.getOperand(i: `0`), Depth: Depth + `1`);
5792	if (Tmp0 == `1`)
5793	return `1`; // Early out.
5794
5795	return std::min(a: Tmp0, b: std::min(a: Tmp1, b: Tmp2));
5796	}
5797	default:
5798	return `1`;
5799	}
5800	}
5801
5802	unsigned AMDGPUTargetLowering::computeNumSignBitsForTargetInstr(
5803	GISelKnownBits &Analysis, Register R,
5804	const APInt &DemandedElts, const MachineRegisterInfo &MRI,
5805	unsigned Depth) const {
5806	const MachineInstr *MI = MRI.getVRegDef(Reg: R);
5807	if (!MI)
5808	return `1`;
5809
5810	// TODO: Check range metadata on MMO.
5811	switch (MI->getOpcode()) {
5812	case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
5813	return `25`;
5814	case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
5815	return `17`;
5816	case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
5817	return `24`;
5818	case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
5819	return `16`;
5820	case AMDGPU::G_AMDGPU_SMED3:
5821	case AMDGPU::G_AMDGPU_UMED3: {
5822	auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
5823	unsigned Tmp2 = Analysis.computeNumSignBits(R: Src2, DemandedElts, Depth: Depth + `1`);
5824	if (Tmp2 == `1`)
5825	return `1`;
5826	unsigned Tmp1 = Analysis.computeNumSignBits(R: Src1, DemandedElts, Depth: Depth + `1`);
5827	if (Tmp1 == `1`)
5828	return `1`;
5829	unsigned Tmp0 = Analysis.computeNumSignBits(R: Src0, DemandedElts, Depth: Depth + `1`);
5830	if (Tmp0 == `1`)
5831	return `1`;
5832	return std::min(a: Tmp0, b: std::min(a: Tmp1, b: Tmp2));
5833	}
5834	default:
5835	return `1`;
5836	}
5837	}
5838
5839	bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
5840	const SelectionDAG &DAG,
5841	bool SNaN,
5842	unsigned Depth) const {
5843	unsigned Opcode = Op.getOpcode();
5844	switch (Opcode) {
5845	case AMDGPUISD::FMIN_LEGACY:
5846	case AMDGPUISD::FMAX_LEGACY: {
5847	if (SNaN)
5848	return true;
5849
5850	// TODO: Can check no nans on one of the operands for each one, but which
5851	// one?
5852	return false;
5853	}
5854	case AMDGPUISD::FMUL_LEGACY:
5855	case AMDGPUISD::CVT_PKRTZ_F16_F32: {
5856	if (SNaN)
5857	return true;
5858	return DAG.isKnownNeverNaN(Op: Op.getOperand(i: `0`), SNaN, Depth: Depth + `1`) &&
5859	DAG.isKnownNeverNaN(Op: Op.getOperand(i: `1`), SNaN, Depth: Depth + `1`);
5860	}
5861	case AMDGPUISD::FMED3:
5862	case AMDGPUISD::FMIN3:
5863	case AMDGPUISD::FMAX3:
5864	case AMDGPUISD::FMINIMUM3:
5865	case AMDGPUISD::FMAXIMUM3:
5866	case AMDGPUISD::FMAD_FTZ: {
5867	if (SNaN)
5868	return true;
5869	return DAG.isKnownNeverNaN(Op: Op.getOperand(i: `0`), SNaN, Depth: Depth + `1`) &&
5870	DAG.isKnownNeverNaN(Op: Op.getOperand(i: `1`), SNaN, Depth: Depth + `1`) &&
5871	DAG.isKnownNeverNaN(Op: Op.getOperand(i: `2`), SNaN, Depth: Depth + `1`);
5872	}
5873	case AMDGPUISD::CVT_F32_UBYTE0:
5874	case AMDGPUISD::CVT_F32_UBYTE1:
5875	case AMDGPUISD::CVT_F32_UBYTE2:
5876	case AMDGPUISD::CVT_F32_UBYTE3:
5877	return true;
5878
5879	case AMDGPUISD::RCP:
5880	case AMDGPUISD::RSQ:
5881	case AMDGPUISD::RCP_LEGACY:
5882	case AMDGPUISD::RSQ_CLAMP: {
5883	if (SNaN)
5884	return true;
5885
5886	// TODO: Need is known positive check.
5887	return false;
5888	}
5889	case ISD::FLDEXP:
5890	case AMDGPUISD::FRACT: {
5891	if (SNaN)
5892	return true;
5893	return DAG.isKnownNeverNaN(Op: Op.getOperand(i: `0`), SNaN, Depth: Depth + `1`);
5894	}
5895	case AMDGPUISD::DIV_SCALE:
5896	case AMDGPUISD::DIV_FMAS:
5897	case AMDGPUISD::DIV_FIXUP:
5898	// TODO: Refine on operands.
5899	return SNaN;
5900	case AMDGPUISD::SIN_HW:
5901	case AMDGPUISD::COS_HW: {
5902	// TODO: Need check for infinity
5903	return SNaN;
5904	}
5905	case ISD::INTRINSIC_WO_CHAIN: {
5906	unsigned IntrinsicID = Op.getConstantOperandVal(i: `0`);
5907	// TODO: Handle more intrinsics
5908	switch (IntrinsicID) {
5909	case Intrinsic::amdgcn_cubeid:
5910	return true;
5911
5912	case Intrinsic::amdgcn_frexp_mant: {
5913	if (SNaN)
5914	return true;
5915	return DAG.isKnownNeverNaN(Op: Op.getOperand(i: `1`), SNaN, Depth: Depth + `1`);
5916	}
5917	case Intrinsic::amdgcn_cvt_pkrtz: {
5918	if (SNaN)
5919	return true;
5920	return DAG.isKnownNeverNaN(Op: Op.getOperand(i: `1`), SNaN, Depth: Depth + `1`) &&
5921	DAG.isKnownNeverNaN(Op: Op.getOperand(i: `2`), SNaN, Depth: Depth + `1`);
5922	}
5923	case Intrinsic::amdgcn_rcp:
5924	case Intrinsic::amdgcn_rsq:
5925	case Intrinsic::amdgcn_rcp_legacy:
5926	case Intrinsic::amdgcn_rsq_legacy:
5927	case Intrinsic::amdgcn_rsq_clamp: {
5928	if (SNaN)
5929	return true;
5930
5931	// TODO: Need is known positive check.
5932	return false;
5933	}
5934	case Intrinsic::amdgcn_trig_preop:
5935	case Intrinsic::amdgcn_fdot2:
5936	// TODO: Refine on operand
5937	return SNaN;
5938	case Intrinsic::amdgcn_fma_legacy:
5939	if (SNaN)
5940	return true;
5941	return DAG.isKnownNeverNaN(Op: Op.getOperand(i: `1`), SNaN, Depth: Depth + `1`) &&
5942	DAG.isKnownNeverNaN(Op: Op.getOperand(i: `2`), SNaN, Depth: Depth + `1`) &&
5943	DAG.isKnownNeverNaN(Op: Op.getOperand(i: `3`), SNaN, Depth: Depth + `1`);
5944	default:
5945	return false;
5946	}
5947	}
5948	default:
5949	return false;
5950	}
5951	}
5952
5953	bool AMDGPUTargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,
5954	Register N0, Register N1) const {
5955	return MRI.hasOneNonDBGUse(RegNo: N0); // FIXME: handle regbanks
5956	}
5957
5958	TargetLowering::AtomicExpansionKind
5959	AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst RMW) const* {
5960	switch (RMW->getOperation()) {
5961	case AtomicRMWInst::Nand:
5962	case AtomicRMWInst::FAdd:
5963	case AtomicRMWInst::FSub:
5964	case AtomicRMWInst::FMax:
5965	case AtomicRMWInst::FMin:
5966	return AtomicExpansionKind::CmpXChg;
5967	default: {
5968	if (auto *IntTy = dyn_cast<IntegerType>(Val: RMW->getType())) {
5969	unsigned Size = IntTy->getBitWidth();
5970	if (Size == `32` \|\| Size == `64`)
5971	return AtomicExpansionKind::None;
5972	}
5973
5974	return AtomicExpansionKind::CmpXChg;
5975	}
5976	}
5977	}
5978
5979	/// Whether it is profitable to sink the operands of an
5980	/// Instruction I to the basic block of I.
5981	/// This helps using several modifiers (like abs and neg) more often.
5982	bool AMDGPUTargetLowering::shouldSinkOperands(
5983	Instruction I, SmallVectorImpl<Use > &Ops) const {
5984	using namespace PatternMatch;
5985
5986	for (auto &Op : I->operands()) {
5987	// Ensure we are not already sinking this operand.
5988	if (any_of(Range&: Ops, P: [&](Use U) { return* U->get() == Op.get(); }))
5989	continue;
5990
5991	if (match(V: &Op, P: m_FAbs(Op0: m_Value())) \|\| match(V: &Op, P: m_FNeg(X: m_Value())))
5992	Ops.push_back(Elt: &Op);
5993	}
5994
5995	return !Ops.empty();
5996	}
5997

source code of llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp