AMDGPULegalizerInfo.cpp source code [llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp]

1	//===- AMDGPULegalizerInfo.cpp ------------------------------------ C++ --==//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	/// \file
9	/// This file implements the targeting of the Machinelegalizer class for
10	/// AMDGPU.
11	/// \todo This should be generated by TableGen.
12	//===----------------------------------------------------------------------===//
13
14	#include "AMDGPULegalizerInfo.h"
15
16	#include "AMDGPU.h"
17	#include "AMDGPUGlobalISelUtils.h"
18	#include "AMDGPUInstrInfo.h"
19	#include "AMDGPUTargetMachine.h"
20	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21	#include "SIInstrInfo.h"
22	#include "SIMachineFunctionInfo.h"
23	#include "SIRegisterInfo.h"
24	#include "Utils/AMDGPUBaseInfo.h"
25	#include "llvm/ADT/ScopeExit.h"
26	#include "llvm/BinaryFormat/ELF.h"
27	#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
28	#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
29	#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
30	#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
31	#include "llvm/CodeGen/GlobalISel/Utils.h"
32	#include "llvm/CodeGen/TargetOpcodes.h"
33	#include "llvm/IR/DiagnosticInfo.h"
34	#include "llvm/IR/IntrinsicsAMDGPU.h"
35	#include "llvm/IR/IntrinsicsR600.h"
36
37	#define DEBUG_TYPE "amdgpu-legalinfo"
38
39	using namespace llvm;
40	using namespace LegalizeActions;
41	using namespace LegalizeMutations;
42	using namespace LegalityPredicates;
43	using namespace MIPatternMatch;
44
45	// Hack until load/store selection patterns support any tuple of legal types.
46	static cl::opt<bool> EnableNewLegality(
47	"amdgpu-global-isel-new-legality",
48	cl::desc ("Use GlobalISel desired legality, rather than try to use"
49	"rules compatible with selection patterns"),
50	cl::init(Val: false),
51	cl::ReallyHidden);
52
53	static constexpr unsigned MaxRegisterSize = `1024`;
54
55	// Round the number of elements to the next power of two elements
56	static LLT getPow2VectorType(LLT Ty) {
57	unsigned NElts = Ty.getNumElements();
58	unsigned Pow2NElts = `1` << Log2_32_Ceil(Value: NElts);
59	return Ty.changeElementCount(EC: ElementCount::getFixed(MinVal: Pow2NElts));
60	}
61
62	// Round the number of bits to the next power of two bits
63	static LLT getPow2ScalarType(LLT Ty) {
64	unsigned Bits = Ty.getSizeInBits();
65	unsigned Pow2Bits = `1` << Log2_32_Ceil(Value: Bits);
66	return LLT::scalar(SizeInBits: Pow2Bits);
67	}
68
69	/// \returns true if this is an odd sized vector which should widen by adding an
70	/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
71	/// excludes s1 vectors, which should always be scalarized.
72	static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
73	return [=](const LegalityQuery &Query) {
74	const LLT Ty = Query.Types [TypeIdx];
75	if (!Ty.isVector())
76	return false;
77
78	const LLT EltTy = Ty.getElementType();
79	const unsigned EltSize = EltTy.getSizeInBits();
80	return Ty.getNumElements() % `2` != `0` &&
81	EltSize > `1` && EltSize < `32` &&
82	Ty.getSizeInBits() % `32` != `0`;
83	};
84	}
85
86	static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
87	return [=](const LegalityQuery &Query) {
88	const LLT Ty = Query.Types [TypeIdx];
89	return Ty.getSizeInBits() % `32` == `0`;
90	};
91	}
92
93	static LegalityPredicate isWideVec16(unsigned TypeIdx) {
94	return [=](const LegalityQuery &Query) {
95	const LLT Ty = Query.Types [TypeIdx];
96	const LLT EltTy = Ty.getScalarType();
97	return EltTy.getSizeInBits() == `16` && Ty.getNumElements() > `2`;
98	};
99	}
100
101	static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
102	return [=](const LegalityQuery &Query) {
103	const LLT Ty = Query.Types [TypeIdx];
104	const LLT EltTy = Ty.getElementType();
105	return std::pair(TypeIdx,
106	LLT::fixed_vector(NumElements: Ty.getNumElements() + `1`, ScalarTy: EltTy));
107	};
108	}
109
110	static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
111	return [=](const LegalityQuery &Query) {
112	const LLT Ty = Query.Types [TypeIdx];
113	const LLT EltTy = Ty.getElementType();
114	unsigned Size = Ty.getSizeInBits();
115	unsigned Pieces = (Size + `63`) / `64`;
116	unsigned NewNumElts = (Ty.getNumElements() + `1`) / Pieces;
117	return std::pair(TypeIdx, LLT::scalarOrVector(
118	EC: ElementCount::getFixed(MinVal: NewNumElts), ScalarTy: EltTy));
119	};
120	}
121
122	// Increase the number of vector elements to reach the next multiple of 32-bit
123	// type.
124	static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
125	return [=](const LegalityQuery &Query) {
126	const LLT Ty = Query.Types [TypeIdx];
127
128	const LLT EltTy = Ty.getElementType();
129	const int Size = Ty.getSizeInBits();
130	const int EltSize = EltTy.getSizeInBits();
131	const int NextMul32 = (Size + `31`) / `32`;
132
133	assert(EltSize < `32`);
134
135	const int NewNumElts = (`32` * NextMul32 + EltSize - `1`) / EltSize;
136	return std::pair(TypeIdx, LLT::fixed_vector(NumElements: NewNumElts, ScalarTy: EltTy));
137	};
138	}
139
140	// Increase the number of vector elements to reach the next legal RegClass.
141	static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) {
142	return [=](const LegalityQuery &Query) {
143	const LLT Ty = Query.Types [TypeIdx];
144	const unsigned NumElts = Ty.getNumElements();
145	const unsigned EltSize = Ty.getElementType().getSizeInBits();
146	const unsigned MaxNumElts = MaxRegisterSize / EltSize;
147
148	assert(EltSize == `32` \|\| EltSize == `64`);
149	assert(Ty.getSizeInBits() < MaxRegisterSize);
150
151	unsigned NewNumElts;
152	// Find the nearest legal RegClass that is larger than the current type.
153	for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
154	if (SIRegisterInfo::getSGPRClassForBitWidth(BitWidth: NewNumElts * EltSize))
155	break;
156	}
157
158	return std::pair(TypeIdx, LLT::fixed_vector(NumElements: NewNumElts, ScalarSizeInBits: EltSize));
159	};
160	}
161
162	static LLT getBufferRsrcScalarType(const LLT Ty) {
163	if (!Ty.isVector())
164	return LLT::scalar(SizeInBits: `128`);
165	const ElementCount NumElems = Ty.getElementCount();
166	return LLT::vector(EC: NumElems, ScalarTy: LLT::scalar(SizeInBits: `128`));
167	}
168
169	static LLT getBufferRsrcRegisterType(const LLT Ty) {
170	if (!Ty.isVector())
171	return LLT::fixed_vector(NumElements: `4`, ScalarTy: LLT::scalar(SizeInBits: `32`));
172	const unsigned NumElems = Ty.getElementCount().getFixedValue();
173	return LLT::fixed_vector(NumElements: NumElems * `4`, ScalarTy: LLT::scalar(SizeInBits: `32`));
174	}
175
176	static LLT getBitcastRegisterType(const LLT Ty) {
177	const unsigned Size = Ty.getSizeInBits();
178
179	if (Size <= `32`) {
180	// <2 x s8> -> s16
181	// <4 x s8> -> s32
182	return LLT::scalar(SizeInBits: Size);
183	}
184
185	return LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: Size / `32`), ScalarSize: `32`);
186	}
187
188	static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
189	return [=](const LegalityQuery &Query) {
190	const LLT Ty = Query.Types [TypeIdx];
191	return std::pair(TypeIdx, getBitcastRegisterType(Ty));
192	};
193	}
194
195	static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
196	return [=](const LegalityQuery &Query) {
197	const LLT Ty = Query.Types [TypeIdx];
198	unsigned Size = Ty.getSizeInBits();
199	assert(Size % `32` == `0`);
200	return std::pair(
201	TypeIdx, LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: Size / `32`), ScalarSize: `32`));
202	};
203	}
204
205	static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
206	return [=](const LegalityQuery &Query) {
207	const LLT QueryTy = Query.Types [TypeIdx];
208	return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
209	};
210	}
211
212	static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
213	return [=](const LegalityQuery &Query) {
214	const LLT QueryTy = Query.Types [TypeIdx];
215	return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
216	};
217	}
218
219	static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
220	return [=](const LegalityQuery &Query) {
221	const LLT QueryTy = Query.Types [TypeIdx];
222	return QueryTy.isVector() && QueryTy.getNumElements() % `2` != `0`;
223	};
224	}
225
226	static bool isRegisterSize(unsigned Size) {
227	return Size % `32` == `0` && Size <= MaxRegisterSize;
228	}
229
230	static bool isRegisterVectorElementType(LLT EltTy) {
231	const int EltSize = EltTy.getSizeInBits();
232	return EltSize == `16` \|\| EltSize % `32` == `0`;
233	}
234
235	static bool isRegisterVectorType(LLT Ty) {
236	const int EltSize = Ty.getElementType().getSizeInBits();
237	return EltSize == `32` \|\| EltSize == `64` \|\|
238	(EltSize == `16` && Ty.getNumElements() % `2` == `0`) \|\|
239	EltSize == `128` \|\| EltSize == `256`;
240	}
241
242	// TODO: replace all uses of isRegisterType with isRegisterClassType
243	static bool isRegisterType(LLT Ty) {
244	if (!isRegisterSize(Size: Ty.getSizeInBits()))
245	return false;
246
247	if (Ty.isVector())
248	return isRegisterVectorType(Ty);
249
250	return true;
251	}
252
253	// Any combination of 32 or 64-bit elements up the maximum register size, and
254	// multiples of v2s16.
255	static LegalityPredicate isRegisterType(unsigned TypeIdx) {
256	return [=](const LegalityQuery &Query) {
257	return isRegisterType(Ty: Query.Types [TypeIdx]);
258	};
259	}
260
261	// RegisterType that doesn't have a corresponding RegClass.
262	// TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
263	// should be removed.
264	static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) {
265	return [=](const LegalityQuery &Query) {
266	LLT Ty = Query.Types [TypeIdx];
267	return isRegisterType(Ty) &&
268	!SIRegisterInfo::getSGPRClassForBitWidth(BitWidth: Ty.getSizeInBits());
269	};
270	}
271
272	static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
273	return [=](const LegalityQuery &Query) {
274	const LLT QueryTy = Query.Types [TypeIdx];
275	if (!QueryTy.isVector())
276	return false;
277	const LLT EltTy = QueryTy.getElementType();
278	return EltTy == LLT::scalar(SizeInBits: `16`) \|\| EltTy.getSizeInBits() >= `32`;
279	};
280	}
281
282	static const LLT S1 = LLT::scalar(SizeInBits: `1`);
283	static const LLT S8 = LLT::scalar(SizeInBits: `8`);
284	static const LLT S16 = LLT::scalar(SizeInBits: `16`);
285	static const LLT S32 = LLT::scalar(SizeInBits: `32`);
286	static const LLT S64 = LLT::scalar(SizeInBits: `64`);
287	static const LLT S96 = LLT::scalar(SizeInBits: `96`);
288	static const LLT S128 = LLT::scalar(SizeInBits: `128`);
289	static const LLT S160 = LLT::scalar(SizeInBits: `160`);
290	static const LLT S224 = LLT::scalar(SizeInBits: `224`);
291	static const LLT S256 = LLT::scalar(SizeInBits: `256`);
292	static const LLT S512 = LLT::scalar(SizeInBits: `512`);
293	static const LLT MaxScalar = LLT::scalar(SizeInBits: MaxRegisterSize);
294
295	static const LLT V2S8 = LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `8`);
296	static const LLT V2S16 = LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `16`);
297	static const LLT V4S16 = LLT::fixed_vector(NumElements: `4`, ScalarSizeInBits: `16`);
298	static const LLT V6S16 = LLT::fixed_vector(NumElements: `6`, ScalarSizeInBits: `16`);
299	static const LLT V8S16 = LLT::fixed_vector(NumElements: `8`, ScalarSizeInBits: `16`);
300	static const LLT V10S16 = LLT::fixed_vector(NumElements: `10`, ScalarSizeInBits: `16`);
301	static const LLT V12S16 = LLT::fixed_vector(NumElements: `12`, ScalarSizeInBits: `16`);
302	static const LLT V16S16 = LLT::fixed_vector(NumElements: `16`, ScalarSizeInBits: `16`);
303
304	static const LLT V2S32 = LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `32`);
305	static const LLT V3S32 = LLT::fixed_vector(NumElements: `3`, ScalarSizeInBits: `32`);
306	static const LLT V4S32 = LLT::fixed_vector(NumElements: `4`, ScalarSizeInBits: `32`);
307	static const LLT V5S32 = LLT::fixed_vector(NumElements: `5`, ScalarSizeInBits: `32`);
308	static const LLT V6S32 = LLT::fixed_vector(NumElements: `6`, ScalarSizeInBits: `32`);
309	static const LLT V7S32 = LLT::fixed_vector(NumElements: `7`, ScalarSizeInBits: `32`);
310	static const LLT V8S32 = LLT::fixed_vector(NumElements: `8`, ScalarSizeInBits: `32`);
311	static const LLT V9S32 = LLT::fixed_vector(NumElements: `9`, ScalarSizeInBits: `32`);
312	static const LLT V10S32 = LLT::fixed_vector(NumElements: `10`, ScalarSizeInBits: `32`);
313	static const LLT V11S32 = LLT::fixed_vector(NumElements: `11`, ScalarSizeInBits: `32`);
314	static const LLT V12S32 = LLT::fixed_vector(NumElements: `12`, ScalarSizeInBits: `32`);
315	static const LLT V16S32 = LLT::fixed_vector(NumElements: `16`, ScalarSizeInBits: `32`);
316	static const LLT V32S32 = LLT::fixed_vector(NumElements: `32`, ScalarSizeInBits: `32`);
317
318	static const LLT V2S64 = LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `64`);
319	static const LLT V3S64 = LLT::fixed_vector(NumElements: `3`, ScalarSizeInBits: `64`);
320	static const LLT V4S64 = LLT::fixed_vector(NumElements: `4`, ScalarSizeInBits: `64`);
321	static const LLT V5S64 = LLT::fixed_vector(NumElements: `5`, ScalarSizeInBits: `64`);
322	static const LLT V6S64 = LLT::fixed_vector(NumElements: `6`, ScalarSizeInBits: `64`);
323	static const LLT V7S64 = LLT::fixed_vector(NumElements: `7`, ScalarSizeInBits: `64`);
324	static const LLT V8S64 = LLT::fixed_vector(NumElements: `8`, ScalarSizeInBits: `64`);
325	static const LLT V16S64 = LLT::fixed_vector(NumElements: `16`, ScalarSizeInBits: `64`);
326
327	static const LLT V2S128 = LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `128`);
328	static const LLT V4S128 = LLT::fixed_vector(NumElements: `4`, ScalarSizeInBits: `128`);
329
330	static std::initializer_list<LLT> AllScalarTypes = {S32, S64, S96, S128,
331	S160, S224, S256, S512};
332
333	static std::initializer_list<LLT> AllS16Vectors{
334	V2S16, V4S16, V6S16, V8S16, V10S16, V12S16, V16S16, V2S128, V4S128};
335
336	static std::initializer_list<LLT> AllS32Vectors = {
337	V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
338	V9S32, V10S32, V11S32, V12S32, V16S32, V32S32};
339
340	static std::initializer_list<LLT> AllS64Vectors = {V2S64, V3S64, V4S64, V5S64,
341	V6S64, V7S64, V8S64, V16S64};
342
343	// Checks whether a type is in the list of legal register types.
344	static bool isRegisterClassType(LLT Ty) {
345	if (Ty.isPointerOrPointerVector())
346	Ty = Ty.changeElementType(NewEltTy: LLT::scalar(SizeInBits: Ty.getScalarSizeInBits()));
347
348	return is_contained(Set: AllS32Vectors, Element: Ty) \|\| is_contained(Set: AllS64Vectors, Element: Ty) \|\|
349	is_contained(Set: AllScalarTypes, Element: Ty) \|\| is_contained(Set: AllS16Vectors, Element: Ty);
350	}
351
352	static LegalityPredicate isRegisterClassType(unsigned TypeIdx) {
353	return [TypeIdx](const LegalityQuery &Query) {
354	return isRegisterClassType(Ty: Query.Types [TypeIdx]);
355	};
356	}
357
358	// If we have a truncating store or an extending load with a data size larger
359	// than 32-bits, we need to reduce to a 32-bit type.
360	static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
361	return [=](const LegalityQuery &Query) {
362	const LLT Ty = Query.Types [TypeIdx];
363	return !Ty.isVector() && Ty.getSizeInBits() > `32` &&
364	Query.MMODescrs [`0`].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
365	};
366	}
367
368	// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
369	// handle some operations by just promoting the register during
370	// selection. There are also d16 loads on GFX9+ which preserve the high bits.
371	static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
372	bool IsLoad, bool IsAtomic) {
373	switch (AS) {
374	case AMDGPUAS::PRIVATE_ADDRESS:
375	// FIXME: Private element size.
376	return ST.enableFlatScratch() ? `128` : `32`;
377	case AMDGPUAS::LOCAL_ADDRESS:
378	return ST.useDS128() ? `128` : `64`;
379	case AMDGPUAS::GLOBAL_ADDRESS:
380	case AMDGPUAS::CONSTANT_ADDRESS:
381	case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
382	case AMDGPUAS::BUFFER_RESOURCE:
383	// Treat constant and global as identical. SMRD loads are sometimes usable for
384	// global loads (ideally constant address space should be eliminated)
385	// depending on the context. Legality cannot be context dependent, but
386	// RegBankSelect can split the load as necessary depending on the pointer
387	// register bank/uniformity and if the memory is invariant or not written in a
388	// kernel.
389	return IsLoad ? `512` : `128`;
390	default:
391	// FIXME: Flat addresses may contextually need to be split to 32-bit parts
392	// if they may alias scratch depending on the subtarget. This needs to be
393	// moved to custom handling to use addressMayBeAccessedAsPrivate
394	return ST.hasMultiDwordFlatScratchAddressing() \|\| IsAtomic ? `128` : `32`;
395	}
396	}
397
398	static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
399	const LegalityQuery &Query) {
400	const LLT Ty = Query.Types [`0`];
401
402	// Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
403	const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
404
405	unsigned RegSize = Ty.getSizeInBits();
406	uint64_t MemSize = Query.MMODescrs [`0`].MemoryTy.getSizeInBits();
407	uint64_t AlignBits = Query.MMODescrs [`0`].AlignInBits;
408	unsigned AS = Query.Types [`1`].getAddressSpace();
409
410	// All of these need to be custom lowered to cast the pointer operand.
411	if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
412	return false;
413
414	// Do not handle extending vector loads.
415	if (Ty.isVector() && MemSize != RegSize)
416	return false;
417
418	// TODO: We should be able to widen loads if the alignment is high enough, but
419	// we also need to modify the memory access size.
420	#if 0
421	// Accept widening loads based on alignment.
422	if (IsLoad && MemSize < Size)
423	MemSize = std::max(MemSize, Align);
424	#endif
425
426	// Only 1-byte and 2-byte to 32-bit extloads are valid.
427	if (MemSize != RegSize && RegSize != `32`)
428	return false;
429
430	if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
431	IsAtomic: Query.MMODescrs [`0`].Ordering !=
432	AtomicOrdering::NotAtomic))
433	return false;
434
435	switch (MemSize) {
436	case `8`:
437	case `16`:
438	case `32`:
439	case `64`:
440	case `128`:
441	break;
442	case `96`:
443	if (!ST.hasDwordx3LoadStores())
444	return false;
445	break;
446	case `256`:
447	case `512`:
448	// These may contextually need to be broken down.
449	break;
450	default:
451	return false;
452	}
453
454	assert(RegSize >= MemSize);
455
456	if (AlignBits < MemSize) {
457	const SITargetLowering *TLI = ST.getTargetLowering();
458	if (!TLI->allowsMisalignedMemoryAccessesImpl(Size: MemSize, AddrSpace: AS,
459	Alignment: Align (AlignBits / `8`)))
460	return false;
461	}
462
463	return true;
464	}
465
466	// The newer buffer intrinsic forms take their resource arguments as
467	// pointers in address space 8, aka s128 values. However, in order to not break
468	// SelectionDAG, the underlying operations have to continue to take v4i32
469	// arguments. Therefore, we convert resource pointers - or vectors of them
470	// to integer values here.
471	static bool hasBufferRsrcWorkaround(const LLT Ty) {
472	if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
473	return true;
474	if (Ty.isVector()) {
475	const LLT ElemTy = Ty.getElementType();
476	return hasBufferRsrcWorkaround(Ty: ElemTy);
477	}
478	return false;
479	}
480
481	// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
482	// workaround this. Eventually it should ignore the type for loads and only care
483	// about the size. Return true in cases where we will workaround this for now by
484	// bitcasting.
485	static bool loadStoreBitcastWorkaround(const LLT Ty) {
486	if (EnableNewLegality)
487	return false;
488
489	const unsigned Size = Ty.getSizeInBits();
490	if (Size <= `64`)
491	return false;
492	// Address space 8 pointers get their own workaround.
493	if (hasBufferRsrcWorkaround(Ty))
494	return false;
495	if (!Ty.isVector())
496	return true;
497
498	if (Ty.isPointerVector())
499	return true;
500
501	unsigned EltSize = Ty.getScalarSizeInBits();
502	return EltSize != `32` && EltSize != `64`;
503	}
504
505	static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
506	const LLT Ty = Query.Types [`0`];
507	return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
508	!hasBufferRsrcWorkaround(Ty) && !loadStoreBitcastWorkaround(Ty);
509	}
510
511	/// Return true if a load or store of the type should be lowered with a bitcast
512	/// to a different type.
513	static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
514	const LLT MemTy) {
515	const unsigned MemSizeInBits = MemTy.getSizeInBits();
516	const unsigned Size = Ty.getSizeInBits();
517	if (Size != MemSizeInBits)
518	return Size <= `32` && Ty.isVector();
519
520	if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
521	return true;
522
523	// Don't try to handle bitcasting vector ext loads for now.
524	return Ty.isVector() && (!MemTy.isVector() \|\| MemTy == Ty) &&
525	(Size <= `32` \|\| isRegisterSize(Size)) &&
526	!isRegisterVectorElementType(EltTy: Ty.getElementType());
527	}
528
529	/// Return true if we should legalize a load by widening an odd sized memory
530	/// access up to the alignment. Note this case when the memory access itself
531	/// changes, not the size of the result register.
532	static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
533	uint64_t AlignInBits, unsigned AddrSpace,
534	unsigned Opcode) {
535	unsigned SizeInBits = MemoryTy.getSizeInBits();
536	// We don't want to widen cases that are naturally legal.
537	if (isPowerOf2_32(Value: SizeInBits))
538	return false;
539
540	// If we have 96-bit memory operations, we shouldn't touch them. Note we may
541	// end up widening these for a scalar load during RegBankSelect, if we don't
542	// have 96-bit scalar loads.
543	if (SizeInBits == `96` && ST.hasDwordx3LoadStores())
544	return false;
545
546	if (SizeInBits >= maxSizeForAddrSpace(ST, AS: AddrSpace, IsLoad: Opcode, IsAtomic: false))
547	return false;
548
549	// A load is known dereferenceable up to the alignment, so it's legal to widen
550	// to it.
551	//
552	// TODO: Could check dereferenceable for less aligned cases.
553	unsigned RoundedSize = NextPowerOf2(A: SizeInBits);
554	if (AlignInBits < RoundedSize)
555	return false;
556
557	// Do not widen if it would introduce a slow unaligned load.
558	const SITargetLowering *TLI = ST.getTargetLowering();
559	unsigned Fast = `0`;
560	return TLI->allowsMisalignedMemoryAccessesImpl(
561	Size: RoundedSize, AddrSpace, Alignment: Align (AlignInBits / `8`),
562	Flags: MachineMemOperand::MOLoad, IsFast: &Fast) &&
563	Fast;
564	}
565
566	static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
567	unsigned Opcode) {
568	if (Query.MMODescrs [`0`].Ordering != AtomicOrdering::NotAtomic)
569	return false;
570
571	return shouldWidenLoad(ST, MemoryTy: Query.MMODescrs [`0`].MemoryTy,
572	AlignInBits: Query.MMODescrs [`0`].AlignInBits,
573	AddrSpace: Query.Types [`1`].getAddressSpace(), Opcode);
574	}
575
576	/// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
577	/// type of the operand `idx` and then to transform it to a `p8` via bitcasts
578	/// and inttoptr. In addition, handle vectors of p8. Returns the new type.
579	static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B,
580	MachineRegisterInfo &MRI, unsigned Idx) {
581	MachineOperand &MO = MI.getOperand(i: Idx);
582
583	const LLT PointerTy = MRI.getType(Reg: MO.getReg());
584
585	// Paranoidly prevent us from doing this multiple times.
586	if (!hasBufferRsrcWorkaround(Ty: PointerTy))
587	return PointerTy;
588
589	const LLT ScalarTy = getBufferRsrcScalarType(Ty: PointerTy);
590	const LLT VectorTy = getBufferRsrcRegisterType(Ty: PointerTy);
591	if (!PointerTy.isVector()) {
592	// Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
593	const unsigned NumParts = PointerTy.getSizeInBits() / `32`;
594	const LLT S32 = LLT::scalar(SizeInBits: `32`);
595
596	Register VectorReg = MRI.createGenericVirtualRegister(Ty: VectorTy);
597	std::array<Register, `4`> VectorElems;
598	B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
599	for (unsigned I = `0`; I < NumParts; ++I)
600	VectorElems [I] =
601	B.buildExtractVectorElementConstant(Res: S32, Val: VectorReg, Idx: I).getReg(Idx: `0`);
602	B.buildMergeValues(Res: MO, Ops: VectorElems);
603	MO.setReg(VectorReg);
604	return VectorTy;
605	}
606	Register BitcastReg = MRI.createGenericVirtualRegister(Ty: VectorTy);
607	B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
608	auto Scalar = B.buildBitcast(Dst: ScalarTy, Src: BitcastReg);
609	B.buildIntToPtr(Dst: MO, Src: Scalar);
610	MO.setReg(BitcastReg);
611
612	return VectorTy;
613	}
614
615	/// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
616	/// the form in which the value must be in order to be passed to the low-level
617	/// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
618	/// needed in order to account for the fact that we can't define a register
619	/// class for s128 without breaking SelectionDAG.
620	static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B) {
621	MachineRegisterInfo &MRI = *B.getMRI();
622	const LLT PointerTy = MRI.getType(Reg: Pointer);
623	const LLT ScalarTy = getBufferRsrcScalarType(Ty: PointerTy);
624	const LLT VectorTy = getBufferRsrcRegisterType(Ty: PointerTy);
625
626	if (!PointerTy.isVector()) {
627	// Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
628	SmallVector<Register, `4`> PointerParts;
629	const unsigned NumParts = PointerTy.getSizeInBits() / `32`;
630	auto Unmerged = B.buildUnmerge(Res: LLT::scalar(SizeInBits: `32`), Op: Pointer);
631	for (unsigned I = `0`; I < NumParts; ++I)
632	PointerParts.push_back(Elt: Unmerged.getReg(Idx: I));
633	return B.buildBuildVector(Res: VectorTy, Ops: PointerParts).getReg(Idx: `0`);
634	}
635	Register Scalar = B.buildPtrToInt(Dst: ScalarTy, Src: Pointer).getReg(Idx: `0`);
636	return B.buildBitcast(Dst: VectorTy, Src: Scalar).getReg(Idx: `0`);
637	}
638
639	static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B,
640	unsigned Idx) {
641	MachineOperand &MO = MI.getOperand(i: Idx);
642
643	const LLT PointerTy = B.getMRI()->getType(Reg: MO.getReg());
644	// Paranoidly prevent us from doing this multiple times.
645	if (!hasBufferRsrcWorkaround(Ty: PointerTy))
646	return;
647	MO.setReg(castBufferRsrcToV4I32(Pointer: MO.getReg(), B));
648	}
649
650	AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
651	const GCNTargetMachine &TM)
652	: ST(ST_) {
653	using namespace TargetOpcode;
654
655	auto GetAddrSpacePtr = [&TM](unsigned AS) {
656	return LLT::pointer(AddressSpace: AS, SizeInBits: TM.getPointerSizeInBits(AS));
657	};
658
659	const LLT GlobalPtr = GetAddrSpacePtr (AMDGPUAS::GLOBAL_ADDRESS);
660	const LLT ConstantPtr = GetAddrSpacePtr (AMDGPUAS::CONSTANT_ADDRESS);
661	const LLT Constant32Ptr = GetAddrSpacePtr (AMDGPUAS::CONSTANT_ADDRESS_32BIT);
662	const LLT LocalPtr = GetAddrSpacePtr (AMDGPUAS::LOCAL_ADDRESS);
663	const LLT RegionPtr = GetAddrSpacePtr (AMDGPUAS::REGION_ADDRESS);
664	const LLT FlatPtr = GetAddrSpacePtr (AMDGPUAS::FLAT_ADDRESS);
665	const LLT PrivatePtr = GetAddrSpacePtr (AMDGPUAS::PRIVATE_ADDRESS);
666	const LLT BufferFatPtr = GetAddrSpacePtr (AMDGPUAS::BUFFER_FAT_POINTER);
667	const LLT RsrcPtr = GetAddrSpacePtr (AMDGPUAS::BUFFER_RESOURCE);
668	const LLT BufferStridedPtr =
669	GetAddrSpacePtr (AMDGPUAS::BUFFER_STRIDED_POINTER);
670
671	const LLT CodePtr = FlatPtr;
672
673	const std::initializer_list<LLT> AddrSpaces64 = {
674	GlobalPtr, ConstantPtr, FlatPtr
675	};
676
677	const std::initializer_list<LLT> AddrSpaces32 = {
678	LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
679	};
680
681	const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
682
683	const std::initializer_list<LLT> FPTypesBase = {
684	S32, S64
685	};
686
687	const std::initializer_list<LLT> FPTypes16 = {
688	S32, S64, S16
689	};
690
691	const std::initializer_list<LLT> FPTypesPK16 = {
692	S32, S64, S16, V2S16
693	};
694
695	const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
696
697	// s1 for VCC branches, s32 for SCC branches.
698	getActionDefinitionsBuilder(Opcode: G_BRCOND).legalFor(Types: {S1, S32});
699
700	// TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
701	// elements for v3s16
702	getActionDefinitionsBuilder(Opcode: G_PHI)
703	.legalFor(Types: {S32, S64, V2S16, S16, V4S16, S1, S128, S256})
704	.legalFor(Types: AllS32Vectors)
705	.legalFor(Types: AllS64Vectors)
706	.legalFor(Types: AddrSpaces64)
707	.legalFor(Types: AddrSpaces32)
708	.legalFor(Types: AddrSpaces128)
709	.legalIf(Predicate: isPointer(TypeIdx: `0`))
710	.clampScalar(TypeIdx: `0`, MinTy: S16, MaxTy: S256)
711	.widenScalarToNextPow2(TypeIdx: `0`, MinSize: `32`)
712	.clampMaxNumElements(TypeIdx: `0`, EltTy: S32, MaxElements: `16`)
713	.moreElementsIf(Predicate: isSmallOddVector(TypeIdx: `0`), Mutation: oneMoreElement(TypeIdx: `0`))
714	.scalarize(TypeIdx: `0`);
715
716	if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
717	// Full set of gfx9 features.
718	if (ST.hasScalarAddSub64()) {
719	getActionDefinitionsBuilder(Opcodes: {G_ADD, G_SUB})
720	.legalFor(Types: {S64, S32, S16, V2S16})
721	.clampMaxNumElementsStrict(TypeIdx: `0`, EltTy: S16, NumElts: `2`)
722	.scalarize(TypeIdx: `0`)
723	.minScalar(TypeIdx: `0`, Ty: S16)
724	.widenScalarToNextMultipleOf(TypeIdx: `0`, Size: `32`)
725	.maxScalar(TypeIdx: `0`, Ty: S32);
726	} else {
727	getActionDefinitionsBuilder(Opcodes: {G_ADD, G_SUB})
728	.legalFor(Types: {S32, S16, V2S16})
729	.clampMaxNumElementsStrict(TypeIdx: `0`, EltTy: S16, NumElts: `2`)
730	.scalarize(TypeIdx: `0`)
731	.minScalar(TypeIdx: `0`, Ty: S16)
732	.widenScalarToNextMultipleOf(TypeIdx: `0`, Size: `32`)
733	.maxScalar(TypeIdx: `0`, Ty: S32);
734	}
735
736	if (ST.hasScalarSMulU64()) {
737	getActionDefinitionsBuilder(Opcode: G_MUL)
738	.legalFor(Types: {S64, S32, S16, V2S16})
739	.clampMaxNumElementsStrict(TypeIdx: `0`, EltTy: S16, NumElts: `2`)
740	.scalarize(TypeIdx: `0`)
741	.minScalar(TypeIdx: `0`, Ty: S16)
742	.widenScalarToNextMultipleOf(TypeIdx: `0`, Size: `32`)
743	.custom();
744	} else {
745	getActionDefinitionsBuilder(Opcode: G_MUL)
746	.legalFor(Types: {S32, S16, V2S16})
747	.clampMaxNumElementsStrict(TypeIdx: `0`, EltTy: S16, NumElts: `2`)
748	.scalarize(TypeIdx: `0`)
749	.minScalar(TypeIdx: `0`, Ty: S16)
750	.widenScalarToNextMultipleOf(TypeIdx: `0`, Size: `32`)
751	.custom();
752	}
753	assert(ST.hasMad64_32());
754
755	getActionDefinitionsBuilder(Opcodes: {G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
756	.legalFor(Types: {S32, S16, V2S16}) // Clamp modifier
757	.minScalarOrElt(TypeIdx: `0`, Ty: S16)
758	.clampMaxNumElementsStrict(TypeIdx: `0`, EltTy: S16, NumElts: `2`)
759	.scalarize(TypeIdx: `0`)
760	.widenScalarToNextPow2(TypeIdx: `0`, MinSize: `32`)
761	.lower();
762	} else if (ST.has16BitInsts()) {
763	getActionDefinitionsBuilder(Opcodes: {G_ADD, G_SUB})
764	.legalFor(Types: {S32, S16})
765	.minScalar(TypeIdx: `0`, Ty: S16)
766	.widenScalarToNextMultipleOf(TypeIdx: `0`, Size: `32`)
767	.maxScalar(TypeIdx: `0`, Ty: S32)
768	.scalarize(TypeIdx: `0`);
769
770	getActionDefinitionsBuilder(Opcode: G_MUL)
771	.legalFor(Types: {S32, S16})
772	.scalarize(TypeIdx: `0`)
773	.minScalar(TypeIdx: `0`, Ty: S16)
774	.widenScalarToNextMultipleOf(TypeIdx: `0`, Size: `32`)
775	.custom();
776	assert(ST.hasMad64_32());
777
778	// Technically the saturating operations require clamp bit support, but this
779	// was introduced at the same time as 16-bit operations.
780	getActionDefinitionsBuilder(Opcodes: {G_UADDSAT, G_USUBSAT})
781	.legalFor(Types: {S32, S16}) // Clamp modifier
782	.minScalar(TypeIdx: `0`, Ty: S16)
783	.scalarize(TypeIdx: `0`)
784	.widenScalarToNextPow2(TypeIdx: `0`, MinSize: `16`)
785	.lower();
786
787	// We're just lowering this, but it helps get a better result to try to
788	// coerce to the desired type first.
789	getActionDefinitionsBuilder(Opcodes: {G_SADDSAT, G_SSUBSAT})
790	.minScalar(TypeIdx: `0`, Ty: S16)
791	.scalarize(TypeIdx: `0`)
792	.lower();
793	} else {
794	getActionDefinitionsBuilder(Opcodes: {G_ADD, G_SUB})
795	.legalFor(Types: {S32})
796	.widenScalarToNextMultipleOf(TypeIdx: `0`, Size: `32`)
797	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S32)
798	.scalarize(TypeIdx: `0`);
799
800	auto &Mul = getActionDefinitionsBuilder(Opcode: G_MUL)
801	.legalFor(Types: {S32})
802	.scalarize(TypeIdx: `0`)
803	.minScalar(TypeIdx: `0`, Ty: S32)
804	.widenScalarToNextMultipleOf(TypeIdx: `0`, Size: `32`);
805
806	if (ST.hasMad64_32())
807	Mul.custom();
808	else
809	Mul.maxScalar(TypeIdx: `0`, Ty: S32);
810
811	if (ST.hasIntClamp()) {
812	getActionDefinitionsBuilder(Opcodes: {G_UADDSAT, G_USUBSAT})
813	.legalFor(Types: {S32}) // Clamp modifier.
814	.scalarize(TypeIdx: `0`)
815	.minScalarOrElt(TypeIdx: `0`, Ty: S32)
816	.lower();
817	} else {
818	// Clamp bit support was added in VI, along with 16-bit operations.
819	getActionDefinitionsBuilder(Opcodes: {G_UADDSAT, G_USUBSAT})
820	.minScalar(TypeIdx: `0`, Ty: S32)
821	.scalarize(TypeIdx: `0`)
822	.lower();
823	}
824
825	// FIXME: DAG expansion gets better results. The widening uses the smaller
826	// range values and goes for the min/max lowering directly.
827	getActionDefinitionsBuilder(Opcodes: {G_SADDSAT, G_SSUBSAT})
828	.minScalar(TypeIdx: `0`, Ty: S32)
829	.scalarize(TypeIdx: `0`)
830	.lower();
831	}
832
833	getActionDefinitionsBuilder(
834	Opcodes: {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
835	.customFor(Types: {S32, S64})
836	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S64)
837	.widenScalarToNextPow2(TypeIdx: `0`, MinSize: `32`)
838	.scalarize(TypeIdx: `0`);
839
840	auto &Mulh = getActionDefinitionsBuilder(Opcodes: {G_UMULH, G_SMULH})
841	.legalFor(Types: {S32})
842	.maxScalar(TypeIdx: `0`, Ty: S32);
843
844	if (ST.hasVOP3PInsts()) {
845	Mulh
846	.clampMaxNumElements(TypeIdx: `0`, EltTy: S8, MaxElements: `2`)
847	.lowerFor(Types: {V2S8});
848	}
849
850	Mulh
851	.scalarize(TypeIdx: `0`)
852	.lower();
853
854	// Report legal for any types we can handle anywhere. For the cases only legal
855	// on the SALU, RegBankSelect will be able to re-legalize.
856	getActionDefinitionsBuilder(Opcodes: {G_AND, G_OR, G_XOR})
857	.legalFor(Types: {S32, S1, S64, V2S32, S16, V2S16, V4S16})
858	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S64)
859	.moreElementsIf(Predicate: isSmallOddVector(TypeIdx: `0`), Mutation: oneMoreElement(TypeIdx: `0`))
860	.fewerElementsIf(Predicate: vectorWiderThan(TypeIdx: `0`, Size: `64`), Mutation: fewerEltsToSize64Vector(TypeIdx: `0`))
861	.widenScalarToNextPow2(TypeIdx: `0`)
862	.scalarize(TypeIdx: `0`);
863
864	getActionDefinitionsBuilder(
865	Opcodes: {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
866	.legalFor(Types: {{S32, S1}, {S32, S32}})
867	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S32)
868	.scalarize(TypeIdx: `0`);
869
870	getActionDefinitionsBuilder(Opcode: G_BITCAST)
871	// Don't worry about the size constraint.
872	.legalIf(Predicate: all(P0: isRegisterClassType(TypeIdx: `0`), P1: isRegisterClassType(TypeIdx: `1`)))
873	.lower();
874
875	getActionDefinitionsBuilder(Opcode: G_CONSTANT)
876	.legalFor(Types: {S1, S32, S64, S16, GlobalPtr,
877	LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
878	.legalIf(Predicate: isPointer(TypeIdx: `0`))
879	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S64)
880	.widenScalarToNextPow2(TypeIdx: `0`);
881
882	getActionDefinitionsBuilder(Opcode: G_FCONSTANT)
883	.legalFor(Types: {S32, S64, S16})
884	.clampScalar(TypeIdx: `0`, MinTy: S16, MaxTy: S64);
885
886	getActionDefinitionsBuilder(Opcodes: {G_IMPLICIT_DEF, G_FREEZE})
887	.legalIf(Predicate: isRegisterType(TypeIdx: `0`))
888	// s1 and s16 are special cases because they have legal operations on
889	// them, but don't really occupy registers in the normal way.
890	.legalFor(Types: {S1, S16})
891	.moreElementsIf(Predicate: isSmallOddVector(TypeIdx: `0`), Mutation: oneMoreElement(TypeIdx: `0`))
892	.clampScalarOrElt(TypeIdx: `0`, MinTy: S32, MaxTy: MaxScalar)
893	.widenScalarToNextPow2(TypeIdx: `0`, MinSize: `32`)
894	.clampMaxNumElements(TypeIdx: `0`, EltTy: S32, MaxElements: `16`);
895
896	getActionDefinitionsBuilder(Opcode: G_FRAME_INDEX).legalFor(Types: {PrivatePtr});
897
898	// If the amount is divergent, we have to do a wave reduction to get the
899	// maximum value, so this is expanded during RegBankSelect.
900	getActionDefinitionsBuilder(Opcode: G_DYN_STACKALLOC)
901	.legalFor(Types: {{PrivatePtr, S32}});
902
903	getActionDefinitionsBuilder(Opcode: G_STACKSAVE)
904	.customFor(Types: {PrivatePtr});
905	getActionDefinitionsBuilder(Opcode: G_STACKRESTORE)
906	.legalFor(Types: {PrivatePtr});
907
908	getActionDefinitionsBuilder(Opcodes: {G_GET_FPENV, G_SET_FPENV}).customFor(Types: {S64});
909
910	getActionDefinitionsBuilder(Opcode: G_GLOBAL_VALUE)
911	.customIf(Predicate: typeIsNot(TypeIdx: `0`, Type: PrivatePtr));
912
913	getActionDefinitionsBuilder(Opcode: G_BLOCK_ADDR).legalFor(Types: {CodePtr});
914
915	auto &FPOpActions = getActionDefinitionsBuilder(
916	Opcodes: { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
917	G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
918	.legalFor(Types: {S32, S64});
919	auto &TrigActions = getActionDefinitionsBuilder(Opcodes: {G_FSIN, G_FCOS})
920	.customFor(Types: {S32, S64});
921	auto &FDIVActions = getActionDefinitionsBuilder(Opcode: G_FDIV)
922	.customFor(Types: {S32, S64});
923
924	if (ST.has16BitInsts()) {
925	if (ST.hasVOP3PInsts())
926	FPOpActions.legalFor(Types: {S16, V2S16});
927	else
928	FPOpActions.legalFor(Types: {S16});
929
930	TrigActions.customFor(Types: {S16});
931	FDIVActions.customFor(Types: {S16});
932	}
933
934	if (ST.hasPackedFP32Ops()) {
935	FPOpActions.legalFor(Types: {V2S32});
936	FPOpActions.clampMaxNumElementsStrict(TypeIdx: `0`, EltTy: S32, NumElts: `2`);
937	}
938
939	auto &MinNumMaxNum = getActionDefinitionsBuilder(Opcodes: {
940	G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
941
942	if (ST.hasVOP3PInsts()) {
943	MinNumMaxNum.customFor(Types: FPTypesPK16)
944	.moreElementsIf(Predicate: isSmallOddVector(TypeIdx: `0`), Mutation: oneMoreElement(TypeIdx: `0`))
945	.clampMaxNumElements(TypeIdx: `0`, EltTy: S16, MaxElements: `2`)
946	.clampScalar(TypeIdx: `0`, MinTy: S16, MaxTy: S64)
947	.scalarize(TypeIdx: `0`);
948	} else if (ST.has16BitInsts()) {
949	MinNumMaxNum.customFor(Types: FPTypes16)
950	.clampScalar(TypeIdx: `0`, MinTy: S16, MaxTy: S64)
951	.scalarize(TypeIdx: `0`);
952	} else {
953	MinNumMaxNum.customFor(Types: FPTypesBase)
954	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S64)
955	.scalarize(TypeIdx: `0`);
956	}
957
958	if (ST.hasVOP3PInsts())
959	FPOpActions.clampMaxNumElementsStrict(TypeIdx: `0`, EltTy: S16, NumElts: `2`);
960
961	FPOpActions
962	.scalarize(TypeIdx: `0`)
963	.clampScalar(TypeIdx: `0`, MinTy: ST.has16BitInsts() ? S16 : S32, MaxTy: S64);
964
965	TrigActions
966	.scalarize(TypeIdx: `0`)
967	.clampScalar(TypeIdx: `0`, MinTy: ST.has16BitInsts() ? S16 : S32, MaxTy: S64);
968
969	FDIVActions
970	.scalarize(TypeIdx: `0`)
971	.clampScalar(TypeIdx: `0`, MinTy: ST.has16BitInsts() ? S16 : S32, MaxTy: S64);
972
973	getActionDefinitionsBuilder(Opcodes: {G_FNEG, G_FABS})
974	.legalFor(Types: FPTypesPK16)
975	.clampMaxNumElementsStrict(TypeIdx: `0`, EltTy: S16, NumElts: `2`)
976	.scalarize(TypeIdx: `0`)
977	.clampScalar(TypeIdx: `0`, MinTy: S16, MaxTy: S64);
978
979	if (ST.has16BitInsts()) {
980	getActionDefinitionsBuilder(Opcode: G_FSQRT)
981	.legalFor(Types: {S16})
982	.customFor(Types: {S32, S64})
983	.scalarize(TypeIdx: `0`)
984	.unsupported();
985	getActionDefinitionsBuilder(Opcode: G_FFLOOR)
986	.legalFor(Types: {S32, S64, S16})
987	.scalarize(TypeIdx: `0`)
988	.clampScalar(TypeIdx: `0`, MinTy: S16, MaxTy: S64);
989
990	getActionDefinitionsBuilder(Opcodes: {G_FLDEXP, G_STRICT_FLDEXP})
991	.legalFor(Types: {{S32, S32}, {S64, S32}, {S16, S16}})
992	.scalarize(TypeIdx: `0`)
993	.maxScalarIf(Predicate: typeIs(TypeIdx: `0`, TypesInit: S16), TypeIdx: `1`, Ty: S16)
994	.clampScalar(TypeIdx: `1`, MinTy: S32, MaxTy: S32)
995	.lower();
996
997	getActionDefinitionsBuilder(Opcode: G_FFREXP)
998	.customFor(Types: {{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
999	.scalarize(TypeIdx: `0`)
1000	.lower();
1001	} else {
1002	getActionDefinitionsBuilder(Opcode: G_FSQRT)
1003	.customFor(Types: {S32, S64, S16})
1004	.scalarize(TypeIdx: `0`)
1005	.unsupported();
1006
1007
1008	if (ST.hasFractBug()) {
1009	getActionDefinitionsBuilder(Opcode: G_FFLOOR)
1010	.customFor(Types: {S64})
1011	.legalFor(Types: {S32, S64})
1012	.scalarize(TypeIdx: `0`)
1013	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S64);
1014	} else {
1015	getActionDefinitionsBuilder(Opcode: G_FFLOOR)
1016	.legalFor(Types: {S32, S64})
1017	.scalarize(TypeIdx: `0`)
1018	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S64);
1019	}
1020
1021	getActionDefinitionsBuilder(Opcodes: {G_FLDEXP, G_STRICT_FLDEXP})
1022	.legalFor(Types: {{S32, S32}, {S64, S32}})
1023	.scalarize(TypeIdx: `0`)
1024	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S64)
1025	.clampScalar(TypeIdx: `1`, MinTy: S32, MaxTy: S32)
1026	.lower();
1027
1028	getActionDefinitionsBuilder(Opcode: G_FFREXP)
1029	.customFor(Types: {{S32, S32}, {S64, S32}})
1030	.scalarize(TypeIdx: `0`)
1031	.minScalar(TypeIdx: `0`, Ty: S32)
1032	.clampScalar(TypeIdx: `1`, MinTy: S32, MaxTy: S32)
1033	.lower();
1034	}
1035
1036	getActionDefinitionsBuilder(Opcode: G_FPTRUNC)
1037	.legalFor(Types: {{S32, S64}, {S16, S32}})
1038	.scalarize(TypeIdx: `0`)
1039	.lower();
1040
1041	getActionDefinitionsBuilder(Opcode: G_FPEXT)
1042	.legalFor(Types: {{S64, S32}, {S32, S16}})
1043	.narrowScalarFor(Types: {{S64, S16}}, Mutation: changeTo(TypeIdx: `0`, Ty: S32))
1044	.scalarize(TypeIdx: `0`);
1045
1046	auto &FSubActions = getActionDefinitionsBuilder(Opcodes: {G_FSUB, G_STRICT_FSUB});
1047	if (ST.has16BitInsts()) {
1048	FSubActions
1049	// Use actual fsub instruction
1050	.legalFor(Types: {S32, S16})
1051	// Must use fadd + fneg
1052	.lowerFor(Types: {S64, V2S16});
1053	} else {
1054	FSubActions
1055	// Use actual fsub instruction
1056	.legalFor(Types: {S32})
1057	// Must use fadd + fneg
1058	.lowerFor(Types: {S64, S16, V2S16});
1059	}
1060
1061	FSubActions
1062	.scalarize(TypeIdx: `0`)
1063	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S64);
1064
1065	// Whether this is legal depends on the floating point mode for the function.
1066	auto &FMad = getActionDefinitionsBuilder(Opcode: G_FMAD);
1067	if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1068	FMad.customFor(Types: {S32, S16});
1069	else if (ST.hasMadMacF32Insts())
1070	FMad.customFor(Types: {S32});
1071	else if (ST.hasMadF16())
1072	FMad.customFor(Types: {S16});
1073	FMad.scalarize(TypeIdx: `0`)
1074	.lower();
1075
1076	auto &FRem = getActionDefinitionsBuilder(Opcode: G_FREM);
1077	if (ST.has16BitInsts()) {
1078	FRem.customFor(Types: {S16, S32, S64});
1079	} else {
1080	FRem.minScalar(TypeIdx: `0`, Ty: S32)
1081	.customFor(Types: {S32, S64});
1082	}
1083	FRem.scalarize(TypeIdx: `0`);
1084
1085	// TODO: Do we need to clamp maximum bitwidth?
1086	getActionDefinitionsBuilder(Opcode: G_TRUNC)
1087	.legalIf(Predicate: isScalar(TypeIdx: `0`))
1088	.legalFor(Types: {{V2S16, V2S32}})
1089	.clampMaxNumElements(TypeIdx: `0`, EltTy: S16, MaxElements: `2`)
1090	// Avoid scalarizing in cases that should be truly illegal. In unresolvable
1091	// situations (like an invalid implicit use), we don't want to infinite loop
1092	// in the legalizer.
1093	.fewerElementsIf(Predicate: elementTypeIsLegal(TypeIdx: `0`), Mutation: LegalizeMutations::scalarize(TypeIdx: `0`))
1094	.alwaysLegal();
1095
1096	getActionDefinitionsBuilder(Opcodes: {G_SEXT, G_ZEXT, G_ANYEXT})
1097	.legalFor(Types: {{S64, S32}, {S32, S16}, {S64, S16},
1098	{S32, S1}, {S64, S1}, {S16, S1}})
1099	.scalarize(TypeIdx: `0`)
1100	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S64)
1101	.widenScalarToNextPow2(TypeIdx: `1`, MinSize: `32`);
1102
1103	// TODO: Split s1->s64 during regbankselect for VALU.
1104	auto &IToFP = getActionDefinitionsBuilder(Opcodes: {G_SITOFP, G_UITOFP})
1105	.legalFor(Types: {{S32, S32}, {S64, S32}, {S16, S32}})
1106	.lowerIf(Predicate: typeIs(TypeIdx: `1`, TypesInit: S1))
1107	.customFor(Types: {{S32, S64}, {S64, S64}});
1108	if (ST.has16BitInsts())
1109	IToFP.legalFor(Types: {{S16, S16}});
1110	IToFP.clampScalar(TypeIdx: `1`, MinTy: S32, MaxTy: S64)
1111	.minScalar(TypeIdx: `0`, Ty: S32)
1112	.scalarize(TypeIdx: `0`)
1113	.widenScalarToNextPow2(TypeIdx: `1`);
1114
1115	auto &FPToI = getActionDefinitionsBuilder(Opcodes: {G_FPTOSI, G_FPTOUI})
1116	.legalFor(Types: {{S32, S32}, {S32, S64}, {S32, S16}})
1117	.customFor(Types: {{S64, S32}, {S64, S64}})
1118	.narrowScalarFor(Types: {{S64, S16}}, Mutation: changeTo(TypeIdx: `0`, Ty: S32));
1119	if (ST.has16BitInsts())
1120	FPToI.legalFor(Types: {{S16, S16}});
1121	else
1122	FPToI.minScalar(TypeIdx: `1`, Ty: S32);
1123
1124	FPToI.minScalar(TypeIdx: `0`, Ty: S32)
1125	.widenScalarToNextPow2(TypeIdx: `0`, MinSize: `32`)
1126	.scalarize(TypeIdx: `0`)
1127	.lower();
1128
1129	getActionDefinitionsBuilder(Opcode: G_INTRINSIC_FPTRUNC_ROUND)
1130	.customFor(Types: {S16, S32})
1131	.scalarize(TypeIdx: `0`)
1132	.lower();
1133
1134	// Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1135	getActionDefinitionsBuilder(Opcodes: {G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1136	.scalarize(TypeIdx: `0`)
1137	.lower();
1138
1139	if (ST.has16BitInsts()) {
1140	getActionDefinitionsBuilder(
1141	Opcodes: {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1142	.legalFor(Types: {S16, S32, S64})
1143	.clampScalar(TypeIdx: `0`, MinTy: S16, MaxTy: S64)
1144	.scalarize(TypeIdx: `0`);
1145	} else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1146	getActionDefinitionsBuilder(
1147	Opcodes: {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1148	.legalFor(Types: {S32, S64})
1149	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S64)
1150	.scalarize(TypeIdx: `0`);
1151	} else {
1152	getActionDefinitionsBuilder(
1153	Opcodes: {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1154	.legalFor(Types: {S32})
1155	.customFor(Types: {S64})
1156	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S64)
1157	.scalarize(TypeIdx: `0`);
1158	}
1159
1160	getActionDefinitionsBuilder(Opcode: G_PTR_ADD)
1161	.unsupportedFor(Types: {BufferFatPtr, BufferStridedPtr, RsrcPtr})
1162	.legalIf(Predicate: all(P0: isPointer(TypeIdx: `0`), P1: sameSize(TypeIdx0: `0`, TypeIdx1: `1`)))
1163	.scalarize(TypeIdx: `0`)
1164	.scalarSameSizeAs(TypeIdx: `1`, SameSizeIdx: `0`);
1165
1166	getActionDefinitionsBuilder(Opcode: G_PTRMASK)
1167	.legalIf(Predicate: all(P0: sameSize(TypeIdx0: `0`, TypeIdx1: `1`), P1: typeInSet(TypeIdx: `1`, TypesInit: {S64, S32})))
1168	.scalarSameSizeAs(TypeIdx: `1`, SameSizeIdx: `0`)
1169	.scalarize(TypeIdx: `0`);
1170
1171	auto &CmpBuilder =
1172	getActionDefinitionsBuilder(Opcode: G_ICMP)
1173	// The compare output type differs based on the register bank of the output,
1174	// so make both s1 and s32 legal.
1175	//
1176	// Scalar compares producing output in scc will be promoted to s32, as that
1177	// is the allocatable register type that will be needed for the copy from
1178	// scc. This will be promoted during RegBankSelect, and we assume something
1179	// before that won't try to use s32 result types.
1180	//
1181	// Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1182	// bank.
1183	.legalForCartesianProduct(
1184	Types0: {S1}, Types1: {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1185	.legalForCartesianProduct(
1186	Types0: {S32}, Types1: {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1187	if (ST.has16BitInsts()) {
1188	CmpBuilder.legalFor(Types: {{S1, S16}});
1189	}
1190
1191	CmpBuilder
1192	.widenScalarToNextPow2(TypeIdx: `1`)
1193	.clampScalar(TypeIdx: `1`, MinTy: S32, MaxTy: S64)
1194	.scalarize(TypeIdx: `0`)
1195	.legalIf(Predicate: all(P0: typeInSet(TypeIdx: `0`, TypesInit: {S1, S32}), P1: isPointer(TypeIdx: `1`)));
1196
1197	auto &FCmpBuilder =
1198	getActionDefinitionsBuilder(Opcode: G_FCMP).legalForCartesianProduct(
1199	{S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1200
1201	if (ST.hasSALUFloatInsts())
1202	FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
1203
1204	FCmpBuilder
1205	.widenScalarToNextPow2(`1`)
1206	.clampScalar(`1`, S32, S64)
1207	.scalarize(`0`);
1208
1209	// FIXME: fpow has a selection pattern that should move to custom lowering.
1210	auto &ExpOps = getActionDefinitionsBuilder(Opcode: G_FPOW);
1211	if (ST.has16BitInsts())
1212	ExpOps.customFor(Types: {{S32}, {S16}});
1213	else
1214	ExpOps.customFor(Types: {S32});
1215	ExpOps.clampScalar(TypeIdx: `0`, MinTy: MinScalarFPTy, MaxTy: S32)
1216	.scalarize(TypeIdx: `0`);
1217
1218	getActionDefinitionsBuilder(Opcode: G_FPOWI)
1219	.clampScalar(TypeIdx: `0`, MinTy: MinScalarFPTy, MaxTy: S32)
1220	.lower();
1221
1222	auto &Log2Ops = getActionDefinitionsBuilder(Opcodes: {G_FLOG2, G_FEXP2});
1223	Log2Ops.customFor(Types: {S32});
1224	if (ST.has16BitInsts())
1225	Log2Ops.legalFor(Types: {S16});
1226	else
1227	Log2Ops.customFor(Types: {S16});
1228	Log2Ops.scalarize(TypeIdx: `0`)
1229	.lower();
1230
1231	auto &LogOps =
1232	getActionDefinitionsBuilder(Opcodes: {G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1233	LogOps.customFor(Types: {S32, S16});
1234	LogOps.clampScalar(TypeIdx: `0`, MinTy: MinScalarFPTy, MaxTy: S32)
1235	.scalarize(TypeIdx: `0`);
1236
1237	// The 64-bit versions produce 32-bit results, but only on the SALU.
1238	getActionDefinitionsBuilder(Opcode: G_CTPOP)
1239	.legalFor(Types: {{S32, S32}, {S32, S64}})
1240	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S32)
1241	.widenScalarToNextPow2(TypeIdx: `1`, MinSize: `32`)
1242	.clampScalar(TypeIdx: `1`, MinTy: S32, MaxTy: S64)
1243	.scalarize(TypeIdx: `0`)
1244	.widenScalarToNextPow2(TypeIdx: `0`, MinSize: `32`);
1245
1246	// If no 16 bit instr is available, lower into different instructions.
1247	if (ST.has16BitInsts())
1248	getActionDefinitionsBuilder(Opcode: G_IS_FPCLASS)
1249	.legalForCartesianProduct(Types0: {S1}, Types1: FPTypes16)
1250	.widenScalarToNextPow2(TypeIdx: `1`)
1251	.scalarize(TypeIdx: `0`)
1252	.lower();
1253	else
1254	getActionDefinitionsBuilder(Opcode: G_IS_FPCLASS)
1255	.legalForCartesianProduct(Types0: {S1}, Types1: FPTypesBase)
1256	.lowerFor(Types: {S1, S16})
1257	.widenScalarToNextPow2(TypeIdx: `1`)
1258	.scalarize(TypeIdx: `0`)
1259	.lower();
1260
1261	// The hardware instructions return a different result on 0 than the generic
1262	// instructions expect. The hardware produces -1, but these produce the
1263	// bitwidth.
1264	getActionDefinitionsBuilder(Opcodes: {G_CTLZ, G_CTTZ})
1265	.scalarize(TypeIdx: `0`)
1266	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S32)
1267	.clampScalar(TypeIdx: `1`, MinTy: S32, MaxTy: S64)
1268	.widenScalarToNextPow2(TypeIdx: `0`, MinSize: `32`)
1269	.widenScalarToNextPow2(TypeIdx: `1`, MinSize: `32`)
1270	.custom();
1271
1272	// The 64-bit versions produce 32-bit results, but only on the SALU.
1273	getActionDefinitionsBuilder(Opcodes: {G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
1274	.legalFor(Types: {{S32, S32}, {S32, S64}})
1275	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S32)
1276	.clampScalar(TypeIdx: `1`, MinTy: S32, MaxTy: S64)
1277	.scalarize(TypeIdx: `0`)
1278	.widenScalarToNextPow2(TypeIdx: `0`, MinSize: `32`)
1279	.widenScalarToNextPow2(TypeIdx: `1`, MinSize: `32`);
1280
1281	// S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1282	// RegBankSelect.
1283	getActionDefinitionsBuilder(Opcode: G_BITREVERSE)
1284	.legalFor(Types: {S32, S64})
1285	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S64)
1286	.scalarize(TypeIdx: `0`)
1287	.widenScalarToNextPow2(TypeIdx: `0`);
1288
1289	if (ST.has16BitInsts()) {
1290	getActionDefinitionsBuilder(Opcode: G_BSWAP)
1291	.legalFor(Types: {S16, S32, V2S16})
1292	.clampMaxNumElementsStrict(TypeIdx: `0`, EltTy: S16, NumElts: `2`)
1293	// FIXME: Fixing non-power-of-2 before clamp is workaround for
1294	// narrowScalar limitation.
1295	.widenScalarToNextPow2(TypeIdx: `0`)
1296	.clampScalar(TypeIdx: `0`, MinTy: S16, MaxTy: S32)
1297	.scalarize(TypeIdx: `0`);
1298
1299	if (ST.hasVOP3PInsts()) {
1300	getActionDefinitionsBuilder(Opcodes: {G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1301	.legalFor(Types: {S32, S16, V2S16})
1302	.clampMaxNumElements(TypeIdx: `0`, EltTy: S16, MaxElements: `2`)
1303	.minScalar(TypeIdx: `0`, Ty: S16)
1304	.widenScalarToNextPow2(TypeIdx: `0`)
1305	.scalarize(TypeIdx: `0`)
1306	.lower();
1307	} else {
1308	getActionDefinitionsBuilder(Opcodes: {G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1309	.legalFor(Types: {S32, S16})
1310	.widenScalarToNextPow2(TypeIdx: `0`)
1311	.minScalar(TypeIdx: `0`, Ty: S16)
1312	.scalarize(TypeIdx: `0`)
1313	.lower();
1314	}
1315	} else {
1316	// TODO: Should have same legality without v_perm_b32
1317	getActionDefinitionsBuilder(Opcode: G_BSWAP)
1318	.legalFor(Types: {S32})
1319	.lowerIf(Predicate: scalarNarrowerThan(TypeIdx: `0`, Size: `32`))
1320	// FIXME: Fixing non-power-of-2 before clamp is workaround for
1321	// narrowScalar limitation.
1322	.widenScalarToNextPow2(TypeIdx: `0`)
1323	.maxScalar(TypeIdx: `0`, Ty: S32)
1324	.scalarize(TypeIdx: `0`)
1325	.lower();
1326
1327	getActionDefinitionsBuilder(Opcodes: {G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1328	.legalFor(Types: {S32})
1329	.minScalar(TypeIdx: `0`, Ty: S32)
1330	.widenScalarToNextPow2(TypeIdx: `0`)
1331	.scalarize(TypeIdx: `0`)
1332	.lower();
1333	}
1334
1335	getActionDefinitionsBuilder(Opcode: G_INTTOPTR)
1336	// List the common cases
1337	.legalForCartesianProduct(Types0: AddrSpaces64, Types1: {S64})
1338	.legalForCartesianProduct(Types0: AddrSpaces32, Types1: {S32})
1339	.scalarize(TypeIdx: `0`)
1340	// Accept any address space as long as the size matches
1341	.legalIf(Predicate: sameSize(TypeIdx0: `0`, TypeIdx1: `1`))
1342	.widenScalarIf(Predicate: smallerThan(TypeIdx0: `1`, TypeIdx1: `0`),
1343	Mutation: [](const LegalityQuery &Query) {
1344	return std::pair(
1345	`1`, LLT::scalar(SizeInBits: Query.Types [`0`].getSizeInBits()));
1346	})
1347	.narrowScalarIf(Predicate: largerThan(TypeIdx0: `1`, TypeIdx1: `0`), Mutation: [](const LegalityQuery &Query) {
1348	return std::pair(`1`, LLT::scalar(SizeInBits: Query.Types [`0`].getSizeInBits()));
1349	});
1350
1351	getActionDefinitionsBuilder(Opcode: G_PTRTOINT)
1352	// List the common cases
1353	.legalForCartesianProduct(Types0: AddrSpaces64, Types1: {S64})
1354	.legalForCartesianProduct(Types0: AddrSpaces32, Types1: {S32})
1355	.scalarize(TypeIdx: `0`)
1356	// Accept any address space as long as the size matches
1357	.legalIf(Predicate: sameSize(TypeIdx0: `0`, TypeIdx1: `1`))
1358	.widenScalarIf(Predicate: smallerThan(TypeIdx0: `0`, TypeIdx1: `1`),
1359	Mutation: [](const LegalityQuery &Query) {
1360	return std::pair(
1361	`0`, LLT::scalar(SizeInBits: Query.Types [`1`].getSizeInBits()));
1362	})
1363	.narrowScalarIf(Predicate: largerThan(TypeIdx0: `0`, TypeIdx1: `1`), Mutation: [](const LegalityQuery &Query) {
1364	return std::pair(`0`, LLT::scalar(SizeInBits: Query.Types [`1`].getSizeInBits()));
1365	});
1366
1367	getActionDefinitionsBuilder(Opcode: G_ADDRSPACE_CAST)
1368	.scalarize(TypeIdx: `0`)
1369	.custom();
1370
1371	const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1372	bool IsLoad) -> bool {
1373	const LLT DstTy = Query.Types [`0`];
1374
1375	// Split vector extloads.
1376	unsigned MemSize = Query.MMODescrs [`0`].MemoryTy.getSizeInBits();
1377
1378	if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1379	return true;
1380
1381	const LLT PtrTy = Query.Types [`1`];
1382	unsigned AS = PtrTy.getAddressSpace();
1383	if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1384	IsAtomic: Query.MMODescrs [`0`].Ordering !=
1385	AtomicOrdering::NotAtomic))
1386	return true;
1387
1388	// Catch weird sized loads that don't evenly divide into the access sizes
1389	// TODO: May be able to widen depending on alignment etc.
1390	unsigned NumRegs = (MemSize + `31`) / `32`;
1391	if (NumRegs == `3`) {
1392	if (!ST.hasDwordx3LoadStores())
1393	return true;
1394	} else {
1395	// If the alignment allows, these should have been widened.
1396	if (!isPowerOf2_32(Value: NumRegs))
1397	return true;
1398	}
1399
1400	return false;
1401	};
1402
1403	unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? `0` : `32`;
1404	unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? `0` : `16`;
1405	unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? `0` : `8`;
1406
1407	// TODO: Refine based on subtargets which support unaligned access or 128-bit
1408	// LDS
1409	// TODO: Unsupported flat for SI.
1410
1411	for (unsigned Op : {G_LOAD, G_STORE}) {
1412	const bool IsStore = Op == G_STORE;
1413
1414	auto &Actions = getActionDefinitionsBuilder(Opcode: Op);
1415	// Explicitly list some common cases.
1416	// TODO: Does this help compile time at all?
1417	Actions.legalForTypesWithMemDesc(TypesAndMemDesc: {{.Type0: S32, .Type1: GlobalPtr, .MemTy: S32, .Align: GlobalAlign32},
1418	{.Type0: V2S32, .Type1: GlobalPtr, .MemTy: V2S32, .Align: GlobalAlign32},
1419	{.Type0: V4S32, .Type1: GlobalPtr, .MemTy: V4S32, .Align: GlobalAlign32},
1420	{.Type0: S64, .Type1: GlobalPtr, .MemTy: S64, .Align: GlobalAlign32},
1421	{.Type0: V2S64, .Type1: GlobalPtr, .MemTy: V2S64, .Align: GlobalAlign32},
1422	{.Type0: V2S16, .Type1: GlobalPtr, .MemTy: V2S16, .Align: GlobalAlign32},
1423	{.Type0: S32, .Type1: GlobalPtr, .MemTy: S8, .Align: GlobalAlign8},
1424	{.Type0: S32, .Type1: GlobalPtr, .MemTy: S16, .Align: GlobalAlign16},
1425
1426	{.Type0: S32, .Type1: LocalPtr, .MemTy: S32, .Align: `32`},
1427	{.Type0: S64, .Type1: LocalPtr, .MemTy: S64, .Align: `32`},
1428	{.Type0: V2S32, .Type1: LocalPtr, .MemTy: V2S32, .Align: `32`},
1429	{.Type0: S32, .Type1: LocalPtr, .MemTy: S8, .Align: `8`},
1430	{.Type0: S32, .Type1: LocalPtr, .MemTy: S16, .Align: `16`},
1431	{.Type0: V2S16, .Type1: LocalPtr, .MemTy: S32, .Align: `32`},
1432
1433	{.Type0: S32, .Type1: PrivatePtr, .MemTy: S32, .Align: `32`},
1434	{.Type0: S32, .Type1: PrivatePtr, .MemTy: S8, .Align: `8`},
1435	{.Type0: S32, .Type1: PrivatePtr, .MemTy: S16, .Align: `16`},
1436	{.Type0: V2S16, .Type1: PrivatePtr, .MemTy: S32, .Align: `32`},
1437
1438	{.Type0: S32, .Type1: ConstantPtr, .MemTy: S32, .Align: GlobalAlign32},
1439	{.Type0: V2S32, .Type1: ConstantPtr, .MemTy: V2S32, .Align: GlobalAlign32},
1440	{.Type0: V4S32, .Type1: ConstantPtr, .MemTy: V4S32, .Align: GlobalAlign32},
1441	{.Type0: S64, .Type1: ConstantPtr, .MemTy: S64, .Align: GlobalAlign32},
1442	{.Type0: V2S32, .Type1: ConstantPtr, .MemTy: V2S32, .Align: GlobalAlign32}});
1443	Actions.legalIf(
1444	Predicate: [=](const LegalityQuery &Query) -> bool {
1445	return isLoadStoreLegal(ST, Query);
1446	});
1447
1448	// The custom pointers (fat pointers, buffer resources) don't work with load
1449	// and store at this level. Fat pointers should have been lowered to
1450	// intrinsics before the translation to MIR.
1451	Actions.unsupportedIf(
1452	Predicate: typeInSet(TypeIdx: `1`, TypesInit: {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1453
1454	// Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1455	// ptrtoint. This is needed to account for the fact that we can't have i128
1456	// as a register class for SelectionDAG reasons.
1457	Actions.customIf(Predicate: [=](const LegalityQuery &Query) -> bool {
1458	return hasBufferRsrcWorkaround(Ty: Query.Types [`0`]);
1459	});
1460
1461	// Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1462	// 64-bits.
1463	//
1464	// TODO: Should generalize bitcast action into coerce, which will also cover
1465	// inserting addrspacecasts.
1466	Actions.customIf(Predicate: typeIs(TypeIdx: `1`, TypesInit: Constant32Ptr));
1467
1468	// Turn any illegal element vectors into something easier to deal
1469	// with. These will ultimately produce 32-bit scalar shifts to extract the
1470	// parts anyway.
1471	//
1472	// For odd 16-bit element vectors, prefer to split those into pieces with
1473	// 16-bit vector parts.
1474	Actions.bitcastIf(
1475	Predicate: [=](const LegalityQuery &Query) -> bool {
1476	return shouldBitcastLoadStoreType(ST, Ty: Query.Types [`0`],
1477	MemTy: Query.MMODescrs [`0`].MemoryTy);
1478	}, Mutation: bitcastToRegisterType(TypeIdx: `0`));
1479
1480	if (!IsStore) {
1481	// Widen suitably aligned loads by loading extra bytes. The standard
1482	// legalization actions can't properly express widening memory operands.
1483	Actions.customIf(Predicate: [=](const LegalityQuery &Query) -> bool {
1484	return shouldWidenLoad(ST, Query, Opcode: G_LOAD);
1485	});
1486	}
1487
1488	// FIXME: load/store narrowing should be moved to lower action
1489	Actions
1490	.narrowScalarIf(
1491	Predicate: [=](const LegalityQuery &Query) -> bool {
1492	return !Query.Types [`0`].isVector() &&
1493	needToSplitMemOp (Query, Op == G_LOAD);
1494	},
1495	Mutation: [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1496	const LLT DstTy = Query.Types [`0`];
1497	const LLT PtrTy = Query.Types [`1`];
1498
1499	const unsigned DstSize = DstTy.getSizeInBits();
1500	unsigned MemSize = Query.MMODescrs [`0`].MemoryTy.getSizeInBits();
1501
1502	// Split extloads.
1503	if (DstSize > MemSize)
1504	return std::pair(`0`, LLT::scalar(SizeInBits: MemSize));
1505
1506	unsigned MaxSize = maxSizeForAddrSpace(
1507	ST, AS: PtrTy.getAddressSpace(), IsLoad: Op == G_LOAD,
1508	IsAtomic: Query.MMODescrs [`0`].Ordering != AtomicOrdering::NotAtomic);
1509	if (MemSize > MaxSize)
1510	return std::pair(`0`, LLT::scalar(SizeInBits: MaxSize));
1511
1512	uint64_t Align = Query.MMODescrs [`0`].AlignInBits;
1513	return std::pair(`0`, LLT::scalar(SizeInBits: Align));
1514	})
1515	.fewerElementsIf(
1516	Predicate: [=](const LegalityQuery &Query) -> bool {
1517	return Query.Types [`0`].isVector() &&
1518	needToSplitMemOp (Query, Op == G_LOAD);
1519	},
1520	Mutation: [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1521	const LLT DstTy = Query.Types [`0`];
1522	const LLT PtrTy = Query.Types [`1`];
1523
1524	LLT EltTy = DstTy.getElementType();
1525	unsigned MaxSize = maxSizeForAddrSpace(
1526	ST, AS: PtrTy.getAddressSpace(), IsLoad: Op == G_LOAD,
1527	IsAtomic: Query.MMODescrs [`0`].Ordering != AtomicOrdering::NotAtomic);
1528
1529	// FIXME: Handle widened to power of 2 results better. This ends
1530	// up scalarizing.
1531	// FIXME: 3 element stores scalarized on SI
1532
1533	// Split if it's too large for the address space.
1534	unsigned MemSize = Query.MMODescrs [`0`].MemoryTy.getSizeInBits();
1535	if (MemSize > MaxSize) {
1536	unsigned NumElts = DstTy.getNumElements();
1537	unsigned EltSize = EltTy.getSizeInBits();
1538
1539	if (MaxSize % EltSize == `0`) {
1540	return std::pair(
1541	`0`, LLT::scalarOrVector(
1542	EC: ElementCount::getFixed(MinVal: MaxSize / EltSize), ScalarTy: EltTy));
1543	}
1544
1545	unsigned NumPieces = MemSize / MaxSize;
1546
1547	// FIXME: Refine when odd breakdowns handled
1548	// The scalars will need to be re-legalized.
1549	if (NumPieces == `1` \|\| NumPieces >= NumElts \|\|
1550	NumElts % NumPieces != `0`)
1551	return std::pair(`0`, EltTy);
1552
1553	return std::pair(`0`,
1554	LLT::fixed_vector(NumElements: NumElts / NumPieces, ScalarTy: EltTy));
1555	}
1556
1557	// FIXME: We could probably handle weird extending loads better.
1558	if (DstTy.getSizeInBits() > MemSize)
1559	return std::pair(`0`, EltTy);
1560
1561	unsigned EltSize = EltTy.getSizeInBits();
1562	unsigned DstSize = DstTy.getSizeInBits();
1563	if (!isPowerOf2_32(Value: DstSize)) {
1564	// We're probably decomposing an odd sized store. Try to split
1565	// to the widest type. TODO: Account for alignment. As-is it
1566	// should be OK, since the new parts will be further legalized.
1567	unsigned FloorSize = llvm::bit_floor(Value: DstSize);
1568	return std::pair(
1569	`0`, LLT::scalarOrVector(
1570	EC: ElementCount::getFixed(MinVal: FloorSize / EltSize), ScalarTy: EltTy));
1571	}
1572
1573	// May need relegalization for the scalars.
1574	return std::pair(`0`, EltTy);
1575	})
1576	.minScalar(TypeIdx: `0`, Ty: S32)
1577	.narrowScalarIf(Predicate: isWideScalarExtLoadTruncStore(TypeIdx: `0`), Mutation: changeTo(TypeIdx: `0`, Ty: S32))
1578	.widenScalarToNextPow2(TypeIdx: `0`)
1579	.moreElementsIf(Predicate: vectorSmallerThan(TypeIdx: `0`, Size: `32`), Mutation: moreEltsToNext32Bit(TypeIdx: `0`))
1580	.lower();
1581	}
1582
1583	// FIXME: Unaligned accesses not lowered.
1584	auto &ExtLoads = getActionDefinitionsBuilder(Opcodes: {G_SEXTLOAD, G_ZEXTLOAD})
1585	.legalForTypesWithMemDesc(TypesAndMemDesc: {{.Type0: S32, .Type1: GlobalPtr, .MemTy: S8, .Align: `8`},
1586	{.Type0: S32, .Type1: GlobalPtr, .MemTy: S16, .Align: `2` * `8`},
1587	{.Type0: S32, .Type1: LocalPtr, .MemTy: S8, .Align: `8`},
1588	{.Type0: S32, .Type1: LocalPtr, .MemTy: S16, .Align: `16`},
1589	{.Type0: S32, .Type1: PrivatePtr, .MemTy: S8, .Align: `8`},
1590	{.Type0: S32, .Type1: PrivatePtr, .MemTy: S16, .Align: `16`},
1591	{.Type0: S32, .Type1: ConstantPtr, .MemTy: S8, .Align: `8`},
1592	{.Type0: S32, .Type1: ConstantPtr, .MemTy: S16, .Align: `2` * `8`}})
1593	.legalIf(
1594	Predicate: [=](const LegalityQuery &Query) -> bool {
1595	return isLoadStoreLegal(ST, Query);
1596	});
1597
1598	if (ST.hasFlatAddressSpace()) {
1599	ExtLoads.legalForTypesWithMemDesc(
1600	TypesAndMemDesc: {{.Type0: S32, .Type1: FlatPtr, .MemTy: S8, .Align: `8`}, {.Type0: S32, .Type1: FlatPtr, .MemTy: S16, .Align: `16`}});
1601	}
1602
1603	// Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1604	// 64-bits.
1605	//
1606	// TODO: Should generalize bitcast action into coerce, which will also cover
1607	// inserting addrspacecasts.
1608	ExtLoads.customIf(Predicate: typeIs(TypeIdx: `1`, TypesInit: Constant32Ptr));
1609
1610	ExtLoads.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S32)
1611	.widenScalarToNextPow2(TypeIdx: `0`)
1612	.lower();
1613
1614	auto &Atomics = getActionDefinitionsBuilder(
1615	Opcodes: {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1616	G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1617	G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1618	G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1619	.legalFor(Types: {{S32, GlobalPtr}, {S32, LocalPtr},
1620	{S64, GlobalPtr}, {S64, LocalPtr},
1621	{S32, RegionPtr}, {S64, RegionPtr}});
1622	if (ST.hasFlatAddressSpace()) {
1623	Atomics.legalFor(Types: {{S32, FlatPtr}, {S64, FlatPtr}});
1624	}
1625
1626	auto &Atomic = getActionDefinitionsBuilder(Opcode: G_ATOMICRMW_FADD);
1627	if (ST.hasLDSFPAtomicAddF32()) {
1628	Atomic.legalFor(Types: {{S32, LocalPtr}, {S32, RegionPtr}});
1629	if (ST.hasLdsAtomicAddF64())
1630	Atomic.legalFor(Types: {{S64, LocalPtr}});
1631	if (ST.hasAtomicDsPkAdd16Insts())
1632	Atomic.legalFor(Types: {{V2S16, LocalPtr}});
1633	}
1634	if (ST.hasAtomicFaddInsts())
1635	Atomic.legalFor(Types: {{S32, GlobalPtr}});
1636	if (ST.hasFlatAtomicFaddF32Inst())
1637	Atomic.legalFor(Types: {{S32, FlatPtr}});
1638
1639	if (ST.hasGFX90AInsts()) {
1640	// These are legal with some caveats, and should have undergone expansion in
1641	// the IR in most situations
1642	// TODO: Move atomic expansion into legalizer
1643	Atomic.legalFor(Types: {
1644	{S32, GlobalPtr},
1645	{S64, GlobalPtr},
1646	{S64, FlatPtr}
1647	});
1648	}
1649
1650	// BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1651	// demarshalling
1652	getActionDefinitionsBuilder(Opcode: G_ATOMIC_CMPXCHG)
1653	.customFor(Types: {{S32, GlobalPtr}, {S64, GlobalPtr},
1654	{S32, FlatPtr}, {S64, FlatPtr}})
1655	.legalFor(Types: {{S32, LocalPtr}, {S64, LocalPtr},
1656	{S32, RegionPtr}, {S64, RegionPtr}});
1657	// TODO: Pointer types, any 32-bit or 64-bit vector
1658
1659	// Condition should be s32 for scalar, s1 for vector.
1660	getActionDefinitionsBuilder(Opcode: G_SELECT)
1661	.legalForCartesianProduct(Types0: {S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1662	LocalPtr, FlatPtr, PrivatePtr,
1663	LLT::fixed_vector(NumElements: `2`, ScalarTy: LocalPtr),
1664	LLT::fixed_vector(NumElements: `2`, ScalarTy: PrivatePtr)},
1665	Types1: {S1, S32})
1666	.clampScalar(TypeIdx: `0`, MinTy: S16, MaxTy: S64)
1667	.scalarize(TypeIdx: `1`)
1668	.moreElementsIf(Predicate: isSmallOddVector(TypeIdx: `0`), Mutation: oneMoreElement(TypeIdx: `0`))
1669	.fewerElementsIf(Predicate: numElementsNotEven(TypeIdx: `0`), Mutation: scalarize(TypeIdx: `0`))
1670	.clampMaxNumElements(TypeIdx: `0`, EltTy: S32, MaxElements: `2`)
1671	.clampMaxNumElements(TypeIdx: `0`, EltTy: LocalPtr, MaxElements: `2`)
1672	.clampMaxNumElements(TypeIdx: `0`, EltTy: PrivatePtr, MaxElements: `2`)
1673	.scalarize(TypeIdx: `0`)
1674	.widenScalarToNextPow2(TypeIdx: `0`)
1675	.legalIf(Predicate: all(P0: isPointer(TypeIdx: `0`), P1: typeInSet(TypeIdx: `1`, TypesInit: {S1, S32})));
1676
1677	// TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1678	// be more flexible with the shift amount type.
1679	auto &Shifts = getActionDefinitionsBuilder(Opcodes: {G_SHL, G_LSHR, G_ASHR})
1680	.legalFor(Types: {{S32, S32}, {S64, S32}});
1681	if (ST.has16BitInsts()) {
1682	if (ST.hasVOP3PInsts()) {
1683	Shifts.legalFor(Types: {{S16, S16}, {V2S16, V2S16}})
1684	.clampMaxNumElements(TypeIdx: `0`, EltTy: S16, MaxElements: `2`);
1685	} else
1686	Shifts.legalFor(Types: {{S16, S16}});
1687
1688	// TODO: Support 16-bit shift amounts for all types
1689	Shifts.widenScalarIf(
1690	Predicate: [=](const LegalityQuery &Query) {
1691	// Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1692	// 32-bit amount.
1693	const LLT ValTy = Query.Types [`0`];
1694	const LLT AmountTy = Query.Types [`1`];
1695	return ValTy.getSizeInBits() <= `16` &&
1696	AmountTy.getSizeInBits() < `16`;
1697	}, Mutation: changeTo(TypeIdx: `1`, Ty: S16));
1698	Shifts.maxScalarIf(Predicate: typeIs(TypeIdx: `0`, TypesInit: S16), TypeIdx: `1`, Ty: S16);
1699	Shifts.clampScalar(TypeIdx: `1`, MinTy: S32, MaxTy: S32);
1700	Shifts.widenScalarToNextPow2(TypeIdx: `0`, MinSize: `16`);
1701	Shifts.clampScalar(TypeIdx: `0`, MinTy: S16, MaxTy: S64);
1702
1703	getActionDefinitionsBuilder(Opcodes: {G_SSHLSAT, G_USHLSAT})
1704	.minScalar(TypeIdx: `0`, Ty: S16)
1705	.scalarize(TypeIdx: `0`)
1706	.lower();
1707	} else {
1708	// Make sure we legalize the shift amount type first, as the general
1709	// expansion for the shifted type will produce much worse code if it hasn't
1710	// been truncated already.
1711	Shifts.clampScalar(TypeIdx: `1`, MinTy: S32, MaxTy: S32);
1712	Shifts.widenScalarToNextPow2(TypeIdx: `0`, MinSize: `32`);
1713	Shifts.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S64);
1714
1715	getActionDefinitionsBuilder(Opcodes: {G_SSHLSAT, G_USHLSAT})
1716	.minScalar(TypeIdx: `0`, Ty: S32)
1717	.scalarize(TypeIdx: `0`)
1718	.lower();
1719	}
1720	Shifts.scalarize(TypeIdx: `0`);
1721
1722	for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1723	unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? `1` : `0`;
1724	unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? `0` : `1`;
1725	unsigned IdxTypeIdx = `2`;
1726
1727	getActionDefinitionsBuilder(Opcode: Op)
1728	.customIf(Predicate: [=](const LegalityQuery &Query) {
1729	const LLT EltTy = Query.Types [EltTypeIdx];
1730	const LLT VecTy = Query.Types [VecTypeIdx];
1731	const LLT IdxTy = Query.Types [IdxTypeIdx];
1732	const unsigned EltSize = EltTy.getSizeInBits();
1733	const bool isLegalVecType =
1734	!!SIRegisterInfo::getSGPRClassForBitWidth(BitWidth: VecTy.getSizeInBits());
1735	// Address space 8 pointers are 128-bit wide values, but the logic
1736	// below will try to bitcast them to 2N x s64, which will fail.
1737	// Therefore, as an intermediate step, wrap extracts/insertions from a
1738	// ptrtoint-ing the vector and scalar arguments (or inttoptring the
1739	// extraction result) in order to produce a vector operation that can
1740	// be handled by the logic below.
1741	if (EltTy.isPointer() && EltSize > `64`)
1742	return true;
1743	return (EltSize == `32` \|\| EltSize == `64`) &&
1744	VecTy.getSizeInBits() % `32` == `0` &&
1745	VecTy.getSizeInBits() <= MaxRegisterSize &&
1746	IdxTy.getSizeInBits() == `32` &&
1747	isLegalVecType;
1748	})
1749	.bitcastIf(Predicate: all(P0: sizeIsMultipleOf32(TypeIdx: VecTypeIdx), P1: scalarOrEltNarrowerThan(TypeIdx: VecTypeIdx, Size: `32`)),
1750	Mutation: bitcastToVectorElement32(TypeIdx: VecTypeIdx))
1751	//.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1752	.bitcastIf(
1753	Predicate: all(P0: sizeIsMultipleOf32(TypeIdx: VecTypeIdx), P1: scalarOrEltWiderThan(TypeIdx: VecTypeIdx, Size: `64`)),
1754	Mutation: [=](const LegalityQuery &Query) {
1755	// For > 64-bit element types, try to turn this into a 64-bit
1756	// element vector since we may be able to do better indexing
1757	// if this is scalar. If not, fall back to 32.
1758	const LLT EltTy = Query.Types [EltTypeIdx];
1759	const LLT VecTy = Query.Types [VecTypeIdx];
1760	const unsigned DstEltSize = EltTy.getSizeInBits();
1761	const unsigned VecSize = VecTy.getSizeInBits();
1762
1763	const unsigned TargetEltSize = DstEltSize % `64` == `0` ? `64` : `32`;
1764	return std::pair(
1765	VecTypeIdx,
1766	LLT::fixed_vector(NumElements: VecSize / TargetEltSize, ScalarSizeInBits: TargetEltSize));
1767	})
1768	.clampScalar(TypeIdx: EltTypeIdx, MinTy: S32, MaxTy: S64)
1769	.clampScalar(TypeIdx: VecTypeIdx, MinTy: S32, MaxTy: S64)
1770	.clampScalar(TypeIdx: IdxTypeIdx, MinTy: S32, MaxTy: S32)
1771	.clampMaxNumElements(TypeIdx: VecTypeIdx, EltTy: S32, MaxElements: `32`)
1772	// TODO: Clamp elements for 64-bit vectors?
1773	.moreElementsIf(
1774	Predicate: isIllegalRegisterType(TypeIdx: VecTypeIdx),
1775	Mutation: moreElementsToNextExistingRegClass(TypeIdx: VecTypeIdx))
1776	// It should only be necessary with variable indexes.
1777	// As a last resort, lower to the stack
1778	.lower();
1779	}
1780
1781	getActionDefinitionsBuilder(Opcode: G_EXTRACT_VECTOR_ELT)
1782	.unsupportedIf(Predicate: [=](const LegalityQuery &Query) {
1783	const LLT &EltTy = Query.Types [`1`].getElementType();
1784	return Query.Types [`0`] != EltTy;
1785	});
1786
1787	for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1788	unsigned BigTyIdx = Op == G_EXTRACT ? `1` : `0`;
1789	unsigned LitTyIdx = Op == G_EXTRACT ? `0` : `1`;
1790
1791	// FIXME: Doesn't handle extract of illegal sizes.
1792	getActionDefinitionsBuilder(Opcode: Op)
1793	.lowerIf(Predicate: all(P0: typeIs(TypeIdx: LitTyIdx, TypesInit: S16), P1: sizeIs(TypeIdx: BigTyIdx, Size: `32`)))
1794	.lowerIf(Predicate: [=](const LegalityQuery &Query) {
1795	// Sub-vector(or single element) insert and extract.
1796	// TODO: verify immediate offset here since lower only works with
1797	// whole elements.
1798	const LLT BigTy = Query.Types [BigTyIdx];
1799	return BigTy.isVector();
1800	})
1801	// FIXME: Multiples of 16 should not be legal.
1802	.legalIf(Predicate: [=](const LegalityQuery &Query) {
1803	const LLT BigTy = Query.Types [BigTyIdx];
1804	const LLT LitTy = Query.Types [LitTyIdx];
1805	return (BigTy.getSizeInBits() % `32` == `0`) &&
1806	(LitTy.getSizeInBits() % `16` == `0`);
1807	})
1808	.widenScalarIf(
1809	Predicate: [=](const LegalityQuery &Query) {
1810	const LLT BigTy = Query.Types [BigTyIdx];
1811	return (BigTy.getScalarSizeInBits() < `16`);
1812	},
1813	Mutation: LegalizeMutations::widenScalarOrEltToNextPow2(TypeIdx: BigTyIdx, Min: `16`))
1814	.widenScalarIf(
1815	Predicate: [=](const LegalityQuery &Query) {
1816	const LLT LitTy = Query.Types [LitTyIdx];
1817	return (LitTy.getScalarSizeInBits() < `16`);
1818	},
1819	Mutation: LegalizeMutations::widenScalarOrEltToNextPow2(TypeIdx: LitTyIdx, Min: `16`))
1820	.moreElementsIf(Predicate: isSmallOddVector(TypeIdx: BigTyIdx), Mutation: oneMoreElement(TypeIdx: BigTyIdx))
1821	.widenScalarToNextPow2(TypeIdx: BigTyIdx, MinSize: `32`);
1822
1823	}
1824
1825	auto &BuildVector = getActionDefinitionsBuilder(Opcode: G_BUILD_VECTOR)
1826	.legalForCartesianProduct(Types0: AllS32Vectors, Types1: {S32})
1827	.legalForCartesianProduct(Types0: AllS64Vectors, Types1: {S64})
1828	.clampNumElements(TypeIdx: `0`, MinTy: V16S32, MaxTy: V32S32)
1829	.clampNumElements(TypeIdx: `0`, MinTy: V2S64, MaxTy: V16S64)
1830	.fewerElementsIf(Predicate: isWideVec16(TypeIdx: `0`), Mutation: changeTo(TypeIdx: `0`, Ty: V2S16))
1831	.moreElementsIf(
1832	Predicate: isIllegalRegisterType(TypeIdx: `0`),
1833	Mutation: moreElementsToNextExistingRegClass(TypeIdx: `0`));
1834
1835	if (ST.hasScalarPackInsts()) {
1836	BuildVector
1837	// FIXME: Should probably widen s1 vectors straight to s32
1838	.minScalarOrElt(TypeIdx: `0`, Ty: S16)
1839	.minScalar(TypeIdx: `1`, Ty: S16);
1840
1841	getActionDefinitionsBuilder(Opcode: G_BUILD_VECTOR_TRUNC)
1842	.legalFor(Types: {V2S16, S32})
1843	.lower();
1844	} else {
1845	BuildVector.customFor(Types: {V2S16, S16});
1846	BuildVector.minScalarOrElt(TypeIdx: `0`, Ty: S32);
1847
1848	getActionDefinitionsBuilder(Opcode: G_BUILD_VECTOR_TRUNC)
1849	.customFor(Types: {V2S16, S32})
1850	.lower();
1851	}
1852
1853	BuildVector.legalIf(Predicate: isRegisterType(TypeIdx: `0`));
1854
1855	// FIXME: Clamp maximum size
1856	getActionDefinitionsBuilder(Opcode: G_CONCAT_VECTORS)
1857	.legalIf(Predicate: all(P0: isRegisterType(TypeIdx: `0`), P1: isRegisterType(TypeIdx: `1`)))
1858	.clampMaxNumElements(TypeIdx: `0`, EltTy: S32, MaxElements: `32`)
1859	.clampMaxNumElements(TypeIdx: `1`, EltTy: S16, MaxElements: `2`) // TODO: Make 4?
1860	.clampMaxNumElements(TypeIdx: `0`, EltTy: S16, MaxElements: `64`);
1861
1862	getActionDefinitionsBuilder(Opcode: G_SHUFFLE_VECTOR).lower();
1863
1864	// Merge/Unmerge
1865	for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1866	unsigned BigTyIdx = Op == G_MERGE_VALUES ? `0` : `1`;
1867	unsigned LitTyIdx = Op == G_MERGE_VALUES ? `1` : `0`;
1868
1869	auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1870	const LLT Ty = Query.Types [TypeIdx];
1871	if (Ty.isVector()) {
1872	const LLT &EltTy = Ty.getElementType();
1873	if (EltTy.getSizeInBits() < `8` \|\| EltTy.getSizeInBits() > `512`)
1874	return true;
1875	if (!llvm::has_single_bit<uint32_t>(Value: EltTy.getSizeInBits()))
1876	return true;
1877	}
1878	return false;
1879	};
1880
1881	auto &Builder = getActionDefinitionsBuilder(Opcode: Op)
1882	.legalIf(Predicate: all(P0: isRegisterType(TypeIdx: `0`), P1: isRegisterType(TypeIdx: `1`)))
1883	.lowerFor(Types: {{S16, V2S16}})
1884	.lowerIf(Predicate: [=](const LegalityQuery &Query) {
1885	const LLT BigTy = Query.Types [BigTyIdx];
1886	return BigTy.getSizeInBits() == `32`;
1887	})
1888	// Try to widen to s16 first for small types.
1889	// TODO: Only do this on targets with legal s16 shifts
1890	.minScalarOrEltIf(Predicate: scalarNarrowerThan(TypeIdx: LitTyIdx, Size: `16`), TypeIdx: LitTyIdx, Ty: S16)
1891	.widenScalarToNextPow2(TypeIdx: LitTyIdx, /Min/ MinSize: `16`)
1892	.moreElementsIf(Predicate: isSmallOddVector(TypeIdx: BigTyIdx), Mutation: oneMoreElement(TypeIdx: BigTyIdx))
1893	.fewerElementsIf(Predicate: all(P0: typeIs(TypeIdx: `0`, TypesInit: S16), P1: vectorWiderThan(TypeIdx: `1`, Size: `32`),
1894	args: elementTypeIs(TypeIdx: `1`, EltTy: S16)),
1895	Mutation: changeTo(TypeIdx: `1`, Ty: V2S16))
1896	// Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1897	// worth considering the multiples of 64 since 2192 and 2384 are not
1898	// valid.
1899	.clampScalar(TypeIdx: LitTyIdx, MinTy: S32, MaxTy: S512)
1900	.widenScalarToNextPow2(TypeIdx: LitTyIdx, /Min/ MinSize: `32`)
1901	// Break up vectors with weird elements into scalars
1902	.fewerElementsIf(
1903	Predicate: [=](const LegalityQuery &Query) { return notValidElt (Query, LitTyIdx); },
1904	Mutation: scalarize(TypeIdx: `0`))
1905	.fewerElementsIf(
1906	Predicate: [=](const LegalityQuery &Query) { return notValidElt (Query, BigTyIdx); },
1907	Mutation: scalarize(TypeIdx: `1`))
1908	.clampScalar(TypeIdx: BigTyIdx, MinTy: S32, MaxTy: MaxScalar);
1909
1910	if (Op == G_MERGE_VALUES) {
1911	Builder.widenScalarIf(
1912	// TODO: Use 16-bit shifts if legal for 8-bit values?
1913	Predicate: [=](const LegalityQuery &Query) {
1914	const LLT Ty = Query.Types [LitTyIdx];
1915	return Ty.getSizeInBits() < `32`;
1916	},
1917	Mutation: changeTo(TypeIdx: LitTyIdx, Ty: S32));
1918	}
1919
1920	Builder.widenScalarIf(
1921	Predicate: [=](const LegalityQuery &Query) {
1922	const LLT Ty = Query.Types [BigTyIdx];
1923	return Ty.getSizeInBits() % `16` != `0`;
1924	},
1925	Mutation: [=](const LegalityQuery &Query) {
1926	// Pick the next power of 2, or a multiple of 64 over 128.
1927	// Whichever is smaller.
1928	const LLT &Ty = Query.Types [BigTyIdx];
1929	unsigned NewSizeInBits = `1` << Log2_32_Ceil(Value: Ty.getSizeInBits() + `1`);
1930	if (NewSizeInBits >= `256`) {
1931	unsigned RoundedTo = alignTo<`64`>(Value: Ty.getSizeInBits() + `1`);
1932	if (RoundedTo < NewSizeInBits)
1933	NewSizeInBits = RoundedTo;
1934	}
1935	return std::pair(BigTyIdx, LLT::scalar(SizeInBits: NewSizeInBits));
1936	})
1937	// Any vectors left are the wrong size. Scalarize them.
1938	.scalarize(TypeIdx: `0`)
1939	.scalarize(TypeIdx: `1`);
1940	}
1941
1942	// S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1943	// RegBankSelect.
1944	auto &SextInReg = getActionDefinitionsBuilder(Opcode: G_SEXT_INREG)
1945	.legalFor(Types: {{S32}, {S64}});
1946
1947	if (ST.hasVOP3PInsts()) {
1948	SextInReg.lowerFor(Types: {{V2S16}})
1949	// Prefer to reduce vector widths for 16-bit vectors before lowering, to
1950	// get more vector shift opportunities, since we'll get those when
1951	// expanded.
1952	.clampMaxNumElementsStrict(TypeIdx: `0`, EltTy: S16, NumElts: `2`);
1953	} else if (ST.has16BitInsts()) {
1954	SextInReg.lowerFor(Types: {{S32}, {S64}, {S16}});
1955	} else {
1956	// Prefer to promote to s32 before lowering if we don't have 16-bit
1957	// shifts. This avoid a lot of intermediate truncate and extend operations.
1958	SextInReg.lowerFor(Types: {{S32}, {S64}});
1959	}
1960
1961	SextInReg
1962	.scalarize(TypeIdx: `0`)
1963	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S64)
1964	.lower();
1965
1966	getActionDefinitionsBuilder(Opcodes: {G_ROTR, G_ROTL})
1967	.scalarize(TypeIdx: `0`)
1968	.lower();
1969
1970	// TODO: Only Try to form v2s16 with legal packed instructions.
1971	getActionDefinitionsBuilder(Opcode: G_FSHR)
1972	.legalFor(Types: {{S32, S32}})
1973	.lowerFor(Types: {{V2S16, V2S16}})
1974	.clampMaxNumElementsStrict(TypeIdx: `0`, EltTy: S16, NumElts: `2`)
1975	.scalarize(TypeIdx: `0`)
1976	.lower();
1977
1978	if (ST.hasVOP3PInsts()) {
1979	getActionDefinitionsBuilder(Opcode: G_FSHL)
1980	.lowerFor(Types: {{V2S16, V2S16}})
1981	.clampMaxNumElementsStrict(TypeIdx: `0`, EltTy: S16, NumElts: `2`)
1982	.scalarize(TypeIdx: `0`)
1983	.lower();
1984	} else {
1985	getActionDefinitionsBuilder(Opcode: G_FSHL)
1986	.scalarize(TypeIdx: `0`)
1987	.lower();
1988	}
1989
1990	getActionDefinitionsBuilder(Opcode: G_READCYCLECOUNTER)
1991	.legalFor(Types: {S64});
1992
1993	getActionDefinitionsBuilder(Opcode: G_READSTEADYCOUNTER).legalFor(Types: {S64});
1994
1995	getActionDefinitionsBuilder(Opcode: G_FENCE)
1996	.alwaysLegal();
1997
1998	getActionDefinitionsBuilder(Opcodes: {G_SMULO, G_UMULO})
1999	.scalarize(TypeIdx: `0`)
2000	.minScalar(TypeIdx: `0`, Ty: S32)
2001	.lower();
2002
2003	getActionDefinitionsBuilder(Opcodes: {G_SBFX, G_UBFX})
2004	.legalFor(Types: {{S32, S32}, {S64, S32}})
2005	.clampScalar(TypeIdx: `1`, MinTy: S32, MaxTy: S32)
2006	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S64)
2007	.widenScalarToNextPow2(TypeIdx: `0`)
2008	.scalarize(TypeIdx: `0`);
2009
2010	getActionDefinitionsBuilder(
2011	Opcodes: {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
2012	G_FCOPYSIGN,
2013
2014	G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2015	G_READ_REGISTER, G_WRITE_REGISTER,
2016
2017	G_SADDO, G_SSUBO})
2018	.lower();
2019
2020	if (ST.hasIEEEMinMax()) {
2021	getActionDefinitionsBuilder(Opcodes: {G_FMINIMUM, G_FMAXIMUM})
2022	.legalFor(Types: FPTypesPK16)
2023	.clampMaxNumElements(TypeIdx: `0`, EltTy: S16, MaxElements: `2`)
2024	.scalarize(TypeIdx: `0`);
2025	} else {
2026	// TODO: Implement
2027	getActionDefinitionsBuilder(Opcodes: {G_FMINIMUM, G_FMAXIMUM}).lower();
2028	}
2029
2030	getActionDefinitionsBuilder(Opcodes: {G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2031	.lower();
2032
2033	getActionDefinitionsBuilder(Opcodes: {G_TRAP, G_DEBUGTRAP}).custom();
2034
2035	getActionDefinitionsBuilder(Opcodes: {G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2036	G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2037	G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2038	.unsupported();
2039
2040	getActionDefinitionsBuilder(Opcode: G_PREFETCH).alwaysLegal();
2041
2042	getLegacyLegalizerInfo().computeTables();
2043	verify(*ST.getInstrInfo());
2044	}
2045
2046	bool AMDGPULegalizerInfo::legalizeCustom(
2047	LegalizerHelper &Helper, MachineInstr &MI,
2048	LostDebugLocObserver &LocObserver) const {
2049	MachineIRBuilder &B = Helper.MIRBuilder;
2050	MachineRegisterInfo &MRI = *B.getMRI();
2051
2052	switch (MI.getOpcode()) {
2053	case TargetOpcode::G_ADDRSPACE_CAST:
2054	return legalizeAddrSpaceCast(MI, MRI, B);
2055	case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2056	return legalizeFroundeven(MI, MRI, B);
2057	case TargetOpcode::G_FCEIL:
2058	return legalizeFceil(MI, MRI, B);
2059	case TargetOpcode::G_FREM:
2060	return legalizeFrem(MI, MRI, B);
2061	case TargetOpcode::G_INTRINSIC_TRUNC:
2062	return legalizeIntrinsicTrunc(MI, MRI, B);
2063	case TargetOpcode::G_SITOFP:
2064	return legalizeITOFP(MI, MRI, B, Signed: true);
2065	case TargetOpcode::G_UITOFP:
2066	return legalizeITOFP(MI, MRI, B, Signed: false);
2067	case TargetOpcode::G_FPTOSI:
2068	return legalizeFPTOI(MI, MRI, B, Signed: true);
2069	case TargetOpcode::G_FPTOUI:
2070	return legalizeFPTOI(MI, MRI, B, Signed: false);
2071	case TargetOpcode::G_FMINNUM:
2072	case TargetOpcode::G_FMAXNUM:
2073	case TargetOpcode::G_FMINNUM_IEEE:
2074	case TargetOpcode::G_FMAXNUM_IEEE:
2075	return legalizeMinNumMaxNum(Helper, MI);
2076	case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2077	return legalizeExtractVectorElt(MI, MRI, B);
2078	case TargetOpcode::G_INSERT_VECTOR_ELT:
2079	return legalizeInsertVectorElt(MI, MRI, B);
2080	case TargetOpcode::G_FSIN:
2081	case TargetOpcode::G_FCOS:
2082	return legalizeSinCos(MI, MRI, B);
2083	case TargetOpcode::G_GLOBAL_VALUE:
2084	return legalizeGlobalValue(MI, MRI, B);
2085	case TargetOpcode::G_LOAD:
2086	case TargetOpcode::G_SEXTLOAD:
2087	case TargetOpcode::G_ZEXTLOAD:
2088	return legalizeLoad(Helper, MI);
2089	case TargetOpcode::G_STORE:
2090	return legalizeStore(Helper, MI);
2091	case TargetOpcode::G_FMAD:
2092	return legalizeFMad(MI, MRI, B);
2093	case TargetOpcode::G_FDIV:
2094	return legalizeFDIV(MI, MRI, B);
2095	case TargetOpcode::G_FFREXP:
2096	return legalizeFFREXP(MI, MRI, B);
2097	case TargetOpcode::G_FSQRT:
2098	return legalizeFSQRT(MI, MRI, B);
2099	case TargetOpcode::G_UDIV:
2100	case TargetOpcode::G_UREM:
2101	case TargetOpcode::G_UDIVREM:
2102	return legalizeUnsignedDIV_REM(MI, MRI, B);
2103	case TargetOpcode::G_SDIV:
2104	case TargetOpcode::G_SREM:
2105	case TargetOpcode::G_SDIVREM:
2106	return legalizeSignedDIV_REM(MI, MRI, B);
2107	case TargetOpcode::G_ATOMIC_CMPXCHG:
2108	return legalizeAtomicCmpXChg(MI, MRI, B);
2109	case TargetOpcode::G_FLOG2:
2110	return legalizeFlog2(MI, B);
2111	case TargetOpcode::G_FLOG:
2112	case TargetOpcode::G_FLOG10:
2113	return legalizeFlogCommon(MI, B);
2114	case TargetOpcode::G_FEXP2:
2115	return legalizeFExp2(MI, B);
2116	case TargetOpcode::G_FEXP:
2117	case TargetOpcode::G_FEXP10:
2118	return legalizeFExp(MI, B);
2119	case TargetOpcode::G_FPOW:
2120	return legalizeFPow(MI, B);
2121	case TargetOpcode::G_FFLOOR:
2122	return legalizeFFloor(MI, MRI, B);
2123	case TargetOpcode::G_BUILD_VECTOR:
2124	case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2125	return legalizeBuildVector(MI, MRI, B);
2126	case TargetOpcode::G_MUL:
2127	return legalizeMul(Helper, MI);
2128	case TargetOpcode::G_CTLZ:
2129	case TargetOpcode::G_CTTZ:
2130	return legalizeCTLZ_CTTZ(MI, MRI, B);
2131	case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
2132	return legalizeFPTruncRound(MI, B);
2133	case TargetOpcode::G_STACKSAVE:
2134	return legalizeStackSave(MI, B);
2135	case TargetOpcode::G_GET_FPENV:
2136	return legalizeGetFPEnv(MI, MRI, B);
2137	case TargetOpcode::G_SET_FPENV:
2138	return legalizeSetFPEnv(MI, MRI, B);
2139	case TargetOpcode::G_TRAP:
2140	return legalizeTrap(MI, MRI, B);
2141	case TargetOpcode::G_DEBUGTRAP:
2142	return legalizeDebugTrap(MI, MRI, B);
2143	default:
2144	return false;
2145	}
2146
2147	llvm_unreachable("expected switch to return");
2148	}
2149
2150	Register AMDGPULegalizerInfo::getSegmentAperture(
2151	unsigned AS,
2152	MachineRegisterInfo &MRI,
2153	MachineIRBuilder &B) const {
2154	MachineFunction &MF = B.getMF();
2155	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2156	const LLT S32 = LLT::scalar(SizeInBits: `32`);
2157	const LLT S64 = LLT::scalar(SizeInBits: `64`);
2158
2159	assert(AS == AMDGPUAS::LOCAL_ADDRESS \|\| AS == AMDGPUAS::PRIVATE_ADDRESS);
2160
2161	if (ST.hasApertureRegs()) {
2162	// Note: this register is somewhat broken. When used as a 32-bit operand,
2163	// it only returns zeroes. The real value is in the upper 32 bits.
2164	// Thus, we must emit extract the high 32 bits.
2165	const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2166	? AMDGPU::SRC_SHARED_BASE
2167	: AMDGPU::SRC_PRIVATE_BASE;
2168	// FIXME: It would be more natural to emit a COPY here, but then copy
2169	// coalescing would kick in and it would think it's okay to use the "HI"
2170	// subregister (instead of extracting the HI 32 bits) which is an artificial
2171	// (unusable) register.
2172	// Register TableGen definitions would need an overhaul to get rid of the
2173	// artificial "HI" aperture registers and prevent this kind of issue from
2174	// happening.
2175	Register Dst = MRI.createGenericVirtualRegister(Ty: S64);
2176	MRI.setRegClass(Reg: Dst, RC: &AMDGPU::SReg_64RegClass);
2177	B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register (ApertureRegNo)});
2178	return B.buildUnmerge(Res: S32, Op: Dst).getReg(Idx: `1`);
2179	}
2180
2181	// TODO: can we be smarter about machine pointer info?
2182	MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
2183	Register LoadAddr = MRI.createGenericVirtualRegister(
2184	Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: `64`));
2185	// For code object version 5, private_base and shared_base are passed through
2186	// implicit kernargs.
2187	if (AMDGPU::getAMDHSACodeObjectVersion(M: *MF.getFunction().getParent()) >=
2188	AMDGPU::AMDHSA_COV5) {
2189	AMDGPUTargetLowering::ImplicitParameter Param =
2190	AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
2191	: AMDGPUTargetLowering::PRIVATE_BASE;
2192	uint64_t Offset =
2193	ST.getTargetLowering()->getImplicitParameterOffset(MF: B.getMF(), Param);
2194
2195	Register KernargPtrReg = MRI.createGenericVirtualRegister(
2196	Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: `64`));
2197
2198	if (!loadInputValue(DstReg: KernargPtrReg, B,
2199	ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
2200	return Register ();
2201
2202	MachineMemOperand *MMO = MF.getMachineMemOperand(
2203	PtrInfo,
2204	f: MachineMemOperand::MOLoad \| MachineMemOperand::MODereferenceable \|
2205	MachineMemOperand::MOInvariant,
2206	MemTy: LLT::scalar(SizeInBits: `32`), base_alignment: commonAlignment(A: Align (`64`), Offset));
2207
2208	// Pointer address
2209	B.buildPtrAdd(Res: LoadAddr, Op0: KernargPtrReg,
2210	Op1: B.buildConstant(Res: LLT::scalar(SizeInBits: `64`), Val: Offset).getReg(Idx: `0`));
2211	// Load address
2212	return B.buildLoad(Res: S32, Addr: LoadAddr, MMO&: *MMO).getReg(Idx: `0`);
2213	}
2214
2215	Register QueuePtr = MRI.createGenericVirtualRegister(
2216	Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: `64`));
2217
2218	if (!loadInputValue(DstReg: QueuePtr, B, ArgType: AMDGPUFunctionArgInfo::QUEUE_PTR))
2219	return Register ();
2220
2221	// Offset into amd_queue_t for group_segment_aperture_base_hi /
2222	// private_segment_aperture_base_hi.
2223	uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? `0x40` : `0x44`;
2224
2225	MachineMemOperand *MMO = MF.getMachineMemOperand(
2226	PtrInfo,
2227	f: MachineMemOperand::MOLoad \| MachineMemOperand::MODereferenceable \|
2228	MachineMemOperand::MOInvariant,
2229	MemTy: LLT::scalar(SizeInBits: `32`), base_alignment: commonAlignment(A: Align (`64`), Offset: StructOffset));
2230
2231	B.buildPtrAdd(Res: LoadAddr, Op0: QueuePtr,
2232	Op1: B.buildConstant(Res: LLT::scalar(SizeInBits: `64`), Val: StructOffset).getReg(Idx: `0`));
2233	return B.buildLoad(Res: S32, Addr: LoadAddr, MMO&: *MMO).getReg(Idx: `0`);
2234	}
2235
2236	/// Return true if the value is a known valid address, such that a null check is
2237	/// not necessary.
2238	static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
2239	const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2240	MachineInstr *Def = MRI.getVRegDef(Reg: Val);
2241	switch (Def->getOpcode()) {
2242	case AMDGPU::G_FRAME_INDEX:
2243	case AMDGPU::G_GLOBAL_VALUE:
2244	case AMDGPU::G_BLOCK_ADDR:
2245	return true;
2246	case AMDGPU::G_CONSTANT: {
2247	const ConstantInt *CI = Def->getOperand(i: `1`).getCImm();
2248	return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
2249	}
2250	default:
2251	return false;
2252	}
2253
2254	return false;
2255	}
2256
2257	bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
2258	MachineInstr &MI, MachineRegisterInfo &MRI,
2259	MachineIRBuilder &B) const {
2260	MachineFunction &MF = B.getMF();
2261
2262	// MI can either be a G_ADDRSPACE_CAST or a
2263	// G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2264	assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST \|\|
2265	(isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
2266	Intrinsic::amdgcn_addrspacecast_nonnull));
2267
2268	const LLT S32 = LLT::scalar(SizeInBits: `32`);
2269	Register Dst = MI.getOperand(i: `0`).getReg();
2270	Register Src = isa<GIntrinsic>(Val: MI) ? MI.getOperand(i: `2`).getReg()
2271	: MI.getOperand(i: `1`).getReg();
2272	LLT DstTy = MRI.getType(Reg: Dst);
2273	LLT SrcTy = MRI.getType(Reg: Src);
2274	unsigned DestAS = DstTy.getAddressSpace();
2275	unsigned SrcAS = SrcTy.getAddressSpace();
2276
2277	// TODO: Avoid reloading from the queue ptr for each cast, or at least each
2278	// vector element.
2279	assert(!DstTy.isVector());
2280
2281	const AMDGPUTargetMachine &TM
2282	= static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2283
2284	if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2285	MI.setDesc(B.getTII().get(Opcode: TargetOpcode::G_BITCAST));
2286	return true;
2287	}
2288
2289	if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2290	(DestAS == AMDGPUAS::LOCAL_ADDRESS \|\|
2291	DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2292	// For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2293	// G_ADDRSPACE_CAST we need to guess.
2294	if (isa<GIntrinsic>(Val: MI) \|\| isKnownNonNull(Val: Src, MRI, TM, AddrSpace: SrcAS)) {
2295	// Extract low 32-bits of the pointer.
2296	B.buildExtract(Res: Dst, Src, Index: `0`);
2297	MI.eraseFromParent();
2298	return true;
2299	}
2300
2301	unsigned NullVal = TM.getNullPointerValue(AddrSpace: DestAS);
2302
2303	auto SegmentNull = B.buildConstant(Res: DstTy, Val: NullVal);
2304	auto FlatNull = B.buildConstant(Res: SrcTy, Val: `0`);
2305
2306	// Extract low 32-bits of the pointer.
2307	auto PtrLo32 = B.buildExtract(Res: DstTy, Src, Index: `0`);
2308
2309	auto CmpRes =
2310	B.buildICmp(Pred: CmpInst::ICMP_NE, Res: LLT::scalar(SizeInBits: `1`), Op0: Src, Op1: FlatNull.getReg(Idx: `0`));
2311	B.buildSelect(Res: Dst, Tst: CmpRes, Op0: PtrLo32, Op1: SegmentNull.getReg(Idx: `0`));
2312
2313	MI.eraseFromParent();
2314	return true;
2315	}
2316
2317	if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2318	(SrcAS == AMDGPUAS::LOCAL_ADDRESS \|\|
2319	SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2320	Register ApertureReg = getSegmentAperture(AS: SrcAS, MRI, B);
2321	if (!ApertureReg.isValid())
2322	return false;
2323
2324	// Coerce the type of the low half of the result so we can use merge_values.
2325	Register SrcAsInt = B.buildPtrToInt(Dst: S32, Src).getReg(Idx: `0`);
2326
2327	// TODO: Should we allow mismatched types but matching sizes in merges to
2328	// avoid the ptrtoint?
2329	auto BuildPtr = B.buildMergeLikeInstr(Res: DstTy, Ops: {SrcAsInt, ApertureReg});
2330
2331	// For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2332	// G_ADDRSPACE_CAST we need to guess.
2333	if (isa<GIntrinsic>(Val: MI) \|\| isKnownNonNull(Val: Src, MRI, TM, AddrSpace: SrcAS)) {
2334	B.buildCopy(Res: Dst, Op: BuildPtr);
2335	MI.eraseFromParent();
2336	return true;
2337	}
2338
2339	auto SegmentNull = B.buildConstant(Res: SrcTy, Val: TM.getNullPointerValue(AddrSpace: SrcAS));
2340	auto FlatNull = B.buildConstant(Res: DstTy, Val: TM.getNullPointerValue(AddrSpace: DestAS));
2341
2342	auto CmpRes = B.buildICmp(Pred: CmpInst::ICMP_NE, Res: LLT::scalar(SizeInBits: `1`), Op0: Src,
2343	Op1: SegmentNull.getReg(Idx: `0`));
2344
2345	B.buildSelect(Res: Dst, Tst: CmpRes, Op0: BuildPtr, Op1: FlatNull);
2346
2347	MI.eraseFromParent();
2348	return true;
2349	}
2350
2351	if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2352	SrcTy.getSizeInBits() == `64`) {
2353	// Truncate.
2354	B.buildExtract(Res: Dst, Src, Index: `0`);
2355	MI.eraseFromParent();
2356	return true;
2357	}
2358
2359	if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2360	DstTy.getSizeInBits() == `64`) {
2361	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2362	uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2363	auto PtrLo = B.buildPtrToInt(Dst: S32, Src);
2364	auto HighAddr = B.buildConstant(Res: S32, Val: AddrHiVal);
2365	B.buildMergeLikeInstr(Res: Dst, Ops: {PtrLo, HighAddr});
2366	MI.eraseFromParent();
2367	return true;
2368	}
2369
2370	DiagnosticInfoUnsupported InvalidAddrSpaceCast(
2371	MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
2372
2373	LLVMContext &Ctx = MF.getFunction().getContext();
2374	Ctx.diagnose(DI: InvalidAddrSpaceCast);
2375	B.buildUndef(Res: Dst);
2376	MI.eraseFromParent();
2377	return true;
2378	}
2379
2380	bool AMDGPULegalizerInfo::legalizeFroundeven(MachineInstr &MI,
2381	MachineRegisterInfo &MRI,
2382	MachineIRBuilder &B) const {
2383	Register Src = MI.getOperand(i: `1`).getReg();
2384	LLT Ty = MRI.getType(Reg: Src);
2385	assert(Ty.isScalar() && Ty.getSizeInBits() == `64`);
2386
2387	APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2388	APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2389
2390	auto C1 = B.buildFConstant(Res: Ty, Val: C1Val);
2391	auto CopySign = B.buildFCopysign(Dst: Ty, Src0: C1, Src1: Src);
2392
2393	// TODO: Should this propagate fast-math-flags?
2394	auto Tmp1 = B.buildFAdd(Dst: Ty, Src0: Src, Src1: CopySign);
2395	auto Tmp2 = B.buildFSub(Dst: Ty, Src0: Tmp1, Src1: CopySign);
2396
2397	auto C2 = B.buildFConstant(Res: Ty, Val: C2Val);
2398	auto Fabs = B.buildFAbs(Dst: Ty, Src0: Src);
2399
2400	auto Cond = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: LLT::scalar(SizeInBits: `1`), Op0: Fabs, Op1: C2);
2401	B.buildSelect(Res: MI.getOperand(i: `0`).getReg(), Tst: Cond, Op0: Src, Op1: Tmp2);
2402	MI.eraseFromParent();
2403	return true;
2404	}
2405
2406	bool AMDGPULegalizerInfo::legalizeFceil(
2407	MachineInstr &MI, MachineRegisterInfo &MRI,
2408	MachineIRBuilder &B) const {
2409
2410	const LLT S1 = LLT::scalar(SizeInBits: `1`);
2411	const LLT S64 = LLT::scalar(SizeInBits: `64`);
2412
2413	Register Src = MI.getOperand(i: `1`).getReg();
2414	assert(MRI.getType(Src) == S64);
2415
2416	// result = trunc(src)
2417	// if (src > 0.0 && src != result)
2418	// result += 1.0
2419
2420	auto Trunc = B.buildIntrinsicTrunc(Dst: S64, Src0: Src);
2421
2422	const auto Zero = B.buildFConstant(Res: S64, Val: `0.0`);
2423	const auto One = B.buildFConstant(Res: S64, Val: `1.0`);
2424	auto Lt0 = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: S1, Op0: Src, Op1: Zero);
2425	auto NeTrunc = B.buildFCmp(Pred: CmpInst::FCMP_ONE, Res: S1, Op0: Src, Op1: Trunc);
2426	auto And = B.buildAnd(Dst: S1, Src0: Lt0, Src1: NeTrunc);
2427	auto Add = B.buildSelect(Res: S64, Tst: And, Op0: One, Op1: Zero);
2428
2429	// TODO: Should this propagate fast-math-flags?
2430	B.buildFAdd(Dst: MI.getOperand(i: `0`).getReg(), Src0: Trunc, Src1: Add);
2431	MI.eraseFromParent();
2432	return true;
2433	}
2434
2435	bool AMDGPULegalizerInfo::legalizeFrem(
2436	MachineInstr &MI, MachineRegisterInfo &MRI,
2437	MachineIRBuilder &B) const {
2438	Register DstReg = MI.getOperand(i: `0`).getReg();
2439	Register Src0Reg = MI.getOperand(i: `1`).getReg();
2440	Register Src1Reg = MI.getOperand(i: `2`).getReg();
2441	auto Flags = MI.getFlags();
2442	LLT Ty = MRI.getType(Reg: DstReg);
2443
2444	auto Div = B.buildFDiv(Dst: Ty, Src0: Src0Reg, Src1: Src1Reg, Flags);
2445	auto Trunc = B.buildIntrinsicTrunc(Dst: Ty, Src0: Div, Flags);
2446	auto Neg = B.buildFNeg(Dst: Ty, Src0: Trunc, Flags);
2447	B.buildFMA(Dst: DstReg, Src0: Neg, Src1: Src1Reg, Src2: Src0Reg, Flags);
2448	MI.eraseFromParent();
2449	return true;
2450	}
2451
2452	static MachineInstrBuilder extractF64Exponent(Register Hi,
2453	MachineIRBuilder &B) {
2454	const unsigned FractBits = `52`;
2455	const unsigned ExpBits = `11`;
2456	LLT S32 = LLT::scalar(SizeInBits: `32`);
2457
2458	auto Const0 = B.buildConstant(Res: S32, Val: FractBits - `32`);
2459	auto Const1 = B.buildConstant(Res: S32, Val: ExpBits);
2460
2461	auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
2462	.addUse(Hi)
2463	.addUse(Const0.getReg(`0`))
2464	.addUse(Const1.getReg(`0`));
2465
2466	return B.buildSub(Dst: S32, Src0: ExpPart, Src1: B.buildConstant(Res: S32, Val: `1023`));
2467	}
2468
2469	bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2470	MachineInstr &MI, MachineRegisterInfo &MRI,
2471	MachineIRBuilder &B) const {
2472	const LLT S1 = LLT::scalar(SizeInBits: `1`);
2473	const LLT S32 = LLT::scalar(SizeInBits: `32`);
2474	const LLT S64 = LLT::scalar(SizeInBits: `64`);
2475
2476	Register Src = MI.getOperand(i: `1`).getReg();
2477	assert(MRI.getType(Src) == S64);
2478
2479	// TODO: Should this use extract since the low half is unused?
2480	auto Unmerge = B.buildUnmerge(Res: {S32, S32}, Op: Src);
2481	Register Hi = Unmerge.getReg(Idx: `1`);
2482
2483	// Extract the upper half, since this is where we will find the sign and
2484	// exponent.
2485	auto Exp = extractF64Exponent(Hi, B);
2486
2487	const unsigned FractBits = `52`;
2488
2489	// Extract the sign bit.
2490	const auto SignBitMask = B.buildConstant(Res: S32, UINT32_C(`1`) << `31`);
2491	auto SignBit = B.buildAnd(Dst: S32, Src0: Hi, Src1: SignBitMask);
2492
2493	const auto FractMask = B.buildConstant(Res: S64, Val: (UINT64_C(`1`) << FractBits) - `1`);
2494
2495	const auto Zero32 = B.buildConstant(Res: S32, Val: `0`);
2496
2497	// Extend back to 64-bits.
2498	auto SignBit64 = B.buildMergeLikeInstr(Res: S64, Ops: {Zero32, SignBit});
2499
2500	auto Shr = B.buildAShr(Dst: S64, Src0: FractMask, Src1: Exp);
2501	auto Not = B.buildNot(Dst: S64, Src0: Shr);
2502	auto Tmp0 = B.buildAnd(Dst: S64, Src0: Src, Src1: Not);
2503	auto FiftyOne = B.buildConstant(Res: S32, Val: FractBits - `1`);
2504
2505	auto ExpLt0 = B.buildICmp(Pred: CmpInst::ICMP_SLT, Res: S1, Op0: Exp, Op1: Zero32);
2506	auto ExpGt51 = B.buildICmp(Pred: CmpInst::ICMP_SGT, Res: S1, Op0: Exp, Op1: FiftyOne);
2507
2508	auto Tmp1 = B.buildSelect(Res: S64, Tst: ExpLt0, Op0: SignBit64, Op1: Tmp0);
2509	B.buildSelect(Res: MI.getOperand(i: `0`).getReg(), Tst: ExpGt51, Op0: Src, Op1: Tmp1);
2510	MI.eraseFromParent();
2511	return true;
2512	}
2513
2514	bool AMDGPULegalizerInfo::legalizeITOFP(
2515	MachineInstr &MI, MachineRegisterInfo &MRI,
2516	MachineIRBuilder &B, bool Signed) const {
2517
2518	Register Dst = MI.getOperand(i: `0`).getReg();
2519	Register Src = MI.getOperand(i: `1`).getReg();
2520
2521	const LLT S64 = LLT::scalar(SizeInBits: `64`);
2522	const LLT S32 = LLT::scalar(SizeInBits: `32`);
2523
2524	assert(MRI.getType(Src) == S64);
2525
2526	auto Unmerge = B.buildUnmerge(Res: {S32, S32}, Op: Src);
2527	auto ThirtyTwo = B.buildConstant(Res: S32, Val: `32`);
2528
2529	if (MRI.getType(Reg: Dst) == S64) {
2530	auto CvtHi = Signed ? B.buildSITOFP(Dst: S64, Src0: Unmerge.getReg(Idx: `1`))
2531	: B.buildUITOFP(Dst: S64, Src0: Unmerge.getReg(Idx: `1`));
2532
2533	auto CvtLo = B.buildUITOFP(Dst: S64, Src0: Unmerge.getReg(Idx: `0`));
2534	auto LdExp = B.buildFLdexp(Dst: S64, Src0: CvtHi, Src1: ThirtyTwo);
2535
2536	// TODO: Should this propagate fast-math-flags?
2537	B.buildFAdd(Dst, Src0: LdExp, Src1: CvtLo);
2538	MI.eraseFromParent();
2539	return true;
2540	}
2541
2542	assert(MRI.getType(Dst) == S32);
2543
2544	auto One = B.buildConstant(Res: S32, Val: `1`);
2545
2546	MachineInstrBuilder ShAmt;
2547	if (Signed) {
2548	auto ThirtyOne = B.buildConstant(Res: S32, Val: `31`);
2549	auto X = B.buildXor(Dst: S32, Src0: Unmerge.getReg(Idx: `0`), Src1: Unmerge.getReg(Idx: `1`));
2550	auto OppositeSign = B.buildAShr(Dst: S32, Src0: X, Src1: ThirtyOne);
2551	auto MaxShAmt = B.buildAdd(Dst: S32, Src0: ThirtyTwo, Src1: OppositeSign);
2552	auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
2553	.addUse(Unmerge.getReg(`1`));
2554	auto LS2 = B.buildSub(Dst: S32, Src0: LS, Src1: One);
2555	ShAmt = B.buildUMin(Dst: S32, Src0: LS2, Src1: MaxShAmt);
2556	} else
2557	ShAmt = B.buildCTLZ(Dst: S32, Src0: Unmerge.getReg(Idx: `1`));
2558	auto Norm = B.buildShl(Dst: S64, Src0: Src, Src1: ShAmt);
2559	auto Unmerge2 = B.buildUnmerge(Res: {S32, S32}, Op: Norm);
2560	auto Adjust = B.buildUMin(Dst: S32, Src0: One, Src1: Unmerge2.getReg(Idx: `0`));
2561	auto Norm2 = B.buildOr(Dst: S32, Src0: Unmerge2.getReg(Idx: `1`), Src1: Adjust);
2562	auto FVal = Signed ? B.buildSITOFP(Dst: S32, Src0: Norm2) : B.buildUITOFP(Dst: S32, Src0: Norm2);
2563	auto Scale = B.buildSub(Dst: S32, Src0: ThirtyTwo, Src1: ShAmt);
2564	B.buildFLdexp(Dst, Src0: FVal, Src1: Scale);
2565	MI.eraseFromParent();
2566	return true;
2567	}
2568
2569	// TODO: Copied from DAG implementation. Verify logic and document how this
2570	// actually works.
2571	bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2572	MachineRegisterInfo &MRI,
2573	MachineIRBuilder &B,
2574	bool Signed) const {
2575
2576	Register Dst = MI.getOperand(i: `0`).getReg();
2577	Register Src = MI.getOperand(i: `1`).getReg();
2578
2579	const LLT S64 = LLT::scalar(SizeInBits: `64`);
2580	const LLT S32 = LLT::scalar(SizeInBits: `32`);
2581
2582	const LLT SrcLT = MRI.getType(Reg: Src);
2583	assert((SrcLT == S32 \|\| SrcLT == S64) && MRI.getType(Dst) == S64);
2584
2585	unsigned Flags = MI.getFlags();
2586
2587	// The basic idea of converting a floating point number into a pair of 32-bit
2588	// integers is illustrated as follows:
2589	//
2590	// tf := trunc(val);
2591	// hif := floor(tf 2^-32);*
2592	// lof := tf - hif 2^32; // lof is always positive due to floor.*
2593	// hi := fptoi(hif);
2594	// lo := fptoi(lof);
2595	//
2596	auto Trunc = B.buildIntrinsicTrunc(Dst: SrcLT, Src0: Src, Flags);
2597	MachineInstrBuilder Sign;
2598	if (Signed && SrcLT == S32) {
2599	// However, a 32-bit floating point number has only 23 bits mantissa and
2600	// it's not enough to hold all the significant bits of `lof` if val is
2601	// negative. To avoid the loss of precision, We need to take the absolute
2602	// value after truncating and flip the result back based on the original
2603	// signedness.
2604	Sign = B.buildAShr(Dst: S32, Src0: Src, Src1: B.buildConstant(Res: S32, Val: `31`));
2605	Trunc = B.buildFAbs(Dst: S32, Src0: Trunc, Flags);
2606	}
2607	MachineInstrBuilder K0, K1;
2608	if (SrcLT == S64) {
2609	K0 = B.buildFConstant(
2610	Res: S64, Val: llvm::bit_cast<double>(UINT64_C(/2^-32/ `0x3df0000000000000`)));
2611	K1 = B.buildFConstant(
2612	Res: S64, Val: llvm::bit_cast<double>(UINT64_C(/-2^32/ `0xc1f0000000000000`)));
2613	} else {
2614	K0 = B.buildFConstant(
2615	Res: S32, Val: llvm::bit_cast<float>(UINT32_C(/2^-32/ `0x2f800000`)));
2616	K1 = B.buildFConstant(
2617	Res: S32, Val: llvm::bit_cast<float>(UINT32_C(/-2^32/ `0xcf800000`)));
2618	}
2619
2620	auto Mul = B.buildFMul(Dst: SrcLT, Src0: Trunc, Src1: K0, Flags);
2621	auto FloorMul = B.buildFFloor(Dst: SrcLT, Src0: Mul, Flags);
2622	auto Fma = B.buildFMA(Dst: SrcLT, Src0: FloorMul, Src1: K1, Src2: Trunc, Flags);
2623
2624	auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(Dst: S32, Src0: FloorMul)
2625	: B.buildFPTOUI(Dst: S32, Src0: FloorMul);
2626	auto Lo = B.buildFPTOUI(Dst: S32, Src0: Fma);
2627
2628	if (Signed && SrcLT == S32) {
2629	// Flip the result based on the signedness, which is either all 0s or 1s.
2630	Sign = B.buildMergeLikeInstr(Res: S64, Ops: {Sign, Sign});
2631	// r := xor({lo, hi}, sign) - sign;
2632	B.buildSub(Dst, Src0: B.buildXor(Dst: S64, Src0: B.buildMergeLikeInstr(Res: S64, Ops: {Lo, Hi}), Src1: Sign),
2633	Src1: Sign);
2634	} else
2635	B.buildMergeLikeInstr(Res: Dst, Ops: {Lo, Hi});
2636	MI.eraseFromParent();
2637
2638	return true;
2639	}
2640
2641	bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2642	MachineInstr &MI) const {
2643	MachineFunction &MF = Helper.MIRBuilder.getMF();
2644	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2645
2646	const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE \|\|
2647	MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2648
2649	// With ieee_mode disabled, the instructions have the correct behavior
2650	// already for G_FMINNUM/G_FMAXNUM
2651	if (!MFI->getMode().IEEE)
2652	return !IsIEEEOp;
2653
2654	if (IsIEEEOp)
2655	return true;
2656
2657	return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2658	}
2659
2660	bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2661	MachineInstr &MI, MachineRegisterInfo &MRI,
2662	MachineIRBuilder &B) const {
2663	// TODO: Should move some of this into LegalizerHelper.
2664
2665	// TODO: Promote dynamic indexing of s16 to s32
2666
2667	Register Dst = MI.getOperand(i: `0`).getReg();
2668	Register Vec = MI.getOperand(i: `1`).getReg();
2669
2670	LLT VecTy = MRI.getType(Reg: Vec);
2671	LLT EltTy = VecTy.getElementType();
2672	assert(EltTy == MRI.getType(Dst));
2673
2674	// Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2675	// but we can't go directly to that logic becasue you can't bitcast a vector
2676	// of pointers to a vector of integers. Therefore, introduce an intermediate
2677	// vector of integers using ptrtoint (and inttoptr on the output) in order to
2678	// drive the legalization forward.
2679	if (EltTy.isPointer() && EltTy.getSizeInBits() > `64`) {
2680	LLT IntTy = LLT::scalar(SizeInBits: EltTy.getSizeInBits());
2681	LLT IntVecTy = VecTy.changeElementType(NewEltTy: IntTy);
2682
2683	auto IntVec = B.buildPtrToInt(Dst: IntVecTy, Src: Vec);
2684	auto IntElt = B.buildExtractVectorElement(Res: IntTy, Val: IntVec, Idx: MI.getOperand(i: `2`));
2685	B.buildIntToPtr(Dst, Src: IntElt);
2686
2687	MI.eraseFromParent();
2688	return true;
2689	}
2690
2691	// FIXME: Artifact combiner probably should have replaced the truncated
2692	// constant before this, so we shouldn't need
2693	// getIConstantVRegValWithLookThrough.
2694	std::optional<ValueAndVReg> MaybeIdxVal =
2695	getIConstantVRegValWithLookThrough(VReg: MI.getOperand(i: `2`).getReg(), MRI);
2696	if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2697	return true;
2698	const uint64_t IdxVal = MaybeIdxVal ->Value.getZExtValue();
2699
2700	if (IdxVal < VecTy.getNumElements()) {
2701	auto Unmerge = B.buildUnmerge(Res: EltTy, Op: Vec);
2702	B.buildCopy(Res: Dst, Op: Unmerge.getReg(Idx: IdxVal));
2703	} else {
2704	B.buildUndef(Res: Dst);
2705	}
2706
2707	MI.eraseFromParent();
2708	return true;
2709	}
2710
2711	bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2712	MachineInstr &MI, MachineRegisterInfo &MRI,
2713	MachineIRBuilder &B) const {
2714	// TODO: Should move some of this into LegalizerHelper.
2715
2716	// TODO: Promote dynamic indexing of s16 to s32
2717
2718	Register Dst = MI.getOperand(i: `0`).getReg();
2719	Register Vec = MI.getOperand(i: `1`).getReg();
2720	Register Ins = MI.getOperand(i: `2`).getReg();
2721
2722	LLT VecTy = MRI.getType(Reg: Vec);
2723	LLT EltTy = VecTy.getElementType();
2724	assert(EltTy == MRI.getType(Ins));
2725
2726	// Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2727	// but we can't go directly to that logic becasue you can't bitcast a vector
2728	// of pointers to a vector of integers. Therefore, make the pointer vector
2729	// into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
2730	// new value, and then inttoptr the result vector back. This will then allow
2731	// the rest of legalization to take over.
2732	if (EltTy.isPointer() && EltTy.getSizeInBits() > `64`) {
2733	LLT IntTy = LLT::scalar(SizeInBits: EltTy.getSizeInBits());
2734	LLT IntVecTy = VecTy.changeElementType(NewEltTy: IntTy);
2735
2736	auto IntVecSource = B.buildPtrToInt(Dst: IntVecTy, Src: Vec);
2737	auto IntIns = B.buildPtrToInt(Dst: IntTy, Src: Ins);
2738	auto IntVecDest = B.buildInsertVectorElement(Res: IntVecTy, Val: IntVecSource, Elt: IntIns,
2739	Idx: MI.getOperand(i: `3`));
2740	B.buildIntToPtr(Dst, Src: IntVecDest);
2741	MI.eraseFromParent();
2742	return true;
2743	}
2744
2745	// FIXME: Artifact combiner probably should have replaced the truncated
2746	// constant before this, so we shouldn't need
2747	// getIConstantVRegValWithLookThrough.
2748	std::optional<ValueAndVReg> MaybeIdxVal =
2749	getIConstantVRegValWithLookThrough(VReg: MI.getOperand(i: `3`).getReg(), MRI);
2750	if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2751	return true;
2752
2753	const uint64_t IdxVal = MaybeIdxVal ->Value.getZExtValue();
2754
2755	unsigned NumElts = VecTy.getNumElements();
2756	if (IdxVal < NumElts) {
2757	SmallVector<Register, `8`> SrcRegs;
2758	for (unsigned i = `0`; i < NumElts; ++i)
2759	SrcRegs.push_back(Elt: MRI.createGenericVirtualRegister(Ty: EltTy));
2760	B.buildUnmerge(Res: SrcRegs, Op: Vec);
2761
2762	SrcRegs [IdxVal] = MI.getOperand(i: `2`).getReg();
2763	B.buildMergeLikeInstr(Res: Dst, Ops: SrcRegs);
2764	} else {
2765	B.buildUndef(Res: Dst);
2766	}
2767
2768	MI.eraseFromParent();
2769	return true;
2770	}
2771
2772	bool AMDGPULegalizerInfo::legalizeSinCos(
2773	MachineInstr &MI, MachineRegisterInfo &MRI,
2774	MachineIRBuilder &B) const {
2775
2776	Register DstReg = MI.getOperand(i: `0`).getReg();
2777	Register SrcReg = MI.getOperand(i: `1`).getReg();
2778	LLT Ty = MRI.getType(Reg: DstReg);
2779	unsigned Flags = MI.getFlags();
2780
2781	Register TrigVal;
2782	auto OneOver2Pi = B.buildFConstant(Res: Ty, Val: `0.5` * numbers::inv_pi);
2783	if (ST.hasTrigReducedRange()) {
2784	auto MulVal = B.buildFMul(Dst: Ty, Src0: SrcReg, Src1: OneOver2Pi, Flags);
2785	TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
2786	.addUse(MulVal.getReg(Idx: `0`))
2787	.setMIFlags(Flags)
2788	.getReg(`0`);
2789	} else
2790	TrigVal = B.buildFMul(Dst: Ty, Src0: SrcReg, Src1: OneOver2Pi, Flags).getReg(Idx: `0`);
2791
2792	Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2793	Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2794	B.buildIntrinsic(ID: TrigIntrin, Res: ArrayRef<Register>(DstReg))
2795	.addUse(RegNo: TrigVal)
2796	.setMIFlags(Flags);
2797	MI.eraseFromParent();
2798	return true;
2799	}
2800
2801	bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2802	MachineIRBuilder &B,
2803	const GlobalValue *GV,
2804	int64_t Offset,
2805	unsigned GAFlags) const {
2806	assert(isInt<`32`>(Offset + `4`) && "32-bit offset is expected!");
2807	// In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2808	// to the following code sequence:
2809	//
2810	// For constant address space:
2811	// s_getpc_b64 s[0:1]
2812	// s_add_u32 s0, s0, $symbol
2813	// s_addc_u32 s1, s1, 0
2814	//
2815	// s_getpc_b64 returns the address of the s_add_u32 instruction and then
2816	// a fixup or relocation is emitted to replace $symbol with a literal
2817	// constant, which is a pc-relative offset from the encoding of the $symbol
2818	// operand to the global variable.
2819	//
2820	// For global address space:
2821	// s_getpc_b64 s[0:1]
2822	// s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2823	// s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2824	//
2825	// s_getpc_b64 returns the address of the s_add_u32 instruction and then
2826	// fixups or relocations are emitted to replace $symbol@@lo and*
2827	// $symbol@@hi with lower 32 bits and higher 32 bits of a literal constant,*
2828	// which is a 64-bit pc-relative offset from the encoding of the $symbol
2829	// operand to the global variable.
2830
2831	LLT ConstPtrTy = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: `64`);
2832
2833	Register PCReg = PtrTy.getSizeInBits() != `32` ? DstReg :
2834	B.getMRI()->createGenericVirtualRegister(Ty: ConstPtrTy);
2835
2836	MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2837	.addDef(PCReg);
2838
2839	MIB.addGlobalAddress(GV, Offset, TargetFlags: GAFlags);
2840	if (GAFlags == SIInstrInfo::MO_NONE)
2841	MIB.addImm(Val: `0`);
2842	else
2843	MIB.addGlobalAddress(GV, Offset, TargetFlags: GAFlags + `1`);
2844
2845	if (!B.getMRI()->getRegClassOrNull(PCReg))
2846	B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2847
2848	if (PtrTy.getSizeInBits() == `32`)
2849	B.buildExtract(Res: DstReg, Src: PCReg, Index: `0`);
2850	return true;
2851	}
2852
2853	// Emit a ABS32_LO / ABS32_HI relocation stub.
2854	void AMDGPULegalizerInfo::buildAbsGlobalAddress(
2855	Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
2856	MachineRegisterInfo &MRI) const {
2857	bool RequiresHighHalf = PtrTy.getSizeInBits() != `32`;
2858
2859	LLT S32 = LLT::scalar(SizeInBits: `32`);
2860
2861	// Use the destination directly, if and only if we store the lower address
2862	// part only and we don't have a register class being set.
2863	Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(Reg: DstReg)
2864	? DstReg
2865	: MRI.createGenericVirtualRegister(Ty: S32);
2866
2867	if (!MRI.getRegClassOrNull(AddrLo))
2868	MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
2869
2870	// Write the lower half.
2871	B.buildInstr(AMDGPU::S_MOV_B32)
2872	.addDef(AddrLo)
2873	.addGlobalAddress(GV, `0`, SIInstrInfo::MO_ABS32_LO);
2874
2875	// If required, write the upper half as well.
2876	if (RequiresHighHalf) {
2877	assert(PtrTy.getSizeInBits() == `64` &&
2878	"Must provide a 64-bit pointer type!");
2879
2880	Register AddrHi = MRI.createGenericVirtualRegister(Ty: S32);
2881	MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
2882
2883	B.buildInstr(AMDGPU::S_MOV_B32)
2884	.addDef(AddrHi)
2885	.addGlobalAddress(GV, `0`, SIInstrInfo::MO_ABS32_HI);
2886
2887	// Use the destination directly, if and only if we don't have a register
2888	// class being set.
2889	Register AddrDst = !MRI.getRegClassOrNull(Reg: DstReg)
2890	? DstReg
2891	: MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: `64`));
2892
2893	if (!MRI.getRegClassOrNull(AddrDst))
2894	MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
2895
2896	B.buildMergeValues(Res: AddrDst, Ops: {AddrLo, AddrHi});
2897
2898	// If we created a new register for the destination, cast the result into
2899	// the final output.
2900	if (AddrDst != DstReg)
2901	B.buildCast(Dst: DstReg, Src: AddrDst);
2902	} else if (AddrLo != DstReg) {
2903	// If we created a new register for the destination, cast the result into
2904	// the final output.
2905	B.buildCast(Dst: DstReg, Src: AddrLo);
2906	}
2907	}
2908
2909	bool AMDGPULegalizerInfo::legalizeGlobalValue(
2910	MachineInstr &MI, MachineRegisterInfo &MRI,
2911	MachineIRBuilder &B) const {
2912	Register DstReg = MI.getOperand(i: `0`).getReg();
2913	LLT Ty = MRI.getType(Reg: DstReg);
2914	unsigned AS = Ty.getAddressSpace();
2915
2916	const GlobalValue *GV = MI.getOperand(i: `1`).getGlobal();
2917	MachineFunction &MF = B.getMF();
2918	SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2919
2920	if (AS == AMDGPUAS::LOCAL_ADDRESS \|\| AS == AMDGPUAS::REGION_ADDRESS) {
2921	if (!MFI->isModuleEntryFunction() &&
2922	!GV->getName().equals(RHS: "llvm.amdgcn.module.lds")) {
2923	const Function &Fn = MF.getFunction();
2924	DiagnosticInfoUnsupported BadLDSDecl(
2925	Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2926	DS_Warning);
2927	Fn.getContext().diagnose(DI: BadLDSDecl);
2928
2929	// We currently don't have a way to correctly allocate LDS objects that
2930	// aren't directly associated with a kernel. We do force inlining of
2931	// functions that use local objects. However, if these dead functions are
2932	// not eliminated, we don't want a compile time error. Just emit a warning
2933	// and a trap, since there should be no callable path here.
2934	B.buildTrap();
2935	B.buildUndef(Res: DstReg);
2936	MI.eraseFromParent();
2937	return true;
2938	}
2939
2940	// TODO: We could emit code to handle the initialization somewhere.
2941	// We ignore the initializer for now and legalize it to allow selection.
2942	// The initializer will anyway get errored out during assembly emission.
2943	const SITargetLowering *TLI = ST.getTargetLowering();
2944	if (!TLI->shouldUseLDSConstAddress(GV)) {
2945	MI.getOperand(i: `1`).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2946	return true; // Leave in place;
2947	}
2948
2949	if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2950	Type *Ty = GV->getValueType();
2951	// HIP uses an unsized array `extern __shared__ T s[]` or similar
2952	// zero-sized type in other languages to declare the dynamic shared
2953	// memory which size is not known at the compile time. They will be
2954	// allocated by the runtime and placed directly after the static
2955	// allocated ones. They all share the same offset.
2956	if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2957	// Adjust alignment for that dynamic shared memory array.
2958	MFI->setDynLDSAlign(F: MF.getFunction(), GV: *cast<GlobalVariable>(Val: GV));
2959	LLT S32 = LLT::scalar(SizeInBits: `32`);
2960	auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
2961	B.buildIntToPtr(Dst: DstReg, Src: Sz);
2962	MI.eraseFromParent();
2963	return true;
2964	}
2965	}
2966
2967	B.buildConstant(Res: DstReg, Val: MFI->allocateLDSGlobal(DL: B.getDataLayout(),
2968	GV: *cast<GlobalVariable>(Val: GV)));
2969	MI.eraseFromParent();
2970	return true;
2971	}
2972
2973	if (ST.isAmdPalOS() \|\| ST.isMesa3DOS()) {
2974	buildAbsGlobalAddress(DstReg, PtrTy: Ty, B, GV, MRI);
2975	MI.eraseFromParent();
2976	return true;
2977	}
2978
2979	const SITargetLowering *TLI = ST.getTargetLowering();
2980
2981	if (TLI->shouldEmitFixup(GV)) {
2982	buildPCRelGlobalAddress(DstReg, PtrTy: Ty, B, GV, Offset: `0`);
2983	MI.eraseFromParent();
2984	return true;
2985	}
2986
2987	if (TLI->shouldEmitPCReloc(GV)) {
2988	buildPCRelGlobalAddress(DstReg, PtrTy: Ty, B, GV, Offset: `0`, GAFlags: SIInstrInfo::MO_REL32);
2989	MI.eraseFromParent();
2990	return true;
2991	}
2992
2993	LLT PtrTy = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: `64`);
2994	Register GOTAddr = MRI.createGenericVirtualRegister(Ty: PtrTy);
2995
2996	LLT LoadTy = Ty.getSizeInBits() == `32` ? PtrTy : Ty;
2997	MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2998	PtrInfo: MachinePointerInfo::getGOT(MF),
2999	f: MachineMemOperand::MOLoad \| MachineMemOperand::MODereferenceable \|
3000	MachineMemOperand::MOInvariant,
3001	MemTy: LoadTy, base_alignment: Align (`8`));
3002
3003	buildPCRelGlobalAddress(DstReg: GOTAddr, PtrTy, B, GV, Offset: `0`, GAFlags: SIInstrInfo::MO_GOTPCREL32);
3004
3005	if (Ty.getSizeInBits() == `32`) {
3006	// Truncate if this is a 32-bit constant address.
3007	auto Load = B.buildLoad(Res: PtrTy, Addr: GOTAddr, MMO&: *GOTMMO);
3008	B.buildExtract(Res: DstReg, Src: Load, Index: `0`);
3009	} else
3010	B.buildLoad(Res: DstReg, Addr: GOTAddr, MMO&: *GOTMMO);
3011
3012	MI.eraseFromParent();
3013	return true;
3014	}
3015
3016	static LLT widenToNextPowerOf2(LLT Ty) {
3017	if (Ty.isVector())
3018	return Ty.changeElementCount(
3019	EC: ElementCount::getFixed(MinVal: PowerOf2Ceil(A: Ty.getNumElements())));
3020	return LLT::scalar(SizeInBits: PowerOf2Ceil(A: Ty.getSizeInBits()));
3021	}
3022
3023	bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
3024	MachineInstr &MI) const {
3025	MachineIRBuilder &B = Helper.MIRBuilder;
3026	MachineRegisterInfo &MRI = *B.getMRI();
3027	GISelChangeObserver &Observer = Helper.Observer;
3028
3029	Register PtrReg = MI.getOperand(i: `1`).getReg();
3030	LLT PtrTy = MRI.getType(Reg: PtrReg);
3031	unsigned AddrSpace = PtrTy.getAddressSpace();
3032
3033	if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3034	LLT ConstPtr = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: `64`);
3035	auto Cast = B.buildAddrSpaceCast(Dst: ConstPtr, Src: PtrReg);
3036	Observer.changingInstr(MI);
3037	MI.getOperand(i: `1`).setReg(Cast.getReg(Idx: `0`));
3038	Observer.changedInstr(MI);
3039	return true;
3040	}
3041
3042	if (MI.getOpcode() != AMDGPU::G_LOAD)
3043	return false;
3044
3045	Register ValReg = MI.getOperand(i: `0`).getReg();
3046	LLT ValTy = MRI.getType(Reg: ValReg);
3047
3048	if (hasBufferRsrcWorkaround(Ty: ValTy)) {
3049	Observer.changingInstr(MI);
3050	castBufferRsrcFromV4I32(MI, B, MRI, Idx: `0`);
3051	Observer.changedInstr(MI);
3052	return true;
3053	}
3054
3055	MachineMemOperand MMO = MI.memoperands_begin();
3056	const unsigned ValSize = ValTy.getSizeInBits();
3057	const LLT MemTy = MMO->getMemoryType();
3058	const Align MemAlign = MMO->getAlign();
3059	const unsigned MemSize = MemTy.getSizeInBits();
3060	const uint64_t AlignInBits = `8` * MemAlign.value();
3061
3062	// Widen non-power-of-2 loads to the alignment if needed
3063	if (shouldWidenLoad(ST, MemoryTy: MemTy, AlignInBits, AddrSpace, Opcode: MI.getOpcode())) {
3064	const unsigned WideMemSize = PowerOf2Ceil(A: MemSize);
3065
3066	// This was already the correct extending load result type, so just adjust
3067	// the memory type.
3068	if (WideMemSize == ValSize) {
3069	MachineFunction &MF = B.getMF();
3070
3071	MachineMemOperand *WideMMO =
3072	MF.getMachineMemOperand(MMO, Offset: `0`, Size: WideMemSize / `8`);
3073	Observer.changingInstr(MI);
3074	MI.setMemRefs(MF, MemRefs: {WideMMO});
3075	Observer.changedInstr(MI);
3076	return true;
3077	}
3078
3079	// Don't bother handling edge case that should probably never be produced.
3080	if (ValSize > WideMemSize)
3081	return false;
3082
3083	LLT WideTy = widenToNextPowerOf2(Ty: ValTy);
3084
3085	Register WideLoad;
3086	if (!WideTy.isVector()) {
3087	WideLoad = B.buildLoadFromOffset(Dst: WideTy, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: `0`).getReg(Idx: `0`);
3088	B.buildTrunc(Res: ValReg, Op: WideLoad).getReg(Idx: `0`);
3089	} else {
3090	// Extract the subvector.
3091
3092	if (isRegisterType(Ty: ValTy)) {
3093	// If this a case where G_EXTRACT is legal, use it.
3094	// (e.g. <3 x s32> -> <4 x s32>)
3095	WideLoad = B.buildLoadFromOffset(Dst: WideTy, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: `0`).getReg(Idx: `0`);
3096	B.buildExtract(Res: ValReg, Src: WideLoad, Index: `0`);
3097	} else {
3098	// For cases where the widened type isn't a nice register value, unmerge
3099	// from a widened register (e.g. <3 x s16> -> <4 x s16>)
3100	WideLoad = B.buildLoadFromOffset(Dst: WideTy, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: `0`).getReg(Idx: `0`);
3101	B.buildDeleteTrailingVectorElements(Res: ValReg, Op0: WideLoad);
3102	}
3103	}
3104
3105	MI.eraseFromParent();
3106	return true;
3107	}
3108
3109	return false;
3110	}
3111
3112	bool AMDGPULegalizerInfo::legalizeStore(LegalizerHelper &Helper,
3113	MachineInstr &MI) const {
3114	MachineIRBuilder &B = Helper.MIRBuilder;
3115	MachineRegisterInfo &MRI = *B.getMRI();
3116	GISelChangeObserver &Observer = Helper.Observer;
3117
3118	Register DataReg = MI.getOperand(i: `0`).getReg();
3119	LLT DataTy = MRI.getType(Reg: DataReg);
3120
3121	if (hasBufferRsrcWorkaround(Ty: DataTy)) {
3122	Observer.changingInstr(MI);
3123	castBufferRsrcArgToV4I32(MI, B, Idx: `0`);
3124	Observer.changedInstr(MI);
3125	return true;
3126	}
3127	return false;
3128	}
3129
3130	bool AMDGPULegalizerInfo::legalizeFMad(
3131	MachineInstr &MI, MachineRegisterInfo &MRI,
3132	MachineIRBuilder &B) const {
3133	LLT Ty = MRI.getType(Reg: MI.getOperand(i: `0`).getReg());
3134	assert(Ty.isScalar());
3135
3136	MachineFunction &MF = B.getMF();
3137	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3138
3139	// TODO: Always legal with future ftz flag.
3140	// FIXME: Do we need just output?
3141	if (Ty == LLT::float32() &&
3142	MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign())
3143	return true;
3144	if (Ty == LLT::float16() &&
3145	MFI->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign())
3146	return true;
3147
3148	MachineIRBuilder HelperBuilder(MI);
3149	GISelObserverWrapper DummyObserver;
3150	LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3151	return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3152	}
3153
3154	bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
3155	MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
3156	Register DstReg = MI.getOperand(i: `0`).getReg();
3157	Register PtrReg = MI.getOperand(i: `1`).getReg();
3158	Register CmpVal = MI.getOperand(i: `2`).getReg();
3159	Register NewVal = MI.getOperand(i: `3`).getReg();
3160
3161	assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
3162	"this should not have been custom lowered");
3163
3164	LLT ValTy = MRI.getType(Reg: CmpVal);
3165	LLT VecTy = LLT::fixed_vector(NumElements: `2`, ScalarTy: ValTy);
3166
3167	Register PackedVal = B.buildBuildVector(Res: VecTy, Ops: { NewVal, CmpVal }).getReg(Idx: `0`);
3168
3169	B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3170	.addDef(DstReg)
3171	.addUse(PtrReg)
3172	.addUse(PackedVal)
3173	.setMemRefs(MI.memoperands());
3174
3175	MI.eraseFromParent();
3176	return true;
3177	}
3178
3179	/// Return true if it's known that \p Src can never be an f32 denormal value.
3180	static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI,
3181	Register Src) {
3182	const MachineInstr *DefMI = MRI.getVRegDef(Reg: Src);
3183	switch (DefMI->getOpcode()) {
3184	case TargetOpcode::G_INTRINSIC: {
3185	switch (cast<GIntrinsic>(Val: DefMI)->getIntrinsicID()) {
3186	case Intrinsic::amdgcn_frexp_mant:
3187	return true;
3188	default:
3189	break;
3190	}
3191
3192	break;
3193	}
3194	case TargetOpcode::G_FFREXP: {
3195	if (DefMI->getOperand(i: `0`).getReg() == Src)
3196	return true;
3197	break;
3198	}
3199	case TargetOpcode::G_FPEXT: {
3200	return MRI.getType(Reg: DefMI->getOperand(i: `1`).getReg()) == LLT::scalar(SizeInBits: `16`);
3201	}
3202	default:
3203	return false;
3204	}
3205
3206	return false;
3207	}
3208
3209	static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3210	if (Flags & MachineInstr::FmAfn)
3211	return true;
3212	const auto &Options = MF.getTarget().Options;
3213	return Options.UnsafeFPMath \|\| Options.ApproxFuncFPMath;
3214	}
3215
3216	static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src,
3217	unsigned Flags) {
3218	return !valueIsKnownNeverF32Denorm(MRI: MF.getRegInfo(), Src) &&
3219	MF.getDenormalMode(FPType: APFloat::IEEEsingle()).Input !=
3220	DenormalMode::PreserveSign;
3221	}
3222
3223	std::pair<Register, Register>
3224	AMDGPULegalizerInfo::getScaledLogInput(MachineIRBuilder &B, Register Src,
3225	unsigned Flags) const {
3226	if (!needsDenormHandlingF32(MF: B.getMF(), Src, Flags))
3227	return {};
3228
3229	const LLT F32 = LLT::scalar(SizeInBits: `32`);
3230	auto SmallestNormal = B.buildFConstant(
3231	Res: F32, Val: APFloat::getSmallestNormalized(Sem: APFloat::IEEEsingle()));
3232	auto IsLtSmallestNormal =
3233	B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: `1`), Op0: Src, Op1: SmallestNormal);
3234
3235	auto Scale32 = B.buildFConstant(Res: F32, Val: `0x1.0p+32`);
3236	auto One = B.buildFConstant(Res: F32, Val: `1.0`);
3237	auto ScaleFactor =
3238	B.buildSelect(Res: F32, Tst: IsLtSmallestNormal, Op0: Scale32, Op1: One, Flags);
3239	auto ScaledInput = B.buildFMul(Dst: F32, Src0: Src, Src1: ScaleFactor, Flags);
3240
3241	return {ScaledInput.getReg(Idx: `0`), IsLtSmallestNormal.getReg(Idx: `0`)};
3242	}
3243
3244	bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI,
3245	MachineIRBuilder &B) const {
3246	// v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3247	// If we have to handle denormals, scale up the input and adjust the result.
3248
3249	// scaled = x (is_denormal ? 0x1.0p+32 : 1.0)*
3250	// log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3251
3252	Register Dst = MI.getOperand(i: `0`).getReg();
3253	Register Src = MI.getOperand(i: `1`).getReg();
3254	LLT Ty = B.getMRI()->getType(Reg: Dst);
3255	unsigned Flags = MI.getFlags();
3256
3257	if (Ty == LLT::scalar(SizeInBits: `16`)) {
3258	const LLT F32 = LLT::scalar(SizeInBits: `32`);
3259	// Nothing in half is a denormal when promoted to f32.
3260	auto Ext = B.buildFPExt(Res: F32, Op: Src, Flags);
3261	auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
3262	.addUse(Ext.getReg(`0`))
3263	.setMIFlags(Flags);
3264	B.buildFPTrunc(Res: Dst, Op: Log2, Flags);
3265	MI.eraseFromParent();
3266	return true;
3267	}
3268
3269	assert(Ty == LLT::scalar(`32`));
3270
3271	auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3272	if (!ScaledInput) {
3273	B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(`0`)})
3274	.addUse(Src)
3275	.setMIFlags(Flags);
3276	MI.eraseFromParent();
3277	return true;
3278	}
3279
3280	auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3281	.addUse(ScaledInput)
3282	.setMIFlags(Flags);
3283
3284	auto ThirtyTwo = B.buildFConstant(Res: Ty, Val: `32.0`);
3285	auto Zero = B.buildFConstant(Res: Ty, Val: `0.0`);
3286	auto ResultOffset =
3287	B.buildSelect(Res: Ty, Tst: IsLtSmallestNormal, Op0: ThirtyTwo, Op1: Zero, Flags);
3288	B.buildFSub(Dst, Src0: Log2, Src1: ResultOffset, Flags);
3289
3290	MI.eraseFromParent();
3291	return true;
3292	}
3293
3294	static Register getMad(MachineIRBuilder &B, LLT Ty, Register X, Register Y,
3295	Register Z, unsigned Flags) {
3296	auto FMul = B.buildFMul(Dst: Ty, Src0: X, Src1: Y, Flags);
3297	return B.buildFAdd(Dst: Ty, Src0: FMul, Src1: Z, Flags).getReg(Idx: `0`);
3298	}
3299
3300	bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
3301	MachineIRBuilder &B) const {
3302	const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3303	assert(IsLog10 \|\| MI.getOpcode() == TargetOpcode::G_FLOG);
3304
3305	MachineRegisterInfo &MRI = *B.getMRI();
3306	Register Dst = MI.getOperand(i: `0`).getReg();
3307	Register X = MI.getOperand(i: `1`).getReg();
3308	unsigned Flags = MI.getFlags();
3309	const LLT Ty = MRI.getType(Reg: X);
3310	MachineFunction &MF = B.getMF();
3311
3312	const LLT F32 = LLT::scalar(SizeInBits: `32`);
3313	const LLT F16 = LLT::scalar(SizeInBits: `16`);
3314
3315	const AMDGPUTargetMachine &TM =
3316	static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
3317
3318	if (Ty == F16 \|\| MI.getFlag(Flag: MachineInstr::FmAfn) \|\|
3319	TM.Options.ApproxFuncFPMath \|\| TM.Options.UnsafeFPMath) {
3320	if (Ty == F16 && !ST.has16BitInsts()) {
3321	Register LogVal = MRI.createGenericVirtualRegister(Ty: F32);
3322	auto PromoteSrc = B.buildFPExt(Res: F32, Op: X);
3323	legalizeFlogUnsafe(B, Dst: LogVal, Src: PromoteSrc.getReg(Idx: `0`), IsLog10, Flags);
3324	B.buildFPTrunc(Res: Dst, Op: LogVal);
3325	} else {
3326	legalizeFlogUnsafe(B, Dst, Src: X, IsLog10, Flags);
3327	}
3328
3329	MI.eraseFromParent();
3330	return true;
3331	}
3332
3333	auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src: X, Flags);
3334	if (ScaledInput)
3335	X = ScaledInput;
3336
3337	auto Y =
3338	B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags);
3339
3340	Register R;
3341	if (ST.hasFastFMAF32()) {
3342	// c+cc are ln(2)/ln(10) to more than 49 bits
3343	const float c_log10 = `0x1.344134p-2f`;
3344	const float cc_log10 = `0x1.09f79ep-26f`;
3345
3346	// c + cc is ln(2) to more than 49 bits
3347	const float c_log = `0x1.62e42ep-1f`;
3348	const float cc_log = `0x1.efa39ep-25f`;
3349
3350	auto C = B.buildFConstant(Res: Ty, Val: IsLog10 ? c_log10 : c_log);
3351	auto CC = B.buildFConstant(Res: Ty, Val: IsLog10 ? cc_log10 : cc_log);
3352
3353	R = B.buildFMul(Dst: Ty, Src0: Y, Src1: C, Flags).getReg(`0`);
3354	auto NegR = B.buildFNeg(Dst: Ty, Src0: R, Flags);
3355	auto FMA0 = B.buildFMA(Dst: Ty, Src0: Y, Src1: C, Src2: NegR, Flags);
3356	auto FMA1 = B.buildFMA(Dst: Ty, Src0: Y, Src1: CC, Src2: FMA0, Flags);
3357	R = B.buildFAdd(Dst: Ty, Src0: R, Src1: FMA1, Flags).getReg(`0`);
3358	} else {
3359	// ch+ct is ln(2)/ln(10) to more than 36 bits
3360	const float ch_log10 = `0x1.344000p-2f`;
3361	const float ct_log10 = `0x1.3509f6p-18f`;
3362
3363	// ch + ct is ln(2) to more than 36 bits
3364	const float ch_log = `0x1.62e000p-1f`;
3365	const float ct_log = `0x1.0bfbe8p-15f`;
3366
3367	auto CH = B.buildFConstant(Res: Ty, Val: IsLog10 ? ch_log10 : ch_log);
3368	auto CT = B.buildFConstant(Res: Ty, Val: IsLog10 ? ct_log10 : ct_log);
3369
3370	auto MaskConst = B.buildConstant(Res: Ty, Val: `0xfffff000`);
3371	auto YH = B.buildAnd(Dst: Ty, Src0: Y, Src1: MaskConst);
3372	auto YT = B.buildFSub(Dst: Ty, Src0: Y, Src1: YH, Flags);
3373	auto YTCT = B.buildFMul(Dst: Ty, Src0: YT, Src1: CT, Flags);
3374
3375	Register Mad0 =
3376	getMad(B, Ty, YH.getReg(`0`), CT.getReg(Idx: `0`), YTCT.getReg(`0`), Flags);
3377	Register Mad1 = getMad(B, Ty, YT.getReg(`0`), CH.getReg(Idx: `0`), Mad0, Flags);
3378	R = getMad(B, Ty, YH.getReg(`0`), CH.getReg(Idx: `0`), Mad1, Flags);
3379	}
3380
3381	const bool IsFiniteOnly =
3382	(MI.getFlag(Flag: MachineInstr::FmNoNans) \|\| TM.Options.NoNaNsFPMath) &&
3383	(MI.getFlag(Flag: MachineInstr::FmNoInfs) \|\| TM.Options.NoInfsFPMath);
3384
3385	if (!IsFiniteOnly) {
3386	// Expand isfinite(x) => fabs(x) < inf
3387	auto Inf = B.buildFConstant(Res: Ty, Val: APFloat::getInf(Sem: APFloat::IEEEsingle()));
3388	auto Fabs = B.buildFAbs(Dst: Ty, Src0: Y);
3389	auto IsFinite =
3390	B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: `1`), Op0: Fabs, Op1: Inf, Flags);
3391	R = B.buildSelect(Res: Ty, Tst: IsFinite, Op0: R, Op1: Y, Flags).getReg(`0`);
3392	}
3393
3394	if (ScaledInput) {
3395	auto Zero = B.buildFConstant(Res: Ty, Val: `0.0`);
3396	auto ShiftK =
3397	B.buildFConstant(Res: Ty, Val: IsLog10 ? `0x1.344136p+3f` : `0x1.62e430p+4f`);
3398	auto Shift = B.buildSelect(Res: Ty, Tst: IsScaled, Op0: ShiftK, Op1: Zero, Flags);
3399	B.buildFSub(Dst, Src0: R, Src1: Shift, Flags);
3400	} else {
3401	B.buildCopy(Res: Dst, Op: R);
3402	}
3403
3404	MI.eraseFromParent();
3405	return true;
3406	}
3407
3408	bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst,
3409	Register Src, bool IsLog10,
3410	unsigned Flags) const {
3411	const double Log2BaseInverted =
3412	IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
3413
3414	LLT Ty = B.getMRI()->getType(Reg: Dst);
3415
3416	if (Ty == LLT::scalar(SizeInBits: `32`)) {
3417	auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3418	if (ScaledInput) {
3419	auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3420	.addUse(Src)
3421	.setMIFlags(Flags);
3422	auto ScaledResultOffset = B.buildFConstant(Res: Ty, Val: -`32.0` * Log2BaseInverted);
3423	auto Zero = B.buildFConstant(Res: Ty, Val: `0.0`);
3424	auto ResultOffset =
3425	B.buildSelect(Res: Ty, Tst: IsScaled, Op0: ScaledResultOffset, Op1: Zero, Flags);
3426	auto Log2Inv = B.buildFConstant(Res: Ty, Val: Log2BaseInverted);
3427
3428	if (ST.hasFastFMAF32())
3429	B.buildFMA(Dst, Src0: LogSrc, Src1: Log2Inv, Src2: ResultOffset, Flags);
3430	else {
3431	auto Mul = B.buildFMul(Dst: Ty, Src0: LogSrc, Src1: Log2Inv, Flags);
3432	B.buildFAdd(Dst, Src0: Mul, Src1: ResultOffset, Flags);
3433	}
3434
3435	return true;
3436	}
3437	}
3438
3439	auto Log2Operand = Ty == LLT::scalar(`16`)
3440	? B.buildFLog2(Ty, Src, Flags)
3441	: B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3442	.addUse(Src)
3443	.setMIFlags(Flags);
3444	auto Log2BaseInvertedOperand = B.buildFConstant(Res: Ty, Val: Log2BaseInverted);
3445	B.buildFMul(Dst, Src0: Log2Operand, Src1: Log2BaseInvertedOperand, Flags);
3446	return true;
3447	}
3448
3449	bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI,
3450	MachineIRBuilder &B) const {
3451	// v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3452	// If we have to handle denormals, scale up the input and adjust the result.
3453
3454	Register Dst = MI.getOperand(i: `0`).getReg();
3455	Register Src = MI.getOperand(i: `1`).getReg();
3456	unsigned Flags = MI.getFlags();
3457	LLT Ty = B.getMRI()->getType(Reg: Dst);
3458	const LLT F16 = LLT::scalar(SizeInBits: `16`);
3459	const LLT F32 = LLT::scalar(SizeInBits: `32`);
3460
3461	if (Ty == F16) {
3462	// Nothing in half is a denormal when promoted to f32.
3463	auto Ext = B.buildFPExt(Res: F32, Op: Src, Flags);
3464	auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
3465	.addUse(Ext.getReg(`0`))
3466	.setMIFlags(Flags);
3467	B.buildFPTrunc(Res: Dst, Op: Log2, Flags);
3468	MI.eraseFromParent();
3469	return true;
3470	}
3471
3472	assert(Ty == F32);
3473
3474	if (!needsDenormHandlingF32(MF: B.getMF(), Src, Flags)) {
3475	B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3476	.addUse(Src)
3477	.setMIFlags(Flags);
3478	MI.eraseFromParent();
3479	return true;
3480	}
3481
3482	// bool needs_scaling = x < -0x1.f80000p+6f;
3483	// v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) (s ? 0x1.0p-64f : 1.0f);*
3484
3485	// -nextafter(128.0, -1)
3486	auto RangeCheckConst = B.buildFConstant(Res: Ty, Val: -`0x1.f80000p+6f`);
3487	auto NeedsScaling = B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: `1`), Op0: Src,
3488	Op1: RangeCheckConst, Flags);
3489
3490	auto SixtyFour = B.buildFConstant(Res: Ty, Val: `0x1.0p+6f`);
3491	auto Zero = B.buildFConstant(Res: Ty, Val: `0.0`);
3492	auto AddOffset = B.buildSelect(Res: F32, Tst: NeedsScaling, Op0: SixtyFour, Op1: Zero, Flags);
3493	auto AddInput = B.buildFAdd(Dst: F32, Src0: Src, Src1: AddOffset, Flags);
3494
3495	auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3496	.addUse(AddInput.getReg(`0`))
3497	.setMIFlags(Flags);
3498
3499	auto TwoExpNeg64 = B.buildFConstant(Res: Ty, Val: `0x1.0p-64f`);
3500	auto One = B.buildFConstant(Res: Ty, Val: `1.0`);
3501	auto ResultScale = B.buildSelect(Res: F32, Tst: NeedsScaling, Op0: TwoExpNeg64, Op1: One, Flags);
3502	B.buildFMul(Dst, Src0: Exp2, Src1: ResultScale, Flags);
3503	MI.eraseFromParent();
3504	return true;
3505	}
3506
3507	bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst,
3508	Register X, unsigned Flags) const {
3509	LLT Ty = B.getMRI()->getType(Reg: Dst);
3510	LLT F32 = LLT::scalar(SizeInBits: `32`);
3511
3512	if (Ty != F32 \|\| !needsDenormHandlingF32(MF: B.getMF(), Src: X, Flags)) {
3513	auto Log2E = B.buildFConstant(Res: Ty, Val: numbers::log2e);
3514	auto Mul = B.buildFMul(Dst: Ty, Src0: X, Src1: Log2E, Flags);
3515
3516	if (Ty == F32) {
3517	B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3518	.addUse(Mul.getReg(`0`))
3519	.setMIFlags(Flags);
3520	} else {
3521	B.buildFExp2(Dst, Src: Mul.getReg(Idx: `0`), Flags);
3522	}
3523
3524	return true;
3525	}
3526
3527	auto Threshold = B.buildFConstant(Res: Ty, Val: -`0x1.5d58a0p+6f`);
3528	auto NeedsScaling =
3529	B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: `1`), Op0: X, Op1: Threshold, Flags);
3530	auto ScaleOffset = B.buildFConstant(Res: Ty, Val: `0x1.0p+6f`);
3531	auto ScaledX = B.buildFAdd(Dst: Ty, Src0: X, Src1: ScaleOffset, Flags);
3532	auto AdjustedX = B.buildSelect(Res: Ty, Tst: NeedsScaling, Op0: ScaledX, Op1: X, Flags);
3533
3534	auto Log2E = B.buildFConstant(Res: Ty, Val: numbers::log2e);
3535	auto ExpInput = B.buildFMul(Dst: Ty, Src0: AdjustedX, Src1: Log2E, Flags);
3536
3537	auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3538	.addUse(ExpInput.getReg(`0`))
3539	.setMIFlags(Flags);
3540
3541	auto ResultScaleFactor = B.buildFConstant(Res: Ty, Val: `0x1.969d48p-93f`);
3542	auto AdjustedResult = B.buildFMul(Dst: Ty, Src0: Exp2, Src1: ResultScaleFactor, Flags);
3543	B.buildSelect(Res: Dst, Tst: NeedsScaling, Op0: AdjustedResult, Op1: Exp2, Flags);
3544	return true;
3545	}
3546
3547	bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
3548	MachineIRBuilder &B) const {
3549	Register Dst = MI.getOperand(i: `0`).getReg();
3550	Register X = MI.getOperand(i: `1`).getReg();
3551	const unsigned Flags = MI.getFlags();
3552	MachineFunction &MF = B.getMF();
3553	MachineRegisterInfo &MRI = *B.getMRI();
3554	LLT Ty = MRI.getType(Reg: Dst);
3555	const LLT F16 = LLT::scalar(SizeInBits: `16`);
3556	const LLT F32 = LLT::scalar(SizeInBits: `32`);
3557	const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
3558
3559	if (Ty == F16) {
3560	// v_exp_f16 (fmul x, log2e)
3561	if (allowApproxFunc(MF, Flags)) {
3562	// TODO: Does this really require fast?
3563	legalizeFExpUnsafe(B, Dst, X, Flags);
3564	MI.eraseFromParent();
3565	return true;
3566	}
3567
3568	// exp(f16 x) ->
3569	// fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3570
3571	// Nothing in half is a denormal when promoted to f32.
3572	auto Ext = B.buildFPExt(Res: F32, Op: X, Flags);
3573	Register Lowered = MRI.createGenericVirtualRegister(Ty: F32);
3574	legalizeFExpUnsafe(B, Dst: Lowered, X: Ext.getReg(Idx: `0`), Flags);
3575	B.buildFPTrunc(Res: Dst, Op: Lowered, Flags);
3576	MI.eraseFromParent();
3577	return true;
3578	}
3579
3580	assert(Ty == F32);
3581
3582	// TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3583	// library behavior. Also, is known-not-daz source sufficient?
3584	if (allowApproxFunc(MF, Flags)) {
3585	legalizeFExpUnsafe(B, Dst, X, Flags);
3586	MI.eraseFromParent();
3587	return true;
3588	}
3589
3590	// Algorithm:
3591	//
3592	// e^x = 2^(x/ln(2)) = 2^(x(64/ln(2))/64)*
3593	//
3594	// x(64/ln(2)) = n + f, \|f\| <= 0.5, n is integer*
3595	// n = 64m + j, 0 <= j < 64*
3596	//
3597	// e^x = 2^((64m + j + f)/64)*
3598	// = (2^m) (2^(j/64)) * 2^(f/64)*
3599	// = (2^m) (2^(j/64)) * e^(f(ln(2)/64))
3600	//
3601	// f = x(64/ln(2)) - n*
3602	// r = f(ln(2)/64) = x - n(ln(2)/64)
3603	//
3604	// e^x = (2^m) (2^(j/64)) * e^r*
3605	//
3606	// (2^(j/64)) is precomputed
3607	//
3608	// e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3609	// e^r = 1 + q
3610	//
3611	// q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3612	//
3613	// e^x = (2^m) ( (2^(j/64)) + q(2^(j/64)) )
3614	const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
3615	Register PH, PL;
3616
3617	if (ST.hasFastFMAF32()) {
3618	const float c_exp = numbers::log2ef;
3619	const float cc_exp = `0x1.4ae0bep-26f`; // c+cc are 49 bits
3620	const float c_exp10 = `0x1.a934f0p+1f`;
3621	const float cc_exp10 = `0x1.2f346ep-24f`;
3622
3623	auto C = B.buildFConstant(Res: Ty, Val: IsExp10 ? c_exp10 : c_exp);
3624	PH = B.buildFMul(Dst: Ty, Src0: X, Src1: C, Flags).getReg(Idx: `0`);
3625	auto NegPH = B.buildFNeg(Dst: Ty, Src0: PH, Flags);
3626	auto FMA0 = B.buildFMA(Dst: Ty, Src0: X, Src1: C, Src2: NegPH, Flags);
3627
3628	auto CC = B.buildFConstant(Res: Ty, Val: IsExp10 ? cc_exp10 : cc_exp);
3629	PL = B.buildFMA(Dst: Ty, Src0: X, Src1: CC, Src2: FMA0, Flags).getReg(Idx: `0`);
3630	} else {
3631	const float ch_exp = `0x1.714000p+0f`;
3632	const float cl_exp = `0x1.47652ap-12f`; // ch + cl are 36 bits
3633
3634	const float ch_exp10 = `0x1.a92000p+1f`;
3635	const float cl_exp10 = `0x1.4f0978p-11f`;
3636
3637	auto MaskConst = B.buildConstant(Res: Ty, Val: `0xfffff000`);
3638	auto XH = B.buildAnd(Dst: Ty, Src0: X, Src1: MaskConst);
3639	auto XL = B.buildFSub(Dst: Ty, Src0: X, Src1: XH, Flags);
3640
3641	auto CH = B.buildFConstant(Res: Ty, Val: IsExp10 ? ch_exp10 : ch_exp);
3642	PH = B.buildFMul(Dst: Ty, Src0: XH, Src1: CH, Flags).getReg(Idx: `0`);
3643
3644	auto CL = B.buildFConstant(Res: Ty, Val: IsExp10 ? cl_exp10 : cl_exp);
3645	auto XLCL = B.buildFMul(Dst: Ty, Src0: XL, Src1: CL, Flags);
3646
3647	Register Mad0 =
3648	getMad(B, Ty, X: XL.getReg(Idx: `0`), Y: CH.getReg(Idx: `0`), Z: XLCL.getReg(Idx: `0`), Flags);
3649	PL = getMad(B, Ty, X: XH.getReg(Idx: `0`), Y: CL.getReg(Idx: `0`), Z: Mad0, Flags);
3650	}
3651
3652	auto E = B.buildIntrinsicRoundeven(Dst: Ty, Src0: PH, Flags);
3653
3654	// It is unsafe to contract this fsub into the PH multiply.
3655	auto PHSubE = B.buildFSub(Dst: Ty, Src0: PH, Src1: E, Flags: FlagsNoContract);
3656	auto A = B.buildFAdd(Dst: Ty, Src0: PHSubE, Src1: PL, Flags);
3657	auto IntE = B.buildFPTOSI(Dst: LLT::scalar(SizeInBits: `32`), Src0: E);
3658
3659	auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3660	.addUse(A.getReg(`0`))
3661	.setMIFlags(Flags);
3662	auto R = B.buildFLdexp(Dst: Ty, Src0: Exp2, Src1: IntE, Flags);
3663
3664	auto UnderflowCheckConst =
3665	B.buildFConstant(Res: Ty, Val: IsExp10 ? -`0x1.66d3e8p+5f` : -`0x1.9d1da0p+6f`);
3666	auto Zero = B.buildFConstant(Res: Ty, Val: `0.0`);
3667	auto Underflow =
3668	B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: `1`), Op0: X, Op1: UnderflowCheckConst);
3669
3670	R = B.buildSelect(Res: Ty, Tst: Underflow, Op0: Zero, Op1: R);
3671
3672	const auto &Options = MF.getTarget().Options;
3673
3674	if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) {
3675	auto OverflowCheckConst =
3676	B.buildFConstant(Res: Ty, Val: IsExp10 ? `0x1.344136p+5f` : `0x1.62e430p+6f`);
3677
3678	auto Overflow =
3679	B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: LLT::scalar(SizeInBits: `1`), Op0: X, Op1: OverflowCheckConst);
3680	auto Inf = B.buildFConstant(Res: Ty, Val: APFloat::getInf(Sem: APFloat::IEEEsingle()));
3681	R = B.buildSelect(Res: Ty, Tst: Overflow, Op0: Inf, Op1: R, Flags);
3682	}
3683
3684	B.buildCopy(Res: Dst, Op: R);
3685	MI.eraseFromParent();
3686	return true;
3687	}
3688
3689	bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
3690	MachineIRBuilder &B) const {
3691	Register Dst = MI.getOperand(i: `0`).getReg();
3692	Register Src0 = MI.getOperand(i: `1`).getReg();
3693	Register Src1 = MI.getOperand(i: `2`).getReg();
3694	unsigned Flags = MI.getFlags();
3695	LLT Ty = B.getMRI()->getType(Reg: Dst);
3696	const LLT F16 = LLT::float16();
3697	const LLT F32 = LLT::float32();
3698
3699	if (Ty == F32) {
3700	auto Log = B.buildFLog2(Dst: F32, Src: Src0, Flags);
3701	auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3702	.addUse(Log.getReg(`0`))
3703	.addUse(Src1)
3704	.setMIFlags(Flags);
3705	B.buildFExp2(Dst, Src: Mul, Flags);
3706	} else if (Ty == F16) {
3707	// There's no f16 fmul_legacy, so we need to convert for it.
3708	auto Log = B.buildFLog2(Dst: F16, Src: Src0, Flags);
3709	auto Ext0 = B.buildFPExt(Res: F32, Op: Log, Flags);
3710	auto Ext1 = B.buildFPExt(Res: F32, Op: Src1, Flags);
3711	auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3712	.addUse(Ext0.getReg(`0`))
3713	.addUse(Ext1.getReg(`0`))
3714	.setMIFlags(Flags);
3715	B.buildFExp2(Dst, Src: B.buildFPTrunc(Res: F16, Op: Mul), Flags);
3716	} else
3717	return false;
3718
3719	MI.eraseFromParent();
3720	return true;
3721	}
3722
3723	// Find a source register, ignoring any possible source modifiers.
3724	static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
3725	Register ModSrc = OrigSrc;
3726	if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
3727	ModSrc = SrcFNeg->getOperand(i: `1`).getReg();
3728	if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3729	ModSrc = SrcFAbs->getOperand(i: `1`).getReg();
3730	} else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3731	ModSrc = SrcFAbs->getOperand(i: `1`).getReg();
3732	return ModSrc;
3733	}
3734
3735	bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
3736	MachineRegisterInfo &MRI,
3737	MachineIRBuilder &B) const {
3738
3739	const LLT S1 = LLT::scalar(SizeInBits: `1`);
3740	const LLT F64 = LLT::float64();
3741	Register Dst = MI.getOperand(i: `0`).getReg();
3742	Register OrigSrc = MI.getOperand(i: `1`).getReg();
3743	unsigned Flags = MI.getFlags();
3744	assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
3745	"this should not have been custom lowered");
3746
3747	// V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
3748	// is used instead. However, SI doesn't have V_FLOOR_F64, so the most
3749	// efficient way to implement it is using V_FRACT_F64. The workaround for the
3750	// V_FRACT bug is:
3751	// fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
3752	//
3753	// Convert floor(x) to (x - fract(x))
3754
3755	auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
3756	.addUse(OrigSrc)
3757	.setMIFlags(Flags);
3758
3759	// Give source modifier matching some assistance before obscuring a foldable
3760	// pattern.
3761
3762	// TODO: We can avoid the neg on the fract? The input sign to fract
3763	// shouldn't matter?
3764	Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
3765
3766	auto Const =
3767	B.buildFConstant(Res: F64, Val: llvm::bit_cast<double>(from: `0x3fefffffffffffff`));
3768
3769	Register Min = MRI.createGenericVirtualRegister(Ty: F64);
3770
3771	// We don't need to concern ourselves with the snan handling difference, so
3772	// use the one which will directly select.
3773	const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3774	if (MFI->getMode().IEEE)
3775	B.buildFMinNumIEEE(Dst: Min, Src0: Fract, Src1: Const, Flags);
3776	else
3777	B.buildFMinNum(Dst: Min, Src0: Fract, Src1: Const, Flags);
3778
3779	Register CorrectedFract = Min;
3780	if (!MI.getFlag(Flag: MachineInstr::FmNoNans)) {
3781	auto IsNan = B.buildFCmp(Pred: CmpInst::FCMP_ORD, Res: S1, Op0: ModSrc, Op1: ModSrc, Flags);
3782	CorrectedFract = B.buildSelect(Res: F64, Tst: IsNan, Op0: ModSrc, Op1: Min, Flags).getReg(Idx: `0`);
3783	}
3784
3785	auto NegFract = B.buildFNeg(Dst: F64, Src0: CorrectedFract, Flags);
3786	B.buildFAdd(Dst, Src0: OrigSrc, Src1: NegFract, Flags);
3787
3788	MI.eraseFromParent();
3789	return true;
3790	}
3791
3792	// Turn an illegal packed v2s16 build vector into bit operations.
3793	// TODO: This should probably be a bitcast action in LegalizerHelper.
3794	bool AMDGPULegalizerInfo::legalizeBuildVector(
3795	MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
3796	Register Dst = MI.getOperand(i: `0`).getReg();
3797	const LLT S32 = LLT::scalar(SizeInBits: `32`);
3798	const LLT S16 = LLT::scalar(SizeInBits: `16`);
3799	assert(MRI.getType(Dst) == LLT::fixed_vector(`2`, `16`));
3800
3801	Register Src0 = MI.getOperand(i: `1`).getReg();
3802	Register Src1 = MI.getOperand(i: `2`).getReg();
3803
3804	if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
3805	assert(MRI.getType(Src0) == S32);
3806	Src0 = B.buildTrunc(Res: S16, Op: MI.getOperand(i: `1`).getReg()).getReg(Idx: `0`);
3807	Src1 = B.buildTrunc(Res: S16, Op: MI.getOperand(i: `2`).getReg()).getReg(Idx: `0`);
3808	}
3809
3810	auto Merge = B.buildMergeLikeInstr(Res: S32, Ops: {Src0, Src1});
3811	B.buildBitcast(Dst, Src: Merge);
3812
3813	MI.eraseFromParent();
3814	return true;
3815	}
3816
3817	// Build a big integer multiply or multiply-add using MAD_64_32 instructions.
3818	//
3819	// Source and accumulation registers must all be 32-bits.
3820	//
3821	// TODO: When the multiply is uniform, we should produce a code sequence
3822	// that is better suited to instruction selection on the SALU. Instead of
3823	// the outer loop going over parts of the result, the outer loop should go
3824	// over parts of one of the factors. This should result in instruction
3825	// selection that makes full use of S_ADDC_U32 instructions.
3826	void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper,
3827	MutableArrayRef<Register> Accum,
3828	ArrayRef<Register> Src0,
3829	ArrayRef<Register> Src1,
3830	bool UsePartialMad64_32,
3831	bool SeparateOddAlignedProducts) const {
3832	// Use (possibly empty) vectors of S1 registers to represent the set of
3833	// carries from one pair of positions to the next.
3834	using Carry = SmallVector<Register, `2`>;
3835
3836	MachineIRBuilder &B = Helper.MIRBuilder;
3837	GISelKnownBits &KB = *Helper.getKnownBits();
3838
3839	const LLT S1 = LLT::scalar(SizeInBits: `1`);
3840	const LLT S32 = LLT::scalar(SizeInBits: `32`);
3841	const LLT S64 = LLT::scalar(SizeInBits: `64`);
3842
3843	Register Zero32;
3844	Register Zero64;
3845
3846	auto getZero32 = [&]() -> Register {
3847	if (!Zero32)
3848	Zero32 = B.buildConstant(Res: S32, Val: `0`).getReg(Idx: `0`);
3849	return Zero32;
3850	};
3851	auto getZero64 = [&]() -> Register {
3852	if (!Zero64)
3853	Zero64 = B.buildConstant(Res: S64, Val: `0`).getReg(Idx: `0`);
3854	return Zero64;
3855	};
3856
3857	SmallVector<bool, `2`> Src0KnownZeros, Src1KnownZeros;
3858	for (unsigned i = `0`; i < Src0.size(); ++i) {
3859	Src0KnownZeros.push_back(Elt: KB.getKnownBits(R: Src0 [i]).isZero());
3860	Src1KnownZeros.push_back(Elt: KB.getKnownBits(R: Src1 [i]).isZero());
3861	}
3862
3863	// Merge the given carries into the 32-bit LocalAccum, which is modified
3864	// in-place.
3865	//
3866	// Returns the carry-out, which is a single S1 register or null.
3867	auto mergeCarry =
3868	[&](Register &LocalAccum, const Carry &CarryIn) -> Register {
3869	if (CarryIn.empty())
3870	return Register ();
3871
3872	bool HaveCarryOut = true;
3873	Register CarryAccum;
3874	if (CarryIn.size() == `1`) {
3875	if (!LocalAccum) {
3876	LocalAccum = B.buildZExt(Res: S32, Op: CarryIn [`0`]).getReg(Idx: `0`);
3877	return Register ();
3878	}
3879
3880	CarryAccum = getZero32 ();
3881	} else {
3882	CarryAccum = B.buildZExt(Res: S32, Op: CarryIn [`0`]).getReg(Idx: `0`);
3883	for (unsigned i = `1`; i + `1` < CarryIn.size(); ++i) {
3884	CarryAccum =
3885	B.buildUAdde(Res: S32, CarryOut: S1, Op0: CarryAccum, Op1: getZero32 (), CarryIn: CarryIn [i])
3886	.getReg(Idx: `0`);
3887	}
3888
3889	if (!LocalAccum) {
3890	LocalAccum = getZero32 ();
3891	HaveCarryOut = false;
3892	}
3893	}
3894
3895	auto Add =
3896	B.buildUAdde(Res: S32, CarryOut: S1, Op0: CarryAccum, Op1: LocalAccum, CarryIn: CarryIn.back());
3897	LocalAccum = Add.getReg(Idx: `0`);
3898	return HaveCarryOut ? Add.getReg(Idx: `1`) : Register ();
3899	};
3900
3901	// Build a multiply-add chain to compute
3902	//
3903	// LocalAccum + (partial products at DstIndex)
3904	// + (opportunistic subset of CarryIn)
3905	//
3906	// LocalAccum is an array of one or two 32-bit registers that are updated
3907	// in-place. The incoming registers may be null.
3908	//
3909	// In some edge cases, carry-ins can be consumed "for free". In that case,
3910	// the consumed carry bits are removed from CarryIn in-place.
3911	auto buildMadChain =
3912	[&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
3913	-> Carry {
3914	assert((DstIndex + `1` < Accum.size() && LocalAccum.size() == `2`) \|\|
3915	(DstIndex + `1` >= Accum.size() && LocalAccum.size() == `1`));
3916
3917	Carry CarryOut;
3918	unsigned j0 = `0`;
3919
3920	// Use plain 32-bit multiplication for the most significant part of the
3921	// result by default.
3922	if (LocalAccum.size() == `1` &&
3923	(!UsePartialMad64_32 \|\| !CarryIn.empty())) {
3924	do {
3925	// Skip multiplication if one of the operands is 0
3926	unsigned j1 = DstIndex - j0;
3927	if (Src0KnownZeros [j0] \|\| Src1KnownZeros [j1]) {
3928	++j0;
3929	continue;
3930	}
3931	auto Mul = B.buildMul(Dst: S32, Src0: Src0 [j0], Src1: Src1 [j1]);
3932	if (!LocalAccum [`0`] \|\| KB.getKnownBits(R: LocalAccum [`0`]).isZero()) {
3933	LocalAccum [`0`] = Mul.getReg(Idx: `0`);
3934	} else {
3935	if (CarryIn.empty()) {
3936	LocalAccum [`0`] = B.buildAdd(Dst: S32, Src0: LocalAccum [`0`], Src1: Mul).getReg(Idx: `0`);
3937	} else {
3938	LocalAccum [`0`] =
3939	B.buildUAdde(Res: S32, CarryOut: S1, Op0: LocalAccum [`0`], Op1: Mul, CarryIn: CarryIn.back())
3940	.getReg(Idx: `0`);
3941	CarryIn.pop_back();
3942	}
3943	}
3944	++j0;
3945	} while (j0 <= DstIndex && (!UsePartialMad64_32 \|\| !CarryIn.empty()));
3946	}
3947
3948	// Build full 64-bit multiplies.
3949	if (j0 <= DstIndex) {
3950	bool HaveSmallAccum = false;
3951	Register Tmp;
3952
3953	if (LocalAccum [`0`]) {
3954	if (LocalAccum.size() == `1`) {
3955	Tmp = B.buildAnyExt(Res: S64, Op: LocalAccum [`0`]).getReg(Idx: `0`);
3956	HaveSmallAccum = true;
3957	} else if (LocalAccum [`1`]) {
3958	Tmp = B.buildMergeLikeInstr(Res: S64, Ops: LocalAccum).getReg(Idx: `0`);
3959	HaveSmallAccum = false;
3960	} else {
3961	Tmp = B.buildZExt(Res: S64, Op: LocalAccum [`0`]).getReg(Idx: `0`);
3962	HaveSmallAccum = true;
3963	}
3964	} else {
3965	assert(LocalAccum.size() == `1` \|\| !LocalAccum[`1`]);
3966	Tmp = getZero64 ();
3967	HaveSmallAccum = true;
3968	}
3969
3970	do {
3971	unsigned j1 = DstIndex - j0;
3972	if (Src0KnownZeros [j0] \|\| Src1KnownZeros [j1]) {
3973	++j0;
3974	continue;
3975	}
3976	auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
3977	{Src0[j0], Src1[j1], Tmp});
3978	Tmp = Mad.getReg(`0`);
3979	if (!HaveSmallAccum)
3980	CarryOut.push_back(Elt: Mad.getReg(`1`));
3981	HaveSmallAccum = false;
3982
3983	++j0;
3984	} while (j0 <= DstIndex);
3985
3986	auto Unmerge = B.buildUnmerge(Res: S32, Op: Tmp);
3987	LocalAccum [`0`] = Unmerge.getReg(Idx: `0`);
3988	if (LocalAccum.size() > `1`)
3989	LocalAccum [`1`] = Unmerge.getReg(Idx: `1`);
3990	}
3991
3992	return CarryOut;
3993	};
3994
3995	// Outer multiply loop, iterating over destination parts from least
3996	// significant to most significant parts.
3997	//
3998	// The columns of the following diagram correspond to the destination parts
3999	// affected by one iteration of the outer loop (ignoring boundary
4000	// conditions).
4001	//
4002	// Dest index relative to 2 i: 1 0 -1*
4003	// ------
4004	// Carries from previous iteration: e o
4005	// Even-aligned partial product sum: E E .
4006	// Odd-aligned partial product sum: O O
4007	//
4008	// 'o' is OddCarry, 'e' is EvenCarry.
4009	// EE and OO are computed from partial products via buildMadChain and use
4010	// accumulation where possible and appropriate.
4011	//
4012	Register SeparateOddCarry;
4013	Carry EvenCarry;
4014	Carry OddCarry;
4015
4016	for (unsigned i = `0`; i <= Accum.size() / `2`; ++i) {
4017	Carry OddCarryIn = std::move(OddCarry);
4018	Carry EvenCarryIn = std::move(EvenCarry);
4019	OddCarry.clear();
4020	EvenCarry.clear();
4021
4022	// Partial products at offset 2 i.*
4023	if (`2` * i < Accum.size()) {
4024	auto LocalAccum = Accum.drop_front(N: `2` * i).take_front(N: `2`);
4025	EvenCarry = buildMadChain (LocalAccum, `2` * i, EvenCarryIn);
4026	}
4027
4028	// Partial products at offset 2 i - 1.*
4029	if (i > `0`) {
4030	if (!SeparateOddAlignedProducts) {
4031	auto LocalAccum = Accum.drop_front(N: `2` * i - `1`).take_front(N: `2`);
4032	OddCarry = buildMadChain (LocalAccum, `2` * i - `1`, OddCarryIn);
4033	} else {
4034	bool IsHighest = `2` * i >= Accum.size();
4035	Register SeparateOddOut[`2`];
4036	auto LocalAccum = MutableArrayRef(SeparateOddOut)
4037	.take_front(N: IsHighest ? `1` : `2`);
4038	OddCarry = buildMadChain (LocalAccum, `2` * i - `1`, OddCarryIn);
4039
4040	MachineInstr *Lo;
4041
4042	if (i == `1`) {
4043	if (!IsHighest)
4044	Lo = B.buildUAddo(Res: S32, CarryOut: S1, Op0: Accum [`2` * i - `1`], Op1: SeparateOddOut[`0`]);
4045	else
4046	Lo = B.buildAdd(Dst: S32, Src0: Accum [`2` * i - `1`], Src1: SeparateOddOut[`0`]);
4047	} else {
4048	Lo = B.buildUAdde(Res: S32, CarryOut: S1, Op0: Accum [`2` * i - `1`], Op1: SeparateOddOut[`0`],
4049	CarryIn: SeparateOddCarry);
4050	}
4051	Accum [`2` * i - `1`] = Lo->getOperand(i: `0`).getReg();
4052
4053	if (!IsHighest) {
4054	auto Hi = B.buildUAdde(Res: S32, CarryOut: S1, Op0: Accum [`2` * i], Op1: SeparateOddOut[`1`],
4055	CarryIn: Lo->getOperand(i: `1`).getReg());
4056	Accum [`2` * i] = Hi.getReg(Idx: `0`);
4057	SeparateOddCarry = Hi.getReg(Idx: `1`);
4058	}
4059	}
4060	}
4061
4062	// Add in the carries from the previous iteration
4063	if (i > `0`) {
4064	if (Register CarryOut = mergeCarry (Accum [`2` * i - `1`], OddCarryIn))
4065	EvenCarryIn.push_back(Elt: CarryOut);
4066
4067	if (`2` * i < Accum.size()) {
4068	if (Register CarryOut = mergeCarry (Accum [`2` * i], EvenCarryIn))
4069	OddCarry.push_back(Elt: CarryOut);
4070	}
4071	}
4072	}
4073	}
4074
4075	// Custom narrowing of wide multiplies using wide multiply-add instructions.
4076	//
4077	// TODO: If the multiply is followed by an addition, we should attempt to
4078	// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4079	bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
4080	MachineInstr &MI) const {
4081	assert(ST.hasMad64_32());
4082	assert(MI.getOpcode() == TargetOpcode::G_MUL);
4083
4084	MachineIRBuilder &B = Helper.MIRBuilder;
4085	MachineRegisterInfo &MRI = *B.getMRI();
4086
4087	Register DstReg = MI.getOperand(i: `0`).getReg();
4088	Register Src0 = MI.getOperand(i: `1`).getReg();
4089	Register Src1 = MI.getOperand(i: `2`).getReg();
4090
4091	LLT Ty = MRI.getType(Reg: DstReg);
4092	assert(Ty.isScalar());
4093
4094	unsigned Size = Ty.getSizeInBits();
4095	unsigned NumParts = Size / `32`;
4096	assert((Size % `32`) == `0`);
4097	assert(NumParts >= `2`);
4098
4099	// Whether to use MAD_64_32 for partial products whose high half is
4100	// discarded. This avoids some ADD instructions but risks false dependency
4101	// stalls on some subtargets in some cases.
4102	const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4103
4104	// Whether to compute odd-aligned partial products separately. This is
4105	// advisable on subtargets where the accumulator of MAD_64_32 must be placed
4106	// in an even-aligned VGPR.
4107	const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4108
4109	LLT S32 = LLT::scalar(SizeInBits: `32`);
4110	SmallVector<Register, `2`> Src0Parts, Src1Parts;
4111	for (unsigned i = `0`; i < NumParts; ++i) {
4112	Src0Parts.push_back(Elt: MRI.createGenericVirtualRegister(Ty: S32));
4113	Src1Parts.push_back(Elt: MRI.createGenericVirtualRegister(Ty: S32));
4114	}
4115	B.buildUnmerge(Res: Src0Parts, Op: Src0);
4116	B.buildUnmerge(Res: Src1Parts, Op: Src1);
4117
4118	SmallVector<Register, `2`> AccumRegs(NumParts);
4119	buildMultiply(Helper, Accum: AccumRegs, Src0: Src0Parts, Src1: Src1Parts, UsePartialMad64_32,
4120	SeparateOddAlignedProducts);
4121
4122	B.buildMergeLikeInstr(Res: DstReg, Ops: AccumRegs);
4123	MI.eraseFromParent();
4124	return true;
4125	}
4126
4127	// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4128	// ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4129	// case with a single min instruction instead of a compare+select.
4130	bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
4131	MachineRegisterInfo &MRI,
4132	MachineIRBuilder &B) const {
4133	Register Dst = MI.getOperand(i: `0`).getReg();
4134	Register Src = MI.getOperand(i: `1`).getReg();
4135	LLT DstTy = MRI.getType(Reg: Dst);
4136	LLT SrcTy = MRI.getType(Reg: Src);
4137
4138	unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4139	? AMDGPU::G_AMDGPU_FFBH_U32
4140	: AMDGPU::G_AMDGPU_FFBL_B32;
4141	auto Tmp = B.buildInstr(Opc: NewOpc, DstOps: {DstTy}, SrcOps: {Src});
4142	B.buildUMin(Dst, Src0: Tmp, Src1: B.buildConstant(Res: DstTy, Val: SrcTy.getSizeInBits()));
4143
4144	MI.eraseFromParent();
4145	return true;
4146	}
4147
4148	// Check that this is a G_XOR x, -1
4149	static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4150	if (MI.getOpcode() != TargetOpcode::G_XOR)
4151	return false;
4152	auto ConstVal = getIConstantVRegSExtVal(VReg: MI.getOperand(i: `2`).getReg(), MRI);
4153	return ConstVal && *ConstVal == -`1`;
4154	}
4155
4156	// Return the use branch instruction, otherwise null if the usage is invalid.
4157	static MachineInstr *
4158	verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
4159	MachineBasicBlock &UncondBrTarget, bool* &Negated) {
4160	Register CondDef = MI.getOperand(i: `0`).getReg();
4161	if (!MRI.hasOneNonDBGUse(RegNo: CondDef))
4162	return nullptr;
4163
4164	MachineBasicBlock *Parent = MI.getParent();
4165	MachineInstr UseMI = &MRI.use_instr_nodbg_begin(RegNo: CondDef);
4166
4167	if (isNot(MRI, MI: *UseMI)) {
4168	Register NegatedCond = UseMI->getOperand(i: `0`).getReg();
4169	if (!MRI.hasOneNonDBGUse(RegNo: NegatedCond))
4170	return nullptr;
4171
4172	// We're deleting the def of this value, so we need to remove it.
4173	eraseInstr(MI&: *UseMI, MRI);
4174
4175	UseMI = &*MRI.use_instr_nodbg_begin(RegNo: NegatedCond);
4176	Negated = true;
4177	}
4178
4179	if (UseMI->getParent() != Parent \|\| UseMI->getOpcode() != AMDGPU::G_BRCOND)
4180	return nullptr;
4181
4182	// Make sure the cond br is followed by a G_BR, or is the last instruction.
4183	MachineBasicBlock::iterator Next = std::next(x: UseMI->getIterator());
4184	if (Next == Parent->end()) {
4185	MachineFunction::iterator NextMBB = std::next(x: Parent->getIterator());
4186	if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4187	return nullptr;
4188	UncondBrTarget = &*NextMBB;
4189	} else {
4190	if (Next->getOpcode() != AMDGPU::G_BR)
4191	return nullptr;
4192	Br = &*Next;
4193	UncondBrTarget = Br->getOperand(i: `0`).getMBB();
4194	}
4195
4196	return UseMI;
4197	}
4198
4199	bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
4200	const ArgDescriptor *Arg,
4201	const TargetRegisterClass *ArgRC,
4202	LLT ArgTy) const {
4203	MCRegister SrcReg = Arg->getRegister();
4204	assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
4205	assert(DstReg.isVirtual() && "Virtual register expected");
4206
4207	Register LiveIn = getFunctionLiveInPhysReg(MF&: B.getMF(), TII: B.getTII(), PhysReg: SrcReg,
4208	RC: *ArgRC, DL: B.getDebugLoc(), RegTy: ArgTy);
4209	if (Arg->isMasked()) {
4210	// TODO: Should we try to emit this once in the entry block?
4211	const LLT S32 = LLT::scalar(SizeInBits: `32`);
4212	const unsigned Mask = Arg->getMask();
4213	const unsigned Shift = llvm::countr_zero<unsigned>(Val: Mask);
4214
4215	Register AndMaskSrc = LiveIn;
4216
4217	// TODO: Avoid clearing the high bits if we know workitem id y/z are always
4218	// 0.
4219	if (Shift != `0`) {
4220	auto ShiftAmt = B.buildConstant(Res: S32, Val: Shift);
4221	AndMaskSrc = B.buildLShr(Dst: S32, Src0: LiveIn, Src1: ShiftAmt).getReg(Idx: `0`);
4222	}
4223
4224	B.buildAnd(Dst: DstReg, Src0: AndMaskSrc, Src1: B.buildConstant(Res: S32, Val: Mask >> Shift));
4225	} else {
4226	B.buildCopy(Res: DstReg, Op: LiveIn);
4227	}
4228
4229	return true;
4230	}
4231
4232	bool AMDGPULegalizerInfo::loadInputValue(
4233	Register DstReg, MachineIRBuilder &B,
4234	AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4235	const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4236	const ArgDescriptor Arg = nullptr*;
4237	const TargetRegisterClass *ArgRC;
4238	LLT ArgTy;
4239
4240	CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4241	const ArgDescriptor WorkGroupIDX =
4242	ArgDescriptor::createRegister(AMDGPU::TTMP9);
4243	// If GridZ is not programmed in an entry function then the hardware will set
4244	// it to all zeros, so there is no need to mask the GridY value in the low
4245	// order bits.
4246	const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4247	AMDGPU::TTMP7,
4248	AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~`0u` : `0xFFFFu`);
4249	const ArgDescriptor WorkGroupIDZ =
4250	ArgDescriptor::createRegister(AMDGPU::TTMP7, `0xFFFF0000u`);
4251	if (ST.hasArchitectedSGPRs() &&
4252	(AMDGPU::isCompute(CC) \|\| CC == CallingConv::AMDGPU_Gfx)) {
4253	switch (ArgType) {
4254	case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
4255	Arg = &WorkGroupIDX;
4256	ArgRC = &AMDGPU::SReg_32RegClass;
4257	ArgTy = LLT::scalar(SizeInBits: `32`);
4258	break;
4259	case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
4260	Arg = &WorkGroupIDY;
4261	ArgRC = &AMDGPU::SReg_32RegClass;
4262	ArgTy = LLT::scalar(SizeInBits: `32`);
4263	break;
4264	case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
4265	Arg = &WorkGroupIDZ;
4266	ArgRC = &AMDGPU::SReg_32RegClass;
4267	ArgTy = LLT::scalar(SizeInBits: `32`);
4268	break;
4269	default:
4270	break;
4271	}
4272	}
4273
4274	if (!Arg)
4275	std::tie(args&: Arg, args&: ArgRC, args&: ArgTy) = MFI->getPreloadedValue(Value: ArgType);
4276
4277	if (!Arg) {
4278	if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
4279	// The intrinsic may appear when we have a 0 sized kernarg segment, in which
4280	// case the pointer argument may be missing and we use null.
4281	B.buildConstant(Res: DstReg, Val: `0`);
4282	return true;
4283	}
4284
4285	// It's undefined behavior if a function marked with the amdgpu-no-*
4286	// attributes uses the corresponding intrinsic.
4287	B.buildUndef(Res: DstReg);
4288	return true;
4289	}
4290
4291	if (!Arg->isRegister() \|\| !Arg->getRegister().isValid())
4292	return false; // TODO: Handle these
4293	return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4294	}
4295
4296	bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
4297	MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
4298	AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4299	if (!loadInputValue(DstReg: MI.getOperand(i: `0`).getReg(), B, ArgType))
4300	return false;
4301
4302	MI.eraseFromParent();
4303	return true;
4304	}
4305
4306	static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI,
4307	int64_t C) {
4308	B.buildConstant(Res: MI.getOperand(i: `0`).getReg(), Val: C);
4309	MI.eraseFromParent();
4310	return true;
4311	}
4312
4313	bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
4314	MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
4315	unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4316	unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
4317	if (MaxID == `0`)
4318	return replaceWithConstant(B, MI, C: `0`);
4319
4320	const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4321	const ArgDescriptor *Arg;
4322	const TargetRegisterClass *ArgRC;
4323	LLT ArgTy;
4324	std::tie(args&: Arg, args&: ArgRC, args&: ArgTy) = MFI->getPreloadedValue(Value: ArgType);
4325
4326	Register DstReg = MI.getOperand(i: `0`).getReg();
4327	if (!Arg) {
4328	// It's undefined behavior if a function marked with the amdgpu-no-*
4329	// attributes uses the corresponding intrinsic.
4330	B.buildUndef(Res: DstReg);
4331	MI.eraseFromParent();
4332	return true;
4333	}
4334
4335	if (Arg->isMasked()) {
4336	// Don't bother inserting AssertZext for packed IDs since we're emitting the
4337	// masking operations anyway.
4338	//
4339	// TODO: We could assert the top bit is 0 for the source copy.
4340	if (!loadInputValue(DstReg, B, ArgType))
4341	return false;
4342	} else {
4343	Register TmpReg = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: `32`));
4344	if (!loadInputValue(DstReg: TmpReg, B, ArgType))
4345	return false;
4346	B.buildAssertZExt(Res: DstReg, Op: TmpReg, Size: llvm::bit_width(Value: MaxID));
4347	}
4348
4349	MI.eraseFromParent();
4350	return true;
4351	}
4352
4353	Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
4354	int64_t Offset) const {
4355	LLT PtrTy = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: `64`);
4356	Register KernArgReg = B.getMRI()->createGenericVirtualRegister(Ty: PtrTy);
4357
4358	// TODO: If we passed in the base kernel offset we could have a better
4359	// alignment than 4, but we don't really need it.
4360	if (!loadInputValue(DstReg: KernArgReg, B,
4361	ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
4362	llvm_unreachable("failed to find kernarg segment ptr");
4363
4364	auto COffset = B.buildConstant(Res: LLT::scalar(SizeInBits: `64`), Val: Offset);
4365	// TODO: Should get nuw
4366	return B.buildPtrAdd(Res: PtrTy, Op0: KernArgReg, Op1: COffset).getReg(Idx: `0`);
4367	}
4368
4369	/// Legalize a value that's loaded from kernel arguments. This is only used by
4370	/// legacy intrinsics.
4371	bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
4372	MachineIRBuilder &B,
4373	uint64_t Offset,
4374	Align Alignment) const {
4375	Register DstReg = MI.getOperand(i: `0`).getReg();
4376
4377	assert(B.getMRI()->getType(DstReg) == LLT::scalar(`32`) &&
4378	"unexpected kernarg parameter type");
4379
4380	Register Ptr = getKernargParameterPtr(B, Offset);
4381	MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
4382	B.buildLoad(Res: DstReg, Addr: Ptr, PtrInfo, Alignment: Align (`4`),
4383	MMOFlags: MachineMemOperand::MODereferenceable \|
4384	MachineMemOperand::MOInvariant);
4385	MI.eraseFromParent();
4386	return true;
4387	}
4388
4389	bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
4390	MachineRegisterInfo &MRI,
4391	MachineIRBuilder &B) const {
4392	Register Dst = MI.getOperand(i: `0`).getReg();
4393	LLT DstTy = MRI.getType(Reg: Dst);
4394	LLT S16 = LLT::scalar(SizeInBits: `16`);
4395	LLT S32 = LLT::scalar(SizeInBits: `32`);
4396	LLT S64 = LLT::scalar(SizeInBits: `64`);
4397
4398	if (DstTy == S16)
4399	return legalizeFDIV16(MI, MRI, B);
4400	if (DstTy == S32)
4401	return legalizeFDIV32(MI, MRI, B);
4402	if (DstTy == S64)
4403	return legalizeFDIV64(MI, MRI, B);
4404
4405	return false;
4406	}
4407
4408	void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
4409	Register DstDivReg,
4410	Register DstRemReg,
4411	Register X,
4412	Register Y) const {
4413	const LLT S1 = LLT::scalar(SizeInBits: `1`);
4414	const LLT S32 = LLT::scalar(SizeInBits: `32`);
4415
4416	// See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
4417	// algorithm used here.
4418
4419	// Initial estimate of inv(y).
4420	auto FloatY = B.buildUITOFP(Dst: S32, Src0: Y);
4421	auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
4422	auto Scale = B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: `0x4f7ffffe`));
4423	auto ScaledY = B.buildFMul(Dst: S32, Src0: RcpIFlag, Src1: Scale);
4424	auto Z = B.buildFPTOUI(Dst: S32, Src0: ScaledY);
4425
4426	// One round of UNR.
4427	auto NegY = B.buildSub(Dst: S32, Src0: B.buildConstant(Res: S32, Val: `0`), Src1: Y);
4428	auto NegYZ = B.buildMul(Dst: S32, Src0: NegY, Src1: Z);
4429	Z = B.buildAdd(Dst: S32, Src0: Z, Src1: B.buildUMulH(Dst: S32, Src0: Z, Src1: NegYZ));
4430
4431	// Quotient/remainder estimate.
4432	auto Q = B.buildUMulH(Dst: S32, Src0: X, Src1: Z);
4433	auto R = B.buildSub(Dst: S32, Src0: X, Src1: B.buildMul(Dst: S32, Src0: Q, Src1: Y));
4434
4435	// First quotient/remainder refinement.
4436	auto One = B.buildConstant(Res: S32, Val: `1`);
4437	auto Cond = B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: R, Op1: Y);
4438	if (DstDivReg)
4439	Q = B.buildSelect(Res: S32, Tst: Cond, Op0: B.buildAdd(Dst: S32, Src0: Q, Src1: One), Op1: Q);
4440	R = B.buildSelect(Res: S32, Tst: Cond, Op0: B.buildSub(Dst: S32, Src0: R, Src1: Y), Op1: R);
4441
4442	// Second quotient/remainder refinement.
4443	Cond = B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: R, Op1: Y);
4444	if (DstDivReg)
4445	B.buildSelect(Res: DstDivReg, Tst: Cond, Op0: B.buildAdd(Dst: S32, Src0: Q, Src1: One), Op1: Q);
4446
4447	if (DstRemReg)
4448	B.buildSelect(Res: DstRemReg, Tst: Cond, Op0: B.buildSub(Dst: S32, Src0: R, Src1: Y), Op1: R);
4449	}
4450
4451	// Build integer reciprocal sequence around V_RCP_IFLAG_F32
4452	//
4453	// Return lo, hi of result
4454	//
4455	// %cvt.lo = G_UITOFP Val.lo
4456	// %cvt.hi = G_UITOFP Val.hi
4457	// %mad = G_FMAD %cvt.hi, 232, %cvt.lo
4458	// %rcp = G_AMDGPU_RCP_IFLAG %mad
4459	// %mul1 = G_FMUL %rcp, 0x5f7ffffc
4460	// %mul2 = G_FMUL %mul1, 2(-32)
4461	// %trunc = G_INTRINSIC_TRUNC %mul2
4462	// %mad2 = G_FMAD %trunc, -(232), %mul1
4463	// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
4464	static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
4465	Register Val) {
4466	const LLT S32 = LLT::scalar(SizeInBits: `32`);
4467	auto Unmerge = B.buildUnmerge(Res: S32, Op: Val);
4468
4469	auto CvtLo = B.buildUITOFP(Dst: S32, Src0: Unmerge.getReg(Idx: `0`));
4470	auto CvtHi = B.buildUITOFP(Dst: S32, Src0: Unmerge.getReg(Idx: `1`));
4471
4472	auto Mad = B.buildFMAD(
4473	Dst: S32, Src0: CvtHi, // 232
4474	Src1: B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: `0x4f800000`)), Src2: CvtLo);
4475
4476	auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
4477	auto Mul1 = B.buildFMul(
4478	Dst: S32, Src0: Rcp, Src1: B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: `0x5f7ffffc`)));
4479
4480	// 2(-32)
4481	auto Mul2 = B.buildFMul(
4482	Dst: S32, Src0: Mul1, Src1: B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: `0x2f800000`)));
4483	auto Trunc = B.buildIntrinsicTrunc(Dst: S32, Src0: Mul2);
4484
4485	// -(232)
4486	auto Mad2 = B.buildFMAD(
4487	Dst: S32, Src0: Trunc, Src1: B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: `0xcf800000`)),
4488	Src2: Mul1);
4489
4490	auto ResultLo = B.buildFPTOUI(Dst: S32, Src0: Mad2);
4491	auto ResultHi = B.buildFPTOUI(Dst: S32, Src0: Trunc);
4492
4493	return {ResultLo.getReg(`0`), ResultHi.getReg(`0`)};
4494	}
4495
4496	void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
4497	Register DstDivReg,
4498	Register DstRemReg,
4499	Register Numer,
4500	Register Denom) const {
4501	const LLT S32 = LLT::scalar(SizeInBits: `32`);
4502	const LLT S64 = LLT::scalar(SizeInBits: `64`);
4503	const LLT S1 = LLT::scalar(SizeInBits: `1`);
4504	Register RcpLo, RcpHi;
4505
4506	std::tie(args&: RcpLo, args&: RcpHi) = emitReciprocalU64(B, Val: Denom);
4507
4508	auto Rcp = B.buildMergeLikeInstr(Res: S64, Ops: {RcpLo, RcpHi});
4509
4510	auto Zero64 = B.buildConstant(Res: S64, Val: `0`);
4511	auto NegDenom = B.buildSub(Dst: S64, Src0: Zero64, Src1: Denom);
4512
4513	auto MulLo1 = B.buildMul(Dst: S64, Src0: NegDenom, Src1: Rcp);
4514	auto MulHi1 = B.buildUMulH(Dst: S64, Src0: Rcp, Src1: MulLo1);
4515
4516	auto UnmergeMulHi1 = B.buildUnmerge(Res: S32, Op: MulHi1);
4517	Register MulHi1_Lo = UnmergeMulHi1.getReg(Idx: `0`);
4518	Register MulHi1_Hi = UnmergeMulHi1.getReg(Idx: `1`);
4519
4520	auto Add1_Lo = B.buildUAddo(Res: S32, CarryOut: S1, Op0: RcpLo, Op1: MulHi1_Lo);
4521	auto Add1_Hi = B.buildUAdde(Res: S32, CarryOut: S1, Op0: RcpHi, Op1: MulHi1_Hi, CarryIn: Add1_Lo.getReg(Idx: `1`));
4522	auto Add1 = B.buildMergeLikeInstr(Res: S64, Ops: {Add1_Lo, Add1_Hi});
4523
4524	auto MulLo2 = B.buildMul(Dst: S64, Src0: NegDenom, Src1: Add1);
4525	auto MulHi2 = B.buildUMulH(Dst: S64, Src0: Add1, Src1: MulLo2);
4526	auto UnmergeMulHi2 = B.buildUnmerge(Res: S32, Op: MulHi2);
4527	Register MulHi2_Lo = UnmergeMulHi2.getReg(Idx: `0`);
4528	Register MulHi2_Hi = UnmergeMulHi2.getReg(Idx: `1`);
4529
4530	auto Zero32 = B.buildConstant(Res: S32, Val: `0`);
4531	auto Add2_Lo = B.buildUAddo(Res: S32, CarryOut: S1, Op0: Add1_Lo, Op1: MulHi2_Lo);
4532	auto Add2_Hi = B.buildUAdde(Res: S32, CarryOut: S1, Op0: Add1_Hi, Op1: MulHi2_Hi, CarryIn: Add2_Lo.getReg(Idx: `1`));
4533	auto Add2 = B.buildMergeLikeInstr(Res: S64, Ops: {Add2_Lo, Add2_Hi});
4534
4535	auto UnmergeNumer = B.buildUnmerge(Res: S32, Op: Numer);
4536	Register NumerLo = UnmergeNumer.getReg(Idx: `0`);
4537	Register NumerHi = UnmergeNumer.getReg(Idx: `1`);
4538
4539	auto MulHi3 = B.buildUMulH(Dst: S64, Src0: Numer, Src1: Add2);
4540	auto Mul3 = B.buildMul(Dst: S64, Src0: Denom, Src1: MulHi3);
4541	auto UnmergeMul3 = B.buildUnmerge(Res: S32, Op: Mul3);
4542	Register Mul3_Lo = UnmergeMul3.getReg(Idx: `0`);
4543	Register Mul3_Hi = UnmergeMul3.getReg(Idx: `1`);
4544	auto Sub1_Lo = B.buildUSubo(Res: S32, CarryOut: S1, Op0: NumerLo, Op1: Mul3_Lo);
4545	auto Sub1_Hi = B.buildUSube(Res: S32, CarryOut: S1, Op0: NumerHi, Op1: Mul3_Hi, CarryIn: Sub1_Lo.getReg(Idx: `1`));
4546	auto Sub1_Mi = B.buildSub(Dst: S32, Src0: NumerHi, Src1: Mul3_Hi);
4547	auto Sub1 = B.buildMergeLikeInstr(Res: S64, Ops: {Sub1_Lo, Sub1_Hi});
4548
4549	auto UnmergeDenom = B.buildUnmerge(Res: S32, Op: Denom);
4550	Register DenomLo = UnmergeDenom.getReg(Idx: `0`);
4551	Register DenomHi = UnmergeDenom.getReg(Idx: `1`);
4552
4553	auto CmpHi = B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: Sub1_Hi, Op1: DenomHi);
4554	auto C1 = B.buildSExt(Res: S32, Op: CmpHi);
4555
4556	auto CmpLo = B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: Sub1_Lo, Op1: DenomLo);
4557	auto C2 = B.buildSExt(Res: S32, Op: CmpLo);
4558
4559	auto CmpEq = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: S1, Op0: Sub1_Hi, Op1: DenomHi);
4560	auto C3 = B.buildSelect(Res: S32, Tst: CmpEq, Op0: C2, Op1: C1);
4561
4562	// TODO: Here and below portions of the code can be enclosed into if/endif.
4563	// Currently control flow is unconditional and we have 4 selects after
4564	// potential endif to substitute PHIs.
4565
4566	// if C3 != 0 ...
4567	auto Sub2_Lo = B.buildUSubo(Res: S32, CarryOut: S1, Op0: Sub1_Lo, Op1: DenomLo);
4568	auto Sub2_Mi = B.buildUSube(Res: S32, CarryOut: S1, Op0: Sub1_Mi, Op1: DenomHi, CarryIn: Sub1_Lo.getReg(Idx: `1`));
4569	auto Sub2_Hi = B.buildUSube(Res: S32, CarryOut: S1, Op0: Sub2_Mi, Op1: Zero32, CarryIn: Sub2_Lo.getReg(Idx: `1`));
4570	auto Sub2 = B.buildMergeLikeInstr(Res: S64, Ops: {Sub2_Lo, Sub2_Hi});
4571
4572	auto One64 = B.buildConstant(Res: S64, Val: `1`);
4573	auto Add3 = B.buildAdd(Dst: S64, Src0: MulHi3, Src1: One64);
4574
4575	auto C4 =
4576	B.buildSExt(Res: S32, Op: B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: Sub2_Hi, Op1: DenomHi));
4577	auto C5 =
4578	B.buildSExt(Res: S32, Op: B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: Sub2_Lo, Op1: DenomLo));
4579	auto C6 = B.buildSelect(
4580	Res: S32, Tst: B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: S1, Op0: Sub2_Hi, Op1: DenomHi), Op0: C5, Op1: C4);
4581
4582	// if (C6 != 0)
4583	auto Add4 = B.buildAdd(Dst: S64, Src0: Add3, Src1: One64);
4584	auto Sub3_Lo = B.buildUSubo(Res: S32, CarryOut: S1, Op0: Sub2_Lo, Op1: DenomLo);
4585
4586	auto Sub3_Mi = B.buildUSube(Res: S32, CarryOut: S1, Op0: Sub2_Mi, Op1: DenomHi, CarryIn: Sub2_Lo.getReg(Idx: `1`));
4587	auto Sub3_Hi = B.buildUSube(Res: S32, CarryOut: S1, Op0: Sub3_Mi, Op1: Zero32, CarryIn: Sub3_Lo.getReg(Idx: `1`));
4588	auto Sub3 = B.buildMergeLikeInstr(Res: S64, Ops: {Sub3_Lo, Sub3_Hi});
4589
4590	// endif C6
4591	// endif C3
4592
4593	if (DstDivReg) {
4594	auto Sel1 = B.buildSelect(
4595	Res: S64, Tst: B.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: C6, Op1: Zero32), Op0: Add4, Op1: Add3);
4596	B.buildSelect(Res: DstDivReg, Tst: B.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: C3, Op1: Zero32),
4597	Op0: Sel1, Op1: MulHi3);
4598	}
4599
4600	if (DstRemReg) {
4601	auto Sel2 = B.buildSelect(
4602	Res: S64, Tst: B.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: C6, Op1: Zero32), Op0: Sub3, Op1: Sub2);
4603	B.buildSelect(Res: DstRemReg, Tst: B.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: C3, Op1: Zero32),
4604	Op0: Sel2, Op1: Sub1);
4605	}
4606	}
4607
4608	bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
4609	MachineRegisterInfo &MRI,
4610	MachineIRBuilder &B) const {
4611	Register DstDivReg, DstRemReg;
4612	switch (MI.getOpcode()) {
4613	default:
4614	llvm_unreachable("Unexpected opcode!");
4615	case AMDGPU::G_UDIV: {
4616	DstDivReg = MI.getOperand(i: `0`).getReg();
4617	break;
4618	}
4619	case AMDGPU::G_UREM: {
4620	DstRemReg = MI.getOperand(i: `0`).getReg();
4621	break;
4622	}
4623	case AMDGPU::G_UDIVREM: {
4624	DstDivReg = MI.getOperand(i: `0`).getReg();
4625	DstRemReg = MI.getOperand(i: `1`).getReg();
4626	break;
4627	}
4628	}
4629
4630	const LLT S64 = LLT::scalar(SizeInBits: `64`);
4631	const LLT S32 = LLT::scalar(SizeInBits: `32`);
4632	const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4633	Register Num = MI.getOperand(i: FirstSrcOpIdx).getReg();
4634	Register Den = MI.getOperand(i: FirstSrcOpIdx + `1`).getReg();
4635	LLT Ty = MRI.getType(Reg: MI.getOperand(i: `0`).getReg());
4636
4637	if (Ty == S32)
4638	legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, X: Num, Y: Den);
4639	else if (Ty == S64)
4640	legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Numer: Num, Denom: Den);
4641	else
4642	return false;
4643
4644	MI.eraseFromParent();
4645	return true;
4646	}
4647
4648	bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
4649	MachineRegisterInfo &MRI,
4650	MachineIRBuilder &B) const {
4651	const LLT S64 = LLT::scalar(SizeInBits: `64`);
4652	const LLT S32 = LLT::scalar(SizeInBits: `32`);
4653
4654	LLT Ty = MRI.getType(Reg: MI.getOperand(i: `0`).getReg());
4655	if (Ty != S32 && Ty != S64)
4656	return false;
4657
4658	const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4659	Register LHS = MI.getOperand(i: FirstSrcOpIdx).getReg();
4660	Register RHS = MI.getOperand(i: FirstSrcOpIdx + `1`).getReg();
4661
4662	auto SignBitOffset = B.buildConstant(Res: S32, Val: Ty.getSizeInBits() - `1`);
4663	auto LHSign = B.buildAShr(Dst: Ty, Src0: LHS, Src1: SignBitOffset);
4664	auto RHSign = B.buildAShr(Dst: Ty, Src0: RHS, Src1: SignBitOffset);
4665
4666	LHS = B.buildAdd(Dst: Ty, Src0: LHS, Src1: LHSign).getReg(Idx: `0`);
4667	RHS = B.buildAdd(Dst: Ty, Src0: RHS, Src1: RHSign).getReg(Idx: `0`);
4668
4669	LHS = B.buildXor(Dst: Ty, Src0: LHS, Src1: LHSign).getReg(Idx: `0`);
4670	RHS = B.buildXor(Dst: Ty, Src0: RHS, Src1: RHSign).getReg(Idx: `0`);
4671
4672	Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
4673	switch (MI.getOpcode()) {
4674	default:
4675	llvm_unreachable("Unexpected opcode!");
4676	case AMDGPU::G_SDIV: {
4677	DstDivReg = MI.getOperand(i: `0`).getReg();
4678	TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4679	break;
4680	}
4681	case AMDGPU::G_SREM: {
4682	DstRemReg = MI.getOperand(i: `0`).getReg();
4683	TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4684	break;
4685	}
4686	case AMDGPU::G_SDIVREM: {
4687	DstDivReg = MI.getOperand(i: `0`).getReg();
4688	DstRemReg = MI.getOperand(i: `1`).getReg();
4689	TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4690	TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4691	break;
4692	}
4693	}
4694
4695	if (Ty == S32)
4696	legalizeUnsignedDIV_REM32Impl(B, DstDivReg: TmpDivReg, DstRemReg: TmpRemReg, X: LHS, Y: RHS);
4697	else
4698	legalizeUnsignedDIV_REM64Impl(B, DstDivReg: TmpDivReg, DstRemReg: TmpRemReg, Numer: LHS, Denom: RHS);
4699
4700	if (DstDivReg) {
4701	auto Sign = B.buildXor(Dst: Ty, Src0: LHSign, Src1: RHSign).getReg(Idx: `0`);
4702	auto SignXor = B.buildXor(Dst: Ty, Src0: TmpDivReg, Src1: Sign).getReg(Idx: `0`);
4703	B.buildSub(Dst: DstDivReg, Src0: SignXor, Src1: Sign);
4704	}
4705
4706	if (DstRemReg) {
4707	auto Sign = LHSign.getReg(Idx: `0`); // Remainder sign is the same as LHS
4708	auto SignXor = B.buildXor(Dst: Ty, Src0: TmpRemReg, Src1: Sign).getReg(Idx: `0`);
4709	B.buildSub(Dst: DstRemReg, Src0: SignXor, Src1: Sign);
4710	}
4711
4712	MI.eraseFromParent();
4713	return true;
4714	}
4715
4716	bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
4717	MachineRegisterInfo &MRI,
4718	MachineIRBuilder &B) const {
4719	Register Res = MI.getOperand(i: `0`).getReg();
4720	Register LHS = MI.getOperand(i: `1`).getReg();
4721	Register RHS = MI.getOperand(i: `2`).getReg();
4722	uint16_t Flags = MI.getFlags();
4723	LLT ResTy = MRI.getType(Reg: Res);
4724
4725	const MachineFunction &MF = B.getMF();
4726	bool AllowInaccurateRcp = MI.getFlag(Flag: MachineInstr::FmAfn) \|\|
4727	MF.getTarget().Options.UnsafeFPMath;
4728
4729	if (auto CLHS = getConstantFPVRegVal(VReg: LHS, MRI)) {
4730	if (!AllowInaccurateRcp && ResTy != LLT::scalar(SizeInBits: `16`))
4731	return false;
4732
4733	// v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
4734	// the CI documentation has a worst case error of 1 ulp.
4735	// OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
4736	// use it as long as we aren't trying to use denormals.
4737	//
4738	// v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
4739
4740	// 1 / x -> RCP(x)
4741	if (CLHS->isExactlyValue(V: `1.0`)) {
4742	B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4743	.addUse(RHS)
4744	.setMIFlags(Flags);
4745
4746	MI.eraseFromParent();
4747	return true;
4748	}
4749
4750	// -1 / x -> RCP( FNEG(x) )
4751	if (CLHS->isExactlyValue(V: -`1.0`)) {
4752	auto FNeg = B.buildFNeg(Dst: ResTy, Src0: RHS, Flags);
4753	B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4754	.addUse(FNeg.getReg(`0`))
4755	.setMIFlags(Flags);
4756
4757	MI.eraseFromParent();
4758	return true;
4759	}
4760	}
4761
4762	// For f16 require afn or arcp.
4763	// For f32 require afn.
4764	if (!AllowInaccurateRcp && (ResTy != LLT::scalar(SizeInBits: `16`) \|\|
4765	!MI.getFlag(Flag: MachineInstr::FmArcp)))
4766	return false;
4767
4768	// x / y -> x (1.0 / y)*
4769	auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4770	.addUse(RHS)
4771	.setMIFlags(Flags);
4772	B.buildFMul(Dst: Res, Src0: LHS, Src1: RCP, Flags);
4773
4774	MI.eraseFromParent();
4775	return true;
4776	}
4777
4778	bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
4779	MachineRegisterInfo &MRI,
4780	MachineIRBuilder &B) const {
4781	Register Res = MI.getOperand(i: `0`).getReg();
4782	Register X = MI.getOperand(i: `1`).getReg();
4783	Register Y = MI.getOperand(i: `2`).getReg();
4784	uint16_t Flags = MI.getFlags();
4785	LLT ResTy = MRI.getType(Reg: Res);
4786
4787	const MachineFunction &MF = B.getMF();
4788	bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath \|\|
4789	MI.getFlag(Flag: MachineInstr::FmAfn);
4790
4791	if (!AllowInaccurateRcp)
4792	return false;
4793
4794	auto NegY = B.buildFNeg(Dst: ResTy, Src0: Y);
4795	auto One = B.buildFConstant(Res: ResTy, Val: `1.0`);
4796
4797	auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4798	.addUse(Y)
4799	.setMIFlags(Flags);
4800
4801	auto Tmp0 = B.buildFMA(Dst: ResTy, Src0: NegY, Src1: R, Src2: One);
4802	R = B.buildFMA(Dst: ResTy, Src0: Tmp0, Src1: R, Src2: R);
4803
4804	auto Tmp1 = B.buildFMA(Dst: ResTy, Src0: NegY, Src1: R, Src2: One);
4805	R = B.buildFMA(Dst: ResTy, Src0: Tmp1, Src1: R, Src2: R);
4806
4807	auto Ret = B.buildFMul(Dst: ResTy, Src0: X, Src1: R);
4808	auto Tmp2 = B.buildFMA(Dst: ResTy, Src0: NegY, Src1: Ret, Src2: X);
4809
4810	B.buildFMA(Dst: Res, Src0: Tmp2, Src1: R, Src2: Ret);
4811	MI.eraseFromParent();
4812	return true;
4813	}
4814
4815	bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
4816	MachineRegisterInfo &MRI,
4817	MachineIRBuilder &B) const {
4818	if (legalizeFastUnsafeFDIV(MI, MRI, B))
4819	return true;
4820
4821	Register Res = MI.getOperand(i: `0`).getReg();
4822	Register LHS = MI.getOperand(i: `1`).getReg();
4823	Register RHS = MI.getOperand(i: `2`).getReg();
4824
4825	uint16_t Flags = MI.getFlags();
4826
4827	LLT S16 = LLT::scalar(SizeInBits: `16`);
4828	LLT S32 = LLT::scalar(SizeInBits: `32`);
4829
4830	auto LHSExt = B.buildFPExt(Res: S32, Op: LHS, Flags);
4831	auto RHSExt = B.buildFPExt(Res: S32, Op: RHS, Flags);
4832
4833	auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
4834	.addUse(RHSExt.getReg(`0`))
4835	.setMIFlags(Flags);
4836
4837	auto QUOT = B.buildFMul(Dst: S32, Src0: LHSExt, Src1: RCP, Flags);
4838	auto RDst = B.buildFPTrunc(Res: S16, Op: QUOT, Flags);
4839
4840	B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
4841	.addUse(RDst.getReg(`0`))
4842	.addUse(RHS)
4843	.addUse(LHS)
4844	.setMIFlags(Flags);
4845
4846	MI.eraseFromParent();
4847	return true;
4848	}
4849
4850	static constexpr unsigned SPDenormModeBitField =
4851	AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, `4`, `2`);
4852
4853	// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
4854	// to enable denorm mode. When 'Enable' is false, disable denorm mode.
4855	static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B,
4856	const GCNSubtarget &ST,
4857	SIModeRegisterDefaults Mode) {
4858	// Set SP denorm mode to this value.
4859	unsigned SPDenormMode =
4860	Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
4861
4862	if (ST.hasDenormModeInst()) {
4863	// Preserve default FP64FP16 denorm mode while updating FP32 mode.
4864	uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
4865
4866	uint32_t NewDenormModeValue = SPDenormMode \| (DPDenormModeDefault << `2`);
4867	B.buildInstr(AMDGPU::S_DENORM_MODE)
4868	.addImm(NewDenormModeValue);
4869
4870	} else {
4871	B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
4872	.addImm(SPDenormMode)
4873	.addImm(SPDenormModeBitField);
4874	}
4875	}
4876
4877	bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
4878	MachineRegisterInfo &MRI,
4879	MachineIRBuilder &B) const {
4880	if (legalizeFastUnsafeFDIV(MI, MRI, B))
4881	return true;
4882
4883	Register Res = MI.getOperand(i: `0`).getReg();
4884	Register LHS = MI.getOperand(i: `1`).getReg();
4885	Register RHS = MI.getOperand(i: `2`).getReg();
4886	const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4887	SIModeRegisterDefaults Mode = MFI->getMode();
4888
4889	uint16_t Flags = MI.getFlags();
4890
4891	LLT S32 = LLT::scalar(SizeInBits: `32`);
4892	LLT S1 = LLT::scalar(SizeInBits: `1`);
4893
4894	auto One = B.buildFConstant(Res: S32, Val: `1.0f`);
4895
4896	auto DenominatorScaled =
4897	B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
4898	.addUse(LHS)
4899	.addUse(RHS)
4900	.addImm(`0`)
4901	.setMIFlags(Flags);
4902	auto NumeratorScaled =
4903	B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
4904	.addUse(LHS)
4905	.addUse(RHS)
4906	.addImm(`1`)
4907	.setMIFlags(Flags);
4908
4909	auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
4910	.addUse(DenominatorScaled.getReg(`0`))
4911	.setMIFlags(Flags);
4912	auto NegDivScale0 = B.buildFNeg(Dst: S32, Src0: DenominatorScaled, Flags);
4913
4914	const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
4915	const bool HasDynamicDenormals =
4916	(Mode.FP32Denormals.Input == DenormalMode::Dynamic) \|\|
4917	(Mode.FP32Denormals.Output == DenormalMode::Dynamic);
4918
4919	Register SavedSPDenormMode;
4920	if (!PreservesDenormals) {
4921	if (HasDynamicDenormals) {
4922	SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4923	B.buildInstr(AMDGPU::S_GETREG_B32)
4924	.addDef(SavedSPDenormMode)
4925	.addImm(SPDenormModeBitField);
4926	}
4927	toggleSPDenormMode(Enable: true, B, ST, Mode);
4928	}
4929
4930	auto Fma0 = B.buildFMA(Dst: S32, Src0: NegDivScale0, Src1: ApproxRcp, Src2: One, Flags);
4931	auto Fma1 = B.buildFMA(Dst: S32, Src0: Fma0, Src1: ApproxRcp, Src2: ApproxRcp, Flags);
4932	auto Mul = B.buildFMul(Dst: S32, Src0: NumeratorScaled, Src1: Fma1, Flags);
4933	auto Fma2 = B.buildFMA(Dst: S32, Src0: NegDivScale0, Src1: Mul, Src2: NumeratorScaled, Flags);
4934	auto Fma3 = B.buildFMA(Dst: S32, Src0: Fma2, Src1: Fma1, Src2: Mul, Flags);
4935	auto Fma4 = B.buildFMA(Dst: S32, Src0: NegDivScale0, Src1: Fma3, Src2: NumeratorScaled, Flags);
4936
4937	if (!PreservesDenormals) {
4938	if (HasDynamicDenormals) {
4939	assert(SavedSPDenormMode);
4940	B.buildInstr(AMDGPU::S_SETREG_B32)
4941	.addReg(SavedSPDenormMode)
4942	.addImm(SPDenormModeBitField);
4943	} else
4944	toggleSPDenormMode(Enable: false, B, ST, Mode);
4945	}
4946
4947	auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32})
4948	.addUse(Fma4.getReg(`0`))
4949	.addUse(Fma1.getReg(`0`))
4950	.addUse(Fma3.getReg(`0`))
4951	.addUse(NumeratorScaled.getReg(`1`))
4952	.setMIFlags(Flags);
4953
4954	B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
4955	.addUse(Fmas.getReg(`0`))
4956	.addUse(RHS)
4957	.addUse(LHS)
4958	.setMIFlags(Flags);
4959
4960	MI.eraseFromParent();
4961	return true;
4962	}
4963
4964	bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
4965	MachineRegisterInfo &MRI,
4966	MachineIRBuilder &B) const {
4967	if (legalizeFastUnsafeFDIV64(MI, MRI, B))
4968	return true;
4969
4970	Register Res = MI.getOperand(i: `0`).getReg();
4971	Register LHS = MI.getOperand(i: `1`).getReg();
4972	Register RHS = MI.getOperand(i: `2`).getReg();
4973
4974	uint16_t Flags = MI.getFlags();
4975
4976	LLT S64 = LLT::scalar(SizeInBits: `64`);
4977	LLT S1 = LLT::scalar(SizeInBits: `1`);
4978
4979	auto One = B.buildFConstant(Res: S64, Val: `1.0`);
4980
4981	auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
4982	.addUse(LHS)
4983	.addUse(RHS)
4984	.addImm(`0`)
4985	.setMIFlags(Flags);
4986
4987	auto NegDivScale0 = B.buildFNeg(Dst: S64, Src0: DivScale0.getReg(`0`), Flags);
4988
4989	auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64})
4990	.addUse(DivScale0.getReg(`0`))
4991	.setMIFlags(Flags);
4992
4993	auto Fma0 = B.buildFMA(Dst: S64, Src0: NegDivScale0, Src1: Rcp, Src2: One, Flags);
4994	auto Fma1 = B.buildFMA(Dst: S64, Src0: Rcp, Src1: Fma0, Src2: Rcp, Flags);
4995	auto Fma2 = B.buildFMA(Dst: S64, Src0: NegDivScale0, Src1: Fma1, Src2: One, Flags);
4996
4997	auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
4998	.addUse(LHS)
4999	.addUse(RHS)
5000	.addImm(`1`)
5001	.setMIFlags(Flags);
5002
5003	auto Fma3 = B.buildFMA(Dst: S64, Src0: Fma1, Src1: Fma2, Src2: Fma1, Flags);
5004	auto Mul = B.buildFMul(Dst: S64, Src0: DivScale1.getReg(`0`), Src1: Fma3, Flags);
5005	auto Fma4 = B.buildFMA(Dst: S64, Src0: NegDivScale0, Src1: Mul, Src2: DivScale1.getReg(`0`), Flags);
5006
5007	Register Scale;
5008	if (!ST.hasUsableDivScaleConditionOutput()) {
5009	// Workaround a hardware bug on SI where the condition output from div_scale
5010	// is not usable.
5011
5012	LLT S32 = LLT::scalar(SizeInBits: `32`);
5013
5014	auto NumUnmerge = B.buildUnmerge(Res: S32, Op: LHS);
5015	auto DenUnmerge = B.buildUnmerge(Res: S32, Op: RHS);
5016	auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
5017	auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
5018
5019	auto CmpNum = B.buildICmp(Pred: ICmpInst::ICMP_EQ, Res: S1, Op0: NumUnmerge.getReg(Idx: `1`),
5020	Op1: Scale1Unmerge.getReg(`1`));
5021	auto CmpDen = B.buildICmp(Pred: ICmpInst::ICMP_EQ, Res: S1, Op0: DenUnmerge.getReg(Idx: `1`),
5022	Op1: Scale0Unmerge.getReg(`1`));
5023	Scale = B.buildXor(Dst: S1, Src0: CmpNum, Src1: CmpDen).getReg(`0`);
5024	} else {
5025	Scale = DivScale1.getReg(`1`);
5026	}
5027
5028	auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64})
5029	.addUse(Fma4.getReg(`0`))
5030	.addUse(Fma3.getReg(`0`))
5031	.addUse(Mul.getReg(`0`))
5032	.addUse(Scale)
5033	.setMIFlags(Flags);
5034
5035	B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res))
5036	.addUse(Fmas.getReg(`0`))
5037	.addUse(RHS)
5038	.addUse(LHS)
5039	.setMIFlags(Flags);
5040
5041	MI.eraseFromParent();
5042	return true;
5043	}
5044
5045	bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI,
5046	MachineRegisterInfo &MRI,
5047	MachineIRBuilder &B) const {
5048	Register Res0 = MI.getOperand(i: `0`).getReg();
5049	Register Res1 = MI.getOperand(i: `1`).getReg();
5050	Register Val = MI.getOperand(i: `2`).getReg();
5051	uint16_t Flags = MI.getFlags();
5052
5053	LLT Ty = MRI.getType(Reg: Res0);
5054	LLT InstrExpTy = Ty == LLT::scalar(SizeInBits: `16`) ? LLT::scalar(SizeInBits: `16`) : LLT::scalar(SizeInBits: `32`);
5055
5056	auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5057	.addUse(Val)
5058	.setMIFlags(Flags);
5059	auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5060	.addUse(Val)
5061	.setMIFlags(Flags);
5062
5063	if (ST.hasFractBug()) {
5064	auto Fabs = B.buildFAbs(Dst: Ty, Src0: Val);
5065	auto Inf = B.buildFConstant(Res: Ty, Val: APFloat::getInf(Sem: getFltSemanticForLLT(Ty)));
5066	auto IsFinite =
5067	B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: `1`), Op0: Fabs, Op1: Inf, Flags);
5068	auto Zero = B.buildConstant(Res: InstrExpTy, Val: `0`);
5069	Exp = B.buildSelect(Res: InstrExpTy, Tst: IsFinite, Op0: Exp, Op1: Zero);
5070	Mant = B.buildSelect(Res: Ty, Tst: IsFinite, Op0: Mant, Op1: Val);
5071	}
5072
5073	B.buildCopy(Res: Res0, Op: Mant);
5074	B.buildSExtOrTrunc(Res: Res1, Op: Exp);
5075
5076	MI.eraseFromParent();
5077	return true;
5078	}
5079
5080	bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
5081	MachineRegisterInfo &MRI,
5082	MachineIRBuilder &B) const {
5083	Register Res = MI.getOperand(i: `0`).getReg();
5084	Register LHS = MI.getOperand(i: `2`).getReg();
5085	Register RHS = MI.getOperand(i: `3`).getReg();
5086	uint16_t Flags = MI.getFlags();
5087
5088	LLT S32 = LLT::scalar(SizeInBits: `32`);
5089	LLT S1 = LLT::scalar(SizeInBits: `1`);
5090
5091	auto Abs = B.buildFAbs(Dst: S32, Src0: RHS, Flags);
5092	const APFloat C0Val(`1.0f`);
5093
5094	auto C0 = B.buildFConstant(Res: S32, Val: `0x1p+96f`);
5095	auto C1 = B.buildFConstant(Res: S32, Val: `0x1p-32f`);
5096	auto C2 = B.buildFConstant(Res: S32, Val: `1.0f`);
5097
5098	auto CmpRes = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: S1, Op0: Abs, Op1: C0, Flags);
5099	auto Sel = B.buildSelect(Res: S32, Tst: CmpRes, Op0: C1, Op1: C2, Flags);
5100
5101	auto Mul0 = B.buildFMul(Dst: S32, Src0: RHS, Src1: Sel, Flags);
5102
5103	auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5104	.addUse(Mul0.getReg(`0`))
5105	.setMIFlags(Flags);
5106
5107	auto Mul1 = B.buildFMul(Dst: S32, Src0: LHS, Src1: RCP, Flags);
5108
5109	B.buildFMul(Dst: Res, Src0: Sel, Src1: Mul1, Flags);
5110
5111	MI.eraseFromParent();
5112	return true;
5113	}
5114
5115	bool AMDGPULegalizerInfo::legalizeFSQRTF16(MachineInstr &MI,
5116	MachineRegisterInfo &MRI,
5117	MachineIRBuilder &B) const {
5118	// Bypass the correct expansion a standard promotion through G_FSQRT would
5119	// get. The f32 op is accurate enough for the f16 cas.
5120	unsigned Flags = MI.getFlags();
5121	assert(!ST.has16BitInsts());
5122	const LLT F32 = LLT::scalar(SizeInBits: `32`);
5123	auto Ext = B.buildFPExt(Res: F32, Op: MI.getOperand(i: `1`), Flags);
5124	auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32})
5125	.addUse(Ext.getReg(`0`))
5126	.setMIFlags(Flags);
5127	B.buildFPTrunc(Res: MI.getOperand(i: `0`), Op: Log2, Flags);
5128	MI.eraseFromParent();
5129	return true;
5130	}
5131
5132	bool AMDGPULegalizerInfo::legalizeFSQRTF32(MachineInstr &MI,
5133	MachineRegisterInfo &MRI,
5134	MachineIRBuilder &B) const {
5135	MachineFunction &MF = B.getMF();
5136	Register Dst = MI.getOperand(i: `0`).getReg();
5137	Register X = MI.getOperand(i: `1`).getReg();
5138	const unsigned Flags = MI.getFlags();
5139	const LLT S1 = LLT::scalar(SizeInBits: `1`);
5140	const LLT F32 = LLT::scalar(SizeInBits: `32`);
5141	const LLT I32 = LLT::scalar(SizeInBits: `32`);
5142
5143	if (allowApproxFunc(MF, Flags)) {
5144	B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst}))
5145	.addUse(X)
5146	.setMIFlags(Flags);
5147	MI.eraseFromParent();
5148	return true;
5149	}
5150
5151	auto ScaleThreshold = B.buildFConstant(Res: F32, Val: `0x1.0p-96f`);
5152	auto NeedScale = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: S1, Op0: ScaleThreshold, Op1: X, Flags);
5153	auto ScaleUpFactor = B.buildFConstant(Res: F32, Val: `0x1.0p+32f`);
5154	auto ScaledX = B.buildFMul(Dst: F32, Src0: X, Src1: ScaleUpFactor, Flags);
5155	auto SqrtX = B.buildSelect(Res: F32, Tst: NeedScale, Op0: ScaledX, Op1: X, Flags);
5156
5157	Register SqrtS = MRI.createGenericVirtualRegister(Ty: F32);
5158	if (needsDenormHandlingF32(MF, Src: X, Flags)) {
5159	B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS}))
5160	.addUse(SqrtX.getReg(`0`))
5161	.setMIFlags(Flags);
5162
5163	auto NegOne = B.buildConstant(Res: I32, Val: -`1`);
5164	auto SqrtSNextDown = B.buildAdd(Dst: I32, Src0: SqrtS, Src1: NegOne);
5165
5166	auto NegSqrtSNextDown = B.buildFNeg(Dst: F32, Src0: SqrtSNextDown, Flags);
5167	auto SqrtVP = B.buildFMA(Dst: F32, Src0: NegSqrtSNextDown, Src1: SqrtS, Src2: SqrtX, Flags);
5168
5169	auto PosOne = B.buildConstant(Res: I32, Val: `1`);
5170	auto SqrtSNextUp = B.buildAdd(Dst: I32, Src0: SqrtS, Src1: PosOne);
5171
5172	auto NegSqrtSNextUp = B.buildFNeg(Dst: F32, Src0: SqrtSNextUp, Flags);
5173	auto SqrtVS = B.buildFMA(Dst: F32, Src0: NegSqrtSNextUp, Src1: SqrtS, Src2: SqrtX, Flags);
5174
5175	auto Zero = B.buildFConstant(Res: F32, Val: `0.0f`);
5176	auto SqrtVPLE0 = B.buildFCmp(Pred: CmpInst::FCMP_OLE, Res: S1, Op0: SqrtVP, Op1: Zero, Flags);
5177
5178	SqrtS =
5179	B.buildSelect(Res: F32, Tst: SqrtVPLE0, Op0: SqrtSNextDown, Op1: SqrtS, Flags).getReg(Idx: `0`);
5180
5181	auto SqrtVPVSGT0 = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: S1, Op0: SqrtVS, Op1: Zero, Flags);
5182	SqrtS =
5183	B.buildSelect(Res: F32, Tst: SqrtVPVSGT0, Op0: SqrtSNextUp, Op1: SqrtS, Flags).getReg(Idx: `0`);
5184	} else {
5185	auto SqrtR =
5186	B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(`0`));
5187	B.buildFMul(Dst: SqrtS, Src0: SqrtX, Src1: SqrtR, Flags);
5188
5189	auto Half = B.buildFConstant(Res: F32, Val: `0.5f`);
5190	auto SqrtH = B.buildFMul(Dst: F32, Src0: SqrtR, Src1: Half, Flags);
5191	auto NegSqrtH = B.buildFNeg(Dst: F32, Src0: SqrtH, Flags);
5192	auto SqrtE = B.buildFMA(Dst: F32, Src0: NegSqrtH, Src1: SqrtS, Src2: Half, Flags);
5193	SqrtH = B.buildFMA(Dst: F32, Src0: SqrtH, Src1: SqrtE, Src2: SqrtH, Flags);
5194	SqrtS = B.buildFMA(Dst: F32, Src0: SqrtS, Src1: SqrtE, Src2: SqrtS, Flags).getReg(`0`);
5195	auto NegSqrtS = B.buildFNeg(Dst: F32, Src0: SqrtS, Flags);
5196	auto SqrtD = B.buildFMA(Dst: F32, Src0: NegSqrtS, Src1: SqrtS, Src2: SqrtX, Flags);
5197	SqrtS = B.buildFMA(Dst: F32, Src0: SqrtD, Src1: SqrtH, Src2: SqrtS, Flags).getReg(`0`);
5198	}
5199
5200	auto ScaleDownFactor = B.buildFConstant(Res: F32, Val: `0x1.0p-16f`);
5201
5202	auto ScaledDown = B.buildFMul(Dst: F32, Src0: SqrtS, Src1: ScaleDownFactor, Flags);
5203
5204	SqrtS = B.buildSelect(Res: F32, Tst: NeedScale, Op0: ScaledDown, Op1: SqrtS, Flags).getReg(Idx: `0`);
5205
5206	auto IsZeroOrInf = B.buildIsFPClass(Res: LLT::scalar(SizeInBits: `1`), Src: SqrtX, Mask: fcZero \| fcPosInf);
5207	B.buildSelect(Res: Dst, Tst: IsZeroOrInf, Op0: SqrtX, Op1: SqrtS, Flags);
5208
5209	MI.eraseFromParent();
5210	return true;
5211	}
5212
5213	bool AMDGPULegalizerInfo::legalizeFSQRTF64(MachineInstr &MI,
5214	MachineRegisterInfo &MRI,
5215	MachineIRBuilder &B) const {
5216	// For double type, the SQRT and RSQ instructions don't have required
5217	// precision, we apply Goldschmidt's algorithm to improve the result:
5218	//
5219	// y0 = rsq(x)
5220	// g0 = x y0*
5221	// h0 = 0.5 y0*
5222	//
5223	// r0 = 0.5 - h0 g0*
5224	// g1 = g0 r0 + g0*
5225	// h1 = h0 r0 + h0*
5226	//
5227	// r1 = 0.5 - h1 g1 => d0 = x - g1 * g1*
5228	// g2 = g1 r1 + g1 g2 = d0 * h1 + g1*
5229	// h2 = h1 r1 + h1*
5230	//
5231	// r2 = 0.5 - h2 g2 => d1 = x - g2 * g2*
5232	// g3 = g2 r2 + g2 g3 = d1 * h1 + g2*
5233	//
5234	// sqrt(x) = g3
5235
5236	const LLT S1 = LLT::scalar(SizeInBits: `1`);
5237	const LLT S32 = LLT::scalar(SizeInBits: `32`);
5238	const LLT F64 = LLT::scalar(SizeInBits: `64`);
5239
5240	Register Dst = MI.getOperand(i: `0`).getReg();
5241	assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
5242
5243	Register X = MI.getOperand(i: `1`).getReg();
5244	unsigned Flags = MI.getFlags();
5245
5246	auto ScaleConstant = B.buildFConstant(Res: F64, Val: `0x1.0p-767`);
5247
5248	auto ZeroInt = B.buildConstant(Res: S32, Val: `0`);
5249	auto Scaling = B.buildFCmp(Pred: FCmpInst::FCMP_OLT, Res: S1, Op0: X, Op1: ScaleConstant);
5250
5251	// Scale up input if it is too small.
5252	auto ScaleUpFactor = B.buildConstant(Res: S32, Val: `256`);
5253	auto ScaleUp = B.buildSelect(Res: S32, Tst: Scaling, Op0: ScaleUpFactor, Op1: ZeroInt);
5254	auto SqrtX = B.buildFLdexp(Dst: F64, Src0: X, Src1: ScaleUp, Flags);
5255
5256	auto SqrtY =
5257	B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX.getReg(`0`));
5258
5259	auto Half = B.buildFConstant(Res: F64, Val: `0.5`);
5260	auto SqrtH0 = B.buildFMul(Dst: F64, Src0: SqrtY, Src1: Half);
5261	auto SqrtS0 = B.buildFMul(Dst: F64, Src0: SqrtX, Src1: SqrtY);
5262
5263	auto NegSqrtH0 = B.buildFNeg(Dst: F64, Src0: SqrtH0);
5264	auto SqrtR0 = B.buildFMA(Dst: F64, Src0: NegSqrtH0, Src1: SqrtS0, Src2: Half);
5265
5266	auto SqrtS1 = B.buildFMA(Dst: F64, Src0: SqrtS0, Src1: SqrtR0, Src2: SqrtS0);
5267	auto SqrtH1 = B.buildFMA(Dst: F64, Src0: SqrtH0, Src1: SqrtR0, Src2: SqrtH0);
5268
5269	auto NegSqrtS1 = B.buildFNeg(Dst: F64, Src0: SqrtS1);
5270	auto SqrtD0 = B.buildFMA(Dst: F64, Src0: NegSqrtS1, Src1: SqrtS1, Src2: SqrtX);
5271
5272	auto SqrtS2 = B.buildFMA(Dst: F64, Src0: SqrtD0, Src1: SqrtH1, Src2: SqrtS1);
5273
5274	auto NegSqrtS2 = B.buildFNeg(Dst: F64, Src0: SqrtS2);
5275	auto SqrtD1 = B.buildFMA(Dst: F64, Src0: NegSqrtS2, Src1: SqrtS2, Src2: SqrtX);
5276
5277	auto SqrtRet = B.buildFMA(Dst: F64, Src0: SqrtD1, Src1: SqrtH1, Src2: SqrtS2);
5278
5279	// Scale down the result.
5280	auto ScaleDownFactor = B.buildConstant(Res: S32, Val: -`128`);
5281	auto ScaleDown = B.buildSelect(Res: S32, Tst: Scaling, Op0: ScaleDownFactor, Op1: ZeroInt);
5282	SqrtRet = B.buildFLdexp(Dst: F64, Src0: SqrtRet, Src1: ScaleDown, Flags);
5283
5284	// TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
5285	// with finite only or nsz because rsq(+/-0) = +/-inf
5286
5287	// TODO: Check for DAZ and expand to subnormals
5288	auto IsZeroOrInf = B.buildIsFPClass(Res: LLT::scalar(SizeInBits: `1`), Src: SqrtX, Mask: fcZero \| fcPosInf);
5289
5290	// If x is +INF, +0, or -0, use its original value
5291	B.buildSelect(Res: Dst, Tst: IsZeroOrInf, Op0: SqrtX, Op1: SqrtRet, Flags);
5292
5293	MI.eraseFromParent();
5294	return true;
5295	}
5296
5297	bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI,
5298	MachineRegisterInfo &MRI,
5299	MachineIRBuilder &B) const {
5300	LLT Ty = MRI.getType(Reg: MI.getOperand(i: `0`).getReg());
5301	if (Ty == LLT::scalar(SizeInBits: `32`))
5302	return legalizeFSQRTF32(MI, MRI, B);
5303	if (Ty == LLT::scalar(SizeInBits: `64`))
5304	return legalizeFSQRTF64(MI, MRI, B);
5305	if (Ty == LLT::scalar(SizeInBits: `16`))
5306	return legalizeFSQRTF16(MI, MRI, B);
5307	return false;
5308	}
5309
5310	// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
5311	// FIXME: Why do we handle this one but not other removed instructions?
5312	//
5313	// Reciprocal square root. The clamp prevents infinite results, clamping
5314	// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
5315	// +-max_float.
5316	bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
5317	MachineRegisterInfo &MRI,
5318	MachineIRBuilder &B) const {
5319	if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
5320	return true;
5321
5322	Register Dst = MI.getOperand(i: `0`).getReg();
5323	Register Src = MI.getOperand(i: `2`).getReg();
5324	auto Flags = MI.getFlags();
5325
5326	LLT Ty = MRI.getType(Reg: Dst);
5327
5328	const fltSemantics *FltSemantics;
5329	if (Ty == LLT::scalar(SizeInBits: `32`))
5330	FltSemantics = &APFloat::IEEEsingle();
5331	else if (Ty == LLT::scalar(SizeInBits: `64`))
5332	FltSemantics = &APFloat::IEEEdouble();
5333	else
5334	return false;
5335
5336	auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
5337	.addUse(Src)
5338	.setMIFlags(Flags);
5339
5340	// We don't need to concern ourselves with the snan handling difference, since
5341	// the rsq quieted (or not) so use the one which will directly select.
5342	const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5343	const bool UseIEEE = MFI->getMode().IEEE;
5344
5345	auto MaxFlt = B.buildFConstant(Res: Ty, Val: APFloat::getLargest(Sem: *FltSemantics));
5346	auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Dst: Ty, Src0: Rsq, Src1: MaxFlt, Flags) :
5347	B.buildFMinNum(Dst: Ty, Src0: Rsq, Src1: MaxFlt, Flags);
5348
5349	auto MinFlt = B.buildFConstant(Res: Ty, Val: APFloat::getLargest(Sem: FltSemantics, Negative: true*));
5350
5351	if (UseIEEE)
5352	B.buildFMaxNumIEEE(Dst, Src0: ClampMax, Src1: MinFlt, Flags);
5353	else
5354	B.buildFMaxNum(Dst, Src0: ClampMax, Src1: MinFlt, Flags);
5355	MI.eraseFromParent();
5356	return true;
5357	}
5358
5359	static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
5360	switch (IID) {
5361	case Intrinsic::amdgcn_ds_fadd:
5362	return AMDGPU::G_ATOMICRMW_FADD;
5363	case Intrinsic::amdgcn_ds_fmin:
5364	return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
5365	case Intrinsic::amdgcn_ds_fmax:
5366	return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
5367	default:
5368	llvm_unreachable("not a DS FP intrinsic");
5369	}
5370	}
5371
5372	bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
5373	MachineInstr &MI,
5374	Intrinsic::ID IID) const {
5375	GISelChangeObserver &Observer = Helper.Observer;
5376	Observer.changingInstr(MI);
5377
5378	MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
5379
5380	// The remaining operands were used to set fields in the MemOperand on
5381	// construction.
5382	for (int I = `6`; I > `3`; --I)
5383	MI.removeOperand(OpNo: I);
5384
5385	MI.removeOperand(OpNo: `1`); // Remove the intrinsic ID.
5386	Observer.changedInstr(MI);
5387	return true;
5388	}
5389
5390	bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
5391	MachineRegisterInfo &MRI,
5392	MachineIRBuilder &B) const {
5393	uint64_t Offset =
5394	ST.getTargetLowering()->getImplicitParameterOffset(
5395	MF: B.getMF(), Param: AMDGPUTargetLowering::FIRST_IMPLICIT);
5396	LLT DstTy = MRI.getType(Reg: DstReg);
5397	LLT IdxTy = LLT::scalar(SizeInBits: DstTy.getSizeInBits());
5398
5399	Register KernargPtrReg = MRI.createGenericVirtualRegister(Ty: DstTy);
5400	if (!loadInputValue(DstReg: KernargPtrReg, B,
5401	ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
5402	return false;
5403
5404	// FIXME: This should be nuw
5405	B.buildPtrAdd(Res: DstReg, Op0: KernargPtrReg, Op1: B.buildConstant(Res: IdxTy, Val: Offset).getReg(Idx: `0`));
5406	return true;
5407	}
5408
5409	/// To create a buffer resource from a 64-bit pointer, mask off the upper 32
5410	/// bits of the pointer and replace them with the stride argument, then
5411	/// merge_values everything together. In the common case of a raw buffer (the
5412	/// stride component is 0), we can just AND off the upper half.
5413	bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin(
5414	MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5415	Register Result = MI.getOperand(i: `0`).getReg();
5416	Register Pointer = MI.getOperand(i: `2`).getReg();
5417	Register Stride = MI.getOperand(i: `3`).getReg();
5418	Register NumRecords = MI.getOperand(i: `4`).getReg();
5419	Register Flags = MI.getOperand(i: `5`).getReg();
5420
5421	LLT S32 = LLT::scalar(SizeInBits: `32`);
5422
5423	B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
5424	auto Unmerge = B.buildUnmerge(Res: S32, Op: Pointer);
5425	Register LowHalf = Unmerge.getReg(Idx: `0`);
5426	Register HighHalf = Unmerge.getReg(Idx: `1`);
5427
5428	auto AndMask = B.buildConstant(Res: S32, Val: `0x0000ffff`);
5429	auto Masked = B.buildAnd(Dst: S32, Src0: HighHalf, Src1: AndMask);
5430
5431	MachineInstrBuilder NewHighHalf = Masked;
5432	std::optional<ValueAndVReg> StrideConst =
5433	getIConstantVRegValWithLookThrough(VReg: Stride, MRI);
5434	if (!StrideConst \|\| !StrideConst ->Value.isZero()) {
5435	MachineInstrBuilder ShiftedStride;
5436	if (StrideConst) {
5437	uint32_t StrideVal = StrideConst ->Value.getZExtValue();
5438	uint32_t ShiftedStrideVal = StrideVal << `16`;
5439	ShiftedStride = B.buildConstant(Res: S32, Val: ShiftedStrideVal);
5440	} else {
5441	auto ExtStride = B.buildAnyExt(Res: S32, Op: Stride);
5442	auto ShiftConst = B.buildConstant(Res: S32, Val: `16`);
5443	ShiftedStride = B.buildShl(Dst: S32, Src0: ExtStride, Src1: ShiftConst);
5444	}
5445	NewHighHalf = B.buildOr(Dst: S32, Src0: Masked, Src1: ShiftedStride);
5446	}
5447	Register NewHighHalfReg = NewHighHalf.getReg(Idx: `0`);
5448	B.buildMergeValues(Res: Result, Ops: {LowHalf, NewHighHalfReg, NumRecords, Flags});
5449	MI.eraseFromParent();
5450	return true;
5451	}
5452
5453	bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
5454	MachineRegisterInfo &MRI,
5455	MachineIRBuilder &B) const {
5456	const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5457	if (!MFI->isEntryFunction()) {
5458	return legalizePreloadedArgIntrin(MI, MRI, B,
5459	ArgType: AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
5460	}
5461
5462	Register DstReg = MI.getOperand(i: `0`).getReg();
5463	if (!getImplicitArgPtr(DstReg, MRI, B))
5464	return false;
5465
5466	MI.eraseFromParent();
5467	return true;
5468	}
5469
5470	bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg,
5471	MachineRegisterInfo &MRI,
5472	MachineIRBuilder &B) const {
5473	Function &F = B.getMF().getFunction();
5474	std::optional<uint32_t> KnownSize =
5475	AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
5476	if (KnownSize.has_value())
5477	B.buildConstant(Res: DstReg, Val: *KnownSize);
5478	return false;
5479	}
5480
5481	bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI,
5482	MachineRegisterInfo &MRI,
5483	MachineIRBuilder &B) const {
5484
5485	const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5486	if (!MFI->isEntryFunction()) {
5487	return legalizePreloadedArgIntrin(MI, MRI, B,
5488	ArgType: AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
5489	}
5490
5491	Register DstReg = MI.getOperand(i: `0`).getReg();
5492	if (!getLDSKernelId(DstReg, MRI, B))
5493	return false;
5494
5495	MI.eraseFromParent();
5496	return true;
5497	}
5498
5499	bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
5500	MachineRegisterInfo &MRI,
5501	MachineIRBuilder &B,
5502	unsigned AddrSpace) const {
5503	Register ApertureReg = getSegmentAperture(AS: AddrSpace, MRI, B);
5504	auto Unmerge = B.buildUnmerge(Res: LLT::scalar(SizeInBits: `32`), Op: MI.getOperand(i: `2`).getReg());
5505	Register Hi32 = Unmerge.getReg(Idx: `1`);
5506
5507	B.buildICmp(Pred: ICmpInst::ICMP_EQ, Res: MI.getOperand(i: `0`), Op0: Hi32, Op1: ApertureReg);
5508	MI.eraseFromParent();
5509	return true;
5510	}
5511
5512	// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
5513	// offset (the offset that is included in bounds checking and swizzling, to be
5514	// split between the instruction's voffset and immoffset fields) and soffset
5515	// (the offset that is excluded from bounds checking and swizzling, to go in
5516	// the instruction's soffset field). This function takes the first kind of
5517	// offset and figures out how to split it between voffset and immoffset.
5518	std::pair<Register, unsigned>
5519	AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
5520	Register OrigOffset) const {
5521	const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
5522	Register BaseReg;
5523	unsigned ImmOffset;
5524	const LLT S32 = LLT::scalar(SizeInBits: `32`);
5525	MachineRegisterInfo &MRI = *B.getMRI();
5526
5527	std::tie(args&: BaseReg, args&: ImmOffset) =
5528	AMDGPU::getBaseWithConstantOffset(MRI, Reg: OrigOffset);
5529
5530	// If BaseReg is a pointer, convert it to int.
5531	if (MRI.getType(Reg: BaseReg).isPointer())
5532	BaseReg = B.buildPtrToInt(Dst: MRI.getType(Reg: OrigOffset), Src: BaseReg).getReg(Idx: `0`);
5533
5534	// If the immediate value is too big for the immoffset field, put only bits
5535	// that would normally fit in the immoffset field. The remaining value that
5536	// is copied/added for the voffset field is a large power of 2, and it
5537	// stands more chance of being CSEd with the copy/add for another similar
5538	// load/store.
5539	// However, do not do that rounding down if that is a negative
5540	// number, as it appears to be illegal to have a negative offset in the
5541	// vgpr, even if adding the immediate offset makes it positive.
5542	unsigned Overflow = ImmOffset & ~MaxImm;
5543	ImmOffset -= Overflow;
5544	if ((int32_t)Overflow < `0`) {
5545	Overflow += ImmOffset;
5546	ImmOffset = `0`;
5547	}
5548
5549	if (Overflow != `0`) {
5550	if (!BaseReg) {
5551	BaseReg = B.buildConstant(Res: S32, Val: Overflow).getReg(Idx: `0`);
5552	} else {
5553	auto OverflowVal = B.buildConstant(Res: S32, Val: Overflow);
5554	BaseReg = B.buildAdd(Dst: S32, Src0: BaseReg, Src1: OverflowVal).getReg(Idx: `0`);
5555	}
5556	}
5557
5558	if (!BaseReg)
5559	BaseReg = B.buildConstant(Res: S32, Val: `0`).getReg(Idx: `0`);
5560
5561	return std::pair(BaseReg, ImmOffset);
5562	}
5563
5564	/// Handle register layout difference for f16 images for some subtargets.
5565	Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
5566	MachineRegisterInfo &MRI,
5567	Register Reg,
5568	bool ImageStore) const {
5569	const LLT S16 = LLT::scalar(SizeInBits: `16`);
5570	const LLT S32 = LLT::scalar(SizeInBits: `32`);
5571	LLT StoreVT = MRI.getType(Reg);
5572	assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
5573
5574	if (ST.hasUnpackedD16VMem()) {
5575	auto Unmerge = B.buildUnmerge(Res: S16, Op: Reg);
5576
5577	SmallVector<Register, `4`> WideRegs;
5578	for (int I = `0`, E = Unmerge ->getNumOperands() - `1`; I != E; ++I)
5579	WideRegs.push_back(Elt: B.buildAnyExt(Res: S32, Op: Unmerge.getReg(Idx: I)).getReg(Idx: `0`));
5580
5581	int NumElts = StoreVT.getNumElements();
5582
5583	return B.buildBuildVector(Res: LLT::fixed_vector(NumElements: NumElts, ScalarTy: S32), Ops: WideRegs)
5584	.getReg(Idx: `0`);
5585	}
5586
5587	if (ImageStore && ST.hasImageStoreD16Bug()) {
5588	if (StoreVT.getNumElements() == `2`) {
5589	SmallVector<Register, `4`> PackedRegs;
5590	Reg = B.buildBitcast(Dst: S32, Src: Reg).getReg(Idx: `0`);
5591	PackedRegs.push_back(Elt: Reg);
5592	PackedRegs.resize(N: `2`, NV: B.buildUndef(Res: S32).getReg(Idx: `0`));
5593	return B.buildBuildVector(Res: LLT::fixed_vector(NumElements: `2`, ScalarTy: S32), Ops: PackedRegs)
5594	.getReg(Idx: `0`);
5595	}
5596
5597	if (StoreVT.getNumElements() == `3`) {
5598	SmallVector<Register, `4`> PackedRegs;
5599	auto Unmerge = B.buildUnmerge(Res: S16, Op: Reg);
5600	for (int I = `0`, E = Unmerge ->getNumOperands() - `1`; I != E; ++I)
5601	PackedRegs.push_back(Elt: Unmerge.getReg(Idx: I));
5602	PackedRegs.resize(N: `6`, NV: B.buildUndef(Res: S16).getReg(Idx: `0`));
5603	Reg = B.buildBuildVector(Res: LLT::fixed_vector(NumElements: `6`, ScalarTy: S16), Ops: PackedRegs).getReg(Idx: `0`);
5604	return B.buildBitcast(Dst: LLT::fixed_vector(NumElements: `3`, ScalarTy: S32), Src: Reg).getReg(Idx: `0`);
5605	}
5606
5607	if (StoreVT.getNumElements() == `4`) {
5608	SmallVector<Register, `4`> PackedRegs;
5609	Reg = B.buildBitcast(Dst: LLT::fixed_vector(NumElements: `2`, ScalarTy: S32), Src: Reg).getReg(Idx: `0`);
5610	auto Unmerge = B.buildUnmerge(Res: S32, Op: Reg);
5611	for (int I = `0`, E = Unmerge ->getNumOperands() - `1`; I != E; ++I)
5612	PackedRegs.push_back(Elt: Unmerge.getReg(Idx: I));
5613	PackedRegs.resize(N: `4`, NV: B.buildUndef(Res: S32).getReg(Idx: `0`));
5614	return B.buildBuildVector(Res: LLT::fixed_vector(NumElements: `4`, ScalarTy: S32), Ops: PackedRegs)
5615	.getReg(Idx: `0`);
5616	}
5617
5618	llvm_unreachable("invalid data type");
5619	}
5620
5621	if (StoreVT == LLT::fixed_vector(NumElements: `3`, ScalarTy: S16)) {
5622	Reg = B.buildPadVectorWithUndefElements(Res: LLT::fixed_vector(NumElements: `4`, ScalarTy: S16), Op0: Reg)
5623	.getReg(Idx: `0`);
5624	}
5625	return Reg;
5626	}
5627
5628	Register AMDGPULegalizerInfo::fixStoreSourceType(
5629	MachineIRBuilder &B, Register VData, bool IsFormat) const {
5630	MachineRegisterInfo *MRI = B.getMRI();
5631	LLT Ty = MRI->getType(Reg: VData);
5632
5633	const LLT S16 = LLT::scalar(SizeInBits: `16`);
5634
5635	// Fixup buffer resources themselves needing to be v4i128.
5636	if (hasBufferRsrcWorkaround(Ty))
5637	return castBufferRsrcToV4I32(Pointer: VData, B);
5638
5639	// Fixup illegal register types for i8 stores.
5640	if (Ty == LLT::scalar(SizeInBits: `8`) \|\| Ty == S16) {
5641	Register AnyExt = B.buildAnyExt(Res: LLT::scalar(SizeInBits: `32`), Op: VData).getReg(Idx: `0`);
5642	return AnyExt;
5643	}
5644
5645	if (Ty.isVector()) {
5646	if (Ty.getElementType() == S16 && Ty.getNumElements() <= `4`) {
5647	if (IsFormat)
5648	return handleD16VData(B, MRI&: *MRI, Reg: VData);
5649	}
5650	}
5651
5652	return VData;
5653	}
5654
5655	bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
5656	MachineRegisterInfo &MRI,
5657	MachineIRBuilder &B,
5658	bool IsTyped,
5659	bool IsFormat) const {
5660	Register VData = MI.getOperand(i: `1`).getReg();
5661	LLT Ty = MRI.getType(Reg: VData);
5662	LLT EltTy = Ty.getScalarType();
5663	const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == `16`);
5664	const LLT S32 = LLT::scalar(SizeInBits: `32`);
5665
5666	VData = fixStoreSourceType(B, VData, IsFormat);
5667	castBufferRsrcArgToV4I32(MI, B, Idx: `2`);
5668	Register RSrc = MI.getOperand(i: `2`).getReg();
5669
5670	MachineMemOperand MMO = MI.memoperands_begin();
5671	const int MemSize = MMO->getSize().getValue();
5672
5673	unsigned ImmOffset;
5674
5675	// The typed intrinsics add an immediate after the registers.
5676	const unsigned NumVIndexOps = IsTyped ? `8` : `7`;
5677
5678	// The struct intrinsic variants add one additional operand over raw.
5679	const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
5680	Register VIndex;
5681	int OpOffset = `0`;
5682	if (HasVIndex) {
5683	VIndex = MI.getOperand(i: `3`).getReg();
5684	OpOffset = `1`;
5685	} else {
5686	VIndex = B.buildConstant(Res: S32, Val: `0`).getReg(Idx: `0`);
5687	}
5688
5689	Register VOffset = MI.getOperand(i: `3` + OpOffset).getReg();
5690	Register SOffset = MI.getOperand(i: `4` + OpOffset).getReg();
5691
5692	unsigned Format = `0`;
5693	if (IsTyped) {
5694	Format = MI.getOperand(i: `5` + OpOffset).getImm();
5695	++OpOffset;
5696	}
5697
5698	unsigned AuxiliaryData = MI.getOperand(i: `5` + OpOffset).getImm();
5699
5700	std::tie(args&: VOffset, args&: ImmOffset) = splitBufferOffsets(B, OrigOffset: VOffset);
5701
5702	unsigned Opc;
5703	if (IsTyped) {
5704	Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
5705	AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
5706	} else if (IsFormat) {
5707	Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
5708	AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
5709	} else {
5710	switch (MemSize) {
5711	case `1`:
5712	Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
5713	break;
5714	case `2`:
5715	Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
5716	break;
5717	default:
5718	Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
5719	break;
5720	}
5721	}
5722
5723	auto MIB = B.buildInstr(Opcode: Opc)
5724	.addUse(RegNo: VData) // vdata
5725	.addUse(RegNo: RSrc) // rsrc
5726	.addUse(RegNo: VIndex) // vindex
5727	.addUse(RegNo: VOffset) // voffset
5728	.addUse(RegNo: SOffset) // soffset
5729	.addImm(Val: ImmOffset); // offset(imm)
5730
5731	if (IsTyped)
5732	MIB.addImm(Val: Format);
5733
5734	MIB.addImm(Val: AuxiliaryData) // cachepolicy, swizzled buffer(imm)
5735	.addImm(Val: HasVIndex ? -`1` : `0`) // idxen(imm)
5736	.addMemOperand(MMO);
5737
5738	MI.eraseFromParent();
5739	return true;
5740	}
5741
5742	static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
5743	Register VIndex, Register VOffset, Register SOffset,
5744	unsigned ImmOffset, unsigned Format,
5745	unsigned AuxiliaryData, MachineMemOperand *MMO,
5746	bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
5747	auto MIB = B.buildInstr(Opcode: Opc)
5748	.addDef(RegNo: LoadDstReg) // vdata
5749	.addUse(RegNo: RSrc) // rsrc
5750	.addUse(RegNo: VIndex) // vindex
5751	.addUse(RegNo: VOffset) // voffset
5752	.addUse(RegNo: SOffset) // soffset
5753	.addImm(Val: ImmOffset); // offset(imm)
5754
5755	if (IsTyped)
5756	MIB.addImm(Val: Format);
5757
5758	MIB.addImm(Val: AuxiliaryData) // cachepolicy, swizzled buffer(imm)
5759	.addImm(Val: HasVIndex ? -`1` : `0`) // idxen(imm)
5760	.addMemOperand(MMO);
5761	}
5762
5763	bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
5764	MachineRegisterInfo &MRI,
5765	MachineIRBuilder &B,
5766	bool IsFormat,
5767	bool IsTyped) const {
5768	// FIXME: Verifier should enforce 1 MMO for these intrinsics.
5769	MachineMemOperand MMO = MI.memoperands_begin();
5770	const LLT MemTy = MMO->getMemoryType();
5771	const LLT S32 = LLT::scalar(SizeInBits: `32`);
5772
5773	Register Dst = MI.getOperand(i: `0`).getReg();
5774
5775	Register StatusDst;
5776	int OpOffset = `0`;
5777	assert(MI.getNumExplicitDefs() == `1` \|\| MI.getNumExplicitDefs() == `2`);
5778	bool IsTFE = MI.getNumExplicitDefs() == `2`;
5779	if (IsTFE) {
5780	StatusDst = MI.getOperand(i: `1`).getReg();
5781	++OpOffset;
5782	}
5783
5784	castBufferRsrcArgToV4I32(MI, B, Idx: `2` + OpOffset);
5785	Register RSrc = MI.getOperand(i: `2` + OpOffset).getReg();
5786
5787	// The typed intrinsics add an immediate after the registers.
5788	const unsigned NumVIndexOps = IsTyped ? `8` : `7`;
5789
5790	// The struct intrinsic variants add one additional operand over raw.
5791	const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
5792	Register VIndex;
5793	if (HasVIndex) {
5794	VIndex = MI.getOperand(i: `3` + OpOffset).getReg();
5795	++OpOffset;
5796	} else {
5797	VIndex = B.buildConstant(Res: S32, Val: `0`).getReg(Idx: `0`);
5798	}
5799
5800	Register VOffset = MI.getOperand(i: `3` + OpOffset).getReg();
5801	Register SOffset = MI.getOperand(i: `4` + OpOffset).getReg();
5802
5803	unsigned Format = `0`;
5804	if (IsTyped) {
5805	Format = MI.getOperand(i: `5` + OpOffset).getImm();
5806	++OpOffset;
5807	}
5808
5809	unsigned AuxiliaryData = MI.getOperand(i: `5` + OpOffset).getImm();
5810	unsigned ImmOffset;
5811
5812	LLT Ty = MRI.getType(Reg: Dst);
5813	// Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
5814	// logic doesn't have to handle that case.
5815	if (hasBufferRsrcWorkaround(Ty)) {
5816	Ty = castBufferRsrcFromV4I32(MI, B, MRI, Idx: `0`);
5817	Dst = MI.getOperand(i: `0`).getReg();
5818	}
5819	LLT EltTy = Ty.getScalarType();
5820	const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == `16`);
5821	const bool Unpacked = ST.hasUnpackedD16VMem();
5822
5823	std::tie(args&: VOffset, args&: ImmOffset) = splitBufferOffsets(B, OrigOffset: VOffset);
5824
5825	unsigned Opc;
5826
5827	// TODO: Support TFE for typed and narrow loads.
5828	if (IsTyped) {
5829	if (IsTFE)
5830	return false;
5831	Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
5832	AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
5833	} else if (IsFormat) {
5834	if (IsD16) {
5835	if (IsTFE)
5836	return false;
5837	Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
5838	} else {
5839	Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
5840	: AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
5841	}
5842	} else {
5843	if (IsTFE)
5844	return false;
5845	switch (MemTy.getSizeInBits()) {
5846	case `8`:
5847	Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
5848	break;
5849	case `16`:
5850	Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
5851	break;
5852	default:
5853	Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
5854	break;
5855	}
5856	}
5857
5858	if (IsTFE) {
5859	unsigned NumValueDWords = divideCeil(Numerator: Ty.getSizeInBits(), Denominator: `32`);
5860	unsigned NumLoadDWords = NumValueDWords + `1`;
5861	LLT LoadTy = LLT::fixed_vector(NumElements: NumLoadDWords, ScalarTy: S32);
5862	Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(Ty: LoadTy);
5863	buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
5864	Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
5865	if (NumValueDWords == `1`) {
5866	B.buildUnmerge(Res: {Dst, StatusDst}, Op: LoadDstReg);
5867	} else {
5868	SmallVector<Register, `5`> LoadElts;
5869	for (unsigned I = `0`; I != NumValueDWords; ++I)
5870	LoadElts.push_back(Elt: B.getMRI()->createGenericVirtualRegister(Ty: S32));
5871	LoadElts.push_back(Elt: StatusDst);
5872	B.buildUnmerge(Res: LoadElts, Op: LoadDstReg);
5873	LoadElts.truncate(N: NumValueDWords);
5874	B.buildMergeLikeInstr(Res: Dst, Ops: LoadElts);
5875	}
5876	} else if ((!IsD16 && MemTy.getSizeInBits() < `32`) \|\|
5877	(IsD16 && !Ty.isVector())) {
5878	Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(Ty: S32);
5879	buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
5880	Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
5881	B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
5882	B.buildTrunc(Res: Dst, Op: LoadDstReg);
5883	} else if (Unpacked && IsD16 && Ty.isVector()) {
5884	LLT UnpackedTy = Ty.changeElementSize(NewEltSize: `32`);
5885	Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(Ty: UnpackedTy);
5886	buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
5887	Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
5888	B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
5889	// FIXME: G_TRUNC should work, but legalization currently fails
5890	auto Unmerge = B.buildUnmerge(Res: S32, Op: LoadDstReg);
5891	SmallVector<Register, `4`> Repack;
5892	for (unsigned I = `0`, N = Unmerge ->getNumOperands() - `1`; I != N; ++I)
5893	Repack.push_back(Elt: B.buildTrunc(Res: EltTy, Op: Unmerge.getReg(Idx: I)).getReg(Idx: `0`));
5894	B.buildMergeLikeInstr(Res: Dst, Ops: Repack);
5895	} else {
5896	buildBufferLoad(Opc, LoadDstReg: Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
5897	AuxiliaryData, MMO, IsTyped, HasVIndex, B);
5898	}
5899
5900	MI.eraseFromParent();
5901	return true;
5902	}
5903
5904	static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
5905	switch (IntrID) {
5906	case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5907	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
5908	case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5909	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
5910	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
5911	case Intrinsic::amdgcn_raw_buffer_atomic_add:
5912	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
5913	case Intrinsic::amdgcn_struct_buffer_atomic_add:
5914	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
5915	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
5916	case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5917	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
5918	case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5919	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
5920	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
5921	case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5922	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
5923	case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5924	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
5925	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
5926	case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5927	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
5928	case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5929	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
5930	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
5931	case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5932	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
5933	case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5934	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
5935	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
5936	case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5937	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
5938	case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5939	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
5940	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
5941	case Intrinsic::amdgcn_raw_buffer_atomic_and:
5942	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
5943	case Intrinsic::amdgcn_struct_buffer_atomic_and:
5944	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
5945	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
5946	case Intrinsic::amdgcn_raw_buffer_atomic_or:
5947	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
5948	case Intrinsic::amdgcn_struct_buffer_atomic_or:
5949	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
5950	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
5951	case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5952	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
5953	case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5954	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
5955	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
5956	case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5957	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
5958	case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5959	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
5960	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
5961	case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5962	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
5963	case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5964	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
5965	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
5966	case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5967	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
5968	case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5969	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
5970	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
5971	case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5972	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
5973	case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
5974	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
5975	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
5976	case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
5977	case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16:
5978	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16;
5979	case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5980	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
5981	case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5982	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
5983	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
5984	case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5985	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
5986	case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5987	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
5988	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
5989	case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
5990	case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
5991	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
5992	default:
5993	llvm_unreachable("unhandled atomic opcode");
5994	}
5995	}
5996
5997	bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
5998	MachineIRBuilder &B,
5999	Intrinsic::ID IID) const {
6000	const bool IsCmpSwap =
6001	IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap \|\|
6002	IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap \|\|
6003	IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap \|\|
6004	IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6005
6006	Register Dst = MI.getOperand(i: `0`).getReg();
6007	// Since we don't have 128-bit atomics, we don't need to handle the case of
6008	// p8 argmunents to the atomic itself
6009	Register VData = MI.getOperand(i: `2`).getReg();
6010
6011	Register CmpVal;
6012	int OpOffset = `0`;
6013
6014	if (IsCmpSwap) {
6015	CmpVal = MI.getOperand(i: `3`).getReg();
6016	++OpOffset;
6017	}
6018
6019	castBufferRsrcArgToV4I32(MI, B, Idx: `3` + OpOffset);
6020	Register RSrc = MI.getOperand(i: `3` + OpOffset).getReg();
6021	const unsigned NumVIndexOps = IsCmpSwap ? `9` : `8`;
6022
6023	// The struct intrinsic variants add one additional operand over raw.
6024	const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6025	Register VIndex;
6026	if (HasVIndex) {
6027	VIndex = MI.getOperand(i: `4` + OpOffset).getReg();
6028	++OpOffset;
6029	} else {
6030	VIndex = B.buildConstant(Res: LLT::scalar(SizeInBits: `32`), Val: `0`).getReg(Idx: `0`);
6031	}
6032
6033	Register VOffset = MI.getOperand(i: `4` + OpOffset).getReg();
6034	Register SOffset = MI.getOperand(i: `5` + OpOffset).getReg();
6035	unsigned AuxiliaryData = MI.getOperand(i: `6` + OpOffset).getImm();
6036
6037	MachineMemOperand MMO = MI.memoperands_begin();
6038
6039	unsigned ImmOffset;
6040	std::tie(args&: VOffset, args&: ImmOffset) = splitBufferOffsets(B, OrigOffset: VOffset);
6041
6042	auto MIB = B.buildInstr(Opcode: getBufferAtomicPseudo(IntrID: IID))
6043	.addDef(RegNo: Dst)
6044	.addUse(RegNo: VData); // vdata
6045
6046	if (IsCmpSwap)
6047	MIB.addReg(RegNo: CmpVal);
6048
6049	MIB.addUse(RegNo: RSrc) // rsrc
6050	.addUse(RegNo: VIndex) // vindex
6051	.addUse(RegNo: VOffset) // voffset
6052	.addUse(RegNo: SOffset) // soffset
6053	.addImm(Val: ImmOffset) // offset(imm)
6054	.addImm(Val: AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6055	.addImm(Val: HasVIndex ? -`1` : `0`) // idxen(imm)
6056	.addMemOperand(MMO);
6057
6058	MI.eraseFromParent();
6059	return true;
6060	}
6061
6062	/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
6063	/// vector with s16 typed elements.
6064	static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
6065	SmallVectorImpl<Register> &PackedAddrs,
6066	unsigned ArgOffset,
6067	const AMDGPU::ImageDimIntrinsicInfo *Intr,
6068	bool IsA16, bool IsG16) {
6069	const LLT S16 = LLT::scalar(SizeInBits: `16`);
6070	const LLT V2S16 = LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `16`);
6071	auto EndIdx = Intr->VAddrEnd;
6072
6073	for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
6074	MachineOperand &SrcOp = MI.getOperand(i: ArgOffset + I);
6075	if (!SrcOp.isReg())
6076	continue; // _L to _LZ may have eliminated this.
6077
6078	Register AddrReg = SrcOp.getReg();
6079
6080	if ((I < Intr->GradientStart) \|\|
6081	(I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) \|\|
6082	(I >= Intr->CoordStart && !IsA16)) {
6083	if ((I < Intr->GradientStart) && IsA16 &&
6084	(B.getMRI()->getType(Reg: AddrReg) == S16)) {
6085	assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
6086	// Special handling of bias when A16 is on. Bias is of type half but
6087	// occupies full 32-bit.
6088	PackedAddrs.push_back(
6089	Elt: B.buildBuildVector(Res: V2S16, Ops: {AddrReg, B.buildUndef(Res: S16).getReg(Idx: `0`)})
6090	.getReg(Idx: `0`));
6091	} else {
6092	assert((!IsA16 \|\| Intr->NumBiasArgs == `0` \|\| I != Intr->BiasIndex) &&
6093	"Bias needs to be converted to 16 bit in A16 mode");
6094	// Handle any gradient or coordinate operands that should not be packed
6095	AddrReg = B.buildBitcast(Dst: V2S16, Src: AddrReg).getReg(Idx: `0`);
6096	PackedAddrs.push_back(Elt: AddrReg);
6097	}
6098	} else {
6099	// Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
6100	// derivatives dx/dh and dx/dv are packed with undef.
6101	if (((I + `1`) >= EndIdx) \|\|
6102	((Intr->NumGradients / `2`) % `2` == `1` &&
6103	(I == static_cast<unsigned>(Intr->GradientStart +
6104	(Intr->NumGradients / `2`) - `1`) \|\|
6105	I == static_cast<unsigned>(Intr->GradientStart +
6106	Intr->NumGradients - `1`))) \|\|
6107	// Check for _L to _LZ optimization
6108	!MI.getOperand(i: ArgOffset + I + `1`).isReg()) {
6109	PackedAddrs.push_back(
6110	Elt: B.buildBuildVector(Res: V2S16, Ops: {AddrReg, B.buildUndef(Res: S16).getReg(Idx: `0`)})
6111	.getReg(Idx: `0`));
6112	} else {
6113	PackedAddrs.push_back(
6114	Elt: B.buildBuildVector(
6115	Res: V2S16, Ops: {AddrReg, MI.getOperand(i: ArgOffset + I + `1`).getReg()})
6116	.getReg(Idx: `0`));
6117	++I;
6118	}
6119	}
6120	}
6121	}
6122
6123	/// Convert from separate vaddr components to a single vector address register,
6124	/// and replace the remaining operands with $noreg.
6125	static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
6126	int DimIdx, int NumVAddrs) {
6127	const LLT S32 = LLT::scalar(SizeInBits: `32`);
6128	(void)S32;
6129	SmallVector<Register, `8`> AddrRegs;
6130	for (int I = `0`; I != NumVAddrs; ++I) {
6131	MachineOperand &SrcOp = MI.getOperand(i: DimIdx + I);
6132	if (SrcOp.isReg()) {
6133	AddrRegs.push_back(Elt: SrcOp.getReg());
6134	assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
6135	}
6136	}
6137
6138	int NumAddrRegs = AddrRegs.size();
6139	if (NumAddrRegs != `1`) {
6140	auto VAddr =
6141	B.buildBuildVector(Res: LLT::fixed_vector(NumElements: NumAddrRegs, ScalarSizeInBits: `32`), Ops: AddrRegs);
6142	MI.getOperand(i: DimIdx).setReg(VAddr.getReg(Idx: `0`));
6143	}
6144
6145	for (int I = `1`; I != NumVAddrs; ++I) {
6146	MachineOperand &SrcOp = MI.getOperand(i: DimIdx + I);
6147	if (SrcOp.isReg())
6148	MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
6149	}
6150	}
6151
6152	/// Rewrite image intrinsics to use register layouts expected by the subtarget.
6153	///
6154	/// Depending on the subtarget, load/store with 16-bit element data need to be
6155	/// rewritten to use the low half of 32-bit registers, or directly use a packed
6156	/// layout. 16-bit addresses should also sometimes be packed into 32-bit
6157	/// registers.
6158	///
6159	/// We don't want to directly select image instructions just yet, but also want
6160	/// to exposes all register repacking to the legalizer/combiners. We also don't
6161	/// want a selected instruction entering RegBankSelect. In order to avoid
6162	/// defining a multitude of intermediate image instructions, directly hack on
6163	/// the intrinsic's arguments. In cases like a16 addresses, this requires
6164	/// padding now unnecessary arguments with $noreg.
6165	bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
6166	MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
6167	const AMDGPU::ImageDimIntrinsicInfo Intr) const* {
6168
6169	const MachineFunction &MF = *MI.getMF();
6170	const unsigned NumDefs = MI.getNumExplicitDefs();
6171	const unsigned ArgOffset = NumDefs + `1`;
6172	bool IsTFE = NumDefs == `2`;
6173	// We are only processing the operands of d16 image operations on subtargets
6174	// that use the unpacked register layout, or need to repack the TFE result.
6175
6176	// TODO: Do we need to guard against already legalized intrinsics?
6177	const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
6178	AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Intr->BaseOpcode);
6179
6180	MachineRegisterInfo *MRI = B.getMRI();
6181	const LLT S32 = LLT::scalar(SizeInBits: `32`);
6182	const LLT S16 = LLT::scalar(SizeInBits: `16`);
6183	const LLT V2S16 = LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `16`);
6184
6185	unsigned DMask = `0`;
6186	Register VData = MI.getOperand(i: NumDefs == `0` ? `1` : `0`).getReg();
6187	LLT Ty = MRI->getType(Reg: VData);
6188
6189	const bool IsAtomicPacked16Bit =
6190	(BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 \|\|
6191	BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
6192
6193	// Check for 16 bit addresses and pack if true.
6194	LLT GradTy =
6195	MRI->getType(Reg: MI.getOperand(i: ArgOffset + Intr->GradientStart).getReg());
6196	LLT AddrTy =
6197	MRI->getType(Reg: MI.getOperand(i: ArgOffset + Intr->CoordStart).getReg());
6198	const bool IsG16 =
6199	ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
6200	const bool IsA16 = AddrTy == S16;
6201	const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
6202
6203	int DMaskLanes = `0`;
6204	if (!BaseOpcode->Atomic) {
6205	DMask = MI.getOperand(i: ArgOffset + Intr->DMaskIndex).getImm();
6206	if (BaseOpcode->Gather4) {
6207	DMaskLanes = `4`;
6208	} else if (DMask != `0`) {
6209	DMaskLanes = llvm::popcount(Value: DMask);
6210	} else if (!IsTFE && !BaseOpcode->Store) {
6211	// If dmask is 0, this is a no-op load. This can be eliminated.
6212	B.buildUndef(Res: MI.getOperand(i: `0`));
6213	MI.eraseFromParent();
6214	return true;
6215	}
6216	}
6217
6218	Observer.changingInstr(MI);
6219	auto ChangedInstr = make_scope_exit(F: [&] { Observer.changedInstr(MI); });
6220
6221	const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
6222	: AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
6223	const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
6224	: AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
6225	unsigned NewOpcode = NumDefs == `0` ? StoreOpcode : LoadOpcode;
6226
6227	// Track that we legalized this
6228	MI.setDesc(B.getTII().get(Opcode: NewOpcode));
6229
6230	// Expecting to get an error flag since TFC is on - and dmask is 0 Force
6231	// dmask to be at least 1 otherwise the instruction will fail
6232	if (IsTFE && DMask == `0`) {
6233	DMask = `0x1`;
6234	DMaskLanes = `1`;
6235	MI.getOperand(i: ArgOffset + Intr->DMaskIndex).setImm(DMask);
6236	}
6237
6238	if (BaseOpcode->Atomic) {
6239	Register VData0 = MI.getOperand(i: `2`).getReg();
6240	LLT Ty = MRI->getType(Reg: VData0);
6241
6242	// TODO: Allow atomic swap and bit ops for v2s16/v4s16
6243	if (Ty.isVector() && !IsAtomicPacked16Bit)
6244	return false;
6245
6246	if (BaseOpcode->AtomicX2) {
6247	Register VData1 = MI.getOperand(i: `3`).getReg();
6248	// The two values are packed in one register.
6249	LLT PackedTy = LLT::fixed_vector(NumElements: `2`, ScalarTy: Ty);
6250	auto Concat = B.buildBuildVector(Res: PackedTy, Ops: {VData0, VData1});
6251	MI.getOperand(i: `2`).setReg(Concat.getReg(Idx: `0`));
6252	MI.getOperand(`3`).setReg(AMDGPU::NoRegister);
6253	}
6254	}
6255
6256	unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
6257
6258	// Rewrite the addressing register layout before doing anything else.
6259	if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
6260	// 16 bit gradients are supported, but are tied to the A16 control
6261	// so both gradients and addresses must be 16 bit
6262	return false;
6263	}
6264
6265	if (IsA16 && !ST.hasA16()) {
6266	// A16 not supported
6267	return false;
6268	}
6269
6270	const unsigned NSAMaxSize = ST.getNSAMaxSize(HasSampler: BaseOpcode->Sampler);
6271	const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
6272
6273	if (IsA16 \|\| IsG16) {
6274	// Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
6275	// instructions expect VGPR_32
6276	SmallVector<Register, `4`> PackedRegs;
6277
6278	packImage16bitOpsToDwords(B, MI, PackedAddrs&: PackedRegs, ArgOffset, Intr, IsA16, IsG16);
6279
6280	// See also below in the non-a16 branch
6281	const bool UseNSA = ST.hasNSAEncoding() &&
6282	PackedRegs.size() >= ST.getNSAThreshold(MF) &&
6283	(PackedRegs.size() <= NSAMaxSize \|\| HasPartialNSA);
6284	const bool UsePartialNSA =
6285	UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
6286
6287	if (UsePartialNSA) {
6288	// Pack registers that would go over NSAMaxSize into last VAddr register
6289	LLT PackedAddrTy =
6290	LLT::fixed_vector(NumElements: `2` * (PackedRegs.size() - NSAMaxSize + `1`), ScalarSizeInBits: `16`);
6291	auto Concat = B.buildConcatVectors(
6292	Res: PackedAddrTy, Ops: ArrayRef(PackedRegs).slice(N: NSAMaxSize - `1`));
6293	PackedRegs [NSAMaxSize - `1`] = Concat.getReg(Idx: `0`);
6294	PackedRegs.resize(N: NSAMaxSize);
6295	} else if (!UseNSA && PackedRegs.size() > `1`) {
6296	LLT PackedAddrTy = LLT::fixed_vector(NumElements: `2` * PackedRegs.size(), ScalarSizeInBits: `16`);
6297	auto Concat = B.buildConcatVectors(Res: PackedAddrTy, Ops: PackedRegs);
6298	PackedRegs [`0`] = Concat.getReg(Idx: `0`);
6299	PackedRegs.resize(N: `1`);
6300	}
6301
6302	const unsigned NumPacked = PackedRegs.size();
6303	for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
6304	MachineOperand &SrcOp = MI.getOperand(i: ArgOffset + I);
6305	if (!SrcOp.isReg()) {
6306	assert(SrcOp.isImm() && SrcOp.getImm() == `0`);
6307	continue;
6308	}
6309
6310	assert(SrcOp.getReg() != AMDGPU::NoRegister);
6311
6312	if (I - Intr->VAddrStart < NumPacked)
6313	SrcOp.setReg(PackedRegs [I - Intr->VAddrStart]);
6314	else
6315	SrcOp.setReg(AMDGPU::NoRegister);
6316	}
6317	} else {
6318	// If the register allocator cannot place the address registers contiguously
6319	// without introducing moves, then using the non-sequential address encoding
6320	// is always preferable, since it saves VALU instructions and is usually a
6321	// wash in terms of code size or even better.
6322	//
6323	// However, we currently have no way of hinting to the register allocator
6324	// that MIMG addresses should be placed contiguously when it is possible to
6325	// do so, so force non-NSA for the common 2-address case as a heuristic.
6326	//
6327	// SIShrinkInstructions will convert NSA encodings to non-NSA after register
6328	// allocation when possible.
6329	//
6330	// Partial NSA is allowed on GFX11+ where the final register is a contiguous
6331	// set of the remaining addresses.
6332	const bool UseNSA = ST.hasNSAEncoding() &&
6333	CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
6334	(CorrectedNumVAddrs <= NSAMaxSize \|\| HasPartialNSA);
6335	const bool UsePartialNSA =
6336	UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
6337
6338	if (UsePartialNSA) {
6339	convertImageAddrToPacked(B, MI,
6340	DimIdx: ArgOffset + Intr->VAddrStart + NSAMaxSize - `1`,
6341	NumVAddrs: Intr->NumVAddrs - NSAMaxSize + `1`);
6342	} else if (!UseNSA && Intr->NumVAddrs > `1`) {
6343	convertImageAddrToPacked(B, MI, DimIdx: ArgOffset + Intr->VAddrStart,
6344	NumVAddrs: Intr->NumVAddrs);
6345	}
6346	}
6347
6348	int Flags = `0`;
6349	if (IsA16)
6350	Flags \|= `1`;
6351	if (IsG16)
6352	Flags \|= `2`;
6353	MI.addOperand(Op: MachineOperand::CreateImm(Val: Flags));
6354
6355	if (BaseOpcode->Store) { // No TFE for stores?
6356	// TODO: Handle dmask trim
6357	if (!Ty.isVector() \|\| !IsD16)
6358	return true;
6359
6360	Register RepackedReg = handleD16VData(B, MRI&: MRI, Reg: VData, ImageStore: true*);
6361	if (RepackedReg != VData) {
6362	MI.getOperand(i: `1`).setReg(RepackedReg);
6363	}
6364
6365	return true;
6366	}
6367
6368	Register DstReg = MI.getOperand(i: `0`).getReg();
6369	const LLT EltTy = Ty.getScalarType();
6370	const int NumElts = Ty.isVector() ? Ty.getNumElements() : `1`;
6371
6372	// Confirm that the return type is large enough for the dmask specified
6373	if (NumElts < DMaskLanes)
6374	return false;
6375
6376	if (NumElts > `4` \|\| DMaskLanes > `4`)
6377	return false;
6378
6379	// Image atomic instructions are using DMask to specify how many bits
6380	// input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
6381	// DMaskLanes for image atomic has default value '0'.
6382	// We must be sure that atomic variants (especially packed) will not be
6383	// truncated from v2s16 or v4s16 to s16 type.
6384	//
6385	// ChangeElementCount will be needed for image load where Ty is always scalar.
6386	const unsigned AdjustedNumElts = DMaskLanes == `0` ? `1` : DMaskLanes;
6387	const LLT AdjustedTy =
6388	DMaskLanes == `0`
6389	? Ty
6390	: Ty.changeElementCount(EC: ElementCount::getFixed(MinVal: AdjustedNumElts));
6391
6392	// The raw dword aligned data component of the load. The only legal cases
6393	// where this matters should be when using the packed D16 format, for
6394	// s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
6395	LLT RoundedTy;
6396
6397	// S32 vector to cover all data, plus TFE result element.
6398	LLT TFETy;
6399
6400	// Register type to use for each loaded component. Will be S32 or V2S16.
6401	LLT RegTy;
6402
6403	if (IsD16 && ST.hasUnpackedD16VMem()) {
6404	RoundedTy =
6405	LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: AdjustedNumElts), ScalarSize: `32`);
6406	TFETy = LLT::fixed_vector(NumElements: AdjustedNumElts + `1`, ScalarSizeInBits: `32`);
6407	RegTy = S32;
6408	} else {
6409	unsigned EltSize = EltTy.getSizeInBits();
6410	unsigned RoundedElts = (AdjustedTy.getSizeInBits() + `31`) / `32`;
6411	unsigned RoundedSize = `32` * RoundedElts;
6412	RoundedTy = LLT::scalarOrVector(
6413	EC: ElementCount::getFixed(MinVal: RoundedSize / EltSize), ScalarSize: EltSize);
6414	TFETy = LLT::fixed_vector(NumElements: RoundedSize / `32` + `1`, ScalarTy: S32);
6415	RegTy = !IsTFE && EltSize == `16` ? V2S16 : S32;
6416	}
6417
6418	// The return type does not need adjustment.
6419	// TODO: Should we change s16 case to s32 or <2 x s16>?
6420	if (!IsTFE && (RoundedTy == Ty \|\| !Ty.isVector()))
6421	return true;
6422
6423	Register Dst1Reg;
6424
6425	// Insert after the instruction.
6426	B.setInsertPt(MBB&: *MI.getParent(), II: ++MI.getIterator());
6427
6428	// TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
6429	// s16> instead of s32, we would only need 1 bitcast instead of multiple.
6430	const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
6431	const int ResultNumRegs = LoadResultTy.getSizeInBits() / `32`;
6432
6433	Register NewResultReg = MRI->createGenericVirtualRegister(Ty: LoadResultTy);
6434
6435	MI.getOperand(i: `0`).setReg(NewResultReg);
6436
6437	// In the IR, TFE is supposed to be used with a 2 element struct return
6438	// type. The instruction really returns these two values in one contiguous
6439	// register, with one additional dword beyond the loaded data. Rewrite the
6440	// return type to use a single register result.
6441
6442	if (IsTFE) {
6443	Dst1Reg = MI.getOperand(i: `1`).getReg();
6444	if (MRI->getType(Reg: Dst1Reg) != S32)
6445	return false;
6446
6447	// TODO: Make sure the TFE operand bit is set.
6448	MI.removeOperand(OpNo: `1`);
6449
6450	// Handle the easy case that requires no repack instructions.
6451	if (Ty == S32) {
6452	B.buildUnmerge(Res: {DstReg, Dst1Reg}, Op: NewResultReg);
6453	return true;
6454	}
6455	}
6456
6457	// Now figure out how to copy the new result register back into the old
6458	// result.
6459	SmallVector<Register, `5`> ResultRegs(ResultNumRegs, Dst1Reg);
6460
6461	const int NumDataRegs = IsTFE ? ResultNumRegs - `1` : ResultNumRegs;
6462
6463	if (ResultNumRegs == `1`) {
6464	assert(!IsTFE);
6465	ResultRegs [`0`] = NewResultReg;
6466	} else {
6467	// We have to repack into a new vector of some kind.
6468	for (int I = `0`; I != NumDataRegs; ++I)
6469	ResultRegs [I] = MRI->createGenericVirtualRegister(Ty: RegTy);
6470	B.buildUnmerge(Res: ResultRegs, Op: NewResultReg);
6471
6472	// Drop the final TFE element to get the data part. The TFE result is
6473	// directly written to the right place already.
6474	if (IsTFE)
6475	ResultRegs.resize(N: NumDataRegs);
6476	}
6477
6478	// For an s16 scalar result, we form an s32 result with a truncate regardless
6479	// of packed vs. unpacked.
6480	if (IsD16 && !Ty.isVector()) {
6481	B.buildTrunc(Res: DstReg, Op: ResultRegs [`0`]);
6482	return true;
6483	}
6484
6485	// Avoid a build/concat_vector of 1 entry.
6486	if (Ty == V2S16 && NumDataRegs == `1` && !ST.hasUnpackedD16VMem()) {
6487	B.buildBitcast(Dst: DstReg, Src: ResultRegs [`0`]);
6488	return true;
6489	}
6490
6491	assert(Ty.isVector());
6492
6493	if (IsD16) {
6494	// For packed D16 results with TFE enabled, all the data components are
6495	// S32. Cast back to the expected type.
6496	//
6497	// TODO: We don't really need to use load s32 elements. We would only need one
6498	// cast for the TFE result if a multiple of v2s16 was used.
6499	if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
6500	for (Register &Reg : ResultRegs)
6501	Reg = B.buildBitcast(Dst: V2S16, Src: Reg).getReg(Idx: `0`);
6502	} else if (ST.hasUnpackedD16VMem()) {
6503	for (Register &Reg : ResultRegs)
6504	Reg = B.buildTrunc(Res: S16, Op: Reg).getReg(Idx: `0`);
6505	}
6506	}
6507
6508	auto padWithUndef = [&](LLT Ty, int NumElts) {
6509	if (NumElts == `0`)
6510	return;
6511	Register Undef = B.buildUndef(Res: Ty).getReg(Idx: `0`);
6512	for (int I = `0`; I != NumElts; ++I)
6513	ResultRegs.push_back(Elt: Undef);
6514	};
6515
6516	// Pad out any elements eliminated due to the dmask.
6517	LLT ResTy = MRI->getType(Reg: ResultRegs [`0`]);
6518	if (!ResTy.isVector()) {
6519	padWithUndef (ResTy, NumElts - ResultRegs.size());
6520	B.buildBuildVector(Res: DstReg, Ops: ResultRegs);
6521	return true;
6522	}
6523
6524	assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
6525	const int RegsToCover = (Ty.getSizeInBits() + `31`) / `32`;
6526
6527	// Deal with the one annoying legal case.
6528	const LLT V3S16 = LLT::fixed_vector(NumElements: `3`, ScalarSizeInBits: `16`);
6529	if (Ty == V3S16) {
6530	if (IsTFE) {
6531	if (ResultRegs.size() == `1`) {
6532	NewResultReg = ResultRegs [`0`];
6533	} else if (ResultRegs.size() == `2`) {
6534	LLT V4S16 = LLT::fixed_vector(NumElements: `4`, ScalarSizeInBits: `16`);
6535	NewResultReg = B.buildConcatVectors(Res: V4S16, Ops: ResultRegs).getReg(Idx: `0`);
6536	} else {
6537	return false;
6538	}
6539	}
6540
6541	if (MRI->getType(Reg: DstReg).getNumElements() <
6542	MRI->getType(Reg: NewResultReg).getNumElements()) {
6543	B.buildDeleteTrailingVectorElements(Res: DstReg, Op0: NewResultReg);
6544	} else {
6545	B.buildPadVectorWithUndefElements(Res: DstReg, Op0: NewResultReg);
6546	}
6547	return true;
6548	}
6549
6550	padWithUndef (ResTy, RegsToCover - ResultRegs.size());
6551	B.buildConcatVectors(Res: DstReg, Ops: ResultRegs);
6552	return true;
6553	}
6554
6555	bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper,
6556	MachineInstr &MI) const {
6557	MachineIRBuilder &B = Helper.MIRBuilder;
6558	GISelChangeObserver &Observer = Helper.Observer;
6559
6560	Register OrigDst = MI.getOperand(i: `0`).getReg();
6561	Register Dst;
6562	LLT Ty = B.getMRI()->getType(Reg: OrigDst);
6563	unsigned Size = Ty.getSizeInBits();
6564	MachineFunction &MF = B.getMF();
6565	unsigned Opc = `0`;
6566	if (Size < `32` && ST.hasScalarSubwordLoads()) {
6567	assert(Size == `8` \|\| Size == `16`);
6568	Opc = Size == `8` ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
6569	: AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
6570	// The 8-bit and 16-bit scalar buffer load instructions have 32-bit
6571	// destination register.
6572	Dst = B.getMRI()->createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: `32`));
6573	} else {
6574	Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
6575	Dst = OrigDst;
6576	}
6577
6578	Observer.changingInstr(MI);
6579
6580	// Handle needing to s.buffer.load() a p8 value.
6581	if (hasBufferRsrcWorkaround(Ty)) {
6582	Ty = castBufferRsrcFromV4I32(MI, B, MRI&: *B.getMRI(), Idx: `0`);
6583	B.setInsertPt(MBB&: B.getMBB(), II: MI);
6584	}
6585	if (shouldBitcastLoadStoreType(ST, Ty, MemTy: LLT::scalar(SizeInBits: Size))) {
6586	Ty = getBitcastRegisterType(Ty);
6587	Helper.bitcastDst(MI, CastTy: Ty, OpIdx: `0`);
6588	B.setInsertPt(MBB&: B.getMBB(), II: MI);
6589	}
6590
6591	// FIXME: We don't really need this intermediate instruction. The intrinsic
6592	// should be fixed to have a memory operand. Since it's readnone, we're not
6593	// allowed to add one.
6594	MI.setDesc(B.getTII().get(Opcode: Opc));
6595	MI.removeOperand(OpNo: `1`); // Remove intrinsic ID
6596
6597	// FIXME: When intrinsic definition is fixed, this should have an MMO already.
6598	// TODO: Should this use datalayout alignment?
6599	const unsigned MemSize = (Size + `7`) / `8`;
6600	const Align MemAlign(std::min(a: MemSize, b: `4u`));
6601	MachineMemOperand *MMO = MF.getMachineMemOperand(
6602	PtrInfo: MachinePointerInfo (),
6603	F: MachineMemOperand::MOLoad \| MachineMemOperand::MODereferenceable \|
6604	MachineMemOperand::MOInvariant,
6605	Size: MemSize, BaseAlignment: MemAlign);
6606	MI.addMemOperand(MF, MO: MMO);
6607	if (Dst != OrigDst) {
6608	MI.getOperand(i: `0`).setReg(Dst);
6609	B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
6610	B.buildTrunc(Res: OrigDst, Op: Dst);
6611	}
6612
6613	// If we don't have 96-bit result scalar loads, widening to 128-bit should
6614	// always be legal. We may need to restore this to a 96-bit result if it turns
6615	// out this needs to be converted to a vector load during RegBankSelect.
6616	if (!isPowerOf2_32(Value: Size) && (Size != `96` \|\| !ST.hasScalarDwordx3Loads())) {
6617	if (Ty.isVector())
6618	Helper.moreElementsVectorDst(MI, MoreTy: getPow2VectorType(Ty), OpIdx: `0`);
6619	else
6620	Helper.widenScalarDst(MI, WideTy: getPow2ScalarType(Ty), OpIdx: `0`);
6621	}
6622
6623	Observer.changedInstr(MI);
6624	return true;
6625	}
6626
6627	// TODO: Move to selection
6628	bool AMDGPULegalizerInfo::legalizeTrap(MachineInstr &MI,
6629	MachineRegisterInfo &MRI,
6630	MachineIRBuilder &B) const {
6631	if (!ST.isTrapHandlerEnabled() \|\|
6632	ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
6633	return legalizeTrapEndpgm(MI, MRI, B);
6634
6635	return ST.supportsGetDoorbellID() ?
6636	legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B);
6637	}
6638
6639	bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
6640	MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6641	const DebugLoc &DL = MI.getDebugLoc();
6642	MachineBasicBlock &BB = B.getMBB();
6643	MachineFunction *MF = BB.getParent();
6644
6645	if (BB.succ_empty() && std::next(x: MI.getIterator()) == BB.end()) {
6646	BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
6647	.addImm(`0`);
6648	MI.eraseFromParent();
6649	return true;
6650	}
6651
6652	// We need a block split to make the real endpgm a terminator. We also don't
6653	// want to break phis in successor blocks, so we can't just delete to the
6654	// end of the block.
6655	BB.splitAt(SplitInst&: MI, UpdateLiveIns: false /UpdateLiveIns/);
6656	MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
6657	MF->push_back(MBB: TrapBB);
6658	BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
6659	.addImm(`0`);
6660	BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
6661	.addMBB(TrapBB);
6662
6663	BB.addSuccessor(Succ: TrapBB);
6664	MI.eraseFromParent();
6665	return true;
6666	}
6667
6668	bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
6669	MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6670	MachineFunction &MF = B.getMF();
6671	const LLT S64 = LLT::scalar(SizeInBits: `64`);
6672
6673	Register SGPR01(AMDGPU::SGPR0_SGPR1);
6674	// For code object version 5, queue_ptr is passed through implicit kernarg.
6675	if (AMDGPU::getAMDHSACodeObjectVersion(M: *MF.getFunction().getParent()) >=
6676	AMDGPU::AMDHSA_COV5) {
6677	AMDGPUTargetLowering::ImplicitParameter Param =
6678	AMDGPUTargetLowering::QUEUE_PTR;
6679	uint64_t Offset =
6680	ST.getTargetLowering()->getImplicitParameterOffset(MF: B.getMF(), Param);
6681
6682	Register KernargPtrReg = MRI.createGenericVirtualRegister(
6683	Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: `64`));
6684
6685	if (!loadInputValue(DstReg: KernargPtrReg, B,
6686	ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
6687	return false;
6688
6689	// TODO: can we be smarter about machine pointer info?
6690	MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
6691	MachineMemOperand *MMO = MF.getMachineMemOperand(
6692	PtrInfo,
6693	f: MachineMemOperand::MOLoad \| MachineMemOperand::MODereferenceable \|
6694	MachineMemOperand::MOInvariant,
6695	MemTy: LLT::scalar(SizeInBits: `64`), base_alignment: commonAlignment(A: Align (`64`), Offset));
6696
6697	// Pointer address
6698	Register LoadAddr = MRI.createGenericVirtualRegister(
6699	Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: `64`));
6700	B.buildPtrAdd(Res: LoadAddr, Op0: KernargPtrReg,
6701	Op1: B.buildConstant(Res: LLT::scalar(SizeInBits: `64`), Val: Offset).getReg(Idx: `0`));
6702	// Load address
6703	Register Temp = B.buildLoad(Res: S64, Addr: LoadAddr, MMO&: *MMO).getReg(Idx: `0`);
6704	B.buildCopy(Res: SGPR01, Op: Temp);
6705	B.buildInstr(AMDGPU::S_TRAP)
6706	.addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
6707	.addReg(SGPR01, RegState::Implicit);
6708	MI.eraseFromParent();
6709	return true;
6710	}
6711
6712	// Pass queue pointer to trap handler as input, and insert trap instruction
6713	// Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
6714	Register LiveIn =
6715	MRI.createGenericVirtualRegister(Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: `64`));
6716	if (!loadInputValue(DstReg: LiveIn, B, ArgType: AMDGPUFunctionArgInfo::QUEUE_PTR))
6717	return false;
6718
6719	B.buildCopy(Res: SGPR01, Op: LiveIn);
6720	B.buildInstr(AMDGPU::S_TRAP)
6721	.addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
6722	.addReg(SGPR01, RegState::Implicit);
6723
6724	MI.eraseFromParent();
6725	return true;
6726	}
6727
6728	bool AMDGPULegalizerInfo::legalizeTrapHsa(MachineInstr &MI,
6729	MachineRegisterInfo &MRI,
6730	MachineIRBuilder &B) const {
6731	// We need to simulate the 's_trap 2' instruction on targets that run in
6732	// PRIV=1 (where it is treated as a nop).
6733	if (ST.hasPrivEnabledTrap2NopBug()) {
6734	ST.getInstrInfo()->insertSimulatedTrap(MRI, MBB&: B.getMBB(), MI,
6735	DL: MI.getDebugLoc());
6736	MI.eraseFromParent();
6737	return true;
6738	}
6739
6740	B.buildInstr(AMDGPU::S_TRAP)
6741	.addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
6742	MI.eraseFromParent();
6743	return true;
6744	}
6745
6746	bool AMDGPULegalizerInfo::legalizeDebugTrap(MachineInstr &MI,
6747	MachineRegisterInfo &MRI,
6748	MachineIRBuilder &B) const {
6749	// Is non-HSA path or trap-handler disabled? Then, report a warning
6750	// accordingly
6751	if (!ST.isTrapHandlerEnabled() \|\|
6752	ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
6753	DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
6754	"debugtrap handler not supported",
6755	MI.getDebugLoc(), DS_Warning);
6756	LLVMContext &Ctx = B.getMF().getFunction().getContext();
6757	Ctx.diagnose(DI: NoTrap);
6758	} else {
6759	// Insert debug-trap instruction
6760	B.buildInstr(AMDGPU::S_TRAP)
6761	.addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
6762	}
6763
6764	MI.eraseFromParent();
6765	return true;
6766	}
6767
6768	bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
6769	MachineIRBuilder &B) const {
6770	MachineRegisterInfo &MRI = *B.getMRI();
6771	const LLT S16 = LLT::scalar(SizeInBits: `16`);
6772	const LLT S32 = LLT::scalar(SizeInBits: `32`);
6773	const LLT V2S16 = LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `16`);
6774	const LLT V3S32 = LLT::fixed_vector(NumElements: `3`, ScalarSizeInBits: `32`);
6775
6776	Register DstReg = MI.getOperand(i: `0`).getReg();
6777	Register NodePtr = MI.getOperand(i: `2`).getReg();
6778	Register RayExtent = MI.getOperand(i: `3`).getReg();
6779	Register RayOrigin = MI.getOperand(i: `4`).getReg();
6780	Register RayDir = MI.getOperand(i: `5`).getReg();
6781	Register RayInvDir = MI.getOperand(i: `6`).getReg();
6782	Register TDescr = MI.getOperand(i: `7`).getReg();
6783
6784	if (!ST.hasGFX10_AEncoding()) {
6785	DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
6786	"intrinsic not supported on subtarget",
6787	MI.getDebugLoc());
6788	B.getMF().getFunction().getContext().diagnose(DI: BadIntrin);
6789	return false;
6790	}
6791
6792	const bool IsGFX11 = AMDGPU::isGFX11(ST);
6793	const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
6794	const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST);
6795	const bool IsA16 = MRI.getType(Reg: RayDir).getElementType().getSizeInBits() == `16`;
6796	const bool Is64 = MRI.getType(Reg: NodePtr).getSizeInBits() == `64`;
6797	const unsigned NumVDataDwords = `4`;
6798	const unsigned NumVAddrDwords = IsA16 ? (Is64 ? `9` : `8`) : (Is64 ? `12` : `11`);
6799	const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? `4` : `5`) : NumVAddrDwords;
6800	const bool UseNSA =
6801	IsGFX12Plus \|\| (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
6802
6803	const unsigned BaseOpcodes[`2`][`2`] = {
6804	{AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
6805	{AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
6806	AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
6807	int Opcode;
6808	if (UseNSA) {
6809	Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
6810	IsGFX12Plus ? AMDGPU::MIMGEncGfx12
6811	: IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
6812	: AMDGPU::MIMGEncGfx10NSA,
6813	NumVDataDwords, NumVAddrDwords);
6814	} else {
6815	assert(!IsGFX12Plus);
6816	Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
6817	IsGFX11 ? AMDGPU::MIMGEncGfx11Default
6818	: AMDGPU::MIMGEncGfx10Default,
6819	NumVDataDwords, NumVAddrDwords);
6820	}
6821	assert(Opcode != -`1`);
6822
6823	SmallVector<Register, `12`> Ops;
6824	if (UseNSA && IsGFX11Plus) {
6825	auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
6826	auto Unmerge = B.buildUnmerge(Res: {S32, S32, S32}, Op: Src);
6827	auto Merged = B.buildMergeLikeInstr(
6828	Res: V3S32, Ops: {Unmerge.getReg(Idx: `0`), Unmerge.getReg(Idx: `1`), Unmerge.getReg(Idx: `2`)});
6829	Ops.push_back(Elt: Merged.getReg(Idx: `0`));
6830	};
6831
6832	Ops.push_back(Elt: NodePtr);
6833	Ops.push_back(Elt: RayExtent);
6834	packLanes (RayOrigin);
6835
6836	if (IsA16) {
6837	auto UnmergeRayDir = B.buildUnmerge(Res: {S16, S16, S16}, Op: RayDir);
6838	auto UnmergeRayInvDir = B.buildUnmerge(Res: {S16, S16, S16}, Op: RayInvDir);
6839	auto MergedDir = B.buildMergeLikeInstr(
6840	Res: V3S32,
6841	Ops: {B.buildBitcast(
6842	Dst: S32, Src: B.buildMergeLikeInstr(Res: V2S16, Ops: {UnmergeRayInvDir.getReg(Idx: `0`),
6843	UnmergeRayDir.getReg(Idx: `0`)}))
6844	.getReg(Idx: `0`),
6845	B.buildBitcast(
6846	Dst: S32, Src: B.buildMergeLikeInstr(Res: V2S16, Ops: {UnmergeRayInvDir.getReg(Idx: `1`),
6847	UnmergeRayDir.getReg(Idx: `1`)}))
6848	.getReg(Idx: `0`),
6849	B.buildBitcast(
6850	Dst: S32, Src: B.buildMergeLikeInstr(Res: V2S16, Ops: {UnmergeRayInvDir.getReg(Idx: `2`),
6851	UnmergeRayDir.getReg(Idx: `2`)}))
6852	.getReg(Idx: `0`)});
6853	Ops.push_back(Elt: MergedDir.getReg(Idx: `0`));
6854	} else {
6855	packLanes (RayDir);
6856	packLanes (RayInvDir);
6857	}
6858	} else {
6859	if (Is64) {
6860	auto Unmerge = B.buildUnmerge(Res: {S32, S32}, Op: NodePtr);
6861	Ops.push_back(Elt: Unmerge.getReg(Idx: `0`));
6862	Ops.push_back(Elt: Unmerge.getReg(Idx: `1`));
6863	} else {
6864	Ops.push_back(Elt: NodePtr);
6865	}
6866	Ops.push_back(Elt: RayExtent);
6867
6868	auto packLanes = [&Ops, &S32, &B](Register Src) {
6869	auto Unmerge = B.buildUnmerge(Res: {S32, S32, S32}, Op: Src);
6870	Ops.push_back(Elt: Unmerge.getReg(Idx: `0`));
6871	Ops.push_back(Elt: Unmerge.getReg(Idx: `1`));
6872	Ops.push_back(Elt: Unmerge.getReg(Idx: `2`));
6873	};
6874
6875	packLanes (RayOrigin);
6876	if (IsA16) {
6877	auto UnmergeRayDir = B.buildUnmerge(Res: {S16, S16, S16}, Op: RayDir);
6878	auto UnmergeRayInvDir = B.buildUnmerge(Res: {S16, S16, S16}, Op: RayInvDir);
6879	Register R1 = MRI.createGenericVirtualRegister(Ty: S32);
6880	Register R2 = MRI.createGenericVirtualRegister(Ty: S32);
6881	Register R3 = MRI.createGenericVirtualRegister(Ty: S32);
6882	B.buildMergeLikeInstr(Res: R1,
6883	Ops: {UnmergeRayDir.getReg(Idx: `0`), UnmergeRayDir.getReg(Idx: `1`)});
6884	B.buildMergeLikeInstr(
6885	Res: R2, Ops: {UnmergeRayDir.getReg(Idx: `2`), UnmergeRayInvDir.getReg(Idx: `0`)});
6886	B.buildMergeLikeInstr(
6887	Res: R3, Ops: {UnmergeRayInvDir.getReg(Idx: `1`), UnmergeRayInvDir.getReg(Idx: `2`)});
6888	Ops.push_back(Elt: R1);
6889	Ops.push_back(Elt: R2);
6890	Ops.push_back(Elt: R3);
6891	} else {
6892	packLanes (RayDir);
6893	packLanes (RayInvDir);
6894	}
6895	}
6896
6897	if (!UseNSA) {
6898	// Build a single vector containing all the operands so far prepared.
6899	LLT OpTy = LLT::fixed_vector(NumElements: Ops.size(), ScalarSizeInBits: `32`);
6900	Register MergedOps = B.buildMergeLikeInstr(Res: OpTy, Ops).getReg(Idx: `0`);
6901	Ops.clear();
6902	Ops.push_back(Elt: MergedOps);
6903	}
6904
6905	auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
6906	.addDef(DstReg)
6907	.addImm(Opcode);
6908
6909	for (Register R : Ops) {
6910	MIB.addUse(R);
6911	}
6912
6913	MIB.addUse(TDescr)
6914	.addImm(IsA16 ? `1` : `0`)
6915	.cloneMemRefs(MI);
6916
6917	MI.eraseFromParent();
6918	return true;
6919	}
6920
6921	bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI,
6922	MachineIRBuilder &B) const {
6923	unsigned Opc;
6924	int RoundMode = MI.getOperand(i: `2`).getImm();
6925
6926	if (RoundMode == (int)RoundingMode::TowardPositive)
6927	Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD;
6928	else if (RoundMode == (int)RoundingMode::TowardNegative)
6929	Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD;
6930	else
6931	return false;
6932
6933	B.buildInstr(Opcode: Opc)
6934	.addDef(RegNo: MI.getOperand(i: `0`).getReg())
6935	.addUse(RegNo: MI.getOperand(i: `1`).getReg());
6936
6937	MI.eraseFromParent();
6938
6939	return true;
6940	}
6941
6942	bool AMDGPULegalizerInfo::legalizeStackSave(MachineInstr &MI,
6943	MachineIRBuilder &B) const {
6944	const SITargetLowering *TLI = ST.getTargetLowering();
6945	Register StackPtr = TLI->getStackPointerRegisterToSaveRestore();
6946	Register DstReg = MI.getOperand(i: `0`).getReg();
6947	B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
6948	MI.eraseFromParent();
6949	return true;
6950	}
6951
6952	bool AMDGPULegalizerInfo::legalizeWaveID(MachineInstr &MI,
6953	MachineIRBuilder &B) const {
6954	// With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
6955	if (!ST.hasArchitectedSGPRs())
6956	return false;
6957	LLT S32 = LLT::scalar(SizeInBits: `32`);
6958	Register DstReg = MI.getOperand(i: `0`).getReg();
6959	auto TTMP8 = B.buildCopy(S32, Register(AMDGPU::TTMP8));
6960	auto LSB = B.buildConstant(Res: S32, Val: `25`);
6961	auto Width = B.buildConstant(Res: S32, Val: `5`);
6962	B.buildUbfx(Dst: DstReg, Src: TTMP8, LSB, Width);
6963	MI.eraseFromParent();
6964	return true;
6965	}
6966
6967	static constexpr unsigned FPEnvModeBitField =
6968	AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, `0`, `23`);
6969
6970	static constexpr unsigned FPEnvTrapBitField =
6971	AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_TRAPSTS, `0`, `5`);
6972
6973	bool AMDGPULegalizerInfo::legalizeGetFPEnv(MachineInstr &MI,
6974	MachineRegisterInfo &MRI,
6975	MachineIRBuilder &B) const {
6976	Register Src = MI.getOperand(i: `0`).getReg();
6977	if (MRI.getType(Reg: Src) != S64)
6978	return false;
6979
6980	auto ModeReg =
6981	B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
6982	/HasSideEffects=/true, /isConvergent=/false)
6983	.addImm(FPEnvModeBitField);
6984	auto TrapReg =
6985	B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
6986	/HasSideEffects=/true, /isConvergent=/false)
6987	.addImm(FPEnvTrapBitField);
6988	B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
6989	MI.eraseFromParent();
6990	return true;
6991	}
6992
6993	bool AMDGPULegalizerInfo::legalizeSetFPEnv(MachineInstr &MI,
6994	MachineRegisterInfo &MRI,
6995	MachineIRBuilder &B) const {
6996	Register Src = MI.getOperand(i: `0`).getReg();
6997	if (MRI.getType(Reg: Src) != S64)
6998	return false;
6999
7000	auto Unmerge = B.buildUnmerge(Res: {S32, S32}, Op: MI.getOperand(i: `0`));
7001	B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7002	/HasSideEffects=/true, /isConvergent=/false)
7003	.addImm(static_cast<int16_t>(FPEnvModeBitField))
7004	.addReg(Unmerge.getReg(`0`));
7005	B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7006	/HasSideEffects=/true, /isConvergent=/false)
7007	.addImm(static_cast<int16_t>(FPEnvTrapBitField))
7008	.addReg(Unmerge.getReg(`1`));
7009	MI.eraseFromParent();
7010	return true;
7011	}
7012
7013	bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
7014	MachineInstr &MI) const {
7015	MachineIRBuilder &B = Helper.MIRBuilder;
7016	MachineRegisterInfo &MRI = *B.getMRI();
7017
7018	// Replace the use G_BRCOND with the exec manipulate and branch pseudos.
7019	auto IntrID = cast<GIntrinsic>(Val&: MI).getIntrinsicID();
7020	switch (IntrID) {
7021	case Intrinsic::amdgcn_if:
7022	case Intrinsic::amdgcn_else: {
7023	MachineInstr Br = nullptr*;
7024	MachineBasicBlock UncondBrTarget = nullptr*;
7025	bool Negated = false;
7026	if (MachineInstr *BrCond =
7027	verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7028	const SIRegisterInfo *TRI
7029	= static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7030
7031	Register Def = MI.getOperand(i: `1`).getReg();
7032	Register Use = MI.getOperand(i: `3`).getReg();
7033
7034	MachineBasicBlock *CondBrTarget = BrCond->getOperand(i: `1`).getMBB();
7035
7036	if (Negated)
7037	std::swap(a&: CondBrTarget, b&: UncondBrTarget);
7038
7039	B.setInsertPt(MBB&: B.getMBB(), II: BrCond->getIterator());
7040	if (IntrID == Intrinsic::amdgcn_if) {
7041	B.buildInstr(AMDGPU::SI_IF)
7042	.addDef(Def)
7043	.addUse(Use)
7044	.addMBB(UncondBrTarget);
7045	} else {
7046	B.buildInstr(AMDGPU::SI_ELSE)
7047	.addDef(Def)
7048	.addUse(Use)
7049	.addMBB(UncondBrTarget);
7050	}
7051
7052	if (Br) {
7053	Br->getOperand(i: `0`).setMBB(CondBrTarget);
7054	} else {
7055	// The IRTranslator skips inserting the G_BR for fallthrough cases, but
7056	// since we're swapping branch targets it needs to be reinserted.
7057	// FIXME: IRTranslator should probably not do this
7058	B.buildBr(Dest&: *CondBrTarget);
7059	}
7060
7061	MRI.setRegClass(Reg: Def, RC: TRI->getWaveMaskRegClass());
7062	MRI.setRegClass(Reg: Use, RC: TRI->getWaveMaskRegClass());
7063	MI.eraseFromParent();
7064	BrCond->eraseFromParent();
7065	return true;
7066	}
7067
7068	return false;
7069	}
7070	case Intrinsic::amdgcn_loop: {
7071	MachineInstr Br = nullptr*;
7072	MachineBasicBlock UncondBrTarget = nullptr*;
7073	bool Negated = false;
7074	if (MachineInstr *BrCond =
7075	verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7076	const SIRegisterInfo *TRI
7077	= static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7078
7079	MachineBasicBlock *CondBrTarget = BrCond->getOperand(i: `1`).getMBB();
7080	Register Reg = MI.getOperand(i: `2`).getReg();
7081
7082	if (Negated)
7083	std::swap(a&: CondBrTarget, b&: UncondBrTarget);
7084
7085	B.setInsertPt(MBB&: B.getMBB(), II: BrCond->getIterator());
7086	B.buildInstr(AMDGPU::SI_LOOP)
7087	.addUse(Reg)
7088	.addMBB(UncondBrTarget);
7089
7090	if (Br)
7091	Br->getOperand(i: `0`).setMBB(CondBrTarget);
7092	else
7093	B.buildBr(Dest&: *CondBrTarget);
7094
7095	MI.eraseFromParent();
7096	BrCond->eraseFromParent();
7097	MRI.setRegClass(Reg, RC: TRI->getWaveMaskRegClass());
7098	return true;
7099	}
7100
7101	return false;
7102	}
7103	case Intrinsic::amdgcn_addrspacecast_nonnull:
7104	return legalizeAddrSpaceCast(MI, MRI, B);
7105	case Intrinsic::amdgcn_make_buffer_rsrc:
7106	return legalizePointerAsRsrcIntrin(MI, MRI, B);
7107	case Intrinsic::amdgcn_kernarg_segment_ptr:
7108	if (!AMDGPU::isKernel(CC: B.getMF().getFunction().getCallingConv())) {
7109	// This only makes sense to call in a kernel, so just lower to null.
7110	B.buildConstant(Res: MI.getOperand(i: `0`).getReg(), Val: `0`);
7111	MI.eraseFromParent();
7112	return true;
7113	}
7114
7115	return legalizePreloadedArgIntrin(
7116	MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
7117	case Intrinsic::amdgcn_implicitarg_ptr:
7118	return legalizeImplicitArgPtr(MI, MRI, B);
7119	case Intrinsic::amdgcn_workitem_id_x:
7120	return legalizeWorkitemIDIntrinsic(MI, MRI, B, Dim: `0`,
7121	ArgType: AMDGPUFunctionArgInfo::WORKITEM_ID_X);
7122	case Intrinsic::amdgcn_workitem_id_y:
7123	return legalizeWorkitemIDIntrinsic(MI, MRI, B, Dim: `1`,
7124	ArgType: AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
7125	case Intrinsic::amdgcn_workitem_id_z:
7126	return legalizeWorkitemIDIntrinsic(MI, MRI, B, Dim: `2`,
7127	ArgType: AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
7128	case Intrinsic::amdgcn_workgroup_id_x:
7129	return legalizePreloadedArgIntrin(MI, MRI, B,
7130	ArgType: AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
7131	case Intrinsic::amdgcn_workgroup_id_y:
7132	return legalizePreloadedArgIntrin(MI, MRI, B,
7133	ArgType: AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
7134	case Intrinsic::amdgcn_workgroup_id_z:
7135	return legalizePreloadedArgIntrin(MI, MRI, B,
7136	ArgType: AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
7137	case Intrinsic::amdgcn_wave_id:
7138	return legalizeWaveID(MI, B);
7139	case Intrinsic::amdgcn_lds_kernel_id:
7140	return legalizePreloadedArgIntrin(MI, MRI, B,
7141	ArgType: AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
7142	case Intrinsic::amdgcn_dispatch_ptr:
7143	return legalizePreloadedArgIntrin(MI, MRI, B,
7144	ArgType: AMDGPUFunctionArgInfo::DISPATCH_PTR);
7145	case Intrinsic::amdgcn_queue_ptr:
7146	return legalizePreloadedArgIntrin(MI, MRI, B,
7147	ArgType: AMDGPUFunctionArgInfo::QUEUE_PTR);
7148	case Intrinsic::amdgcn_implicit_buffer_ptr:
7149	return legalizePreloadedArgIntrin(
7150	MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
7151	case Intrinsic::amdgcn_dispatch_id:
7152	return legalizePreloadedArgIntrin(MI, MRI, B,
7153	ArgType: AMDGPUFunctionArgInfo::DISPATCH_ID);
7154	case Intrinsic::r600_read_ngroups_x:
7155	// TODO: Emit error for hsa
7156	return legalizeKernargMemParameter(MI, B,
7157	Offset: SI::KernelInputOffsets::NGROUPS_X);
7158	case Intrinsic::r600_read_ngroups_y:
7159	return legalizeKernargMemParameter(MI, B,
7160	Offset: SI::KernelInputOffsets::NGROUPS_Y);
7161	case Intrinsic::r600_read_ngroups_z:
7162	return legalizeKernargMemParameter(MI, B,
7163	Offset: SI::KernelInputOffsets::NGROUPS_Z);
7164	case Intrinsic::r600_read_local_size_x:
7165	// TODO: Could insert G_ASSERT_ZEXT from s16
7166	return legalizeKernargMemParameter(MI, B, Offset: SI::KernelInputOffsets::LOCAL_SIZE_X);
7167	case Intrinsic::r600_read_local_size_y:
7168	// TODO: Could insert G_ASSERT_ZEXT from s16
7169	return legalizeKernargMemParameter(MI, B, Offset: SI::KernelInputOffsets::LOCAL_SIZE_Y);
7170	// TODO: Could insert G_ASSERT_ZEXT from s16
7171	case Intrinsic::r600_read_local_size_z:
7172	return legalizeKernargMemParameter(MI, B, Offset: SI::KernelInputOffsets::LOCAL_SIZE_Z);
7173	case Intrinsic::r600_read_global_size_x:
7174	return legalizeKernargMemParameter(MI, B, Offset: SI::KernelInputOffsets::GLOBAL_SIZE_X);
7175	case Intrinsic::r600_read_global_size_y:
7176	return legalizeKernargMemParameter(MI, B, Offset: SI::KernelInputOffsets::GLOBAL_SIZE_Y);
7177	case Intrinsic::r600_read_global_size_z:
7178	return legalizeKernargMemParameter(MI, B, Offset: SI::KernelInputOffsets::GLOBAL_SIZE_Z);
7179	case Intrinsic::amdgcn_fdiv_fast:
7180	return legalizeFDIVFastIntrin(MI, MRI, B);
7181	case Intrinsic::amdgcn_is_shared:
7182	return legalizeIsAddrSpace(MI, MRI, B, AddrSpace: AMDGPUAS::LOCAL_ADDRESS);
7183	case Intrinsic::amdgcn_is_private:
7184	return legalizeIsAddrSpace(MI, MRI, B, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS);
7185	case Intrinsic::amdgcn_wavefrontsize: {
7186	B.buildConstant(MI.getOperand(i: `0`), ST.getWavefrontSize());
7187	MI.eraseFromParent();
7188	return true;
7189	}
7190	case Intrinsic::amdgcn_s_buffer_load:
7191	return legalizeSBufferLoad(Helper, MI);
7192	case Intrinsic::amdgcn_raw_buffer_store:
7193	case Intrinsic::amdgcn_raw_ptr_buffer_store:
7194	case Intrinsic::amdgcn_struct_buffer_store:
7195	case Intrinsic::amdgcn_struct_ptr_buffer_store:
7196	return legalizeBufferStore(MI, MRI, B, IsTyped: false, IsFormat: false);
7197	case Intrinsic::amdgcn_raw_buffer_store_format:
7198	case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
7199	case Intrinsic::amdgcn_struct_buffer_store_format:
7200	case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
7201	return legalizeBufferStore(MI, MRI, B, IsTyped: false, IsFormat: true);
7202	case Intrinsic::amdgcn_raw_tbuffer_store:
7203	case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
7204	case Intrinsic::amdgcn_struct_tbuffer_store:
7205	case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
7206	return legalizeBufferStore(MI, MRI, B, IsTyped: true, IsFormat: true);
7207	case Intrinsic::amdgcn_raw_buffer_load:
7208	case Intrinsic::amdgcn_raw_ptr_buffer_load:
7209	case Intrinsic::amdgcn_struct_buffer_load:
7210	case Intrinsic::amdgcn_struct_ptr_buffer_load:
7211	return legalizeBufferLoad(MI, MRI, B, IsFormat: false, IsTyped: false);
7212	case Intrinsic::amdgcn_raw_buffer_load_format:
7213	case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
7214	case Intrinsic::amdgcn_struct_buffer_load_format:
7215	case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
7216	return legalizeBufferLoad(MI, MRI, B, IsFormat: true, IsTyped: false);
7217	case Intrinsic::amdgcn_raw_tbuffer_load:
7218	case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
7219	case Intrinsic::amdgcn_struct_tbuffer_load:
7220	case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
7221	return legalizeBufferLoad(MI, MRI, B, IsFormat: true, IsTyped: true);
7222	case Intrinsic::amdgcn_raw_buffer_atomic_swap:
7223	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
7224	case Intrinsic::amdgcn_struct_buffer_atomic_swap:
7225	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
7226	case Intrinsic::amdgcn_raw_buffer_atomic_add:
7227	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
7228	case Intrinsic::amdgcn_struct_buffer_atomic_add:
7229	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
7230	case Intrinsic::amdgcn_raw_buffer_atomic_sub:
7231	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
7232	case Intrinsic::amdgcn_struct_buffer_atomic_sub:
7233	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
7234	case Intrinsic::amdgcn_raw_buffer_atomic_smin:
7235	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
7236	case Intrinsic::amdgcn_struct_buffer_atomic_smin:
7237	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
7238	case Intrinsic::amdgcn_raw_buffer_atomic_umin:
7239	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
7240	case Intrinsic::amdgcn_struct_buffer_atomic_umin:
7241	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
7242	case Intrinsic::amdgcn_raw_buffer_atomic_smax:
7243	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
7244	case Intrinsic::amdgcn_struct_buffer_atomic_smax:
7245	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
7246	case Intrinsic::amdgcn_raw_buffer_atomic_umax:
7247	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
7248	case Intrinsic::amdgcn_struct_buffer_atomic_umax:
7249	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
7250	case Intrinsic::amdgcn_raw_buffer_atomic_and:
7251	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
7252	case Intrinsic::amdgcn_struct_buffer_atomic_and:
7253	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
7254	case Intrinsic::amdgcn_raw_buffer_atomic_or:
7255	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
7256	case Intrinsic::amdgcn_struct_buffer_atomic_or:
7257	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
7258	case Intrinsic::amdgcn_raw_buffer_atomic_xor:
7259	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
7260	case Intrinsic::amdgcn_struct_buffer_atomic_xor:
7261	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
7262	case Intrinsic::amdgcn_raw_buffer_atomic_inc:
7263	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
7264	case Intrinsic::amdgcn_struct_buffer_atomic_inc:
7265	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
7266	case Intrinsic::amdgcn_raw_buffer_atomic_dec:
7267	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
7268	case Intrinsic::amdgcn_struct_buffer_atomic_dec:
7269	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
7270	case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
7271	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
7272	case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
7273	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
7274	case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
7275	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
7276	case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
7277	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
7278	case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
7279	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
7280	case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
7281	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
7282	case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
7283	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
7284	case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
7285	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
7286	case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
7287	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16:
7288	case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16:
7289	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16:
7290	return legalizeBufferAtomic(MI, B, IID: IntrID);
7291	case Intrinsic::amdgcn_rsq_clamp:
7292	return legalizeRsqClampIntrinsic(MI, MRI, B);
7293	case Intrinsic::amdgcn_ds_fadd:
7294	case Intrinsic::amdgcn_ds_fmin:
7295	case Intrinsic::amdgcn_ds_fmax:
7296	return legalizeDSAtomicFPIntrinsic(Helper, MI, IID: IntrID);
7297	case Intrinsic::amdgcn_image_bvh_intersect_ray:
7298	return legalizeBVHIntrinsic(MI, B);
7299	case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
7300	case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
7301	case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
7302	case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
7303	case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
7304	case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
7305	case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
7306	case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
7307	Register Index = MI.getOperand(i: `5`).getReg();
7308	LLT S32 = LLT::scalar(SizeInBits: `32`);
7309	if (MRI.getType(Reg: Index) != S32)
7310	MI.getOperand(i: `5`).setReg(B.buildAnyExt(Res: S32, Op: Index).getReg(Idx: `0`));
7311	return true;
7312	}
7313	case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
7314	case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
7315	case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
7316	Register Index = MI.getOperand(i: `7`).getReg();
7317	LLT S32 = LLT::scalar(SizeInBits: `32`);
7318	if (MRI.getType(Reg: Index) != S32)
7319	MI.getOperand(i: `7`).setReg(B.buildAnyExt(Res: S32, Op: Index).getReg(Idx: `0`));
7320	return true;
7321	}
7322	case Intrinsic::amdgcn_fmed3: {
7323	GISelChangeObserver &Observer = Helper.Observer;
7324
7325	// FIXME: This is to workaround the inability of tablegen match combiners to
7326	// match intrinsics in patterns.
7327	Observer.changingInstr(MI);
7328	MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
7329	MI.removeOperand(OpNo: `1`);
7330	Observer.changedInstr(MI);
7331	return true;
7332	}
7333	default: {
7334	if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
7335	AMDGPU::getImageDimIntrinsicInfo(Intr: IntrID))
7336	return legalizeImageIntrinsic(MI, B, Observer&: Helper.Observer, Intr: ImageDimIntr);
7337	return true;
7338	}
7339	}
7340
7341	return true;
7342	}
7343

source code of llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp