1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
17#include "AMDGPUGlobalISelUtils.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
20#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21#include "SIInstrInfo.h"
22#include "SIMachineFunctionInfo.h"
23#include "SIRegisterInfo.h"
24#include "Utils/AMDGPUBaseInfo.h"
25#include "llvm/ADT/ScopeExit.h"
26#include "llvm/BinaryFormat/ELF.h"
27#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
28#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
29#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
30#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
31#include "llvm/CodeGen/GlobalISel/Utils.h"
32#include "llvm/CodeGen/TargetOpcodes.h"
33#include "llvm/IR/DiagnosticInfo.h"
34#include "llvm/IR/IntrinsicsAMDGPU.h"
35#include "llvm/IR/IntrinsicsR600.h"
36
37#define DEBUG_TYPE "amdgpu-legalinfo"
38
39using namespace llvm;
40using namespace LegalizeActions;
41using namespace LegalizeMutations;
42using namespace LegalityPredicates;
43using namespace MIPatternMatch;
44
45// Hack until load/store selection patterns support any tuple of legal types.
46static cl::opt<bool> EnableNewLegality(
47 "amdgpu-global-isel-new-legality",
48 cl::desc("Use GlobalISel desired legality, rather than try to use"
49 "rules compatible with selection patterns"),
50 cl::init(Val: false),
51 cl::ReallyHidden);
52
53static constexpr unsigned MaxRegisterSize = 1024;
54
55// Round the number of elements to the next power of two elements
56static LLT getPow2VectorType(LLT Ty) {
57 unsigned NElts = Ty.getNumElements();
58 unsigned Pow2NElts = 1 << Log2_32_Ceil(Value: NElts);
59 return Ty.changeElementCount(EC: ElementCount::getFixed(MinVal: Pow2NElts));
60}
61
62// Round the number of bits to the next power of two bits
63static LLT getPow2ScalarType(LLT Ty) {
64 unsigned Bits = Ty.getSizeInBits();
65 unsigned Pow2Bits = 1 << Log2_32_Ceil(Value: Bits);
66 return LLT::scalar(SizeInBits: Pow2Bits);
67}
68
69/// \returns true if this is an odd sized vector which should widen by adding an
70/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
71/// excludes s1 vectors, which should always be scalarized.
72static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
73 return [=](const LegalityQuery &Query) {
74 const LLT Ty = Query.Types[TypeIdx];
75 if (!Ty.isVector())
76 return false;
77
78 const LLT EltTy = Ty.getElementType();
79 const unsigned EltSize = EltTy.getSizeInBits();
80 return Ty.getNumElements() % 2 != 0 &&
81 EltSize > 1 && EltSize < 32 &&
82 Ty.getSizeInBits() % 32 != 0;
83 };
84}
85
86static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
87 return [=](const LegalityQuery &Query) {
88 const LLT Ty = Query.Types[TypeIdx];
89 return Ty.getSizeInBits() % 32 == 0;
90 };
91}
92
93static LegalityPredicate isWideVec16(unsigned TypeIdx) {
94 return [=](const LegalityQuery &Query) {
95 const LLT Ty = Query.Types[TypeIdx];
96 const LLT EltTy = Ty.getScalarType();
97 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
98 };
99}
100
101static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
102 return [=](const LegalityQuery &Query) {
103 const LLT Ty = Query.Types[TypeIdx];
104 const LLT EltTy = Ty.getElementType();
105 return std::pair(TypeIdx,
106 LLT::fixed_vector(NumElements: Ty.getNumElements() + 1, ScalarTy: EltTy));
107 };
108}
109
110static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
111 return [=](const LegalityQuery &Query) {
112 const LLT Ty = Query.Types[TypeIdx];
113 const LLT EltTy = Ty.getElementType();
114 unsigned Size = Ty.getSizeInBits();
115 unsigned Pieces = (Size + 63) / 64;
116 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
117 return std::pair(TypeIdx, LLT::scalarOrVector(
118 EC: ElementCount::getFixed(MinVal: NewNumElts), ScalarTy: EltTy));
119 };
120}
121
122// Increase the number of vector elements to reach the next multiple of 32-bit
123// type.
124static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
125 return [=](const LegalityQuery &Query) {
126 const LLT Ty = Query.Types[TypeIdx];
127
128 const LLT EltTy = Ty.getElementType();
129 const int Size = Ty.getSizeInBits();
130 const int EltSize = EltTy.getSizeInBits();
131 const int NextMul32 = (Size + 31) / 32;
132
133 assert(EltSize < 32);
134
135 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
136 return std::pair(TypeIdx, LLT::fixed_vector(NumElements: NewNumElts, ScalarTy: EltTy));
137 };
138}
139
140// Increase the number of vector elements to reach the next legal RegClass.
141static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) {
142 return [=](const LegalityQuery &Query) {
143 const LLT Ty = Query.Types[TypeIdx];
144 const unsigned NumElts = Ty.getNumElements();
145 const unsigned EltSize = Ty.getElementType().getSizeInBits();
146 const unsigned MaxNumElts = MaxRegisterSize / EltSize;
147
148 assert(EltSize == 32 || EltSize == 64);
149 assert(Ty.getSizeInBits() < MaxRegisterSize);
150
151 unsigned NewNumElts;
152 // Find the nearest legal RegClass that is larger than the current type.
153 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
154 if (SIRegisterInfo::getSGPRClassForBitWidth(BitWidth: NewNumElts * EltSize))
155 break;
156 }
157
158 return std::pair(TypeIdx, LLT::fixed_vector(NumElements: NewNumElts, ScalarSizeInBits: EltSize));
159 };
160}
161
162static LLT getBufferRsrcScalarType(const LLT Ty) {
163 if (!Ty.isVector())
164 return LLT::scalar(SizeInBits: 128);
165 const ElementCount NumElems = Ty.getElementCount();
166 return LLT::vector(EC: NumElems, ScalarTy: LLT::scalar(SizeInBits: 128));
167}
168
169static LLT getBufferRsrcRegisterType(const LLT Ty) {
170 if (!Ty.isVector())
171 return LLT::fixed_vector(NumElements: 4, ScalarTy: LLT::scalar(SizeInBits: 32));
172 const unsigned NumElems = Ty.getElementCount().getFixedValue();
173 return LLT::fixed_vector(NumElements: NumElems * 4, ScalarTy: LLT::scalar(SizeInBits: 32));
174}
175
176static LLT getBitcastRegisterType(const LLT Ty) {
177 const unsigned Size = Ty.getSizeInBits();
178
179 if (Size <= 32) {
180 // <2 x s8> -> s16
181 // <4 x s8> -> s32
182 return LLT::scalar(SizeInBits: Size);
183 }
184
185 return LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: Size / 32), ScalarSize: 32);
186}
187
188static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
189 return [=](const LegalityQuery &Query) {
190 const LLT Ty = Query.Types[TypeIdx];
191 return std::pair(TypeIdx, getBitcastRegisterType(Ty));
192 };
193}
194
195static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
196 return [=](const LegalityQuery &Query) {
197 const LLT Ty = Query.Types[TypeIdx];
198 unsigned Size = Ty.getSizeInBits();
199 assert(Size % 32 == 0);
200 return std::pair(
201 TypeIdx, LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: Size / 32), ScalarSize: 32));
202 };
203}
204
205static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
206 return [=](const LegalityQuery &Query) {
207 const LLT QueryTy = Query.Types[TypeIdx];
208 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
209 };
210}
211
212static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
213 return [=](const LegalityQuery &Query) {
214 const LLT QueryTy = Query.Types[TypeIdx];
215 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
216 };
217}
218
219static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
220 return [=](const LegalityQuery &Query) {
221 const LLT QueryTy = Query.Types[TypeIdx];
222 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
223 };
224}
225
226static bool isRegisterSize(unsigned Size) {
227 return Size % 32 == 0 && Size <= MaxRegisterSize;
228}
229
230static bool isRegisterVectorElementType(LLT EltTy) {
231 const int EltSize = EltTy.getSizeInBits();
232 return EltSize == 16 || EltSize % 32 == 0;
233}
234
235static bool isRegisterVectorType(LLT Ty) {
236 const int EltSize = Ty.getElementType().getSizeInBits();
237 return EltSize == 32 || EltSize == 64 ||
238 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
239 EltSize == 128 || EltSize == 256;
240}
241
242// TODO: replace all uses of isRegisterType with isRegisterClassType
243static bool isRegisterType(LLT Ty) {
244 if (!isRegisterSize(Size: Ty.getSizeInBits()))
245 return false;
246
247 if (Ty.isVector())
248 return isRegisterVectorType(Ty);
249
250 return true;
251}
252
253// Any combination of 32 or 64-bit elements up the maximum register size, and
254// multiples of v2s16.
255static LegalityPredicate isRegisterType(unsigned TypeIdx) {
256 return [=](const LegalityQuery &Query) {
257 return isRegisterType(Ty: Query.Types[TypeIdx]);
258 };
259}
260
261// RegisterType that doesn't have a corresponding RegClass.
262// TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
263// should be removed.
264static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) {
265 return [=](const LegalityQuery &Query) {
266 LLT Ty = Query.Types[TypeIdx];
267 return isRegisterType(Ty) &&
268 !SIRegisterInfo::getSGPRClassForBitWidth(BitWidth: Ty.getSizeInBits());
269 };
270}
271
272static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
273 return [=](const LegalityQuery &Query) {
274 const LLT QueryTy = Query.Types[TypeIdx];
275 if (!QueryTy.isVector())
276 return false;
277 const LLT EltTy = QueryTy.getElementType();
278 return EltTy == LLT::scalar(SizeInBits: 16) || EltTy.getSizeInBits() >= 32;
279 };
280}
281
282static const LLT S1 = LLT::scalar(SizeInBits: 1);
283static const LLT S8 = LLT::scalar(SizeInBits: 8);
284static const LLT S16 = LLT::scalar(SizeInBits: 16);
285static const LLT S32 = LLT::scalar(SizeInBits: 32);
286static const LLT S64 = LLT::scalar(SizeInBits: 64);
287static const LLT S96 = LLT::scalar(SizeInBits: 96);
288static const LLT S128 = LLT::scalar(SizeInBits: 128);
289static const LLT S160 = LLT::scalar(SizeInBits: 160);
290static const LLT S224 = LLT::scalar(SizeInBits: 224);
291static const LLT S256 = LLT::scalar(SizeInBits: 256);
292static const LLT S512 = LLT::scalar(SizeInBits: 512);
293static const LLT MaxScalar = LLT::scalar(SizeInBits: MaxRegisterSize);
294
295static const LLT V2S8 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 8);
296static const LLT V2S16 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16);
297static const LLT V4S16 = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 16);
298static const LLT V6S16 = LLT::fixed_vector(NumElements: 6, ScalarSizeInBits: 16);
299static const LLT V8S16 = LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 16);
300static const LLT V10S16 = LLT::fixed_vector(NumElements: 10, ScalarSizeInBits: 16);
301static const LLT V12S16 = LLT::fixed_vector(NumElements: 12, ScalarSizeInBits: 16);
302static const LLT V16S16 = LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 16);
303
304static const LLT V2S32 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 32);
305static const LLT V3S32 = LLT::fixed_vector(NumElements: 3, ScalarSizeInBits: 32);
306static const LLT V4S32 = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 32);
307static const LLT V5S32 = LLT::fixed_vector(NumElements: 5, ScalarSizeInBits: 32);
308static const LLT V6S32 = LLT::fixed_vector(NumElements: 6, ScalarSizeInBits: 32);
309static const LLT V7S32 = LLT::fixed_vector(NumElements: 7, ScalarSizeInBits: 32);
310static const LLT V8S32 = LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 32);
311static const LLT V9S32 = LLT::fixed_vector(NumElements: 9, ScalarSizeInBits: 32);
312static const LLT V10S32 = LLT::fixed_vector(NumElements: 10, ScalarSizeInBits: 32);
313static const LLT V11S32 = LLT::fixed_vector(NumElements: 11, ScalarSizeInBits: 32);
314static const LLT V12S32 = LLT::fixed_vector(NumElements: 12, ScalarSizeInBits: 32);
315static const LLT V16S32 = LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 32);
316static const LLT V32S32 = LLT::fixed_vector(NumElements: 32, ScalarSizeInBits: 32);
317
318static const LLT V2S64 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 64);
319static const LLT V3S64 = LLT::fixed_vector(NumElements: 3, ScalarSizeInBits: 64);
320static const LLT V4S64 = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 64);
321static const LLT V5S64 = LLT::fixed_vector(NumElements: 5, ScalarSizeInBits: 64);
322static const LLT V6S64 = LLT::fixed_vector(NumElements: 6, ScalarSizeInBits: 64);
323static const LLT V7S64 = LLT::fixed_vector(NumElements: 7, ScalarSizeInBits: 64);
324static const LLT V8S64 = LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 64);
325static const LLT V16S64 = LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 64);
326
327static const LLT V2S128 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 128);
328static const LLT V4S128 = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 128);
329
330static std::initializer_list<LLT> AllScalarTypes = {S32, S64, S96, S128,
331 S160, S224, S256, S512};
332
333static std::initializer_list<LLT> AllS16Vectors{
334 V2S16, V4S16, V6S16, V8S16, V10S16, V12S16, V16S16, V2S128, V4S128};
335
336static std::initializer_list<LLT> AllS32Vectors = {
337 V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
338 V9S32, V10S32, V11S32, V12S32, V16S32, V32S32};
339
340static std::initializer_list<LLT> AllS64Vectors = {V2S64, V3S64, V4S64, V5S64,
341 V6S64, V7S64, V8S64, V16S64};
342
343// Checks whether a type is in the list of legal register types.
344static bool isRegisterClassType(LLT Ty) {
345 if (Ty.isPointerOrPointerVector())
346 Ty = Ty.changeElementType(NewEltTy: LLT::scalar(SizeInBits: Ty.getScalarSizeInBits()));
347
348 return is_contained(Set: AllS32Vectors, Element: Ty) || is_contained(Set: AllS64Vectors, Element: Ty) ||
349 is_contained(Set: AllScalarTypes, Element: Ty) || is_contained(Set: AllS16Vectors, Element: Ty);
350}
351
352static LegalityPredicate isRegisterClassType(unsigned TypeIdx) {
353 return [TypeIdx](const LegalityQuery &Query) {
354 return isRegisterClassType(Ty: Query.Types[TypeIdx]);
355 };
356}
357
358// If we have a truncating store or an extending load with a data size larger
359// than 32-bits, we need to reduce to a 32-bit type.
360static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
361 return [=](const LegalityQuery &Query) {
362 const LLT Ty = Query.Types[TypeIdx];
363 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
364 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
365 };
366}
367
368// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
369// handle some operations by just promoting the register during
370// selection. There are also d16 loads on GFX9+ which preserve the high bits.
371static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
372 bool IsLoad, bool IsAtomic) {
373 switch (AS) {
374 case AMDGPUAS::PRIVATE_ADDRESS:
375 // FIXME: Private element size.
376 return ST.enableFlatScratch() ? 128 : 32;
377 case AMDGPUAS::LOCAL_ADDRESS:
378 return ST.useDS128() ? 128 : 64;
379 case AMDGPUAS::GLOBAL_ADDRESS:
380 case AMDGPUAS::CONSTANT_ADDRESS:
381 case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
382 case AMDGPUAS::BUFFER_RESOURCE:
383 // Treat constant and global as identical. SMRD loads are sometimes usable for
384 // global loads (ideally constant address space should be eliminated)
385 // depending on the context. Legality cannot be context dependent, but
386 // RegBankSelect can split the load as necessary depending on the pointer
387 // register bank/uniformity and if the memory is invariant or not written in a
388 // kernel.
389 return IsLoad ? 512 : 128;
390 default:
391 // FIXME: Flat addresses may contextually need to be split to 32-bit parts
392 // if they may alias scratch depending on the subtarget. This needs to be
393 // moved to custom handling to use addressMayBeAccessedAsPrivate
394 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
395 }
396}
397
398static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
399 const LegalityQuery &Query) {
400 const LLT Ty = Query.Types[0];
401
402 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
403 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
404
405 unsigned RegSize = Ty.getSizeInBits();
406 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
407 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
408 unsigned AS = Query.Types[1].getAddressSpace();
409
410 // All of these need to be custom lowered to cast the pointer operand.
411 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
412 return false;
413
414 // Do not handle extending vector loads.
415 if (Ty.isVector() && MemSize != RegSize)
416 return false;
417
418 // TODO: We should be able to widen loads if the alignment is high enough, but
419 // we also need to modify the memory access size.
420#if 0
421 // Accept widening loads based on alignment.
422 if (IsLoad && MemSize < Size)
423 MemSize = std::max(MemSize, Align);
424#endif
425
426 // Only 1-byte and 2-byte to 32-bit extloads are valid.
427 if (MemSize != RegSize && RegSize != 32)
428 return false;
429
430 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
431 IsAtomic: Query.MMODescrs[0].Ordering !=
432 AtomicOrdering::NotAtomic))
433 return false;
434
435 switch (MemSize) {
436 case 8:
437 case 16:
438 case 32:
439 case 64:
440 case 128:
441 break;
442 case 96:
443 if (!ST.hasDwordx3LoadStores())
444 return false;
445 break;
446 case 256:
447 case 512:
448 // These may contextually need to be broken down.
449 break;
450 default:
451 return false;
452 }
453
454 assert(RegSize >= MemSize);
455
456 if (AlignBits < MemSize) {
457 const SITargetLowering *TLI = ST.getTargetLowering();
458 if (!TLI->allowsMisalignedMemoryAccessesImpl(Size: MemSize, AddrSpace: AS,
459 Alignment: Align(AlignBits / 8)))
460 return false;
461 }
462
463 return true;
464}
465
466// The newer buffer intrinsic forms take their resource arguments as
467// pointers in address space 8, aka s128 values. However, in order to not break
468// SelectionDAG, the underlying operations have to continue to take v4i32
469// arguments. Therefore, we convert resource pointers - or vectors of them
470// to integer values here.
471static bool hasBufferRsrcWorkaround(const LLT Ty) {
472 if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
473 return true;
474 if (Ty.isVector()) {
475 const LLT ElemTy = Ty.getElementType();
476 return hasBufferRsrcWorkaround(Ty: ElemTy);
477 }
478 return false;
479}
480
481// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
482// workaround this. Eventually it should ignore the type for loads and only care
483// about the size. Return true in cases where we will workaround this for now by
484// bitcasting.
485static bool loadStoreBitcastWorkaround(const LLT Ty) {
486 if (EnableNewLegality)
487 return false;
488
489 const unsigned Size = Ty.getSizeInBits();
490 if (Size <= 64)
491 return false;
492 // Address space 8 pointers get their own workaround.
493 if (hasBufferRsrcWorkaround(Ty))
494 return false;
495 if (!Ty.isVector())
496 return true;
497
498 if (Ty.isPointerVector())
499 return true;
500
501 unsigned EltSize = Ty.getScalarSizeInBits();
502 return EltSize != 32 && EltSize != 64;
503}
504
505static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
506 const LLT Ty = Query.Types[0];
507 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
508 !hasBufferRsrcWorkaround(Ty) && !loadStoreBitcastWorkaround(Ty);
509}
510
511/// Return true if a load or store of the type should be lowered with a bitcast
512/// to a different type.
513static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
514 const LLT MemTy) {
515 const unsigned MemSizeInBits = MemTy.getSizeInBits();
516 const unsigned Size = Ty.getSizeInBits();
517 if (Size != MemSizeInBits)
518 return Size <= 32 && Ty.isVector();
519
520 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
521 return true;
522
523 // Don't try to handle bitcasting vector ext loads for now.
524 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
525 (Size <= 32 || isRegisterSize(Size)) &&
526 !isRegisterVectorElementType(EltTy: Ty.getElementType());
527}
528
529/// Return true if we should legalize a load by widening an odd sized memory
530/// access up to the alignment. Note this case when the memory access itself
531/// changes, not the size of the result register.
532static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
533 uint64_t AlignInBits, unsigned AddrSpace,
534 unsigned Opcode) {
535 unsigned SizeInBits = MemoryTy.getSizeInBits();
536 // We don't want to widen cases that are naturally legal.
537 if (isPowerOf2_32(Value: SizeInBits))
538 return false;
539
540 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
541 // end up widening these for a scalar load during RegBankSelect, if we don't
542 // have 96-bit scalar loads.
543 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
544 return false;
545
546 if (SizeInBits >= maxSizeForAddrSpace(ST, AS: AddrSpace, IsLoad: Opcode, IsAtomic: false))
547 return false;
548
549 // A load is known dereferenceable up to the alignment, so it's legal to widen
550 // to it.
551 //
552 // TODO: Could check dereferenceable for less aligned cases.
553 unsigned RoundedSize = NextPowerOf2(A: SizeInBits);
554 if (AlignInBits < RoundedSize)
555 return false;
556
557 // Do not widen if it would introduce a slow unaligned load.
558 const SITargetLowering *TLI = ST.getTargetLowering();
559 unsigned Fast = 0;
560 return TLI->allowsMisalignedMemoryAccessesImpl(
561 Size: RoundedSize, AddrSpace, Alignment: Align(AlignInBits / 8),
562 Flags: MachineMemOperand::MOLoad, IsFast: &Fast) &&
563 Fast;
564}
565
566static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
567 unsigned Opcode) {
568 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
569 return false;
570
571 return shouldWidenLoad(ST, MemoryTy: Query.MMODescrs[0].MemoryTy,
572 AlignInBits: Query.MMODescrs[0].AlignInBits,
573 AddrSpace: Query.Types[1].getAddressSpace(), Opcode);
574}
575
576/// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
577/// type of the operand `idx` and then to transform it to a `p8` via bitcasts
578/// and inttoptr. In addition, handle vectors of p8. Returns the new type.
579static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B,
580 MachineRegisterInfo &MRI, unsigned Idx) {
581 MachineOperand &MO = MI.getOperand(i: Idx);
582
583 const LLT PointerTy = MRI.getType(Reg: MO.getReg());
584
585 // Paranoidly prevent us from doing this multiple times.
586 if (!hasBufferRsrcWorkaround(Ty: PointerTy))
587 return PointerTy;
588
589 const LLT ScalarTy = getBufferRsrcScalarType(Ty: PointerTy);
590 const LLT VectorTy = getBufferRsrcRegisterType(Ty: PointerTy);
591 if (!PointerTy.isVector()) {
592 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
593 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
594 const LLT S32 = LLT::scalar(SizeInBits: 32);
595
596 Register VectorReg = MRI.createGenericVirtualRegister(Ty: VectorTy);
597 std::array<Register, 4> VectorElems;
598 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
599 for (unsigned I = 0; I < NumParts; ++I)
600 VectorElems[I] =
601 B.buildExtractVectorElementConstant(Res: S32, Val: VectorReg, Idx: I).getReg(Idx: 0);
602 B.buildMergeValues(Res: MO, Ops: VectorElems);
603 MO.setReg(VectorReg);
604 return VectorTy;
605 }
606 Register BitcastReg = MRI.createGenericVirtualRegister(Ty: VectorTy);
607 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
608 auto Scalar = B.buildBitcast(Dst: ScalarTy, Src: BitcastReg);
609 B.buildIntToPtr(Dst: MO, Src: Scalar);
610 MO.setReg(BitcastReg);
611
612 return VectorTy;
613}
614
615/// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
616/// the form in which the value must be in order to be passed to the low-level
617/// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
618/// needed in order to account for the fact that we can't define a register
619/// class for s128 without breaking SelectionDAG.
620static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B) {
621 MachineRegisterInfo &MRI = *B.getMRI();
622 const LLT PointerTy = MRI.getType(Reg: Pointer);
623 const LLT ScalarTy = getBufferRsrcScalarType(Ty: PointerTy);
624 const LLT VectorTy = getBufferRsrcRegisterType(Ty: PointerTy);
625
626 if (!PointerTy.isVector()) {
627 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
628 SmallVector<Register, 4> PointerParts;
629 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
630 auto Unmerged = B.buildUnmerge(Res: LLT::scalar(SizeInBits: 32), Op: Pointer);
631 for (unsigned I = 0; I < NumParts; ++I)
632 PointerParts.push_back(Elt: Unmerged.getReg(Idx: I));
633 return B.buildBuildVector(Res: VectorTy, Ops: PointerParts).getReg(Idx: 0);
634 }
635 Register Scalar = B.buildPtrToInt(Dst: ScalarTy, Src: Pointer).getReg(Idx: 0);
636 return B.buildBitcast(Dst: VectorTy, Src: Scalar).getReg(Idx: 0);
637}
638
639static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B,
640 unsigned Idx) {
641 MachineOperand &MO = MI.getOperand(i: Idx);
642
643 const LLT PointerTy = B.getMRI()->getType(Reg: MO.getReg());
644 // Paranoidly prevent us from doing this multiple times.
645 if (!hasBufferRsrcWorkaround(Ty: PointerTy))
646 return;
647 MO.setReg(castBufferRsrcToV4I32(Pointer: MO.getReg(), B));
648}
649
650AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
651 const GCNTargetMachine &TM)
652 : ST(ST_) {
653 using namespace TargetOpcode;
654
655 auto GetAddrSpacePtr = [&TM](unsigned AS) {
656 return LLT::pointer(AddressSpace: AS, SizeInBits: TM.getPointerSizeInBits(AS));
657 };
658
659 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
660 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
661 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
662 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
663 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
664 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
665 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
666 const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
667 const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
668 const LLT BufferStridedPtr =
669 GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
670
671 const LLT CodePtr = FlatPtr;
672
673 const std::initializer_list<LLT> AddrSpaces64 = {
674 GlobalPtr, ConstantPtr, FlatPtr
675 };
676
677 const std::initializer_list<LLT> AddrSpaces32 = {
678 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
679 };
680
681 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
682
683 const std::initializer_list<LLT> FPTypesBase = {
684 S32, S64
685 };
686
687 const std::initializer_list<LLT> FPTypes16 = {
688 S32, S64, S16
689 };
690
691 const std::initializer_list<LLT> FPTypesPK16 = {
692 S32, S64, S16, V2S16
693 };
694
695 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
696
697 // s1 for VCC branches, s32 for SCC branches.
698 getActionDefinitionsBuilder(Opcode: G_BRCOND).legalFor(Types: {S1, S32});
699
700 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
701 // elements for v3s16
702 getActionDefinitionsBuilder(Opcode: G_PHI)
703 .legalFor(Types: {S32, S64, V2S16, S16, V4S16, S1, S128, S256})
704 .legalFor(Types: AllS32Vectors)
705 .legalFor(Types: AllS64Vectors)
706 .legalFor(Types: AddrSpaces64)
707 .legalFor(Types: AddrSpaces32)
708 .legalFor(Types: AddrSpaces128)
709 .legalIf(Predicate: isPointer(TypeIdx: 0))
710 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S256)
711 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
712 .clampMaxNumElements(TypeIdx: 0, EltTy: S32, MaxElements: 16)
713 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: 0), Mutation: oneMoreElement(TypeIdx: 0))
714 .scalarize(TypeIdx: 0);
715
716 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
717 // Full set of gfx9 features.
718 if (ST.hasScalarAddSub64()) {
719 getActionDefinitionsBuilder(Opcodes: {G_ADD, G_SUB})
720 .legalFor(Types: {S64, S32, S16, V2S16})
721 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
722 .scalarize(TypeIdx: 0)
723 .minScalar(TypeIdx: 0, Ty: S16)
724 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
725 .maxScalar(TypeIdx: 0, Ty: S32);
726 } else {
727 getActionDefinitionsBuilder(Opcodes: {G_ADD, G_SUB})
728 .legalFor(Types: {S32, S16, V2S16})
729 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
730 .scalarize(TypeIdx: 0)
731 .minScalar(TypeIdx: 0, Ty: S16)
732 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
733 .maxScalar(TypeIdx: 0, Ty: S32);
734 }
735
736 if (ST.hasScalarSMulU64()) {
737 getActionDefinitionsBuilder(Opcode: G_MUL)
738 .legalFor(Types: {S64, S32, S16, V2S16})
739 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
740 .scalarize(TypeIdx: 0)
741 .minScalar(TypeIdx: 0, Ty: S16)
742 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
743 .custom();
744 } else {
745 getActionDefinitionsBuilder(Opcode: G_MUL)
746 .legalFor(Types: {S32, S16, V2S16})
747 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
748 .scalarize(TypeIdx: 0)
749 .minScalar(TypeIdx: 0, Ty: S16)
750 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
751 .custom();
752 }
753 assert(ST.hasMad64_32());
754
755 getActionDefinitionsBuilder(Opcodes: {G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
756 .legalFor(Types: {S32, S16, V2S16}) // Clamp modifier
757 .minScalarOrElt(TypeIdx: 0, Ty: S16)
758 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
759 .scalarize(TypeIdx: 0)
760 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
761 .lower();
762 } else if (ST.has16BitInsts()) {
763 getActionDefinitionsBuilder(Opcodes: {G_ADD, G_SUB})
764 .legalFor(Types: {S32, S16})
765 .minScalar(TypeIdx: 0, Ty: S16)
766 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
767 .maxScalar(TypeIdx: 0, Ty: S32)
768 .scalarize(TypeIdx: 0);
769
770 getActionDefinitionsBuilder(Opcode: G_MUL)
771 .legalFor(Types: {S32, S16})
772 .scalarize(TypeIdx: 0)
773 .minScalar(TypeIdx: 0, Ty: S16)
774 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
775 .custom();
776 assert(ST.hasMad64_32());
777
778 // Technically the saturating operations require clamp bit support, but this
779 // was introduced at the same time as 16-bit operations.
780 getActionDefinitionsBuilder(Opcodes: {G_UADDSAT, G_USUBSAT})
781 .legalFor(Types: {S32, S16}) // Clamp modifier
782 .minScalar(TypeIdx: 0, Ty: S16)
783 .scalarize(TypeIdx: 0)
784 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 16)
785 .lower();
786
787 // We're just lowering this, but it helps get a better result to try to
788 // coerce to the desired type first.
789 getActionDefinitionsBuilder(Opcodes: {G_SADDSAT, G_SSUBSAT})
790 .minScalar(TypeIdx: 0, Ty: S16)
791 .scalarize(TypeIdx: 0)
792 .lower();
793 } else {
794 getActionDefinitionsBuilder(Opcodes: {G_ADD, G_SUB})
795 .legalFor(Types: {S32})
796 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32)
797 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
798 .scalarize(TypeIdx: 0);
799
800 auto &Mul = getActionDefinitionsBuilder(Opcode: G_MUL)
801 .legalFor(Types: {S32})
802 .scalarize(TypeIdx: 0)
803 .minScalar(TypeIdx: 0, Ty: S32)
804 .widenScalarToNextMultipleOf(TypeIdx: 0, Size: 32);
805
806 if (ST.hasMad64_32())
807 Mul.custom();
808 else
809 Mul.maxScalar(TypeIdx: 0, Ty: S32);
810
811 if (ST.hasIntClamp()) {
812 getActionDefinitionsBuilder(Opcodes: {G_UADDSAT, G_USUBSAT})
813 .legalFor(Types: {S32}) // Clamp modifier.
814 .scalarize(TypeIdx: 0)
815 .minScalarOrElt(TypeIdx: 0, Ty: S32)
816 .lower();
817 } else {
818 // Clamp bit support was added in VI, along with 16-bit operations.
819 getActionDefinitionsBuilder(Opcodes: {G_UADDSAT, G_USUBSAT})
820 .minScalar(TypeIdx: 0, Ty: S32)
821 .scalarize(TypeIdx: 0)
822 .lower();
823 }
824
825 // FIXME: DAG expansion gets better results. The widening uses the smaller
826 // range values and goes for the min/max lowering directly.
827 getActionDefinitionsBuilder(Opcodes: {G_SADDSAT, G_SSUBSAT})
828 .minScalar(TypeIdx: 0, Ty: S32)
829 .scalarize(TypeIdx: 0)
830 .lower();
831 }
832
833 getActionDefinitionsBuilder(
834 Opcodes: {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
835 .customFor(Types: {S32, S64})
836 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
837 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
838 .scalarize(TypeIdx: 0);
839
840 auto &Mulh = getActionDefinitionsBuilder(Opcodes: {G_UMULH, G_SMULH})
841 .legalFor(Types: {S32})
842 .maxScalar(TypeIdx: 0, Ty: S32);
843
844 if (ST.hasVOP3PInsts()) {
845 Mulh
846 .clampMaxNumElements(TypeIdx: 0, EltTy: S8, MaxElements: 2)
847 .lowerFor(Types: {V2S8});
848 }
849
850 Mulh
851 .scalarize(TypeIdx: 0)
852 .lower();
853
854 // Report legal for any types we can handle anywhere. For the cases only legal
855 // on the SALU, RegBankSelect will be able to re-legalize.
856 getActionDefinitionsBuilder(Opcodes: {G_AND, G_OR, G_XOR})
857 .legalFor(Types: {S32, S1, S64, V2S32, S16, V2S16, V4S16})
858 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
859 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: 0), Mutation: oneMoreElement(TypeIdx: 0))
860 .fewerElementsIf(Predicate: vectorWiderThan(TypeIdx: 0, Size: 64), Mutation: fewerEltsToSize64Vector(TypeIdx: 0))
861 .widenScalarToNextPow2(TypeIdx: 0)
862 .scalarize(TypeIdx: 0);
863
864 getActionDefinitionsBuilder(
865 Opcodes: {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
866 .legalFor(Types: {{S32, S1}, {S32, S32}})
867 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
868 .scalarize(TypeIdx: 0);
869
870 getActionDefinitionsBuilder(Opcode: G_BITCAST)
871 // Don't worry about the size constraint.
872 .legalIf(Predicate: all(P0: isRegisterClassType(TypeIdx: 0), P1: isRegisterClassType(TypeIdx: 1)))
873 .lower();
874
875 getActionDefinitionsBuilder(Opcode: G_CONSTANT)
876 .legalFor(Types: {S1, S32, S64, S16, GlobalPtr,
877 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
878 .legalIf(Predicate: isPointer(TypeIdx: 0))
879 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
880 .widenScalarToNextPow2(TypeIdx: 0);
881
882 getActionDefinitionsBuilder(Opcode: G_FCONSTANT)
883 .legalFor(Types: {S32, S64, S16})
884 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64);
885
886 getActionDefinitionsBuilder(Opcodes: {G_IMPLICIT_DEF, G_FREEZE})
887 .legalIf(Predicate: isRegisterType(TypeIdx: 0))
888 // s1 and s16 are special cases because they have legal operations on
889 // them, but don't really occupy registers in the normal way.
890 .legalFor(Types: {S1, S16})
891 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: 0), Mutation: oneMoreElement(TypeIdx: 0))
892 .clampScalarOrElt(TypeIdx: 0, MinTy: S32, MaxTy: MaxScalar)
893 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
894 .clampMaxNumElements(TypeIdx: 0, EltTy: S32, MaxElements: 16);
895
896 getActionDefinitionsBuilder(Opcode: G_FRAME_INDEX).legalFor(Types: {PrivatePtr});
897
898 // If the amount is divergent, we have to do a wave reduction to get the
899 // maximum value, so this is expanded during RegBankSelect.
900 getActionDefinitionsBuilder(Opcode: G_DYN_STACKALLOC)
901 .legalFor(Types: {{PrivatePtr, S32}});
902
903 getActionDefinitionsBuilder(Opcode: G_STACKSAVE)
904 .customFor(Types: {PrivatePtr});
905 getActionDefinitionsBuilder(Opcode: G_STACKRESTORE)
906 .legalFor(Types: {PrivatePtr});
907
908 getActionDefinitionsBuilder(Opcodes: {G_GET_FPENV, G_SET_FPENV}).customFor(Types: {S64});
909
910 getActionDefinitionsBuilder(Opcode: G_GLOBAL_VALUE)
911 .customIf(Predicate: typeIsNot(TypeIdx: 0, Type: PrivatePtr));
912
913 getActionDefinitionsBuilder(Opcode: G_BLOCK_ADDR).legalFor(Types: {CodePtr});
914
915 auto &FPOpActions = getActionDefinitionsBuilder(
916 Opcodes: { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
917 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
918 .legalFor(Types: {S32, S64});
919 auto &TrigActions = getActionDefinitionsBuilder(Opcodes: {G_FSIN, G_FCOS})
920 .customFor(Types: {S32, S64});
921 auto &FDIVActions = getActionDefinitionsBuilder(Opcode: G_FDIV)
922 .customFor(Types: {S32, S64});
923
924 if (ST.has16BitInsts()) {
925 if (ST.hasVOP3PInsts())
926 FPOpActions.legalFor(Types: {S16, V2S16});
927 else
928 FPOpActions.legalFor(Types: {S16});
929
930 TrigActions.customFor(Types: {S16});
931 FDIVActions.customFor(Types: {S16});
932 }
933
934 if (ST.hasPackedFP32Ops()) {
935 FPOpActions.legalFor(Types: {V2S32});
936 FPOpActions.clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S32, NumElts: 2);
937 }
938
939 auto &MinNumMaxNum = getActionDefinitionsBuilder(Opcodes: {
940 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
941
942 if (ST.hasVOP3PInsts()) {
943 MinNumMaxNum.customFor(Types: FPTypesPK16)
944 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: 0), Mutation: oneMoreElement(TypeIdx: 0))
945 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
946 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
947 .scalarize(TypeIdx: 0);
948 } else if (ST.has16BitInsts()) {
949 MinNumMaxNum.customFor(Types: FPTypes16)
950 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
951 .scalarize(TypeIdx: 0);
952 } else {
953 MinNumMaxNum.customFor(Types: FPTypesBase)
954 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
955 .scalarize(TypeIdx: 0);
956 }
957
958 if (ST.hasVOP3PInsts())
959 FPOpActions.clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2);
960
961 FPOpActions
962 .scalarize(TypeIdx: 0)
963 .clampScalar(TypeIdx: 0, MinTy: ST.has16BitInsts() ? S16 : S32, MaxTy: S64);
964
965 TrigActions
966 .scalarize(TypeIdx: 0)
967 .clampScalar(TypeIdx: 0, MinTy: ST.has16BitInsts() ? S16 : S32, MaxTy: S64);
968
969 FDIVActions
970 .scalarize(TypeIdx: 0)
971 .clampScalar(TypeIdx: 0, MinTy: ST.has16BitInsts() ? S16 : S32, MaxTy: S64);
972
973 getActionDefinitionsBuilder(Opcodes: {G_FNEG, G_FABS})
974 .legalFor(Types: FPTypesPK16)
975 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
976 .scalarize(TypeIdx: 0)
977 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64);
978
979 if (ST.has16BitInsts()) {
980 getActionDefinitionsBuilder(Opcode: G_FSQRT)
981 .legalFor(Types: {S16})
982 .customFor(Types: {S32, S64})
983 .scalarize(TypeIdx: 0)
984 .unsupported();
985 getActionDefinitionsBuilder(Opcode: G_FFLOOR)
986 .legalFor(Types: {S32, S64, S16})
987 .scalarize(TypeIdx: 0)
988 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64);
989
990 getActionDefinitionsBuilder(Opcodes: {G_FLDEXP, G_STRICT_FLDEXP})
991 .legalFor(Types: {{S32, S32}, {S64, S32}, {S16, S16}})
992 .scalarize(TypeIdx: 0)
993 .maxScalarIf(Predicate: typeIs(TypeIdx: 0, TypesInit: S16), TypeIdx: 1, Ty: S16)
994 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32)
995 .lower();
996
997 getActionDefinitionsBuilder(Opcode: G_FFREXP)
998 .customFor(Types: {{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
999 .scalarize(TypeIdx: 0)
1000 .lower();
1001 } else {
1002 getActionDefinitionsBuilder(Opcode: G_FSQRT)
1003 .customFor(Types: {S32, S64, S16})
1004 .scalarize(TypeIdx: 0)
1005 .unsupported();
1006
1007
1008 if (ST.hasFractBug()) {
1009 getActionDefinitionsBuilder(Opcode: G_FFLOOR)
1010 .customFor(Types: {S64})
1011 .legalFor(Types: {S32, S64})
1012 .scalarize(TypeIdx: 0)
1013 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64);
1014 } else {
1015 getActionDefinitionsBuilder(Opcode: G_FFLOOR)
1016 .legalFor(Types: {S32, S64})
1017 .scalarize(TypeIdx: 0)
1018 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64);
1019 }
1020
1021 getActionDefinitionsBuilder(Opcodes: {G_FLDEXP, G_STRICT_FLDEXP})
1022 .legalFor(Types: {{S32, S32}, {S64, S32}})
1023 .scalarize(TypeIdx: 0)
1024 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
1025 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32)
1026 .lower();
1027
1028 getActionDefinitionsBuilder(Opcode: G_FFREXP)
1029 .customFor(Types: {{S32, S32}, {S64, S32}})
1030 .scalarize(TypeIdx: 0)
1031 .minScalar(TypeIdx: 0, Ty: S32)
1032 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32)
1033 .lower();
1034 }
1035
1036 getActionDefinitionsBuilder(Opcode: G_FPTRUNC)
1037 .legalFor(Types: {{S32, S64}, {S16, S32}})
1038 .scalarize(TypeIdx: 0)
1039 .lower();
1040
1041 getActionDefinitionsBuilder(Opcode: G_FPEXT)
1042 .legalFor(Types: {{S64, S32}, {S32, S16}})
1043 .narrowScalarFor(Types: {{S64, S16}}, Mutation: changeTo(TypeIdx: 0, Ty: S32))
1044 .scalarize(TypeIdx: 0);
1045
1046 auto &FSubActions = getActionDefinitionsBuilder(Opcodes: {G_FSUB, G_STRICT_FSUB});
1047 if (ST.has16BitInsts()) {
1048 FSubActions
1049 // Use actual fsub instruction
1050 .legalFor(Types: {S32, S16})
1051 // Must use fadd + fneg
1052 .lowerFor(Types: {S64, V2S16});
1053 } else {
1054 FSubActions
1055 // Use actual fsub instruction
1056 .legalFor(Types: {S32})
1057 // Must use fadd + fneg
1058 .lowerFor(Types: {S64, S16, V2S16});
1059 }
1060
1061 FSubActions
1062 .scalarize(TypeIdx: 0)
1063 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64);
1064
1065 // Whether this is legal depends on the floating point mode for the function.
1066 auto &FMad = getActionDefinitionsBuilder(Opcode: G_FMAD);
1067 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1068 FMad.customFor(Types: {S32, S16});
1069 else if (ST.hasMadMacF32Insts())
1070 FMad.customFor(Types: {S32});
1071 else if (ST.hasMadF16())
1072 FMad.customFor(Types: {S16});
1073 FMad.scalarize(TypeIdx: 0)
1074 .lower();
1075
1076 auto &FRem = getActionDefinitionsBuilder(Opcode: G_FREM);
1077 if (ST.has16BitInsts()) {
1078 FRem.customFor(Types: {S16, S32, S64});
1079 } else {
1080 FRem.minScalar(TypeIdx: 0, Ty: S32)
1081 .customFor(Types: {S32, S64});
1082 }
1083 FRem.scalarize(TypeIdx: 0);
1084
1085 // TODO: Do we need to clamp maximum bitwidth?
1086 getActionDefinitionsBuilder(Opcode: G_TRUNC)
1087 .legalIf(Predicate: isScalar(TypeIdx: 0))
1088 .legalFor(Types: {{V2S16, V2S32}})
1089 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
1090 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1091 // situations (like an invalid implicit use), we don't want to infinite loop
1092 // in the legalizer.
1093 .fewerElementsIf(Predicate: elementTypeIsLegal(TypeIdx: 0), Mutation: LegalizeMutations::scalarize(TypeIdx: 0))
1094 .alwaysLegal();
1095
1096 getActionDefinitionsBuilder(Opcodes: {G_SEXT, G_ZEXT, G_ANYEXT})
1097 .legalFor(Types: {{S64, S32}, {S32, S16}, {S64, S16},
1098 {S32, S1}, {S64, S1}, {S16, S1}})
1099 .scalarize(TypeIdx: 0)
1100 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
1101 .widenScalarToNextPow2(TypeIdx: 1, MinSize: 32);
1102
1103 // TODO: Split s1->s64 during regbankselect for VALU.
1104 auto &IToFP = getActionDefinitionsBuilder(Opcodes: {G_SITOFP, G_UITOFP})
1105 .legalFor(Types: {{S32, S32}, {S64, S32}, {S16, S32}})
1106 .lowerIf(Predicate: typeIs(TypeIdx: 1, TypesInit: S1))
1107 .customFor(Types: {{S32, S64}, {S64, S64}});
1108 if (ST.has16BitInsts())
1109 IToFP.legalFor(Types: {{S16, S16}});
1110 IToFP.clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1111 .minScalar(TypeIdx: 0, Ty: S32)
1112 .scalarize(TypeIdx: 0)
1113 .widenScalarToNextPow2(TypeIdx: 1);
1114
1115 auto &FPToI = getActionDefinitionsBuilder(Opcodes: {G_FPTOSI, G_FPTOUI})
1116 .legalFor(Types: {{S32, S32}, {S32, S64}, {S32, S16}})
1117 .customFor(Types: {{S64, S32}, {S64, S64}})
1118 .narrowScalarFor(Types: {{S64, S16}}, Mutation: changeTo(TypeIdx: 0, Ty: S32));
1119 if (ST.has16BitInsts())
1120 FPToI.legalFor(Types: {{S16, S16}});
1121 else
1122 FPToI.minScalar(TypeIdx: 1, Ty: S32);
1123
1124 FPToI.minScalar(TypeIdx: 0, Ty: S32)
1125 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
1126 .scalarize(TypeIdx: 0)
1127 .lower();
1128
1129 getActionDefinitionsBuilder(Opcode: G_INTRINSIC_FPTRUNC_ROUND)
1130 .customFor(Types: {S16, S32})
1131 .scalarize(TypeIdx: 0)
1132 .lower();
1133
1134 // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1135 getActionDefinitionsBuilder(Opcodes: {G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1136 .scalarize(TypeIdx: 0)
1137 .lower();
1138
1139 if (ST.has16BitInsts()) {
1140 getActionDefinitionsBuilder(
1141 Opcodes: {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1142 .legalFor(Types: {S16, S32, S64})
1143 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
1144 .scalarize(TypeIdx: 0);
1145 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1146 getActionDefinitionsBuilder(
1147 Opcodes: {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1148 .legalFor(Types: {S32, S64})
1149 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
1150 .scalarize(TypeIdx: 0);
1151 } else {
1152 getActionDefinitionsBuilder(
1153 Opcodes: {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1154 .legalFor(Types: {S32})
1155 .customFor(Types: {S64})
1156 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
1157 .scalarize(TypeIdx: 0);
1158 }
1159
1160 getActionDefinitionsBuilder(Opcode: G_PTR_ADD)
1161 .unsupportedFor(Types: {BufferFatPtr, BufferStridedPtr, RsrcPtr})
1162 .legalIf(Predicate: all(P0: isPointer(TypeIdx: 0), P1: sameSize(TypeIdx0: 0, TypeIdx1: 1)))
1163 .scalarize(TypeIdx: 0)
1164 .scalarSameSizeAs(TypeIdx: 1, SameSizeIdx: 0);
1165
1166 getActionDefinitionsBuilder(Opcode: G_PTRMASK)
1167 .legalIf(Predicate: all(P0: sameSize(TypeIdx0: 0, TypeIdx1: 1), P1: typeInSet(TypeIdx: 1, TypesInit: {S64, S32})))
1168 .scalarSameSizeAs(TypeIdx: 1, SameSizeIdx: 0)
1169 .scalarize(TypeIdx: 0);
1170
1171 auto &CmpBuilder =
1172 getActionDefinitionsBuilder(Opcode: G_ICMP)
1173 // The compare output type differs based on the register bank of the output,
1174 // so make both s1 and s32 legal.
1175 //
1176 // Scalar compares producing output in scc will be promoted to s32, as that
1177 // is the allocatable register type that will be needed for the copy from
1178 // scc. This will be promoted during RegBankSelect, and we assume something
1179 // before that won't try to use s32 result types.
1180 //
1181 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1182 // bank.
1183 .legalForCartesianProduct(
1184 Types0: {S1}, Types1: {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1185 .legalForCartesianProduct(
1186 Types0: {S32}, Types1: {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1187 if (ST.has16BitInsts()) {
1188 CmpBuilder.legalFor(Types: {{S1, S16}});
1189 }
1190
1191 CmpBuilder
1192 .widenScalarToNextPow2(TypeIdx: 1)
1193 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1194 .scalarize(TypeIdx: 0)
1195 .legalIf(Predicate: all(P0: typeInSet(TypeIdx: 0, TypesInit: {S1, S32}), P1: isPointer(TypeIdx: 1)));
1196
1197 auto &FCmpBuilder =
1198 getActionDefinitionsBuilder(Opcode: G_FCMP).legalForCartesianProduct(
1199 {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1200
1201 if (ST.hasSALUFloatInsts())
1202 FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
1203
1204 FCmpBuilder
1205 .widenScalarToNextPow2(1)
1206 .clampScalar(1, S32, S64)
1207 .scalarize(0);
1208
1209 // FIXME: fpow has a selection pattern that should move to custom lowering.
1210 auto &ExpOps = getActionDefinitionsBuilder(Opcode: G_FPOW);
1211 if (ST.has16BitInsts())
1212 ExpOps.customFor(Types: {{S32}, {S16}});
1213 else
1214 ExpOps.customFor(Types: {S32});
1215 ExpOps.clampScalar(TypeIdx: 0, MinTy: MinScalarFPTy, MaxTy: S32)
1216 .scalarize(TypeIdx: 0);
1217
1218 getActionDefinitionsBuilder(Opcode: G_FPOWI)
1219 .clampScalar(TypeIdx: 0, MinTy: MinScalarFPTy, MaxTy: S32)
1220 .lower();
1221
1222 auto &Log2Ops = getActionDefinitionsBuilder(Opcodes: {G_FLOG2, G_FEXP2});
1223 Log2Ops.customFor(Types: {S32});
1224 if (ST.has16BitInsts())
1225 Log2Ops.legalFor(Types: {S16});
1226 else
1227 Log2Ops.customFor(Types: {S16});
1228 Log2Ops.scalarize(TypeIdx: 0)
1229 .lower();
1230
1231 auto &LogOps =
1232 getActionDefinitionsBuilder(Opcodes: {G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1233 LogOps.customFor(Types: {S32, S16});
1234 LogOps.clampScalar(TypeIdx: 0, MinTy: MinScalarFPTy, MaxTy: S32)
1235 .scalarize(TypeIdx: 0);
1236
1237 // The 64-bit versions produce 32-bit results, but only on the SALU.
1238 getActionDefinitionsBuilder(Opcode: G_CTPOP)
1239 .legalFor(Types: {{S32, S32}, {S32, S64}})
1240 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
1241 .widenScalarToNextPow2(TypeIdx: 1, MinSize: 32)
1242 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1243 .scalarize(TypeIdx: 0)
1244 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32);
1245
1246 // If no 16 bit instr is available, lower into different instructions.
1247 if (ST.has16BitInsts())
1248 getActionDefinitionsBuilder(Opcode: G_IS_FPCLASS)
1249 .legalForCartesianProduct(Types0: {S1}, Types1: FPTypes16)
1250 .widenScalarToNextPow2(TypeIdx: 1)
1251 .scalarize(TypeIdx: 0)
1252 .lower();
1253 else
1254 getActionDefinitionsBuilder(Opcode: G_IS_FPCLASS)
1255 .legalForCartesianProduct(Types0: {S1}, Types1: FPTypesBase)
1256 .lowerFor(Types: {S1, S16})
1257 .widenScalarToNextPow2(TypeIdx: 1)
1258 .scalarize(TypeIdx: 0)
1259 .lower();
1260
1261 // The hardware instructions return a different result on 0 than the generic
1262 // instructions expect. The hardware produces -1, but these produce the
1263 // bitwidth.
1264 getActionDefinitionsBuilder(Opcodes: {G_CTLZ, G_CTTZ})
1265 .scalarize(TypeIdx: 0)
1266 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
1267 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1268 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
1269 .widenScalarToNextPow2(TypeIdx: 1, MinSize: 32)
1270 .custom();
1271
1272 // The 64-bit versions produce 32-bit results, but only on the SALU.
1273 getActionDefinitionsBuilder(Opcodes: {G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
1274 .legalFor(Types: {{S32, S32}, {S32, S64}})
1275 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
1276 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S64)
1277 .scalarize(TypeIdx: 0)
1278 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 32)
1279 .widenScalarToNextPow2(TypeIdx: 1, MinSize: 32);
1280
1281 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1282 // RegBankSelect.
1283 getActionDefinitionsBuilder(Opcode: G_BITREVERSE)
1284 .legalFor(Types: {S32, S64})
1285 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
1286 .scalarize(TypeIdx: 0)
1287 .widenScalarToNextPow2(TypeIdx: 0);
1288
1289 if (ST.has16BitInsts()) {
1290 getActionDefinitionsBuilder(Opcode: G_BSWAP)
1291 .legalFor(Types: {S16, S32, V2S16})
1292 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
1293 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1294 // narrowScalar limitation.
1295 .widenScalarToNextPow2(TypeIdx: 0)
1296 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S32)
1297 .scalarize(TypeIdx: 0);
1298
1299 if (ST.hasVOP3PInsts()) {
1300 getActionDefinitionsBuilder(Opcodes: {G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1301 .legalFor(Types: {S32, S16, V2S16})
1302 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
1303 .minScalar(TypeIdx: 0, Ty: S16)
1304 .widenScalarToNextPow2(TypeIdx: 0)
1305 .scalarize(TypeIdx: 0)
1306 .lower();
1307 } else {
1308 getActionDefinitionsBuilder(Opcodes: {G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1309 .legalFor(Types: {S32, S16})
1310 .widenScalarToNextPow2(TypeIdx: 0)
1311 .minScalar(TypeIdx: 0, Ty: S16)
1312 .scalarize(TypeIdx: 0)
1313 .lower();
1314 }
1315 } else {
1316 // TODO: Should have same legality without v_perm_b32
1317 getActionDefinitionsBuilder(Opcode: G_BSWAP)
1318 .legalFor(Types: {S32})
1319 .lowerIf(Predicate: scalarNarrowerThan(TypeIdx: 0, Size: 32))
1320 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1321 // narrowScalar limitation.
1322 .widenScalarToNextPow2(TypeIdx: 0)
1323 .maxScalar(TypeIdx: 0, Ty: S32)
1324 .scalarize(TypeIdx: 0)
1325 .lower();
1326
1327 getActionDefinitionsBuilder(Opcodes: {G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1328 .legalFor(Types: {S32})
1329 .minScalar(TypeIdx: 0, Ty: S32)
1330 .widenScalarToNextPow2(TypeIdx: 0)
1331 .scalarize(TypeIdx: 0)
1332 .lower();
1333 }
1334
1335 getActionDefinitionsBuilder(Opcode: G_INTTOPTR)
1336 // List the common cases
1337 .legalForCartesianProduct(Types0: AddrSpaces64, Types1: {S64})
1338 .legalForCartesianProduct(Types0: AddrSpaces32, Types1: {S32})
1339 .scalarize(TypeIdx: 0)
1340 // Accept any address space as long as the size matches
1341 .legalIf(Predicate: sameSize(TypeIdx0: 0, TypeIdx1: 1))
1342 .widenScalarIf(Predicate: smallerThan(TypeIdx0: 1, TypeIdx1: 0),
1343 Mutation: [](const LegalityQuery &Query) {
1344 return std::pair(
1345 1, LLT::scalar(SizeInBits: Query.Types[0].getSizeInBits()));
1346 })
1347 .narrowScalarIf(Predicate: largerThan(TypeIdx0: 1, TypeIdx1: 0), Mutation: [](const LegalityQuery &Query) {
1348 return std::pair(1, LLT::scalar(SizeInBits: Query.Types[0].getSizeInBits()));
1349 });
1350
1351 getActionDefinitionsBuilder(Opcode: G_PTRTOINT)
1352 // List the common cases
1353 .legalForCartesianProduct(Types0: AddrSpaces64, Types1: {S64})
1354 .legalForCartesianProduct(Types0: AddrSpaces32, Types1: {S32})
1355 .scalarize(TypeIdx: 0)
1356 // Accept any address space as long as the size matches
1357 .legalIf(Predicate: sameSize(TypeIdx0: 0, TypeIdx1: 1))
1358 .widenScalarIf(Predicate: smallerThan(TypeIdx0: 0, TypeIdx1: 1),
1359 Mutation: [](const LegalityQuery &Query) {
1360 return std::pair(
1361 0, LLT::scalar(SizeInBits: Query.Types[1].getSizeInBits()));
1362 })
1363 .narrowScalarIf(Predicate: largerThan(TypeIdx0: 0, TypeIdx1: 1), Mutation: [](const LegalityQuery &Query) {
1364 return std::pair(0, LLT::scalar(SizeInBits: Query.Types[1].getSizeInBits()));
1365 });
1366
1367 getActionDefinitionsBuilder(Opcode: G_ADDRSPACE_CAST)
1368 .scalarize(TypeIdx: 0)
1369 .custom();
1370
1371 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1372 bool IsLoad) -> bool {
1373 const LLT DstTy = Query.Types[0];
1374
1375 // Split vector extloads.
1376 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1377
1378 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1379 return true;
1380
1381 const LLT PtrTy = Query.Types[1];
1382 unsigned AS = PtrTy.getAddressSpace();
1383 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1384 IsAtomic: Query.MMODescrs[0].Ordering !=
1385 AtomicOrdering::NotAtomic))
1386 return true;
1387
1388 // Catch weird sized loads that don't evenly divide into the access sizes
1389 // TODO: May be able to widen depending on alignment etc.
1390 unsigned NumRegs = (MemSize + 31) / 32;
1391 if (NumRegs == 3) {
1392 if (!ST.hasDwordx3LoadStores())
1393 return true;
1394 } else {
1395 // If the alignment allows, these should have been widened.
1396 if (!isPowerOf2_32(Value: NumRegs))
1397 return true;
1398 }
1399
1400 return false;
1401 };
1402
1403 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1404 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1405 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1406
1407 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1408 // LDS
1409 // TODO: Unsupported flat for SI.
1410
1411 for (unsigned Op : {G_LOAD, G_STORE}) {
1412 const bool IsStore = Op == G_STORE;
1413
1414 auto &Actions = getActionDefinitionsBuilder(Opcode: Op);
1415 // Explicitly list some common cases.
1416 // TODO: Does this help compile time at all?
1417 Actions.legalForTypesWithMemDesc(TypesAndMemDesc: {{.Type0: S32, .Type1: GlobalPtr, .MemTy: S32, .Align: GlobalAlign32},
1418 {.Type0: V2S32, .Type1: GlobalPtr, .MemTy: V2S32, .Align: GlobalAlign32},
1419 {.Type0: V4S32, .Type1: GlobalPtr, .MemTy: V4S32, .Align: GlobalAlign32},
1420 {.Type0: S64, .Type1: GlobalPtr, .MemTy: S64, .Align: GlobalAlign32},
1421 {.Type0: V2S64, .Type1: GlobalPtr, .MemTy: V2S64, .Align: GlobalAlign32},
1422 {.Type0: V2S16, .Type1: GlobalPtr, .MemTy: V2S16, .Align: GlobalAlign32},
1423 {.Type0: S32, .Type1: GlobalPtr, .MemTy: S8, .Align: GlobalAlign8},
1424 {.Type0: S32, .Type1: GlobalPtr, .MemTy: S16, .Align: GlobalAlign16},
1425
1426 {.Type0: S32, .Type1: LocalPtr, .MemTy: S32, .Align: 32},
1427 {.Type0: S64, .Type1: LocalPtr, .MemTy: S64, .Align: 32},
1428 {.Type0: V2S32, .Type1: LocalPtr, .MemTy: V2S32, .Align: 32},
1429 {.Type0: S32, .Type1: LocalPtr, .MemTy: S8, .Align: 8},
1430 {.Type0: S32, .Type1: LocalPtr, .MemTy: S16, .Align: 16},
1431 {.Type0: V2S16, .Type1: LocalPtr, .MemTy: S32, .Align: 32},
1432
1433 {.Type0: S32, .Type1: PrivatePtr, .MemTy: S32, .Align: 32},
1434 {.Type0: S32, .Type1: PrivatePtr, .MemTy: S8, .Align: 8},
1435 {.Type0: S32, .Type1: PrivatePtr, .MemTy: S16, .Align: 16},
1436 {.Type0: V2S16, .Type1: PrivatePtr, .MemTy: S32, .Align: 32},
1437
1438 {.Type0: S32, .Type1: ConstantPtr, .MemTy: S32, .Align: GlobalAlign32},
1439 {.Type0: V2S32, .Type1: ConstantPtr, .MemTy: V2S32, .Align: GlobalAlign32},
1440 {.Type0: V4S32, .Type1: ConstantPtr, .MemTy: V4S32, .Align: GlobalAlign32},
1441 {.Type0: S64, .Type1: ConstantPtr, .MemTy: S64, .Align: GlobalAlign32},
1442 {.Type0: V2S32, .Type1: ConstantPtr, .MemTy: V2S32, .Align: GlobalAlign32}});
1443 Actions.legalIf(
1444 Predicate: [=](const LegalityQuery &Query) -> bool {
1445 return isLoadStoreLegal(ST, Query);
1446 });
1447
1448 // The custom pointers (fat pointers, buffer resources) don't work with load
1449 // and store at this level. Fat pointers should have been lowered to
1450 // intrinsics before the translation to MIR.
1451 Actions.unsupportedIf(
1452 Predicate: typeInSet(TypeIdx: 1, TypesInit: {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1453
1454 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1455 // ptrtoint. This is needed to account for the fact that we can't have i128
1456 // as a register class for SelectionDAG reasons.
1457 Actions.customIf(Predicate: [=](const LegalityQuery &Query) -> bool {
1458 return hasBufferRsrcWorkaround(Ty: Query.Types[0]);
1459 });
1460
1461 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1462 // 64-bits.
1463 //
1464 // TODO: Should generalize bitcast action into coerce, which will also cover
1465 // inserting addrspacecasts.
1466 Actions.customIf(Predicate: typeIs(TypeIdx: 1, TypesInit: Constant32Ptr));
1467
1468 // Turn any illegal element vectors into something easier to deal
1469 // with. These will ultimately produce 32-bit scalar shifts to extract the
1470 // parts anyway.
1471 //
1472 // For odd 16-bit element vectors, prefer to split those into pieces with
1473 // 16-bit vector parts.
1474 Actions.bitcastIf(
1475 Predicate: [=](const LegalityQuery &Query) -> bool {
1476 return shouldBitcastLoadStoreType(ST, Ty: Query.Types[0],
1477 MemTy: Query.MMODescrs[0].MemoryTy);
1478 }, Mutation: bitcastToRegisterType(TypeIdx: 0));
1479
1480 if (!IsStore) {
1481 // Widen suitably aligned loads by loading extra bytes. The standard
1482 // legalization actions can't properly express widening memory operands.
1483 Actions.customIf(Predicate: [=](const LegalityQuery &Query) -> bool {
1484 return shouldWidenLoad(ST, Query, Opcode: G_LOAD);
1485 });
1486 }
1487
1488 // FIXME: load/store narrowing should be moved to lower action
1489 Actions
1490 .narrowScalarIf(
1491 Predicate: [=](const LegalityQuery &Query) -> bool {
1492 return !Query.Types[0].isVector() &&
1493 needToSplitMemOp(Query, Op == G_LOAD);
1494 },
1495 Mutation: [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1496 const LLT DstTy = Query.Types[0];
1497 const LLT PtrTy = Query.Types[1];
1498
1499 const unsigned DstSize = DstTy.getSizeInBits();
1500 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1501
1502 // Split extloads.
1503 if (DstSize > MemSize)
1504 return std::pair(0, LLT::scalar(SizeInBits: MemSize));
1505
1506 unsigned MaxSize = maxSizeForAddrSpace(
1507 ST, AS: PtrTy.getAddressSpace(), IsLoad: Op == G_LOAD,
1508 IsAtomic: Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1509 if (MemSize > MaxSize)
1510 return std::pair(0, LLT::scalar(SizeInBits: MaxSize));
1511
1512 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1513 return std::pair(0, LLT::scalar(SizeInBits: Align));
1514 })
1515 .fewerElementsIf(
1516 Predicate: [=](const LegalityQuery &Query) -> bool {
1517 return Query.Types[0].isVector() &&
1518 needToSplitMemOp(Query, Op == G_LOAD);
1519 },
1520 Mutation: [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1521 const LLT DstTy = Query.Types[0];
1522 const LLT PtrTy = Query.Types[1];
1523
1524 LLT EltTy = DstTy.getElementType();
1525 unsigned MaxSize = maxSizeForAddrSpace(
1526 ST, AS: PtrTy.getAddressSpace(), IsLoad: Op == G_LOAD,
1527 IsAtomic: Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1528
1529 // FIXME: Handle widened to power of 2 results better. This ends
1530 // up scalarizing.
1531 // FIXME: 3 element stores scalarized on SI
1532
1533 // Split if it's too large for the address space.
1534 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1535 if (MemSize > MaxSize) {
1536 unsigned NumElts = DstTy.getNumElements();
1537 unsigned EltSize = EltTy.getSizeInBits();
1538
1539 if (MaxSize % EltSize == 0) {
1540 return std::pair(
1541 0, LLT::scalarOrVector(
1542 EC: ElementCount::getFixed(MinVal: MaxSize / EltSize), ScalarTy: EltTy));
1543 }
1544
1545 unsigned NumPieces = MemSize / MaxSize;
1546
1547 // FIXME: Refine when odd breakdowns handled
1548 // The scalars will need to be re-legalized.
1549 if (NumPieces == 1 || NumPieces >= NumElts ||
1550 NumElts % NumPieces != 0)
1551 return std::pair(0, EltTy);
1552
1553 return std::pair(0,
1554 LLT::fixed_vector(NumElements: NumElts / NumPieces, ScalarTy: EltTy));
1555 }
1556
1557 // FIXME: We could probably handle weird extending loads better.
1558 if (DstTy.getSizeInBits() > MemSize)
1559 return std::pair(0, EltTy);
1560
1561 unsigned EltSize = EltTy.getSizeInBits();
1562 unsigned DstSize = DstTy.getSizeInBits();
1563 if (!isPowerOf2_32(Value: DstSize)) {
1564 // We're probably decomposing an odd sized store. Try to split
1565 // to the widest type. TODO: Account for alignment. As-is it
1566 // should be OK, since the new parts will be further legalized.
1567 unsigned FloorSize = llvm::bit_floor(Value: DstSize);
1568 return std::pair(
1569 0, LLT::scalarOrVector(
1570 EC: ElementCount::getFixed(MinVal: FloorSize / EltSize), ScalarTy: EltTy));
1571 }
1572
1573 // May need relegalization for the scalars.
1574 return std::pair(0, EltTy);
1575 })
1576 .minScalar(TypeIdx: 0, Ty: S32)
1577 .narrowScalarIf(Predicate: isWideScalarExtLoadTruncStore(TypeIdx: 0), Mutation: changeTo(TypeIdx: 0, Ty: S32))
1578 .widenScalarToNextPow2(TypeIdx: 0)
1579 .moreElementsIf(Predicate: vectorSmallerThan(TypeIdx: 0, Size: 32), Mutation: moreEltsToNext32Bit(TypeIdx: 0))
1580 .lower();
1581 }
1582
1583 // FIXME: Unaligned accesses not lowered.
1584 auto &ExtLoads = getActionDefinitionsBuilder(Opcodes: {G_SEXTLOAD, G_ZEXTLOAD})
1585 .legalForTypesWithMemDesc(TypesAndMemDesc: {{.Type0: S32, .Type1: GlobalPtr, .MemTy: S8, .Align: 8},
1586 {.Type0: S32, .Type1: GlobalPtr, .MemTy: S16, .Align: 2 * 8},
1587 {.Type0: S32, .Type1: LocalPtr, .MemTy: S8, .Align: 8},
1588 {.Type0: S32, .Type1: LocalPtr, .MemTy: S16, .Align: 16},
1589 {.Type0: S32, .Type1: PrivatePtr, .MemTy: S8, .Align: 8},
1590 {.Type0: S32, .Type1: PrivatePtr, .MemTy: S16, .Align: 16},
1591 {.Type0: S32, .Type1: ConstantPtr, .MemTy: S8, .Align: 8},
1592 {.Type0: S32, .Type1: ConstantPtr, .MemTy: S16, .Align: 2 * 8}})
1593 .legalIf(
1594 Predicate: [=](const LegalityQuery &Query) -> bool {
1595 return isLoadStoreLegal(ST, Query);
1596 });
1597
1598 if (ST.hasFlatAddressSpace()) {
1599 ExtLoads.legalForTypesWithMemDesc(
1600 TypesAndMemDesc: {{.Type0: S32, .Type1: FlatPtr, .MemTy: S8, .Align: 8}, {.Type0: S32, .Type1: FlatPtr, .MemTy: S16, .Align: 16}});
1601 }
1602
1603 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1604 // 64-bits.
1605 //
1606 // TODO: Should generalize bitcast action into coerce, which will also cover
1607 // inserting addrspacecasts.
1608 ExtLoads.customIf(Predicate: typeIs(TypeIdx: 1, TypesInit: Constant32Ptr));
1609
1610 ExtLoads.clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S32)
1611 .widenScalarToNextPow2(TypeIdx: 0)
1612 .lower();
1613
1614 auto &Atomics = getActionDefinitionsBuilder(
1615 Opcodes: {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1616 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1617 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1618 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1619 .legalFor(Types: {{S32, GlobalPtr}, {S32, LocalPtr},
1620 {S64, GlobalPtr}, {S64, LocalPtr},
1621 {S32, RegionPtr}, {S64, RegionPtr}});
1622 if (ST.hasFlatAddressSpace()) {
1623 Atomics.legalFor(Types: {{S32, FlatPtr}, {S64, FlatPtr}});
1624 }
1625
1626 auto &Atomic = getActionDefinitionsBuilder(Opcode: G_ATOMICRMW_FADD);
1627 if (ST.hasLDSFPAtomicAddF32()) {
1628 Atomic.legalFor(Types: {{S32, LocalPtr}, {S32, RegionPtr}});
1629 if (ST.hasLdsAtomicAddF64())
1630 Atomic.legalFor(Types: {{S64, LocalPtr}});
1631 if (ST.hasAtomicDsPkAdd16Insts())
1632 Atomic.legalFor(Types: {{V2S16, LocalPtr}});
1633 }
1634 if (ST.hasAtomicFaddInsts())
1635 Atomic.legalFor(Types: {{S32, GlobalPtr}});
1636 if (ST.hasFlatAtomicFaddF32Inst())
1637 Atomic.legalFor(Types: {{S32, FlatPtr}});
1638
1639 if (ST.hasGFX90AInsts()) {
1640 // These are legal with some caveats, and should have undergone expansion in
1641 // the IR in most situations
1642 // TODO: Move atomic expansion into legalizer
1643 Atomic.legalFor(Types: {
1644 {S32, GlobalPtr},
1645 {S64, GlobalPtr},
1646 {S64, FlatPtr}
1647 });
1648 }
1649
1650 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1651 // demarshalling
1652 getActionDefinitionsBuilder(Opcode: G_ATOMIC_CMPXCHG)
1653 .customFor(Types: {{S32, GlobalPtr}, {S64, GlobalPtr},
1654 {S32, FlatPtr}, {S64, FlatPtr}})
1655 .legalFor(Types: {{S32, LocalPtr}, {S64, LocalPtr},
1656 {S32, RegionPtr}, {S64, RegionPtr}});
1657 // TODO: Pointer types, any 32-bit or 64-bit vector
1658
1659 // Condition should be s32 for scalar, s1 for vector.
1660 getActionDefinitionsBuilder(Opcode: G_SELECT)
1661 .legalForCartesianProduct(Types0: {S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1662 LocalPtr, FlatPtr, PrivatePtr,
1663 LLT::fixed_vector(NumElements: 2, ScalarTy: LocalPtr),
1664 LLT::fixed_vector(NumElements: 2, ScalarTy: PrivatePtr)},
1665 Types1: {S1, S32})
1666 .clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64)
1667 .scalarize(TypeIdx: 1)
1668 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: 0), Mutation: oneMoreElement(TypeIdx: 0))
1669 .fewerElementsIf(Predicate: numElementsNotEven(TypeIdx: 0), Mutation: scalarize(TypeIdx: 0))
1670 .clampMaxNumElements(TypeIdx: 0, EltTy: S32, MaxElements: 2)
1671 .clampMaxNumElements(TypeIdx: 0, EltTy: LocalPtr, MaxElements: 2)
1672 .clampMaxNumElements(TypeIdx: 0, EltTy: PrivatePtr, MaxElements: 2)
1673 .scalarize(TypeIdx: 0)
1674 .widenScalarToNextPow2(TypeIdx: 0)
1675 .legalIf(Predicate: all(P0: isPointer(TypeIdx: 0), P1: typeInSet(TypeIdx: 1, TypesInit: {S1, S32})));
1676
1677 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1678 // be more flexible with the shift amount type.
1679 auto &Shifts = getActionDefinitionsBuilder(Opcodes: {G_SHL, G_LSHR, G_ASHR})
1680 .legalFor(Types: {{S32, S32}, {S64, S32}});
1681 if (ST.has16BitInsts()) {
1682 if (ST.hasVOP3PInsts()) {
1683 Shifts.legalFor(Types: {{S16, S16}, {V2S16, V2S16}})
1684 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2);
1685 } else
1686 Shifts.legalFor(Types: {{S16, S16}});
1687
1688 // TODO: Support 16-bit shift amounts for all types
1689 Shifts.widenScalarIf(
1690 Predicate: [=](const LegalityQuery &Query) {
1691 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1692 // 32-bit amount.
1693 const LLT ValTy = Query.Types[0];
1694 const LLT AmountTy = Query.Types[1];
1695 return ValTy.getSizeInBits() <= 16 &&
1696 AmountTy.getSizeInBits() < 16;
1697 }, Mutation: changeTo(TypeIdx: 1, Ty: S16));
1698 Shifts.maxScalarIf(Predicate: typeIs(TypeIdx: 0, TypesInit: S16), TypeIdx: 1, Ty: S16);
1699 Shifts.clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32);
1700 Shifts.widenScalarToNextPow2(TypeIdx: 0, MinSize: 16);
1701 Shifts.clampScalar(TypeIdx: 0, MinTy: S16, MaxTy: S64);
1702
1703 getActionDefinitionsBuilder(Opcodes: {G_SSHLSAT, G_USHLSAT})
1704 .minScalar(TypeIdx: 0, Ty: S16)
1705 .scalarize(TypeIdx: 0)
1706 .lower();
1707 } else {
1708 // Make sure we legalize the shift amount type first, as the general
1709 // expansion for the shifted type will produce much worse code if it hasn't
1710 // been truncated already.
1711 Shifts.clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32);
1712 Shifts.widenScalarToNextPow2(TypeIdx: 0, MinSize: 32);
1713 Shifts.clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64);
1714
1715 getActionDefinitionsBuilder(Opcodes: {G_SSHLSAT, G_USHLSAT})
1716 .minScalar(TypeIdx: 0, Ty: S32)
1717 .scalarize(TypeIdx: 0)
1718 .lower();
1719 }
1720 Shifts.scalarize(TypeIdx: 0);
1721
1722 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1723 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1724 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1725 unsigned IdxTypeIdx = 2;
1726
1727 getActionDefinitionsBuilder(Opcode: Op)
1728 .customIf(Predicate: [=](const LegalityQuery &Query) {
1729 const LLT EltTy = Query.Types[EltTypeIdx];
1730 const LLT VecTy = Query.Types[VecTypeIdx];
1731 const LLT IdxTy = Query.Types[IdxTypeIdx];
1732 const unsigned EltSize = EltTy.getSizeInBits();
1733 const bool isLegalVecType =
1734 !!SIRegisterInfo::getSGPRClassForBitWidth(BitWidth: VecTy.getSizeInBits());
1735 // Address space 8 pointers are 128-bit wide values, but the logic
1736 // below will try to bitcast them to 2N x s64, which will fail.
1737 // Therefore, as an intermediate step, wrap extracts/insertions from a
1738 // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1739 // extraction result) in order to produce a vector operation that can
1740 // be handled by the logic below.
1741 if (EltTy.isPointer() && EltSize > 64)
1742 return true;
1743 return (EltSize == 32 || EltSize == 64) &&
1744 VecTy.getSizeInBits() % 32 == 0 &&
1745 VecTy.getSizeInBits() <= MaxRegisterSize &&
1746 IdxTy.getSizeInBits() == 32 &&
1747 isLegalVecType;
1748 })
1749 .bitcastIf(Predicate: all(P0: sizeIsMultipleOf32(TypeIdx: VecTypeIdx), P1: scalarOrEltNarrowerThan(TypeIdx: VecTypeIdx, Size: 32)),
1750 Mutation: bitcastToVectorElement32(TypeIdx: VecTypeIdx))
1751 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1752 .bitcastIf(
1753 Predicate: all(P0: sizeIsMultipleOf32(TypeIdx: VecTypeIdx), P1: scalarOrEltWiderThan(TypeIdx: VecTypeIdx, Size: 64)),
1754 Mutation: [=](const LegalityQuery &Query) {
1755 // For > 64-bit element types, try to turn this into a 64-bit
1756 // element vector since we may be able to do better indexing
1757 // if this is scalar. If not, fall back to 32.
1758 const LLT EltTy = Query.Types[EltTypeIdx];
1759 const LLT VecTy = Query.Types[VecTypeIdx];
1760 const unsigned DstEltSize = EltTy.getSizeInBits();
1761 const unsigned VecSize = VecTy.getSizeInBits();
1762
1763 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1764 return std::pair(
1765 VecTypeIdx,
1766 LLT::fixed_vector(NumElements: VecSize / TargetEltSize, ScalarSizeInBits: TargetEltSize));
1767 })
1768 .clampScalar(TypeIdx: EltTypeIdx, MinTy: S32, MaxTy: S64)
1769 .clampScalar(TypeIdx: VecTypeIdx, MinTy: S32, MaxTy: S64)
1770 .clampScalar(TypeIdx: IdxTypeIdx, MinTy: S32, MaxTy: S32)
1771 .clampMaxNumElements(TypeIdx: VecTypeIdx, EltTy: S32, MaxElements: 32)
1772 // TODO: Clamp elements for 64-bit vectors?
1773 .moreElementsIf(
1774 Predicate: isIllegalRegisterType(TypeIdx: VecTypeIdx),
1775 Mutation: moreElementsToNextExistingRegClass(TypeIdx: VecTypeIdx))
1776 // It should only be necessary with variable indexes.
1777 // As a last resort, lower to the stack
1778 .lower();
1779 }
1780
1781 getActionDefinitionsBuilder(Opcode: G_EXTRACT_VECTOR_ELT)
1782 .unsupportedIf(Predicate: [=](const LegalityQuery &Query) {
1783 const LLT &EltTy = Query.Types[1].getElementType();
1784 return Query.Types[0] != EltTy;
1785 });
1786
1787 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1788 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1789 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1790
1791 // FIXME: Doesn't handle extract of illegal sizes.
1792 getActionDefinitionsBuilder(Opcode: Op)
1793 .lowerIf(Predicate: all(P0: typeIs(TypeIdx: LitTyIdx, TypesInit: S16), P1: sizeIs(TypeIdx: BigTyIdx, Size: 32)))
1794 .lowerIf(Predicate: [=](const LegalityQuery &Query) {
1795 // Sub-vector(or single element) insert and extract.
1796 // TODO: verify immediate offset here since lower only works with
1797 // whole elements.
1798 const LLT BigTy = Query.Types[BigTyIdx];
1799 return BigTy.isVector();
1800 })
1801 // FIXME: Multiples of 16 should not be legal.
1802 .legalIf(Predicate: [=](const LegalityQuery &Query) {
1803 const LLT BigTy = Query.Types[BigTyIdx];
1804 const LLT LitTy = Query.Types[LitTyIdx];
1805 return (BigTy.getSizeInBits() % 32 == 0) &&
1806 (LitTy.getSizeInBits() % 16 == 0);
1807 })
1808 .widenScalarIf(
1809 Predicate: [=](const LegalityQuery &Query) {
1810 const LLT BigTy = Query.Types[BigTyIdx];
1811 return (BigTy.getScalarSizeInBits() < 16);
1812 },
1813 Mutation: LegalizeMutations::widenScalarOrEltToNextPow2(TypeIdx: BigTyIdx, Min: 16))
1814 .widenScalarIf(
1815 Predicate: [=](const LegalityQuery &Query) {
1816 const LLT LitTy = Query.Types[LitTyIdx];
1817 return (LitTy.getScalarSizeInBits() < 16);
1818 },
1819 Mutation: LegalizeMutations::widenScalarOrEltToNextPow2(TypeIdx: LitTyIdx, Min: 16))
1820 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: BigTyIdx), Mutation: oneMoreElement(TypeIdx: BigTyIdx))
1821 .widenScalarToNextPow2(TypeIdx: BigTyIdx, MinSize: 32);
1822
1823 }
1824
1825 auto &BuildVector = getActionDefinitionsBuilder(Opcode: G_BUILD_VECTOR)
1826 .legalForCartesianProduct(Types0: AllS32Vectors, Types1: {S32})
1827 .legalForCartesianProduct(Types0: AllS64Vectors, Types1: {S64})
1828 .clampNumElements(TypeIdx: 0, MinTy: V16S32, MaxTy: V32S32)
1829 .clampNumElements(TypeIdx: 0, MinTy: V2S64, MaxTy: V16S64)
1830 .fewerElementsIf(Predicate: isWideVec16(TypeIdx: 0), Mutation: changeTo(TypeIdx: 0, Ty: V2S16))
1831 .moreElementsIf(
1832 Predicate: isIllegalRegisterType(TypeIdx: 0),
1833 Mutation: moreElementsToNextExistingRegClass(TypeIdx: 0));
1834
1835 if (ST.hasScalarPackInsts()) {
1836 BuildVector
1837 // FIXME: Should probably widen s1 vectors straight to s32
1838 .minScalarOrElt(TypeIdx: 0, Ty: S16)
1839 .minScalar(TypeIdx: 1, Ty: S16);
1840
1841 getActionDefinitionsBuilder(Opcode: G_BUILD_VECTOR_TRUNC)
1842 .legalFor(Types: {V2S16, S32})
1843 .lower();
1844 } else {
1845 BuildVector.customFor(Types: {V2S16, S16});
1846 BuildVector.minScalarOrElt(TypeIdx: 0, Ty: S32);
1847
1848 getActionDefinitionsBuilder(Opcode: G_BUILD_VECTOR_TRUNC)
1849 .customFor(Types: {V2S16, S32})
1850 .lower();
1851 }
1852
1853 BuildVector.legalIf(Predicate: isRegisterType(TypeIdx: 0));
1854
1855 // FIXME: Clamp maximum size
1856 getActionDefinitionsBuilder(Opcode: G_CONCAT_VECTORS)
1857 .legalIf(Predicate: all(P0: isRegisterType(TypeIdx: 0), P1: isRegisterType(TypeIdx: 1)))
1858 .clampMaxNumElements(TypeIdx: 0, EltTy: S32, MaxElements: 32)
1859 .clampMaxNumElements(TypeIdx: 1, EltTy: S16, MaxElements: 2) // TODO: Make 4?
1860 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 64);
1861
1862 getActionDefinitionsBuilder(Opcode: G_SHUFFLE_VECTOR).lower();
1863
1864 // Merge/Unmerge
1865 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1866 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1867 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1868
1869 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1870 const LLT Ty = Query.Types[TypeIdx];
1871 if (Ty.isVector()) {
1872 const LLT &EltTy = Ty.getElementType();
1873 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1874 return true;
1875 if (!llvm::has_single_bit<uint32_t>(Value: EltTy.getSizeInBits()))
1876 return true;
1877 }
1878 return false;
1879 };
1880
1881 auto &Builder = getActionDefinitionsBuilder(Opcode: Op)
1882 .legalIf(Predicate: all(P0: isRegisterType(TypeIdx: 0), P1: isRegisterType(TypeIdx: 1)))
1883 .lowerFor(Types: {{S16, V2S16}})
1884 .lowerIf(Predicate: [=](const LegalityQuery &Query) {
1885 const LLT BigTy = Query.Types[BigTyIdx];
1886 return BigTy.getSizeInBits() == 32;
1887 })
1888 // Try to widen to s16 first for small types.
1889 // TODO: Only do this on targets with legal s16 shifts
1890 .minScalarOrEltIf(Predicate: scalarNarrowerThan(TypeIdx: LitTyIdx, Size: 16), TypeIdx: LitTyIdx, Ty: S16)
1891 .widenScalarToNextPow2(TypeIdx: LitTyIdx, /*Min*/ MinSize: 16)
1892 .moreElementsIf(Predicate: isSmallOddVector(TypeIdx: BigTyIdx), Mutation: oneMoreElement(TypeIdx: BigTyIdx))
1893 .fewerElementsIf(Predicate: all(P0: typeIs(TypeIdx: 0, TypesInit: S16), P1: vectorWiderThan(TypeIdx: 1, Size: 32),
1894 args: elementTypeIs(TypeIdx: 1, EltTy: S16)),
1895 Mutation: changeTo(TypeIdx: 1, Ty: V2S16))
1896 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1897 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1898 // valid.
1899 .clampScalar(TypeIdx: LitTyIdx, MinTy: S32, MaxTy: S512)
1900 .widenScalarToNextPow2(TypeIdx: LitTyIdx, /*Min*/ MinSize: 32)
1901 // Break up vectors with weird elements into scalars
1902 .fewerElementsIf(
1903 Predicate: [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1904 Mutation: scalarize(TypeIdx: 0))
1905 .fewerElementsIf(
1906 Predicate: [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1907 Mutation: scalarize(TypeIdx: 1))
1908 .clampScalar(TypeIdx: BigTyIdx, MinTy: S32, MaxTy: MaxScalar);
1909
1910 if (Op == G_MERGE_VALUES) {
1911 Builder.widenScalarIf(
1912 // TODO: Use 16-bit shifts if legal for 8-bit values?
1913 Predicate: [=](const LegalityQuery &Query) {
1914 const LLT Ty = Query.Types[LitTyIdx];
1915 return Ty.getSizeInBits() < 32;
1916 },
1917 Mutation: changeTo(TypeIdx: LitTyIdx, Ty: S32));
1918 }
1919
1920 Builder.widenScalarIf(
1921 Predicate: [=](const LegalityQuery &Query) {
1922 const LLT Ty = Query.Types[BigTyIdx];
1923 return Ty.getSizeInBits() % 16 != 0;
1924 },
1925 Mutation: [=](const LegalityQuery &Query) {
1926 // Pick the next power of 2, or a multiple of 64 over 128.
1927 // Whichever is smaller.
1928 const LLT &Ty = Query.Types[BigTyIdx];
1929 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Value: Ty.getSizeInBits() + 1);
1930 if (NewSizeInBits >= 256) {
1931 unsigned RoundedTo = alignTo<64>(Value: Ty.getSizeInBits() + 1);
1932 if (RoundedTo < NewSizeInBits)
1933 NewSizeInBits = RoundedTo;
1934 }
1935 return std::pair(BigTyIdx, LLT::scalar(SizeInBits: NewSizeInBits));
1936 })
1937 // Any vectors left are the wrong size. Scalarize them.
1938 .scalarize(TypeIdx: 0)
1939 .scalarize(TypeIdx: 1);
1940 }
1941
1942 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1943 // RegBankSelect.
1944 auto &SextInReg = getActionDefinitionsBuilder(Opcode: G_SEXT_INREG)
1945 .legalFor(Types: {{S32}, {S64}});
1946
1947 if (ST.hasVOP3PInsts()) {
1948 SextInReg.lowerFor(Types: {{V2S16}})
1949 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1950 // get more vector shift opportunities, since we'll get those when
1951 // expanded.
1952 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2);
1953 } else if (ST.has16BitInsts()) {
1954 SextInReg.lowerFor(Types: {{S32}, {S64}, {S16}});
1955 } else {
1956 // Prefer to promote to s32 before lowering if we don't have 16-bit
1957 // shifts. This avoid a lot of intermediate truncate and extend operations.
1958 SextInReg.lowerFor(Types: {{S32}, {S64}});
1959 }
1960
1961 SextInReg
1962 .scalarize(TypeIdx: 0)
1963 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
1964 .lower();
1965
1966 getActionDefinitionsBuilder(Opcodes: {G_ROTR, G_ROTL})
1967 .scalarize(TypeIdx: 0)
1968 .lower();
1969
1970 // TODO: Only Try to form v2s16 with legal packed instructions.
1971 getActionDefinitionsBuilder(Opcode: G_FSHR)
1972 .legalFor(Types: {{S32, S32}})
1973 .lowerFor(Types: {{V2S16, V2S16}})
1974 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
1975 .scalarize(TypeIdx: 0)
1976 .lower();
1977
1978 if (ST.hasVOP3PInsts()) {
1979 getActionDefinitionsBuilder(Opcode: G_FSHL)
1980 .lowerFor(Types: {{V2S16, V2S16}})
1981 .clampMaxNumElementsStrict(TypeIdx: 0, EltTy: S16, NumElts: 2)
1982 .scalarize(TypeIdx: 0)
1983 .lower();
1984 } else {
1985 getActionDefinitionsBuilder(Opcode: G_FSHL)
1986 .scalarize(TypeIdx: 0)
1987 .lower();
1988 }
1989
1990 getActionDefinitionsBuilder(Opcode: G_READCYCLECOUNTER)
1991 .legalFor(Types: {S64});
1992
1993 getActionDefinitionsBuilder(Opcode: G_READSTEADYCOUNTER).legalFor(Types: {S64});
1994
1995 getActionDefinitionsBuilder(Opcode: G_FENCE)
1996 .alwaysLegal();
1997
1998 getActionDefinitionsBuilder(Opcodes: {G_SMULO, G_UMULO})
1999 .scalarize(TypeIdx: 0)
2000 .minScalar(TypeIdx: 0, Ty: S32)
2001 .lower();
2002
2003 getActionDefinitionsBuilder(Opcodes: {G_SBFX, G_UBFX})
2004 .legalFor(Types: {{S32, S32}, {S64, S32}})
2005 .clampScalar(TypeIdx: 1, MinTy: S32, MaxTy: S32)
2006 .clampScalar(TypeIdx: 0, MinTy: S32, MaxTy: S64)
2007 .widenScalarToNextPow2(TypeIdx: 0)
2008 .scalarize(TypeIdx: 0);
2009
2010 getActionDefinitionsBuilder(
2011 Opcodes: {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
2012 G_FCOPYSIGN,
2013
2014 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2015 G_READ_REGISTER, G_WRITE_REGISTER,
2016
2017 G_SADDO, G_SSUBO})
2018 .lower();
2019
2020 if (ST.hasIEEEMinMax()) {
2021 getActionDefinitionsBuilder(Opcodes: {G_FMINIMUM, G_FMAXIMUM})
2022 .legalFor(Types: FPTypesPK16)
2023 .clampMaxNumElements(TypeIdx: 0, EltTy: S16, MaxElements: 2)
2024 .scalarize(TypeIdx: 0);
2025 } else {
2026 // TODO: Implement
2027 getActionDefinitionsBuilder(Opcodes: {G_FMINIMUM, G_FMAXIMUM}).lower();
2028 }
2029
2030 getActionDefinitionsBuilder(Opcodes: {G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2031 .lower();
2032
2033 getActionDefinitionsBuilder(Opcodes: {G_TRAP, G_DEBUGTRAP}).custom();
2034
2035 getActionDefinitionsBuilder(Opcodes: {G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2036 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2037 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2038 .unsupported();
2039
2040 getActionDefinitionsBuilder(Opcode: G_PREFETCH).alwaysLegal();
2041
2042 getLegacyLegalizerInfo().computeTables();
2043 verify(*ST.getInstrInfo());
2044}
2045
2046bool AMDGPULegalizerInfo::legalizeCustom(
2047 LegalizerHelper &Helper, MachineInstr &MI,
2048 LostDebugLocObserver &LocObserver) const {
2049 MachineIRBuilder &B = Helper.MIRBuilder;
2050 MachineRegisterInfo &MRI = *B.getMRI();
2051
2052 switch (MI.getOpcode()) {
2053 case TargetOpcode::G_ADDRSPACE_CAST:
2054 return legalizeAddrSpaceCast(MI, MRI, B);
2055 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2056 return legalizeFroundeven(MI, MRI, B);
2057 case TargetOpcode::G_FCEIL:
2058 return legalizeFceil(MI, MRI, B);
2059 case TargetOpcode::G_FREM:
2060 return legalizeFrem(MI, MRI, B);
2061 case TargetOpcode::G_INTRINSIC_TRUNC:
2062 return legalizeIntrinsicTrunc(MI, MRI, B);
2063 case TargetOpcode::G_SITOFP:
2064 return legalizeITOFP(MI, MRI, B, Signed: true);
2065 case TargetOpcode::G_UITOFP:
2066 return legalizeITOFP(MI, MRI, B, Signed: false);
2067 case TargetOpcode::G_FPTOSI:
2068 return legalizeFPTOI(MI, MRI, B, Signed: true);
2069 case TargetOpcode::G_FPTOUI:
2070 return legalizeFPTOI(MI, MRI, B, Signed: false);
2071 case TargetOpcode::G_FMINNUM:
2072 case TargetOpcode::G_FMAXNUM:
2073 case TargetOpcode::G_FMINNUM_IEEE:
2074 case TargetOpcode::G_FMAXNUM_IEEE:
2075 return legalizeMinNumMaxNum(Helper, MI);
2076 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2077 return legalizeExtractVectorElt(MI, MRI, B);
2078 case TargetOpcode::G_INSERT_VECTOR_ELT:
2079 return legalizeInsertVectorElt(MI, MRI, B);
2080 case TargetOpcode::G_FSIN:
2081 case TargetOpcode::G_FCOS:
2082 return legalizeSinCos(MI, MRI, B);
2083 case TargetOpcode::G_GLOBAL_VALUE:
2084 return legalizeGlobalValue(MI, MRI, B);
2085 case TargetOpcode::G_LOAD:
2086 case TargetOpcode::G_SEXTLOAD:
2087 case TargetOpcode::G_ZEXTLOAD:
2088 return legalizeLoad(Helper, MI);
2089 case TargetOpcode::G_STORE:
2090 return legalizeStore(Helper, MI);
2091 case TargetOpcode::G_FMAD:
2092 return legalizeFMad(MI, MRI, B);
2093 case TargetOpcode::G_FDIV:
2094 return legalizeFDIV(MI, MRI, B);
2095 case TargetOpcode::G_FFREXP:
2096 return legalizeFFREXP(MI, MRI, B);
2097 case TargetOpcode::G_FSQRT:
2098 return legalizeFSQRT(MI, MRI, B);
2099 case TargetOpcode::G_UDIV:
2100 case TargetOpcode::G_UREM:
2101 case TargetOpcode::G_UDIVREM:
2102 return legalizeUnsignedDIV_REM(MI, MRI, B);
2103 case TargetOpcode::G_SDIV:
2104 case TargetOpcode::G_SREM:
2105 case TargetOpcode::G_SDIVREM:
2106 return legalizeSignedDIV_REM(MI, MRI, B);
2107 case TargetOpcode::G_ATOMIC_CMPXCHG:
2108 return legalizeAtomicCmpXChg(MI, MRI, B);
2109 case TargetOpcode::G_FLOG2:
2110 return legalizeFlog2(MI, B);
2111 case TargetOpcode::G_FLOG:
2112 case TargetOpcode::G_FLOG10:
2113 return legalizeFlogCommon(MI, B);
2114 case TargetOpcode::G_FEXP2:
2115 return legalizeFExp2(MI, B);
2116 case TargetOpcode::G_FEXP:
2117 case TargetOpcode::G_FEXP10:
2118 return legalizeFExp(MI, B);
2119 case TargetOpcode::G_FPOW:
2120 return legalizeFPow(MI, B);
2121 case TargetOpcode::G_FFLOOR:
2122 return legalizeFFloor(MI, MRI, B);
2123 case TargetOpcode::G_BUILD_VECTOR:
2124 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2125 return legalizeBuildVector(MI, MRI, B);
2126 case TargetOpcode::G_MUL:
2127 return legalizeMul(Helper, MI);
2128 case TargetOpcode::G_CTLZ:
2129 case TargetOpcode::G_CTTZ:
2130 return legalizeCTLZ_CTTZ(MI, MRI, B);
2131 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
2132 return legalizeFPTruncRound(MI, B);
2133 case TargetOpcode::G_STACKSAVE:
2134 return legalizeStackSave(MI, B);
2135 case TargetOpcode::G_GET_FPENV:
2136 return legalizeGetFPEnv(MI, MRI, B);
2137 case TargetOpcode::G_SET_FPENV:
2138 return legalizeSetFPEnv(MI, MRI, B);
2139 case TargetOpcode::G_TRAP:
2140 return legalizeTrap(MI, MRI, B);
2141 case TargetOpcode::G_DEBUGTRAP:
2142 return legalizeDebugTrap(MI, MRI, B);
2143 default:
2144 return false;
2145 }
2146
2147 llvm_unreachable("expected switch to return");
2148}
2149
2150Register AMDGPULegalizerInfo::getSegmentAperture(
2151 unsigned AS,
2152 MachineRegisterInfo &MRI,
2153 MachineIRBuilder &B) const {
2154 MachineFunction &MF = B.getMF();
2155 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2156 const LLT S32 = LLT::scalar(SizeInBits: 32);
2157 const LLT S64 = LLT::scalar(SizeInBits: 64);
2158
2159 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
2160
2161 if (ST.hasApertureRegs()) {
2162 // Note: this register is somewhat broken. When used as a 32-bit operand,
2163 // it only returns zeroes. The real value is in the upper 32 bits.
2164 // Thus, we must emit extract the high 32 bits.
2165 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2166 ? AMDGPU::SRC_SHARED_BASE
2167 : AMDGPU::SRC_PRIVATE_BASE;
2168 // FIXME: It would be more natural to emit a COPY here, but then copy
2169 // coalescing would kick in and it would think it's okay to use the "HI"
2170 // subregister (instead of extracting the HI 32 bits) which is an artificial
2171 // (unusable) register.
2172 // Register TableGen definitions would need an overhaul to get rid of the
2173 // artificial "HI" aperture registers and prevent this kind of issue from
2174 // happening.
2175 Register Dst = MRI.createGenericVirtualRegister(Ty: S64);
2176 MRI.setRegClass(Reg: Dst, RC: &AMDGPU::SReg_64RegClass);
2177 B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)});
2178 return B.buildUnmerge(Res: S32, Op: Dst).getReg(Idx: 1);
2179 }
2180
2181 // TODO: can we be smarter about machine pointer info?
2182 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
2183 Register LoadAddr = MRI.createGenericVirtualRegister(
2184 Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
2185 // For code object version 5, private_base and shared_base are passed through
2186 // implicit kernargs.
2187 if (AMDGPU::getAMDHSACodeObjectVersion(M: *MF.getFunction().getParent()) >=
2188 AMDGPU::AMDHSA_COV5) {
2189 AMDGPUTargetLowering::ImplicitParameter Param =
2190 AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
2191 : AMDGPUTargetLowering::PRIVATE_BASE;
2192 uint64_t Offset =
2193 ST.getTargetLowering()->getImplicitParameterOffset(MF: B.getMF(), Param);
2194
2195 Register KernargPtrReg = MRI.createGenericVirtualRegister(
2196 Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
2197
2198 if (!loadInputValue(DstReg: KernargPtrReg, B,
2199 ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
2200 return Register();
2201
2202 MachineMemOperand *MMO = MF.getMachineMemOperand(
2203 PtrInfo,
2204 f: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2205 MachineMemOperand::MOInvariant,
2206 MemTy: LLT::scalar(SizeInBits: 32), base_alignment: commonAlignment(A: Align(64), Offset));
2207
2208 // Pointer address
2209 B.buildPtrAdd(Res: LoadAddr, Op0: KernargPtrReg,
2210 Op1: B.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: Offset).getReg(Idx: 0));
2211 // Load address
2212 return B.buildLoad(Res: S32, Addr: LoadAddr, MMO&: *MMO).getReg(Idx: 0);
2213 }
2214
2215 Register QueuePtr = MRI.createGenericVirtualRegister(
2216 Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
2217
2218 if (!loadInputValue(DstReg: QueuePtr, B, ArgType: AMDGPUFunctionArgInfo::QUEUE_PTR))
2219 return Register();
2220
2221 // Offset into amd_queue_t for group_segment_aperture_base_hi /
2222 // private_segment_aperture_base_hi.
2223 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2224
2225 MachineMemOperand *MMO = MF.getMachineMemOperand(
2226 PtrInfo,
2227 f: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2228 MachineMemOperand::MOInvariant,
2229 MemTy: LLT::scalar(SizeInBits: 32), base_alignment: commonAlignment(A: Align(64), Offset: StructOffset));
2230
2231 B.buildPtrAdd(Res: LoadAddr, Op0: QueuePtr,
2232 Op1: B.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: StructOffset).getReg(Idx: 0));
2233 return B.buildLoad(Res: S32, Addr: LoadAddr, MMO&: *MMO).getReg(Idx: 0);
2234}
2235
2236/// Return true if the value is a known valid address, such that a null check is
2237/// not necessary.
2238static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
2239 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2240 MachineInstr *Def = MRI.getVRegDef(Reg: Val);
2241 switch (Def->getOpcode()) {
2242 case AMDGPU::G_FRAME_INDEX:
2243 case AMDGPU::G_GLOBAL_VALUE:
2244 case AMDGPU::G_BLOCK_ADDR:
2245 return true;
2246 case AMDGPU::G_CONSTANT: {
2247 const ConstantInt *CI = Def->getOperand(i: 1).getCImm();
2248 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
2249 }
2250 default:
2251 return false;
2252 }
2253
2254 return false;
2255}
2256
2257bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
2258 MachineInstr &MI, MachineRegisterInfo &MRI,
2259 MachineIRBuilder &B) const {
2260 MachineFunction &MF = B.getMF();
2261
2262 // MI can either be a G_ADDRSPACE_CAST or a
2263 // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2264 assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2265 (isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
2266 Intrinsic::amdgcn_addrspacecast_nonnull));
2267
2268 const LLT S32 = LLT::scalar(SizeInBits: 32);
2269 Register Dst = MI.getOperand(i: 0).getReg();
2270 Register Src = isa<GIntrinsic>(Val: MI) ? MI.getOperand(i: 2).getReg()
2271 : MI.getOperand(i: 1).getReg();
2272 LLT DstTy = MRI.getType(Reg: Dst);
2273 LLT SrcTy = MRI.getType(Reg: Src);
2274 unsigned DestAS = DstTy.getAddressSpace();
2275 unsigned SrcAS = SrcTy.getAddressSpace();
2276
2277 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2278 // vector element.
2279 assert(!DstTy.isVector());
2280
2281 const AMDGPUTargetMachine &TM
2282 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2283
2284 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2285 MI.setDesc(B.getTII().get(Opcode: TargetOpcode::G_BITCAST));
2286 return true;
2287 }
2288
2289 if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2290 (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2291 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2292 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2293 // G_ADDRSPACE_CAST we need to guess.
2294 if (isa<GIntrinsic>(Val: MI) || isKnownNonNull(Val: Src, MRI, TM, AddrSpace: SrcAS)) {
2295 // Extract low 32-bits of the pointer.
2296 B.buildExtract(Res: Dst, Src, Index: 0);
2297 MI.eraseFromParent();
2298 return true;
2299 }
2300
2301 unsigned NullVal = TM.getNullPointerValue(AddrSpace: DestAS);
2302
2303 auto SegmentNull = B.buildConstant(Res: DstTy, Val: NullVal);
2304 auto FlatNull = B.buildConstant(Res: SrcTy, Val: 0);
2305
2306 // Extract low 32-bits of the pointer.
2307 auto PtrLo32 = B.buildExtract(Res: DstTy, Src, Index: 0);
2308
2309 auto CmpRes =
2310 B.buildICmp(Pred: CmpInst::ICMP_NE, Res: LLT::scalar(SizeInBits: 1), Op0: Src, Op1: FlatNull.getReg(Idx: 0));
2311 B.buildSelect(Res: Dst, Tst: CmpRes, Op0: PtrLo32, Op1: SegmentNull.getReg(Idx: 0));
2312
2313 MI.eraseFromParent();
2314 return true;
2315 }
2316
2317 if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2318 (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2319 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2320 Register ApertureReg = getSegmentAperture(AS: SrcAS, MRI, B);
2321 if (!ApertureReg.isValid())
2322 return false;
2323
2324 // Coerce the type of the low half of the result so we can use merge_values.
2325 Register SrcAsInt = B.buildPtrToInt(Dst: S32, Src).getReg(Idx: 0);
2326
2327 // TODO: Should we allow mismatched types but matching sizes in merges to
2328 // avoid the ptrtoint?
2329 auto BuildPtr = B.buildMergeLikeInstr(Res: DstTy, Ops: {SrcAsInt, ApertureReg});
2330
2331 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2332 // G_ADDRSPACE_CAST we need to guess.
2333 if (isa<GIntrinsic>(Val: MI) || isKnownNonNull(Val: Src, MRI, TM, AddrSpace: SrcAS)) {
2334 B.buildCopy(Res: Dst, Op: BuildPtr);
2335 MI.eraseFromParent();
2336 return true;
2337 }
2338
2339 auto SegmentNull = B.buildConstant(Res: SrcTy, Val: TM.getNullPointerValue(AddrSpace: SrcAS));
2340 auto FlatNull = B.buildConstant(Res: DstTy, Val: TM.getNullPointerValue(AddrSpace: DestAS));
2341
2342 auto CmpRes = B.buildICmp(Pred: CmpInst::ICMP_NE, Res: LLT::scalar(SizeInBits: 1), Op0: Src,
2343 Op1: SegmentNull.getReg(Idx: 0));
2344
2345 B.buildSelect(Res: Dst, Tst: CmpRes, Op0: BuildPtr, Op1: FlatNull);
2346
2347 MI.eraseFromParent();
2348 return true;
2349 }
2350
2351 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2352 SrcTy.getSizeInBits() == 64) {
2353 // Truncate.
2354 B.buildExtract(Res: Dst, Src, Index: 0);
2355 MI.eraseFromParent();
2356 return true;
2357 }
2358
2359 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2360 DstTy.getSizeInBits() == 64) {
2361 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2362 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2363 auto PtrLo = B.buildPtrToInt(Dst: S32, Src);
2364 auto HighAddr = B.buildConstant(Res: S32, Val: AddrHiVal);
2365 B.buildMergeLikeInstr(Res: Dst, Ops: {PtrLo, HighAddr});
2366 MI.eraseFromParent();
2367 return true;
2368 }
2369
2370 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
2371 MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
2372
2373 LLVMContext &Ctx = MF.getFunction().getContext();
2374 Ctx.diagnose(DI: InvalidAddrSpaceCast);
2375 B.buildUndef(Res: Dst);
2376 MI.eraseFromParent();
2377 return true;
2378}
2379
2380bool AMDGPULegalizerInfo::legalizeFroundeven(MachineInstr &MI,
2381 MachineRegisterInfo &MRI,
2382 MachineIRBuilder &B) const {
2383 Register Src = MI.getOperand(i: 1).getReg();
2384 LLT Ty = MRI.getType(Reg: Src);
2385 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2386
2387 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2388 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2389
2390 auto C1 = B.buildFConstant(Res: Ty, Val: C1Val);
2391 auto CopySign = B.buildFCopysign(Dst: Ty, Src0: C1, Src1: Src);
2392
2393 // TODO: Should this propagate fast-math-flags?
2394 auto Tmp1 = B.buildFAdd(Dst: Ty, Src0: Src, Src1: CopySign);
2395 auto Tmp2 = B.buildFSub(Dst: Ty, Src0: Tmp1, Src1: CopySign);
2396
2397 auto C2 = B.buildFConstant(Res: Ty, Val: C2Val);
2398 auto Fabs = B.buildFAbs(Dst: Ty, Src0: Src);
2399
2400 auto Cond = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: LLT::scalar(SizeInBits: 1), Op0: Fabs, Op1: C2);
2401 B.buildSelect(Res: MI.getOperand(i: 0).getReg(), Tst: Cond, Op0: Src, Op1: Tmp2);
2402 MI.eraseFromParent();
2403 return true;
2404}
2405
2406bool AMDGPULegalizerInfo::legalizeFceil(
2407 MachineInstr &MI, MachineRegisterInfo &MRI,
2408 MachineIRBuilder &B) const {
2409
2410 const LLT S1 = LLT::scalar(SizeInBits: 1);
2411 const LLT S64 = LLT::scalar(SizeInBits: 64);
2412
2413 Register Src = MI.getOperand(i: 1).getReg();
2414 assert(MRI.getType(Src) == S64);
2415
2416 // result = trunc(src)
2417 // if (src > 0.0 && src != result)
2418 // result += 1.0
2419
2420 auto Trunc = B.buildIntrinsicTrunc(Dst: S64, Src0: Src);
2421
2422 const auto Zero = B.buildFConstant(Res: S64, Val: 0.0);
2423 const auto One = B.buildFConstant(Res: S64, Val: 1.0);
2424 auto Lt0 = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: S1, Op0: Src, Op1: Zero);
2425 auto NeTrunc = B.buildFCmp(Pred: CmpInst::FCMP_ONE, Res: S1, Op0: Src, Op1: Trunc);
2426 auto And = B.buildAnd(Dst: S1, Src0: Lt0, Src1: NeTrunc);
2427 auto Add = B.buildSelect(Res: S64, Tst: And, Op0: One, Op1: Zero);
2428
2429 // TODO: Should this propagate fast-math-flags?
2430 B.buildFAdd(Dst: MI.getOperand(i: 0).getReg(), Src0: Trunc, Src1: Add);
2431 MI.eraseFromParent();
2432 return true;
2433}
2434
2435bool AMDGPULegalizerInfo::legalizeFrem(
2436 MachineInstr &MI, MachineRegisterInfo &MRI,
2437 MachineIRBuilder &B) const {
2438 Register DstReg = MI.getOperand(i: 0).getReg();
2439 Register Src0Reg = MI.getOperand(i: 1).getReg();
2440 Register Src1Reg = MI.getOperand(i: 2).getReg();
2441 auto Flags = MI.getFlags();
2442 LLT Ty = MRI.getType(Reg: DstReg);
2443
2444 auto Div = B.buildFDiv(Dst: Ty, Src0: Src0Reg, Src1: Src1Reg, Flags);
2445 auto Trunc = B.buildIntrinsicTrunc(Dst: Ty, Src0: Div, Flags);
2446 auto Neg = B.buildFNeg(Dst: Ty, Src0: Trunc, Flags);
2447 B.buildFMA(Dst: DstReg, Src0: Neg, Src1: Src1Reg, Src2: Src0Reg, Flags);
2448 MI.eraseFromParent();
2449 return true;
2450}
2451
2452static MachineInstrBuilder extractF64Exponent(Register Hi,
2453 MachineIRBuilder &B) {
2454 const unsigned FractBits = 52;
2455 const unsigned ExpBits = 11;
2456 LLT S32 = LLT::scalar(SizeInBits: 32);
2457
2458 auto Const0 = B.buildConstant(Res: S32, Val: FractBits - 32);
2459 auto Const1 = B.buildConstant(Res: S32, Val: ExpBits);
2460
2461 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
2462 .addUse(Hi)
2463 .addUse(Const0.getReg(0))
2464 .addUse(Const1.getReg(0));
2465
2466 return B.buildSub(Dst: S32, Src0: ExpPart, Src1: B.buildConstant(Res: S32, Val: 1023));
2467}
2468
2469bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2470 MachineInstr &MI, MachineRegisterInfo &MRI,
2471 MachineIRBuilder &B) const {
2472 const LLT S1 = LLT::scalar(SizeInBits: 1);
2473 const LLT S32 = LLT::scalar(SizeInBits: 32);
2474 const LLT S64 = LLT::scalar(SizeInBits: 64);
2475
2476 Register Src = MI.getOperand(i: 1).getReg();
2477 assert(MRI.getType(Src) == S64);
2478
2479 // TODO: Should this use extract since the low half is unused?
2480 auto Unmerge = B.buildUnmerge(Res: {S32, S32}, Op: Src);
2481 Register Hi = Unmerge.getReg(Idx: 1);
2482
2483 // Extract the upper half, since this is where we will find the sign and
2484 // exponent.
2485 auto Exp = extractF64Exponent(Hi, B);
2486
2487 const unsigned FractBits = 52;
2488
2489 // Extract the sign bit.
2490 const auto SignBitMask = B.buildConstant(Res: S32, UINT32_C(1) << 31);
2491 auto SignBit = B.buildAnd(Dst: S32, Src0: Hi, Src1: SignBitMask);
2492
2493 const auto FractMask = B.buildConstant(Res: S64, Val: (UINT64_C(1) << FractBits) - 1);
2494
2495 const auto Zero32 = B.buildConstant(Res: S32, Val: 0);
2496
2497 // Extend back to 64-bits.
2498 auto SignBit64 = B.buildMergeLikeInstr(Res: S64, Ops: {Zero32, SignBit});
2499
2500 auto Shr = B.buildAShr(Dst: S64, Src0: FractMask, Src1: Exp);
2501 auto Not = B.buildNot(Dst: S64, Src0: Shr);
2502 auto Tmp0 = B.buildAnd(Dst: S64, Src0: Src, Src1: Not);
2503 auto FiftyOne = B.buildConstant(Res: S32, Val: FractBits - 1);
2504
2505 auto ExpLt0 = B.buildICmp(Pred: CmpInst::ICMP_SLT, Res: S1, Op0: Exp, Op1: Zero32);
2506 auto ExpGt51 = B.buildICmp(Pred: CmpInst::ICMP_SGT, Res: S1, Op0: Exp, Op1: FiftyOne);
2507
2508 auto Tmp1 = B.buildSelect(Res: S64, Tst: ExpLt0, Op0: SignBit64, Op1: Tmp0);
2509 B.buildSelect(Res: MI.getOperand(i: 0).getReg(), Tst: ExpGt51, Op0: Src, Op1: Tmp1);
2510 MI.eraseFromParent();
2511 return true;
2512}
2513
2514bool AMDGPULegalizerInfo::legalizeITOFP(
2515 MachineInstr &MI, MachineRegisterInfo &MRI,
2516 MachineIRBuilder &B, bool Signed) const {
2517
2518 Register Dst = MI.getOperand(i: 0).getReg();
2519 Register Src = MI.getOperand(i: 1).getReg();
2520
2521 const LLT S64 = LLT::scalar(SizeInBits: 64);
2522 const LLT S32 = LLT::scalar(SizeInBits: 32);
2523
2524 assert(MRI.getType(Src) == S64);
2525
2526 auto Unmerge = B.buildUnmerge(Res: {S32, S32}, Op: Src);
2527 auto ThirtyTwo = B.buildConstant(Res: S32, Val: 32);
2528
2529 if (MRI.getType(Reg: Dst) == S64) {
2530 auto CvtHi = Signed ? B.buildSITOFP(Dst: S64, Src0: Unmerge.getReg(Idx: 1))
2531 : B.buildUITOFP(Dst: S64, Src0: Unmerge.getReg(Idx: 1));
2532
2533 auto CvtLo = B.buildUITOFP(Dst: S64, Src0: Unmerge.getReg(Idx: 0));
2534 auto LdExp = B.buildFLdexp(Dst: S64, Src0: CvtHi, Src1: ThirtyTwo);
2535
2536 // TODO: Should this propagate fast-math-flags?
2537 B.buildFAdd(Dst, Src0: LdExp, Src1: CvtLo);
2538 MI.eraseFromParent();
2539 return true;
2540 }
2541
2542 assert(MRI.getType(Dst) == S32);
2543
2544 auto One = B.buildConstant(Res: S32, Val: 1);
2545
2546 MachineInstrBuilder ShAmt;
2547 if (Signed) {
2548 auto ThirtyOne = B.buildConstant(Res: S32, Val: 31);
2549 auto X = B.buildXor(Dst: S32, Src0: Unmerge.getReg(Idx: 0), Src1: Unmerge.getReg(Idx: 1));
2550 auto OppositeSign = B.buildAShr(Dst: S32, Src0: X, Src1: ThirtyOne);
2551 auto MaxShAmt = B.buildAdd(Dst: S32, Src0: ThirtyTwo, Src1: OppositeSign);
2552 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
2553 .addUse(Unmerge.getReg(1));
2554 auto LS2 = B.buildSub(Dst: S32, Src0: LS, Src1: One);
2555 ShAmt = B.buildUMin(Dst: S32, Src0: LS2, Src1: MaxShAmt);
2556 } else
2557 ShAmt = B.buildCTLZ(Dst: S32, Src0: Unmerge.getReg(Idx: 1));
2558 auto Norm = B.buildShl(Dst: S64, Src0: Src, Src1: ShAmt);
2559 auto Unmerge2 = B.buildUnmerge(Res: {S32, S32}, Op: Norm);
2560 auto Adjust = B.buildUMin(Dst: S32, Src0: One, Src1: Unmerge2.getReg(Idx: 0));
2561 auto Norm2 = B.buildOr(Dst: S32, Src0: Unmerge2.getReg(Idx: 1), Src1: Adjust);
2562 auto FVal = Signed ? B.buildSITOFP(Dst: S32, Src0: Norm2) : B.buildUITOFP(Dst: S32, Src0: Norm2);
2563 auto Scale = B.buildSub(Dst: S32, Src0: ThirtyTwo, Src1: ShAmt);
2564 B.buildFLdexp(Dst, Src0: FVal, Src1: Scale);
2565 MI.eraseFromParent();
2566 return true;
2567}
2568
2569// TODO: Copied from DAG implementation. Verify logic and document how this
2570// actually works.
2571bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2572 MachineRegisterInfo &MRI,
2573 MachineIRBuilder &B,
2574 bool Signed) const {
2575
2576 Register Dst = MI.getOperand(i: 0).getReg();
2577 Register Src = MI.getOperand(i: 1).getReg();
2578
2579 const LLT S64 = LLT::scalar(SizeInBits: 64);
2580 const LLT S32 = LLT::scalar(SizeInBits: 32);
2581
2582 const LLT SrcLT = MRI.getType(Reg: Src);
2583 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2584
2585 unsigned Flags = MI.getFlags();
2586
2587 // The basic idea of converting a floating point number into a pair of 32-bit
2588 // integers is illustrated as follows:
2589 //
2590 // tf := trunc(val);
2591 // hif := floor(tf * 2^-32);
2592 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2593 // hi := fptoi(hif);
2594 // lo := fptoi(lof);
2595 //
2596 auto Trunc = B.buildIntrinsicTrunc(Dst: SrcLT, Src0: Src, Flags);
2597 MachineInstrBuilder Sign;
2598 if (Signed && SrcLT == S32) {
2599 // However, a 32-bit floating point number has only 23 bits mantissa and
2600 // it's not enough to hold all the significant bits of `lof` if val is
2601 // negative. To avoid the loss of precision, We need to take the absolute
2602 // value after truncating and flip the result back based on the original
2603 // signedness.
2604 Sign = B.buildAShr(Dst: S32, Src0: Src, Src1: B.buildConstant(Res: S32, Val: 31));
2605 Trunc = B.buildFAbs(Dst: S32, Src0: Trunc, Flags);
2606 }
2607 MachineInstrBuilder K0, K1;
2608 if (SrcLT == S64) {
2609 K0 = B.buildFConstant(
2610 Res: S64, Val: llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2611 K1 = B.buildFConstant(
2612 Res: S64, Val: llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2613 } else {
2614 K0 = B.buildFConstant(
2615 Res: S32, Val: llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2616 K1 = B.buildFConstant(
2617 Res: S32, Val: llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2618 }
2619
2620 auto Mul = B.buildFMul(Dst: SrcLT, Src0: Trunc, Src1: K0, Flags);
2621 auto FloorMul = B.buildFFloor(Dst: SrcLT, Src0: Mul, Flags);
2622 auto Fma = B.buildFMA(Dst: SrcLT, Src0: FloorMul, Src1: K1, Src2: Trunc, Flags);
2623
2624 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(Dst: S32, Src0: FloorMul)
2625 : B.buildFPTOUI(Dst: S32, Src0: FloorMul);
2626 auto Lo = B.buildFPTOUI(Dst: S32, Src0: Fma);
2627
2628 if (Signed && SrcLT == S32) {
2629 // Flip the result based on the signedness, which is either all 0s or 1s.
2630 Sign = B.buildMergeLikeInstr(Res: S64, Ops: {Sign, Sign});
2631 // r := xor({lo, hi}, sign) - sign;
2632 B.buildSub(Dst, Src0: B.buildXor(Dst: S64, Src0: B.buildMergeLikeInstr(Res: S64, Ops: {Lo, Hi}), Src1: Sign),
2633 Src1: Sign);
2634 } else
2635 B.buildMergeLikeInstr(Res: Dst, Ops: {Lo, Hi});
2636 MI.eraseFromParent();
2637
2638 return true;
2639}
2640
2641bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2642 MachineInstr &MI) const {
2643 MachineFunction &MF = Helper.MIRBuilder.getMF();
2644 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2645
2646 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2647 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2648
2649 // With ieee_mode disabled, the instructions have the correct behavior
2650 // already for G_FMINNUM/G_FMAXNUM
2651 if (!MFI->getMode().IEEE)
2652 return !IsIEEEOp;
2653
2654 if (IsIEEEOp)
2655 return true;
2656
2657 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2658}
2659
2660bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2661 MachineInstr &MI, MachineRegisterInfo &MRI,
2662 MachineIRBuilder &B) const {
2663 // TODO: Should move some of this into LegalizerHelper.
2664
2665 // TODO: Promote dynamic indexing of s16 to s32
2666
2667 Register Dst = MI.getOperand(i: 0).getReg();
2668 Register Vec = MI.getOperand(i: 1).getReg();
2669
2670 LLT VecTy = MRI.getType(Reg: Vec);
2671 LLT EltTy = VecTy.getElementType();
2672 assert(EltTy == MRI.getType(Dst));
2673
2674 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2675 // but we can't go directly to that logic becasue you can't bitcast a vector
2676 // of pointers to a vector of integers. Therefore, introduce an intermediate
2677 // vector of integers using ptrtoint (and inttoptr on the output) in order to
2678 // drive the legalization forward.
2679 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2680 LLT IntTy = LLT::scalar(SizeInBits: EltTy.getSizeInBits());
2681 LLT IntVecTy = VecTy.changeElementType(NewEltTy: IntTy);
2682
2683 auto IntVec = B.buildPtrToInt(Dst: IntVecTy, Src: Vec);
2684 auto IntElt = B.buildExtractVectorElement(Res: IntTy, Val: IntVec, Idx: MI.getOperand(i: 2));
2685 B.buildIntToPtr(Dst, Src: IntElt);
2686
2687 MI.eraseFromParent();
2688 return true;
2689 }
2690
2691 // FIXME: Artifact combiner probably should have replaced the truncated
2692 // constant before this, so we shouldn't need
2693 // getIConstantVRegValWithLookThrough.
2694 std::optional<ValueAndVReg> MaybeIdxVal =
2695 getIConstantVRegValWithLookThrough(VReg: MI.getOperand(i: 2).getReg(), MRI);
2696 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2697 return true;
2698 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2699
2700 if (IdxVal < VecTy.getNumElements()) {
2701 auto Unmerge = B.buildUnmerge(Res: EltTy, Op: Vec);
2702 B.buildCopy(Res: Dst, Op: Unmerge.getReg(Idx: IdxVal));
2703 } else {
2704 B.buildUndef(Res: Dst);
2705 }
2706
2707 MI.eraseFromParent();
2708 return true;
2709}
2710
2711bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2712 MachineInstr &MI, MachineRegisterInfo &MRI,
2713 MachineIRBuilder &B) const {
2714 // TODO: Should move some of this into LegalizerHelper.
2715
2716 // TODO: Promote dynamic indexing of s16 to s32
2717
2718 Register Dst = MI.getOperand(i: 0).getReg();
2719 Register Vec = MI.getOperand(i: 1).getReg();
2720 Register Ins = MI.getOperand(i: 2).getReg();
2721
2722 LLT VecTy = MRI.getType(Reg: Vec);
2723 LLT EltTy = VecTy.getElementType();
2724 assert(EltTy == MRI.getType(Ins));
2725
2726 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2727 // but we can't go directly to that logic becasue you can't bitcast a vector
2728 // of pointers to a vector of integers. Therefore, make the pointer vector
2729 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
2730 // new value, and then inttoptr the result vector back. This will then allow
2731 // the rest of legalization to take over.
2732 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2733 LLT IntTy = LLT::scalar(SizeInBits: EltTy.getSizeInBits());
2734 LLT IntVecTy = VecTy.changeElementType(NewEltTy: IntTy);
2735
2736 auto IntVecSource = B.buildPtrToInt(Dst: IntVecTy, Src: Vec);
2737 auto IntIns = B.buildPtrToInt(Dst: IntTy, Src: Ins);
2738 auto IntVecDest = B.buildInsertVectorElement(Res: IntVecTy, Val: IntVecSource, Elt: IntIns,
2739 Idx: MI.getOperand(i: 3));
2740 B.buildIntToPtr(Dst, Src: IntVecDest);
2741 MI.eraseFromParent();
2742 return true;
2743 }
2744
2745 // FIXME: Artifact combiner probably should have replaced the truncated
2746 // constant before this, so we shouldn't need
2747 // getIConstantVRegValWithLookThrough.
2748 std::optional<ValueAndVReg> MaybeIdxVal =
2749 getIConstantVRegValWithLookThrough(VReg: MI.getOperand(i: 3).getReg(), MRI);
2750 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2751 return true;
2752
2753 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2754
2755 unsigned NumElts = VecTy.getNumElements();
2756 if (IdxVal < NumElts) {
2757 SmallVector<Register, 8> SrcRegs;
2758 for (unsigned i = 0; i < NumElts; ++i)
2759 SrcRegs.push_back(Elt: MRI.createGenericVirtualRegister(Ty: EltTy));
2760 B.buildUnmerge(Res: SrcRegs, Op: Vec);
2761
2762 SrcRegs[IdxVal] = MI.getOperand(i: 2).getReg();
2763 B.buildMergeLikeInstr(Res: Dst, Ops: SrcRegs);
2764 } else {
2765 B.buildUndef(Res: Dst);
2766 }
2767
2768 MI.eraseFromParent();
2769 return true;
2770}
2771
2772bool AMDGPULegalizerInfo::legalizeSinCos(
2773 MachineInstr &MI, MachineRegisterInfo &MRI,
2774 MachineIRBuilder &B) const {
2775
2776 Register DstReg = MI.getOperand(i: 0).getReg();
2777 Register SrcReg = MI.getOperand(i: 1).getReg();
2778 LLT Ty = MRI.getType(Reg: DstReg);
2779 unsigned Flags = MI.getFlags();
2780
2781 Register TrigVal;
2782 auto OneOver2Pi = B.buildFConstant(Res: Ty, Val: 0.5 * numbers::inv_pi);
2783 if (ST.hasTrigReducedRange()) {
2784 auto MulVal = B.buildFMul(Dst: Ty, Src0: SrcReg, Src1: OneOver2Pi, Flags);
2785 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
2786 .addUse(MulVal.getReg(Idx: 0))
2787 .setMIFlags(Flags)
2788 .getReg(0);
2789 } else
2790 TrigVal = B.buildFMul(Dst: Ty, Src0: SrcReg, Src1: OneOver2Pi, Flags).getReg(Idx: 0);
2791
2792 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2793 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2794 B.buildIntrinsic(ID: TrigIntrin, Res: ArrayRef<Register>(DstReg))
2795 .addUse(RegNo: TrigVal)
2796 .setMIFlags(Flags);
2797 MI.eraseFromParent();
2798 return true;
2799}
2800
2801bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2802 MachineIRBuilder &B,
2803 const GlobalValue *GV,
2804 int64_t Offset,
2805 unsigned GAFlags) const {
2806 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2807 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2808 // to the following code sequence:
2809 //
2810 // For constant address space:
2811 // s_getpc_b64 s[0:1]
2812 // s_add_u32 s0, s0, $symbol
2813 // s_addc_u32 s1, s1, 0
2814 //
2815 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2816 // a fixup or relocation is emitted to replace $symbol with a literal
2817 // constant, which is a pc-relative offset from the encoding of the $symbol
2818 // operand to the global variable.
2819 //
2820 // For global address space:
2821 // s_getpc_b64 s[0:1]
2822 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2823 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2824 //
2825 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2826 // fixups or relocations are emitted to replace $symbol@*@lo and
2827 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2828 // which is a 64-bit pc-relative offset from the encoding of the $symbol
2829 // operand to the global variable.
2830
2831 LLT ConstPtrTy = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64);
2832
2833 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2834 B.getMRI()->createGenericVirtualRegister(Ty: ConstPtrTy);
2835
2836 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2837 .addDef(PCReg);
2838
2839 MIB.addGlobalAddress(GV, Offset, TargetFlags: GAFlags);
2840 if (GAFlags == SIInstrInfo::MO_NONE)
2841 MIB.addImm(Val: 0);
2842 else
2843 MIB.addGlobalAddress(GV, Offset, TargetFlags: GAFlags + 1);
2844
2845 if (!B.getMRI()->getRegClassOrNull(PCReg))
2846 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2847
2848 if (PtrTy.getSizeInBits() == 32)
2849 B.buildExtract(Res: DstReg, Src: PCReg, Index: 0);
2850 return true;
2851}
2852
2853// Emit a ABS32_LO / ABS32_HI relocation stub.
2854void AMDGPULegalizerInfo::buildAbsGlobalAddress(
2855 Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
2856 MachineRegisterInfo &MRI) const {
2857 bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
2858
2859 LLT S32 = LLT::scalar(SizeInBits: 32);
2860
2861 // Use the destination directly, if and only if we store the lower address
2862 // part only and we don't have a register class being set.
2863 Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(Reg: DstReg)
2864 ? DstReg
2865 : MRI.createGenericVirtualRegister(Ty: S32);
2866
2867 if (!MRI.getRegClassOrNull(AddrLo))
2868 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
2869
2870 // Write the lower half.
2871 B.buildInstr(AMDGPU::S_MOV_B32)
2872 .addDef(AddrLo)
2873 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
2874
2875 // If required, write the upper half as well.
2876 if (RequiresHighHalf) {
2877 assert(PtrTy.getSizeInBits() == 64 &&
2878 "Must provide a 64-bit pointer type!");
2879
2880 Register AddrHi = MRI.createGenericVirtualRegister(Ty: S32);
2881 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
2882
2883 B.buildInstr(AMDGPU::S_MOV_B32)
2884 .addDef(AddrHi)
2885 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI);
2886
2887 // Use the destination directly, if and only if we don't have a register
2888 // class being set.
2889 Register AddrDst = !MRI.getRegClassOrNull(Reg: DstReg)
2890 ? DstReg
2891 : MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 64));
2892
2893 if (!MRI.getRegClassOrNull(AddrDst))
2894 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
2895
2896 B.buildMergeValues(Res: AddrDst, Ops: {AddrLo, AddrHi});
2897
2898 // If we created a new register for the destination, cast the result into
2899 // the final output.
2900 if (AddrDst != DstReg)
2901 B.buildCast(Dst: DstReg, Src: AddrDst);
2902 } else if (AddrLo != DstReg) {
2903 // If we created a new register for the destination, cast the result into
2904 // the final output.
2905 B.buildCast(Dst: DstReg, Src: AddrLo);
2906 }
2907}
2908
2909bool AMDGPULegalizerInfo::legalizeGlobalValue(
2910 MachineInstr &MI, MachineRegisterInfo &MRI,
2911 MachineIRBuilder &B) const {
2912 Register DstReg = MI.getOperand(i: 0).getReg();
2913 LLT Ty = MRI.getType(Reg: DstReg);
2914 unsigned AS = Ty.getAddressSpace();
2915
2916 const GlobalValue *GV = MI.getOperand(i: 1).getGlobal();
2917 MachineFunction &MF = B.getMF();
2918 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2919
2920 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2921 if (!MFI->isModuleEntryFunction() &&
2922 !GV->getName().equals(RHS: "llvm.amdgcn.module.lds")) {
2923 const Function &Fn = MF.getFunction();
2924 DiagnosticInfoUnsupported BadLDSDecl(
2925 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2926 DS_Warning);
2927 Fn.getContext().diagnose(DI: BadLDSDecl);
2928
2929 // We currently don't have a way to correctly allocate LDS objects that
2930 // aren't directly associated with a kernel. We do force inlining of
2931 // functions that use local objects. However, if these dead functions are
2932 // not eliminated, we don't want a compile time error. Just emit a warning
2933 // and a trap, since there should be no callable path here.
2934 B.buildTrap();
2935 B.buildUndef(Res: DstReg);
2936 MI.eraseFromParent();
2937 return true;
2938 }
2939
2940 // TODO: We could emit code to handle the initialization somewhere.
2941 // We ignore the initializer for now and legalize it to allow selection.
2942 // The initializer will anyway get errored out during assembly emission.
2943 const SITargetLowering *TLI = ST.getTargetLowering();
2944 if (!TLI->shouldUseLDSConstAddress(GV)) {
2945 MI.getOperand(i: 1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2946 return true; // Leave in place;
2947 }
2948
2949 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2950 Type *Ty = GV->getValueType();
2951 // HIP uses an unsized array `extern __shared__ T s[]` or similar
2952 // zero-sized type in other languages to declare the dynamic shared
2953 // memory which size is not known at the compile time. They will be
2954 // allocated by the runtime and placed directly after the static
2955 // allocated ones. They all share the same offset.
2956 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2957 // Adjust alignment for that dynamic shared memory array.
2958 MFI->setDynLDSAlign(F: MF.getFunction(), GV: *cast<GlobalVariable>(Val: GV));
2959 LLT S32 = LLT::scalar(SizeInBits: 32);
2960 auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
2961 B.buildIntToPtr(Dst: DstReg, Src: Sz);
2962 MI.eraseFromParent();
2963 return true;
2964 }
2965 }
2966
2967 B.buildConstant(Res: DstReg, Val: MFI->allocateLDSGlobal(DL: B.getDataLayout(),
2968 GV: *cast<GlobalVariable>(Val: GV)));
2969 MI.eraseFromParent();
2970 return true;
2971 }
2972
2973 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
2974 buildAbsGlobalAddress(DstReg, PtrTy: Ty, B, GV, MRI);
2975 MI.eraseFromParent();
2976 return true;
2977 }
2978
2979 const SITargetLowering *TLI = ST.getTargetLowering();
2980
2981 if (TLI->shouldEmitFixup(GV)) {
2982 buildPCRelGlobalAddress(DstReg, PtrTy: Ty, B, GV, Offset: 0);
2983 MI.eraseFromParent();
2984 return true;
2985 }
2986
2987 if (TLI->shouldEmitPCReloc(GV)) {
2988 buildPCRelGlobalAddress(DstReg, PtrTy: Ty, B, GV, Offset: 0, GAFlags: SIInstrInfo::MO_REL32);
2989 MI.eraseFromParent();
2990 return true;
2991 }
2992
2993 LLT PtrTy = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64);
2994 Register GOTAddr = MRI.createGenericVirtualRegister(Ty: PtrTy);
2995
2996 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2997 MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2998 PtrInfo: MachinePointerInfo::getGOT(MF),
2999 f: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
3000 MachineMemOperand::MOInvariant,
3001 MemTy: LoadTy, base_alignment: Align(8));
3002
3003 buildPCRelGlobalAddress(DstReg: GOTAddr, PtrTy, B, GV, Offset: 0, GAFlags: SIInstrInfo::MO_GOTPCREL32);
3004
3005 if (Ty.getSizeInBits() == 32) {
3006 // Truncate if this is a 32-bit constant address.
3007 auto Load = B.buildLoad(Res: PtrTy, Addr: GOTAddr, MMO&: *GOTMMO);
3008 B.buildExtract(Res: DstReg, Src: Load, Index: 0);
3009 } else
3010 B.buildLoad(Res: DstReg, Addr: GOTAddr, MMO&: *GOTMMO);
3011
3012 MI.eraseFromParent();
3013 return true;
3014}
3015
3016static LLT widenToNextPowerOf2(LLT Ty) {
3017 if (Ty.isVector())
3018 return Ty.changeElementCount(
3019 EC: ElementCount::getFixed(MinVal: PowerOf2Ceil(A: Ty.getNumElements())));
3020 return LLT::scalar(SizeInBits: PowerOf2Ceil(A: Ty.getSizeInBits()));
3021}
3022
3023bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
3024 MachineInstr &MI) const {
3025 MachineIRBuilder &B = Helper.MIRBuilder;
3026 MachineRegisterInfo &MRI = *B.getMRI();
3027 GISelChangeObserver &Observer = Helper.Observer;
3028
3029 Register PtrReg = MI.getOperand(i: 1).getReg();
3030 LLT PtrTy = MRI.getType(Reg: PtrReg);
3031 unsigned AddrSpace = PtrTy.getAddressSpace();
3032
3033 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3034 LLT ConstPtr = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64);
3035 auto Cast = B.buildAddrSpaceCast(Dst: ConstPtr, Src: PtrReg);
3036 Observer.changingInstr(MI);
3037 MI.getOperand(i: 1).setReg(Cast.getReg(Idx: 0));
3038 Observer.changedInstr(MI);
3039 return true;
3040 }
3041
3042 if (MI.getOpcode() != AMDGPU::G_LOAD)
3043 return false;
3044
3045 Register ValReg = MI.getOperand(i: 0).getReg();
3046 LLT ValTy = MRI.getType(Reg: ValReg);
3047
3048 if (hasBufferRsrcWorkaround(Ty: ValTy)) {
3049 Observer.changingInstr(MI);
3050 castBufferRsrcFromV4I32(MI, B, MRI, Idx: 0);
3051 Observer.changedInstr(MI);
3052 return true;
3053 }
3054
3055 MachineMemOperand *MMO = *MI.memoperands_begin();
3056 const unsigned ValSize = ValTy.getSizeInBits();
3057 const LLT MemTy = MMO->getMemoryType();
3058 const Align MemAlign = MMO->getAlign();
3059 const unsigned MemSize = MemTy.getSizeInBits();
3060 const uint64_t AlignInBits = 8 * MemAlign.value();
3061
3062 // Widen non-power-of-2 loads to the alignment if needed
3063 if (shouldWidenLoad(ST, MemoryTy: MemTy, AlignInBits, AddrSpace, Opcode: MI.getOpcode())) {
3064 const unsigned WideMemSize = PowerOf2Ceil(A: MemSize);
3065
3066 // This was already the correct extending load result type, so just adjust
3067 // the memory type.
3068 if (WideMemSize == ValSize) {
3069 MachineFunction &MF = B.getMF();
3070
3071 MachineMemOperand *WideMMO =
3072 MF.getMachineMemOperand(MMO, Offset: 0, Size: WideMemSize / 8);
3073 Observer.changingInstr(MI);
3074 MI.setMemRefs(MF, MemRefs: {WideMMO});
3075 Observer.changedInstr(MI);
3076 return true;
3077 }
3078
3079 // Don't bother handling edge case that should probably never be produced.
3080 if (ValSize > WideMemSize)
3081 return false;
3082
3083 LLT WideTy = widenToNextPowerOf2(Ty: ValTy);
3084
3085 Register WideLoad;
3086 if (!WideTy.isVector()) {
3087 WideLoad = B.buildLoadFromOffset(Dst: WideTy, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0).getReg(Idx: 0);
3088 B.buildTrunc(Res: ValReg, Op: WideLoad).getReg(Idx: 0);
3089 } else {
3090 // Extract the subvector.
3091
3092 if (isRegisterType(Ty: ValTy)) {
3093 // If this a case where G_EXTRACT is legal, use it.
3094 // (e.g. <3 x s32> -> <4 x s32>)
3095 WideLoad = B.buildLoadFromOffset(Dst: WideTy, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0).getReg(Idx: 0);
3096 B.buildExtract(Res: ValReg, Src: WideLoad, Index: 0);
3097 } else {
3098 // For cases where the widened type isn't a nice register value, unmerge
3099 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3100 WideLoad = B.buildLoadFromOffset(Dst: WideTy, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0).getReg(Idx: 0);
3101 B.buildDeleteTrailingVectorElements(Res: ValReg, Op0: WideLoad);
3102 }
3103 }
3104
3105 MI.eraseFromParent();
3106 return true;
3107 }
3108
3109 return false;
3110}
3111
3112bool AMDGPULegalizerInfo::legalizeStore(LegalizerHelper &Helper,
3113 MachineInstr &MI) const {
3114 MachineIRBuilder &B = Helper.MIRBuilder;
3115 MachineRegisterInfo &MRI = *B.getMRI();
3116 GISelChangeObserver &Observer = Helper.Observer;
3117
3118 Register DataReg = MI.getOperand(i: 0).getReg();
3119 LLT DataTy = MRI.getType(Reg: DataReg);
3120
3121 if (hasBufferRsrcWorkaround(Ty: DataTy)) {
3122 Observer.changingInstr(MI);
3123 castBufferRsrcArgToV4I32(MI, B, Idx: 0);
3124 Observer.changedInstr(MI);
3125 return true;
3126 }
3127 return false;
3128}
3129
3130bool AMDGPULegalizerInfo::legalizeFMad(
3131 MachineInstr &MI, MachineRegisterInfo &MRI,
3132 MachineIRBuilder &B) const {
3133 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
3134 assert(Ty.isScalar());
3135
3136 MachineFunction &MF = B.getMF();
3137 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3138
3139 // TODO: Always legal with future ftz flag.
3140 // FIXME: Do we need just output?
3141 if (Ty == LLT::float32() &&
3142 MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign())
3143 return true;
3144 if (Ty == LLT::float16() &&
3145 MFI->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign())
3146 return true;
3147
3148 MachineIRBuilder HelperBuilder(MI);
3149 GISelObserverWrapper DummyObserver;
3150 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3151 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3152}
3153
3154bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
3155 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
3156 Register DstReg = MI.getOperand(i: 0).getReg();
3157 Register PtrReg = MI.getOperand(i: 1).getReg();
3158 Register CmpVal = MI.getOperand(i: 2).getReg();
3159 Register NewVal = MI.getOperand(i: 3).getReg();
3160
3161 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
3162 "this should not have been custom lowered");
3163
3164 LLT ValTy = MRI.getType(Reg: CmpVal);
3165 LLT VecTy = LLT::fixed_vector(NumElements: 2, ScalarTy: ValTy);
3166
3167 Register PackedVal = B.buildBuildVector(Res: VecTy, Ops: { NewVal, CmpVal }).getReg(Idx: 0);
3168
3169 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3170 .addDef(DstReg)
3171 .addUse(PtrReg)
3172 .addUse(PackedVal)
3173 .setMemRefs(MI.memoperands());
3174
3175 MI.eraseFromParent();
3176 return true;
3177}
3178
3179/// Return true if it's known that \p Src can never be an f32 denormal value.
3180static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI,
3181 Register Src) {
3182 const MachineInstr *DefMI = MRI.getVRegDef(Reg: Src);
3183 switch (DefMI->getOpcode()) {
3184 case TargetOpcode::G_INTRINSIC: {
3185 switch (cast<GIntrinsic>(Val: DefMI)->getIntrinsicID()) {
3186 case Intrinsic::amdgcn_frexp_mant:
3187 return true;
3188 default:
3189 break;
3190 }
3191
3192 break;
3193 }
3194 case TargetOpcode::G_FFREXP: {
3195 if (DefMI->getOperand(i: 0).getReg() == Src)
3196 return true;
3197 break;
3198 }
3199 case TargetOpcode::G_FPEXT: {
3200 return MRI.getType(Reg: DefMI->getOperand(i: 1).getReg()) == LLT::scalar(SizeInBits: 16);
3201 }
3202 default:
3203 return false;
3204 }
3205
3206 return false;
3207}
3208
3209static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3210 if (Flags & MachineInstr::FmAfn)
3211 return true;
3212 const auto &Options = MF.getTarget().Options;
3213 return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
3214}
3215
3216static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src,
3217 unsigned Flags) {
3218 return !valueIsKnownNeverF32Denorm(MRI: MF.getRegInfo(), Src) &&
3219 MF.getDenormalMode(FPType: APFloat::IEEEsingle()).Input !=
3220 DenormalMode::PreserveSign;
3221}
3222
3223std::pair<Register, Register>
3224AMDGPULegalizerInfo::getScaledLogInput(MachineIRBuilder &B, Register Src,
3225 unsigned Flags) const {
3226 if (!needsDenormHandlingF32(MF: B.getMF(), Src, Flags))
3227 return {};
3228
3229 const LLT F32 = LLT::scalar(SizeInBits: 32);
3230 auto SmallestNormal = B.buildFConstant(
3231 Res: F32, Val: APFloat::getSmallestNormalized(Sem: APFloat::IEEEsingle()));
3232 auto IsLtSmallestNormal =
3233 B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: Src, Op1: SmallestNormal);
3234
3235 auto Scale32 = B.buildFConstant(Res: F32, Val: 0x1.0p+32);
3236 auto One = B.buildFConstant(Res: F32, Val: 1.0);
3237 auto ScaleFactor =
3238 B.buildSelect(Res: F32, Tst: IsLtSmallestNormal, Op0: Scale32, Op1: One, Flags);
3239 auto ScaledInput = B.buildFMul(Dst: F32, Src0: Src, Src1: ScaleFactor, Flags);
3240
3241 return {ScaledInput.getReg(Idx: 0), IsLtSmallestNormal.getReg(Idx: 0)};
3242}
3243
3244bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI,
3245 MachineIRBuilder &B) const {
3246 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3247 // If we have to handle denormals, scale up the input and adjust the result.
3248
3249 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3250 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3251
3252 Register Dst = MI.getOperand(i: 0).getReg();
3253 Register Src = MI.getOperand(i: 1).getReg();
3254 LLT Ty = B.getMRI()->getType(Reg: Dst);
3255 unsigned Flags = MI.getFlags();
3256
3257 if (Ty == LLT::scalar(SizeInBits: 16)) {
3258 const LLT F32 = LLT::scalar(SizeInBits: 32);
3259 // Nothing in half is a denormal when promoted to f32.
3260 auto Ext = B.buildFPExt(Res: F32, Op: Src, Flags);
3261 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
3262 .addUse(Ext.getReg(0))
3263 .setMIFlags(Flags);
3264 B.buildFPTrunc(Res: Dst, Op: Log2, Flags);
3265 MI.eraseFromParent();
3266 return true;
3267 }
3268
3269 assert(Ty == LLT::scalar(32));
3270
3271 auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3272 if (!ScaledInput) {
3273 B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)})
3274 .addUse(Src)
3275 .setMIFlags(Flags);
3276 MI.eraseFromParent();
3277 return true;
3278 }
3279
3280 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3281 .addUse(ScaledInput)
3282 .setMIFlags(Flags);
3283
3284 auto ThirtyTwo = B.buildFConstant(Res: Ty, Val: 32.0);
3285 auto Zero = B.buildFConstant(Res: Ty, Val: 0.0);
3286 auto ResultOffset =
3287 B.buildSelect(Res: Ty, Tst: IsLtSmallestNormal, Op0: ThirtyTwo, Op1: Zero, Flags);
3288 B.buildFSub(Dst, Src0: Log2, Src1: ResultOffset, Flags);
3289
3290 MI.eraseFromParent();
3291 return true;
3292}
3293
3294static Register getMad(MachineIRBuilder &B, LLT Ty, Register X, Register Y,
3295 Register Z, unsigned Flags) {
3296 auto FMul = B.buildFMul(Dst: Ty, Src0: X, Src1: Y, Flags);
3297 return B.buildFAdd(Dst: Ty, Src0: FMul, Src1: Z, Flags).getReg(Idx: 0);
3298}
3299
3300bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
3301 MachineIRBuilder &B) const {
3302 const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3303 assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3304
3305 MachineRegisterInfo &MRI = *B.getMRI();
3306 Register Dst = MI.getOperand(i: 0).getReg();
3307 Register X = MI.getOperand(i: 1).getReg();
3308 unsigned Flags = MI.getFlags();
3309 const LLT Ty = MRI.getType(Reg: X);
3310 MachineFunction &MF = B.getMF();
3311
3312 const LLT F32 = LLT::scalar(SizeInBits: 32);
3313 const LLT F16 = LLT::scalar(SizeInBits: 16);
3314
3315 const AMDGPUTargetMachine &TM =
3316 static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
3317
3318 if (Ty == F16 || MI.getFlag(Flag: MachineInstr::FmAfn) ||
3319 TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) {
3320 if (Ty == F16 && !ST.has16BitInsts()) {
3321 Register LogVal = MRI.createGenericVirtualRegister(Ty: F32);
3322 auto PromoteSrc = B.buildFPExt(Res: F32, Op: X);
3323 legalizeFlogUnsafe(B, Dst: LogVal, Src: PromoteSrc.getReg(Idx: 0), IsLog10, Flags);
3324 B.buildFPTrunc(Res: Dst, Op: LogVal);
3325 } else {
3326 legalizeFlogUnsafe(B, Dst, Src: X, IsLog10, Flags);
3327 }
3328
3329 MI.eraseFromParent();
3330 return true;
3331 }
3332
3333 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src: X, Flags);
3334 if (ScaledInput)
3335 X = ScaledInput;
3336
3337 auto Y =
3338 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags);
3339
3340 Register R;
3341 if (ST.hasFastFMAF32()) {
3342 // c+cc are ln(2)/ln(10) to more than 49 bits
3343 const float c_log10 = 0x1.344134p-2f;
3344 const float cc_log10 = 0x1.09f79ep-26f;
3345
3346 // c + cc is ln(2) to more than 49 bits
3347 const float c_log = 0x1.62e42ep-1f;
3348 const float cc_log = 0x1.efa39ep-25f;
3349
3350 auto C = B.buildFConstant(Res: Ty, Val: IsLog10 ? c_log10 : c_log);
3351 auto CC = B.buildFConstant(Res: Ty, Val: IsLog10 ? cc_log10 : cc_log);
3352
3353 R = B.buildFMul(Dst: Ty, Src0: Y, Src1: C, Flags).getReg(0);
3354 auto NegR = B.buildFNeg(Dst: Ty, Src0: R, Flags);
3355 auto FMA0 = B.buildFMA(Dst: Ty, Src0: Y, Src1: C, Src2: NegR, Flags);
3356 auto FMA1 = B.buildFMA(Dst: Ty, Src0: Y, Src1: CC, Src2: FMA0, Flags);
3357 R = B.buildFAdd(Dst: Ty, Src0: R, Src1: FMA1, Flags).getReg(0);
3358 } else {
3359 // ch+ct is ln(2)/ln(10) to more than 36 bits
3360 const float ch_log10 = 0x1.344000p-2f;
3361 const float ct_log10 = 0x1.3509f6p-18f;
3362
3363 // ch + ct is ln(2) to more than 36 bits
3364 const float ch_log = 0x1.62e000p-1f;
3365 const float ct_log = 0x1.0bfbe8p-15f;
3366
3367 auto CH = B.buildFConstant(Res: Ty, Val: IsLog10 ? ch_log10 : ch_log);
3368 auto CT = B.buildFConstant(Res: Ty, Val: IsLog10 ? ct_log10 : ct_log);
3369
3370 auto MaskConst = B.buildConstant(Res: Ty, Val: 0xfffff000);
3371 auto YH = B.buildAnd(Dst: Ty, Src0: Y, Src1: MaskConst);
3372 auto YT = B.buildFSub(Dst: Ty, Src0: Y, Src1: YH, Flags);
3373 auto YTCT = B.buildFMul(Dst: Ty, Src0: YT, Src1: CT, Flags);
3374
3375 Register Mad0 =
3376 getMad(B, Ty, YH.getReg(0), CT.getReg(Idx: 0), YTCT.getReg(0), Flags);
3377 Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(Idx: 0), Mad0, Flags);
3378 R = getMad(B, Ty, YH.getReg(0), CH.getReg(Idx: 0), Mad1, Flags);
3379 }
3380
3381 const bool IsFiniteOnly =
3382 (MI.getFlag(Flag: MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) &&
3383 (MI.getFlag(Flag: MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath);
3384
3385 if (!IsFiniteOnly) {
3386 // Expand isfinite(x) => fabs(x) < inf
3387 auto Inf = B.buildFConstant(Res: Ty, Val: APFloat::getInf(Sem: APFloat::IEEEsingle()));
3388 auto Fabs = B.buildFAbs(Dst: Ty, Src0: Y);
3389 auto IsFinite =
3390 B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: Fabs, Op1: Inf, Flags);
3391 R = B.buildSelect(Res: Ty, Tst: IsFinite, Op0: R, Op1: Y, Flags).getReg(0);
3392 }
3393
3394 if (ScaledInput) {
3395 auto Zero = B.buildFConstant(Res: Ty, Val: 0.0);
3396 auto ShiftK =
3397 B.buildFConstant(Res: Ty, Val: IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3398 auto Shift = B.buildSelect(Res: Ty, Tst: IsScaled, Op0: ShiftK, Op1: Zero, Flags);
3399 B.buildFSub(Dst, Src0: R, Src1: Shift, Flags);
3400 } else {
3401 B.buildCopy(Res: Dst, Op: R);
3402 }
3403
3404 MI.eraseFromParent();
3405 return true;
3406}
3407
3408bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst,
3409 Register Src, bool IsLog10,
3410 unsigned Flags) const {
3411 const double Log2BaseInverted =
3412 IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
3413
3414 LLT Ty = B.getMRI()->getType(Reg: Dst);
3415
3416 if (Ty == LLT::scalar(SizeInBits: 32)) {
3417 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3418 if (ScaledInput) {
3419 auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3420 .addUse(Src)
3421 .setMIFlags(Flags);
3422 auto ScaledResultOffset = B.buildFConstant(Res: Ty, Val: -32.0 * Log2BaseInverted);
3423 auto Zero = B.buildFConstant(Res: Ty, Val: 0.0);
3424 auto ResultOffset =
3425 B.buildSelect(Res: Ty, Tst: IsScaled, Op0: ScaledResultOffset, Op1: Zero, Flags);
3426 auto Log2Inv = B.buildFConstant(Res: Ty, Val: Log2BaseInverted);
3427
3428 if (ST.hasFastFMAF32())
3429 B.buildFMA(Dst, Src0: LogSrc, Src1: Log2Inv, Src2: ResultOffset, Flags);
3430 else {
3431 auto Mul = B.buildFMul(Dst: Ty, Src0: LogSrc, Src1: Log2Inv, Flags);
3432 B.buildFAdd(Dst, Src0: Mul, Src1: ResultOffset, Flags);
3433 }
3434
3435 return true;
3436 }
3437 }
3438
3439 auto Log2Operand = Ty == LLT::scalar(16)
3440 ? B.buildFLog2(Ty, Src, Flags)
3441 : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3442 .addUse(Src)
3443 .setMIFlags(Flags);
3444 auto Log2BaseInvertedOperand = B.buildFConstant(Res: Ty, Val: Log2BaseInverted);
3445 B.buildFMul(Dst, Src0: Log2Operand, Src1: Log2BaseInvertedOperand, Flags);
3446 return true;
3447}
3448
3449bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI,
3450 MachineIRBuilder &B) const {
3451 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3452 // If we have to handle denormals, scale up the input and adjust the result.
3453
3454 Register Dst = MI.getOperand(i: 0).getReg();
3455 Register Src = MI.getOperand(i: 1).getReg();
3456 unsigned Flags = MI.getFlags();
3457 LLT Ty = B.getMRI()->getType(Reg: Dst);
3458 const LLT F16 = LLT::scalar(SizeInBits: 16);
3459 const LLT F32 = LLT::scalar(SizeInBits: 32);
3460
3461 if (Ty == F16) {
3462 // Nothing in half is a denormal when promoted to f32.
3463 auto Ext = B.buildFPExt(Res: F32, Op: Src, Flags);
3464 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
3465 .addUse(Ext.getReg(0))
3466 .setMIFlags(Flags);
3467 B.buildFPTrunc(Res: Dst, Op: Log2, Flags);
3468 MI.eraseFromParent();
3469 return true;
3470 }
3471
3472 assert(Ty == F32);
3473
3474 if (!needsDenormHandlingF32(MF: B.getMF(), Src, Flags)) {
3475 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3476 .addUse(Src)
3477 .setMIFlags(Flags);
3478 MI.eraseFromParent();
3479 return true;
3480 }
3481
3482 // bool needs_scaling = x < -0x1.f80000p+6f;
3483 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3484
3485 // -nextafter(128.0, -1)
3486 auto RangeCheckConst = B.buildFConstant(Res: Ty, Val: -0x1.f80000p+6f);
3487 auto NeedsScaling = B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: Src,
3488 Op1: RangeCheckConst, Flags);
3489
3490 auto SixtyFour = B.buildFConstant(Res: Ty, Val: 0x1.0p+6f);
3491 auto Zero = B.buildFConstant(Res: Ty, Val: 0.0);
3492 auto AddOffset = B.buildSelect(Res: F32, Tst: NeedsScaling, Op0: SixtyFour, Op1: Zero, Flags);
3493 auto AddInput = B.buildFAdd(Dst: F32, Src0: Src, Src1: AddOffset, Flags);
3494
3495 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3496 .addUse(AddInput.getReg(0))
3497 .setMIFlags(Flags);
3498
3499 auto TwoExpNeg64 = B.buildFConstant(Res: Ty, Val: 0x1.0p-64f);
3500 auto One = B.buildFConstant(Res: Ty, Val: 1.0);
3501 auto ResultScale = B.buildSelect(Res: F32, Tst: NeedsScaling, Op0: TwoExpNeg64, Op1: One, Flags);
3502 B.buildFMul(Dst, Src0: Exp2, Src1: ResultScale, Flags);
3503 MI.eraseFromParent();
3504 return true;
3505}
3506
3507bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst,
3508 Register X, unsigned Flags) const {
3509 LLT Ty = B.getMRI()->getType(Reg: Dst);
3510 LLT F32 = LLT::scalar(SizeInBits: 32);
3511
3512 if (Ty != F32 || !needsDenormHandlingF32(MF: B.getMF(), Src: X, Flags)) {
3513 auto Log2E = B.buildFConstant(Res: Ty, Val: numbers::log2e);
3514 auto Mul = B.buildFMul(Dst: Ty, Src0: X, Src1: Log2E, Flags);
3515
3516 if (Ty == F32) {
3517 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3518 .addUse(Mul.getReg(0))
3519 .setMIFlags(Flags);
3520 } else {
3521 B.buildFExp2(Dst, Src: Mul.getReg(Idx: 0), Flags);
3522 }
3523
3524 return true;
3525 }
3526
3527 auto Threshold = B.buildFConstant(Res: Ty, Val: -0x1.5d58a0p+6f);
3528 auto NeedsScaling =
3529 B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: X, Op1: Threshold, Flags);
3530 auto ScaleOffset = B.buildFConstant(Res: Ty, Val: 0x1.0p+6f);
3531 auto ScaledX = B.buildFAdd(Dst: Ty, Src0: X, Src1: ScaleOffset, Flags);
3532 auto AdjustedX = B.buildSelect(Res: Ty, Tst: NeedsScaling, Op0: ScaledX, Op1: X, Flags);
3533
3534 auto Log2E = B.buildFConstant(Res: Ty, Val: numbers::log2e);
3535 auto ExpInput = B.buildFMul(Dst: Ty, Src0: AdjustedX, Src1: Log2E, Flags);
3536
3537 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3538 .addUse(ExpInput.getReg(0))
3539 .setMIFlags(Flags);
3540
3541 auto ResultScaleFactor = B.buildFConstant(Res: Ty, Val: 0x1.969d48p-93f);
3542 auto AdjustedResult = B.buildFMul(Dst: Ty, Src0: Exp2, Src1: ResultScaleFactor, Flags);
3543 B.buildSelect(Res: Dst, Tst: NeedsScaling, Op0: AdjustedResult, Op1: Exp2, Flags);
3544 return true;
3545}
3546
3547bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
3548 MachineIRBuilder &B) const {
3549 Register Dst = MI.getOperand(i: 0).getReg();
3550 Register X = MI.getOperand(i: 1).getReg();
3551 const unsigned Flags = MI.getFlags();
3552 MachineFunction &MF = B.getMF();
3553 MachineRegisterInfo &MRI = *B.getMRI();
3554 LLT Ty = MRI.getType(Reg: Dst);
3555 const LLT F16 = LLT::scalar(SizeInBits: 16);
3556 const LLT F32 = LLT::scalar(SizeInBits: 32);
3557 const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
3558
3559 if (Ty == F16) {
3560 // v_exp_f16 (fmul x, log2e)
3561 if (allowApproxFunc(MF, Flags)) {
3562 // TODO: Does this really require fast?
3563 legalizeFExpUnsafe(B, Dst, X, Flags);
3564 MI.eraseFromParent();
3565 return true;
3566 }
3567
3568 // exp(f16 x) ->
3569 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3570
3571 // Nothing in half is a denormal when promoted to f32.
3572 auto Ext = B.buildFPExt(Res: F32, Op: X, Flags);
3573 Register Lowered = MRI.createGenericVirtualRegister(Ty: F32);
3574 legalizeFExpUnsafe(B, Dst: Lowered, X: Ext.getReg(Idx: 0), Flags);
3575 B.buildFPTrunc(Res: Dst, Op: Lowered, Flags);
3576 MI.eraseFromParent();
3577 return true;
3578 }
3579
3580 assert(Ty == F32);
3581
3582 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3583 // library behavior. Also, is known-not-daz source sufficient?
3584 if (allowApproxFunc(MF, Flags)) {
3585 legalizeFExpUnsafe(B, Dst, X, Flags);
3586 MI.eraseFromParent();
3587 return true;
3588 }
3589
3590 // Algorithm:
3591 //
3592 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3593 //
3594 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3595 // n = 64*m + j, 0 <= j < 64
3596 //
3597 // e^x = 2^((64*m + j + f)/64)
3598 // = (2^m) * (2^(j/64)) * 2^(f/64)
3599 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3600 //
3601 // f = x*(64/ln(2)) - n
3602 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3603 //
3604 // e^x = (2^m) * (2^(j/64)) * e^r
3605 //
3606 // (2^(j/64)) is precomputed
3607 //
3608 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3609 // e^r = 1 + q
3610 //
3611 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3612 //
3613 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3614 const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
3615 Register PH, PL;
3616
3617 if (ST.hasFastFMAF32()) {
3618 const float c_exp = numbers::log2ef;
3619 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3620 const float c_exp10 = 0x1.a934f0p+1f;
3621 const float cc_exp10 = 0x1.2f346ep-24f;
3622
3623 auto C = B.buildFConstant(Res: Ty, Val: IsExp10 ? c_exp10 : c_exp);
3624 PH = B.buildFMul(Dst: Ty, Src0: X, Src1: C, Flags).getReg(Idx: 0);
3625 auto NegPH = B.buildFNeg(Dst: Ty, Src0: PH, Flags);
3626 auto FMA0 = B.buildFMA(Dst: Ty, Src0: X, Src1: C, Src2: NegPH, Flags);
3627
3628 auto CC = B.buildFConstant(Res: Ty, Val: IsExp10 ? cc_exp10 : cc_exp);
3629 PL = B.buildFMA(Dst: Ty, Src0: X, Src1: CC, Src2: FMA0, Flags).getReg(Idx: 0);
3630 } else {
3631 const float ch_exp = 0x1.714000p+0f;
3632 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3633
3634 const float ch_exp10 = 0x1.a92000p+1f;
3635 const float cl_exp10 = 0x1.4f0978p-11f;
3636
3637 auto MaskConst = B.buildConstant(Res: Ty, Val: 0xfffff000);
3638 auto XH = B.buildAnd(Dst: Ty, Src0: X, Src1: MaskConst);
3639 auto XL = B.buildFSub(Dst: Ty, Src0: X, Src1: XH, Flags);
3640
3641 auto CH = B.buildFConstant(Res: Ty, Val: IsExp10 ? ch_exp10 : ch_exp);
3642 PH = B.buildFMul(Dst: Ty, Src0: XH, Src1: CH, Flags).getReg(Idx: 0);
3643
3644 auto CL = B.buildFConstant(Res: Ty, Val: IsExp10 ? cl_exp10 : cl_exp);
3645 auto XLCL = B.buildFMul(Dst: Ty, Src0: XL, Src1: CL, Flags);
3646
3647 Register Mad0 =
3648 getMad(B, Ty, X: XL.getReg(Idx: 0), Y: CH.getReg(Idx: 0), Z: XLCL.getReg(Idx: 0), Flags);
3649 PL = getMad(B, Ty, X: XH.getReg(Idx: 0), Y: CL.getReg(Idx: 0), Z: Mad0, Flags);
3650 }
3651
3652 auto E = B.buildIntrinsicRoundeven(Dst: Ty, Src0: PH, Flags);
3653
3654 // It is unsafe to contract this fsub into the PH multiply.
3655 auto PHSubE = B.buildFSub(Dst: Ty, Src0: PH, Src1: E, Flags: FlagsNoContract);
3656 auto A = B.buildFAdd(Dst: Ty, Src0: PHSubE, Src1: PL, Flags);
3657 auto IntE = B.buildFPTOSI(Dst: LLT::scalar(SizeInBits: 32), Src0: E);
3658
3659 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3660 .addUse(A.getReg(0))
3661 .setMIFlags(Flags);
3662 auto R = B.buildFLdexp(Dst: Ty, Src0: Exp2, Src1: IntE, Flags);
3663
3664 auto UnderflowCheckConst =
3665 B.buildFConstant(Res: Ty, Val: IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3666 auto Zero = B.buildFConstant(Res: Ty, Val: 0.0);
3667 auto Underflow =
3668 B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: X, Op1: UnderflowCheckConst);
3669
3670 R = B.buildSelect(Res: Ty, Tst: Underflow, Op0: Zero, Op1: R);
3671
3672 const auto &Options = MF.getTarget().Options;
3673
3674 if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) {
3675 auto OverflowCheckConst =
3676 B.buildFConstant(Res: Ty, Val: IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3677
3678 auto Overflow =
3679 B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: LLT::scalar(SizeInBits: 1), Op0: X, Op1: OverflowCheckConst);
3680 auto Inf = B.buildFConstant(Res: Ty, Val: APFloat::getInf(Sem: APFloat::IEEEsingle()));
3681 R = B.buildSelect(Res: Ty, Tst: Overflow, Op0: Inf, Op1: R, Flags);
3682 }
3683
3684 B.buildCopy(Res: Dst, Op: R);
3685 MI.eraseFromParent();
3686 return true;
3687}
3688
3689bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
3690 MachineIRBuilder &B) const {
3691 Register Dst = MI.getOperand(i: 0).getReg();
3692 Register Src0 = MI.getOperand(i: 1).getReg();
3693 Register Src1 = MI.getOperand(i: 2).getReg();
3694 unsigned Flags = MI.getFlags();
3695 LLT Ty = B.getMRI()->getType(Reg: Dst);
3696 const LLT F16 = LLT::float16();
3697 const LLT F32 = LLT::float32();
3698
3699 if (Ty == F32) {
3700 auto Log = B.buildFLog2(Dst: F32, Src: Src0, Flags);
3701 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3702 .addUse(Log.getReg(0))
3703 .addUse(Src1)
3704 .setMIFlags(Flags);
3705 B.buildFExp2(Dst, Src: Mul, Flags);
3706 } else if (Ty == F16) {
3707 // There's no f16 fmul_legacy, so we need to convert for it.
3708 auto Log = B.buildFLog2(Dst: F16, Src: Src0, Flags);
3709 auto Ext0 = B.buildFPExt(Res: F32, Op: Log, Flags);
3710 auto Ext1 = B.buildFPExt(Res: F32, Op: Src1, Flags);
3711 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3712 .addUse(Ext0.getReg(0))
3713 .addUse(Ext1.getReg(0))
3714 .setMIFlags(Flags);
3715 B.buildFExp2(Dst, Src: B.buildFPTrunc(Res: F16, Op: Mul), Flags);
3716 } else
3717 return false;
3718
3719 MI.eraseFromParent();
3720 return true;
3721}
3722
3723// Find a source register, ignoring any possible source modifiers.
3724static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
3725 Register ModSrc = OrigSrc;
3726 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
3727 ModSrc = SrcFNeg->getOperand(i: 1).getReg();
3728 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3729 ModSrc = SrcFAbs->getOperand(i: 1).getReg();
3730 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3731 ModSrc = SrcFAbs->getOperand(i: 1).getReg();
3732 return ModSrc;
3733}
3734
3735bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
3736 MachineRegisterInfo &MRI,
3737 MachineIRBuilder &B) const {
3738
3739 const LLT S1 = LLT::scalar(SizeInBits: 1);
3740 const LLT F64 = LLT::float64();
3741 Register Dst = MI.getOperand(i: 0).getReg();
3742 Register OrigSrc = MI.getOperand(i: 1).getReg();
3743 unsigned Flags = MI.getFlags();
3744 assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
3745 "this should not have been custom lowered");
3746
3747 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
3748 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
3749 // efficient way to implement it is using V_FRACT_F64. The workaround for the
3750 // V_FRACT bug is:
3751 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
3752 //
3753 // Convert floor(x) to (x - fract(x))
3754
3755 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
3756 .addUse(OrigSrc)
3757 .setMIFlags(Flags);
3758
3759 // Give source modifier matching some assistance before obscuring a foldable
3760 // pattern.
3761
3762 // TODO: We can avoid the neg on the fract? The input sign to fract
3763 // shouldn't matter?
3764 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
3765
3766 auto Const =
3767 B.buildFConstant(Res: F64, Val: llvm::bit_cast<double>(from: 0x3fefffffffffffff));
3768
3769 Register Min = MRI.createGenericVirtualRegister(Ty: F64);
3770
3771 // We don't need to concern ourselves with the snan handling difference, so
3772 // use the one which will directly select.
3773 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3774 if (MFI->getMode().IEEE)
3775 B.buildFMinNumIEEE(Dst: Min, Src0: Fract, Src1: Const, Flags);
3776 else
3777 B.buildFMinNum(Dst: Min, Src0: Fract, Src1: Const, Flags);
3778
3779 Register CorrectedFract = Min;
3780 if (!MI.getFlag(Flag: MachineInstr::FmNoNans)) {
3781 auto IsNan = B.buildFCmp(Pred: CmpInst::FCMP_ORD, Res: S1, Op0: ModSrc, Op1: ModSrc, Flags);
3782 CorrectedFract = B.buildSelect(Res: F64, Tst: IsNan, Op0: ModSrc, Op1: Min, Flags).getReg(Idx: 0);
3783 }
3784
3785 auto NegFract = B.buildFNeg(Dst: F64, Src0: CorrectedFract, Flags);
3786 B.buildFAdd(Dst, Src0: OrigSrc, Src1: NegFract, Flags);
3787
3788 MI.eraseFromParent();
3789 return true;
3790}
3791
3792// Turn an illegal packed v2s16 build vector into bit operations.
3793// TODO: This should probably be a bitcast action in LegalizerHelper.
3794bool AMDGPULegalizerInfo::legalizeBuildVector(
3795 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
3796 Register Dst = MI.getOperand(i: 0).getReg();
3797 const LLT S32 = LLT::scalar(SizeInBits: 32);
3798 const LLT S16 = LLT::scalar(SizeInBits: 16);
3799 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
3800
3801 Register Src0 = MI.getOperand(i: 1).getReg();
3802 Register Src1 = MI.getOperand(i: 2).getReg();
3803
3804 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
3805 assert(MRI.getType(Src0) == S32);
3806 Src0 = B.buildTrunc(Res: S16, Op: MI.getOperand(i: 1).getReg()).getReg(Idx: 0);
3807 Src1 = B.buildTrunc(Res: S16, Op: MI.getOperand(i: 2).getReg()).getReg(Idx: 0);
3808 }
3809
3810 auto Merge = B.buildMergeLikeInstr(Res: S32, Ops: {Src0, Src1});
3811 B.buildBitcast(Dst, Src: Merge);
3812
3813 MI.eraseFromParent();
3814 return true;
3815}
3816
3817// Build a big integer multiply or multiply-add using MAD_64_32 instructions.
3818//
3819// Source and accumulation registers must all be 32-bits.
3820//
3821// TODO: When the multiply is uniform, we should produce a code sequence
3822// that is better suited to instruction selection on the SALU. Instead of
3823// the outer loop going over parts of the result, the outer loop should go
3824// over parts of one of the factors. This should result in instruction
3825// selection that makes full use of S_ADDC_U32 instructions.
3826void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper,
3827 MutableArrayRef<Register> Accum,
3828 ArrayRef<Register> Src0,
3829 ArrayRef<Register> Src1,
3830 bool UsePartialMad64_32,
3831 bool SeparateOddAlignedProducts) const {
3832 // Use (possibly empty) vectors of S1 registers to represent the set of
3833 // carries from one pair of positions to the next.
3834 using Carry = SmallVector<Register, 2>;
3835
3836 MachineIRBuilder &B = Helper.MIRBuilder;
3837 GISelKnownBits &KB = *Helper.getKnownBits();
3838
3839 const LLT S1 = LLT::scalar(SizeInBits: 1);
3840 const LLT S32 = LLT::scalar(SizeInBits: 32);
3841 const LLT S64 = LLT::scalar(SizeInBits: 64);
3842
3843 Register Zero32;
3844 Register Zero64;
3845
3846 auto getZero32 = [&]() -> Register {
3847 if (!Zero32)
3848 Zero32 = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
3849 return Zero32;
3850 };
3851 auto getZero64 = [&]() -> Register {
3852 if (!Zero64)
3853 Zero64 = B.buildConstant(Res: S64, Val: 0).getReg(Idx: 0);
3854 return Zero64;
3855 };
3856
3857 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
3858 for (unsigned i = 0; i < Src0.size(); ++i) {
3859 Src0KnownZeros.push_back(Elt: KB.getKnownBits(R: Src0[i]).isZero());
3860 Src1KnownZeros.push_back(Elt: KB.getKnownBits(R: Src1[i]).isZero());
3861 }
3862
3863 // Merge the given carries into the 32-bit LocalAccum, which is modified
3864 // in-place.
3865 //
3866 // Returns the carry-out, which is a single S1 register or null.
3867 auto mergeCarry =
3868 [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
3869 if (CarryIn.empty())
3870 return Register();
3871
3872 bool HaveCarryOut = true;
3873 Register CarryAccum;
3874 if (CarryIn.size() == 1) {
3875 if (!LocalAccum) {
3876 LocalAccum = B.buildZExt(Res: S32, Op: CarryIn[0]).getReg(Idx: 0);
3877 return Register();
3878 }
3879
3880 CarryAccum = getZero32();
3881 } else {
3882 CarryAccum = B.buildZExt(Res: S32, Op: CarryIn[0]).getReg(Idx: 0);
3883 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
3884 CarryAccum =
3885 B.buildUAdde(Res: S32, CarryOut: S1, Op0: CarryAccum, Op1: getZero32(), CarryIn: CarryIn[i])
3886 .getReg(Idx: 0);
3887 }
3888
3889 if (!LocalAccum) {
3890 LocalAccum = getZero32();
3891 HaveCarryOut = false;
3892 }
3893 }
3894
3895 auto Add =
3896 B.buildUAdde(Res: S32, CarryOut: S1, Op0: CarryAccum, Op1: LocalAccum, CarryIn: CarryIn.back());
3897 LocalAccum = Add.getReg(Idx: 0);
3898 return HaveCarryOut ? Add.getReg(Idx: 1) : Register();
3899 };
3900
3901 // Build a multiply-add chain to compute
3902 //
3903 // LocalAccum + (partial products at DstIndex)
3904 // + (opportunistic subset of CarryIn)
3905 //
3906 // LocalAccum is an array of one or two 32-bit registers that are updated
3907 // in-place. The incoming registers may be null.
3908 //
3909 // In some edge cases, carry-ins can be consumed "for free". In that case,
3910 // the consumed carry bits are removed from CarryIn in-place.
3911 auto buildMadChain =
3912 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
3913 -> Carry {
3914 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
3915 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
3916
3917 Carry CarryOut;
3918 unsigned j0 = 0;
3919
3920 // Use plain 32-bit multiplication for the most significant part of the
3921 // result by default.
3922 if (LocalAccum.size() == 1 &&
3923 (!UsePartialMad64_32 || !CarryIn.empty())) {
3924 do {
3925 // Skip multiplication if one of the operands is 0
3926 unsigned j1 = DstIndex - j0;
3927 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3928 ++j0;
3929 continue;
3930 }
3931 auto Mul = B.buildMul(Dst: S32, Src0: Src0[j0], Src1: Src1[j1]);
3932 if (!LocalAccum[0] || KB.getKnownBits(R: LocalAccum[0]).isZero()) {
3933 LocalAccum[0] = Mul.getReg(Idx: 0);
3934 } else {
3935 if (CarryIn.empty()) {
3936 LocalAccum[0] = B.buildAdd(Dst: S32, Src0: LocalAccum[0], Src1: Mul).getReg(Idx: 0);
3937 } else {
3938 LocalAccum[0] =
3939 B.buildUAdde(Res: S32, CarryOut: S1, Op0: LocalAccum[0], Op1: Mul, CarryIn: CarryIn.back())
3940 .getReg(Idx: 0);
3941 CarryIn.pop_back();
3942 }
3943 }
3944 ++j0;
3945 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
3946 }
3947
3948 // Build full 64-bit multiplies.
3949 if (j0 <= DstIndex) {
3950 bool HaveSmallAccum = false;
3951 Register Tmp;
3952
3953 if (LocalAccum[0]) {
3954 if (LocalAccum.size() == 1) {
3955 Tmp = B.buildAnyExt(Res: S64, Op: LocalAccum[0]).getReg(Idx: 0);
3956 HaveSmallAccum = true;
3957 } else if (LocalAccum[1]) {
3958 Tmp = B.buildMergeLikeInstr(Res: S64, Ops: LocalAccum).getReg(Idx: 0);
3959 HaveSmallAccum = false;
3960 } else {
3961 Tmp = B.buildZExt(Res: S64, Op: LocalAccum[0]).getReg(Idx: 0);
3962 HaveSmallAccum = true;
3963 }
3964 } else {
3965 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
3966 Tmp = getZero64();
3967 HaveSmallAccum = true;
3968 }
3969
3970 do {
3971 unsigned j1 = DstIndex - j0;
3972 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3973 ++j0;
3974 continue;
3975 }
3976 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
3977 {Src0[j0], Src1[j1], Tmp});
3978 Tmp = Mad.getReg(0);
3979 if (!HaveSmallAccum)
3980 CarryOut.push_back(Elt: Mad.getReg(1));
3981 HaveSmallAccum = false;
3982
3983 ++j0;
3984 } while (j0 <= DstIndex);
3985
3986 auto Unmerge = B.buildUnmerge(Res: S32, Op: Tmp);
3987 LocalAccum[0] = Unmerge.getReg(Idx: 0);
3988 if (LocalAccum.size() > 1)
3989 LocalAccum[1] = Unmerge.getReg(Idx: 1);
3990 }
3991
3992 return CarryOut;
3993 };
3994
3995 // Outer multiply loop, iterating over destination parts from least
3996 // significant to most significant parts.
3997 //
3998 // The columns of the following diagram correspond to the destination parts
3999 // affected by one iteration of the outer loop (ignoring boundary
4000 // conditions).
4001 //
4002 // Dest index relative to 2 * i: 1 0 -1
4003 // ------
4004 // Carries from previous iteration: e o
4005 // Even-aligned partial product sum: E E .
4006 // Odd-aligned partial product sum: O O
4007 //
4008 // 'o' is OddCarry, 'e' is EvenCarry.
4009 // EE and OO are computed from partial products via buildMadChain and use
4010 // accumulation where possible and appropriate.
4011 //
4012 Register SeparateOddCarry;
4013 Carry EvenCarry;
4014 Carry OddCarry;
4015
4016 for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
4017 Carry OddCarryIn = std::move(OddCarry);
4018 Carry EvenCarryIn = std::move(EvenCarry);
4019 OddCarry.clear();
4020 EvenCarry.clear();
4021
4022 // Partial products at offset 2 * i.
4023 if (2 * i < Accum.size()) {
4024 auto LocalAccum = Accum.drop_front(N: 2 * i).take_front(N: 2);
4025 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4026 }
4027
4028 // Partial products at offset 2 * i - 1.
4029 if (i > 0) {
4030 if (!SeparateOddAlignedProducts) {
4031 auto LocalAccum = Accum.drop_front(N: 2 * i - 1).take_front(N: 2);
4032 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4033 } else {
4034 bool IsHighest = 2 * i >= Accum.size();
4035 Register SeparateOddOut[2];
4036 auto LocalAccum = MutableArrayRef(SeparateOddOut)
4037 .take_front(N: IsHighest ? 1 : 2);
4038 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4039
4040 MachineInstr *Lo;
4041
4042 if (i == 1) {
4043 if (!IsHighest)
4044 Lo = B.buildUAddo(Res: S32, CarryOut: S1, Op0: Accum[2 * i - 1], Op1: SeparateOddOut[0]);
4045 else
4046 Lo = B.buildAdd(Dst: S32, Src0: Accum[2 * i - 1], Src1: SeparateOddOut[0]);
4047 } else {
4048 Lo = B.buildUAdde(Res: S32, CarryOut: S1, Op0: Accum[2 * i - 1], Op1: SeparateOddOut[0],
4049 CarryIn: SeparateOddCarry);
4050 }
4051 Accum[2 * i - 1] = Lo->getOperand(i: 0).getReg();
4052
4053 if (!IsHighest) {
4054 auto Hi = B.buildUAdde(Res: S32, CarryOut: S1, Op0: Accum[2 * i], Op1: SeparateOddOut[1],
4055 CarryIn: Lo->getOperand(i: 1).getReg());
4056 Accum[2 * i] = Hi.getReg(Idx: 0);
4057 SeparateOddCarry = Hi.getReg(Idx: 1);
4058 }
4059 }
4060 }
4061
4062 // Add in the carries from the previous iteration
4063 if (i > 0) {
4064 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4065 EvenCarryIn.push_back(Elt: CarryOut);
4066
4067 if (2 * i < Accum.size()) {
4068 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4069 OddCarry.push_back(Elt: CarryOut);
4070 }
4071 }
4072 }
4073}
4074
4075// Custom narrowing of wide multiplies using wide multiply-add instructions.
4076//
4077// TODO: If the multiply is followed by an addition, we should attempt to
4078// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4079bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
4080 MachineInstr &MI) const {
4081 assert(ST.hasMad64_32());
4082 assert(MI.getOpcode() == TargetOpcode::G_MUL);
4083
4084 MachineIRBuilder &B = Helper.MIRBuilder;
4085 MachineRegisterInfo &MRI = *B.getMRI();
4086
4087 Register DstReg = MI.getOperand(i: 0).getReg();
4088 Register Src0 = MI.getOperand(i: 1).getReg();
4089 Register Src1 = MI.getOperand(i: 2).getReg();
4090
4091 LLT Ty = MRI.getType(Reg: DstReg);
4092 assert(Ty.isScalar());
4093
4094 unsigned Size = Ty.getSizeInBits();
4095 unsigned NumParts = Size / 32;
4096 assert((Size % 32) == 0);
4097 assert(NumParts >= 2);
4098
4099 // Whether to use MAD_64_32 for partial products whose high half is
4100 // discarded. This avoids some ADD instructions but risks false dependency
4101 // stalls on some subtargets in some cases.
4102 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4103
4104 // Whether to compute odd-aligned partial products separately. This is
4105 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4106 // in an even-aligned VGPR.
4107 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4108
4109 LLT S32 = LLT::scalar(SizeInBits: 32);
4110 SmallVector<Register, 2> Src0Parts, Src1Parts;
4111 for (unsigned i = 0; i < NumParts; ++i) {
4112 Src0Parts.push_back(Elt: MRI.createGenericVirtualRegister(Ty: S32));
4113 Src1Parts.push_back(Elt: MRI.createGenericVirtualRegister(Ty: S32));
4114 }
4115 B.buildUnmerge(Res: Src0Parts, Op: Src0);
4116 B.buildUnmerge(Res: Src1Parts, Op: Src1);
4117
4118 SmallVector<Register, 2> AccumRegs(NumParts);
4119 buildMultiply(Helper, Accum: AccumRegs, Src0: Src0Parts, Src1: Src1Parts, UsePartialMad64_32,
4120 SeparateOddAlignedProducts);
4121
4122 B.buildMergeLikeInstr(Res: DstReg, Ops: AccumRegs);
4123 MI.eraseFromParent();
4124 return true;
4125}
4126
4127// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4128// ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4129// case with a single min instruction instead of a compare+select.
4130bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
4131 MachineRegisterInfo &MRI,
4132 MachineIRBuilder &B) const {
4133 Register Dst = MI.getOperand(i: 0).getReg();
4134 Register Src = MI.getOperand(i: 1).getReg();
4135 LLT DstTy = MRI.getType(Reg: Dst);
4136 LLT SrcTy = MRI.getType(Reg: Src);
4137
4138 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4139 ? AMDGPU::G_AMDGPU_FFBH_U32
4140 : AMDGPU::G_AMDGPU_FFBL_B32;
4141 auto Tmp = B.buildInstr(Opc: NewOpc, DstOps: {DstTy}, SrcOps: {Src});
4142 B.buildUMin(Dst, Src0: Tmp, Src1: B.buildConstant(Res: DstTy, Val: SrcTy.getSizeInBits()));
4143
4144 MI.eraseFromParent();
4145 return true;
4146}
4147
4148// Check that this is a G_XOR x, -1
4149static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4150 if (MI.getOpcode() != TargetOpcode::G_XOR)
4151 return false;
4152 auto ConstVal = getIConstantVRegSExtVal(VReg: MI.getOperand(i: 2).getReg(), MRI);
4153 return ConstVal && *ConstVal == -1;
4154}
4155
4156// Return the use branch instruction, otherwise null if the usage is invalid.
4157static MachineInstr *
4158verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
4159 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
4160 Register CondDef = MI.getOperand(i: 0).getReg();
4161 if (!MRI.hasOneNonDBGUse(RegNo: CondDef))
4162 return nullptr;
4163
4164 MachineBasicBlock *Parent = MI.getParent();
4165 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(RegNo: CondDef);
4166
4167 if (isNot(MRI, MI: *UseMI)) {
4168 Register NegatedCond = UseMI->getOperand(i: 0).getReg();
4169 if (!MRI.hasOneNonDBGUse(RegNo: NegatedCond))
4170 return nullptr;
4171
4172 // We're deleting the def of this value, so we need to remove it.
4173 eraseInstr(MI&: *UseMI, MRI);
4174
4175 UseMI = &*MRI.use_instr_nodbg_begin(RegNo: NegatedCond);
4176 Negated = true;
4177 }
4178
4179 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4180 return nullptr;
4181
4182 // Make sure the cond br is followed by a G_BR, or is the last instruction.
4183 MachineBasicBlock::iterator Next = std::next(x: UseMI->getIterator());
4184 if (Next == Parent->end()) {
4185 MachineFunction::iterator NextMBB = std::next(x: Parent->getIterator());
4186 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4187 return nullptr;
4188 UncondBrTarget = &*NextMBB;
4189 } else {
4190 if (Next->getOpcode() != AMDGPU::G_BR)
4191 return nullptr;
4192 Br = &*Next;
4193 UncondBrTarget = Br->getOperand(i: 0).getMBB();
4194 }
4195
4196 return UseMI;
4197}
4198
4199bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
4200 const ArgDescriptor *Arg,
4201 const TargetRegisterClass *ArgRC,
4202 LLT ArgTy) const {
4203 MCRegister SrcReg = Arg->getRegister();
4204 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
4205 assert(DstReg.isVirtual() && "Virtual register expected");
4206
4207 Register LiveIn = getFunctionLiveInPhysReg(MF&: B.getMF(), TII: B.getTII(), PhysReg: SrcReg,
4208 RC: *ArgRC, DL: B.getDebugLoc(), RegTy: ArgTy);
4209 if (Arg->isMasked()) {
4210 // TODO: Should we try to emit this once in the entry block?
4211 const LLT S32 = LLT::scalar(SizeInBits: 32);
4212 const unsigned Mask = Arg->getMask();
4213 const unsigned Shift = llvm::countr_zero<unsigned>(Val: Mask);
4214
4215 Register AndMaskSrc = LiveIn;
4216
4217 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4218 // 0.
4219 if (Shift != 0) {
4220 auto ShiftAmt = B.buildConstant(Res: S32, Val: Shift);
4221 AndMaskSrc = B.buildLShr(Dst: S32, Src0: LiveIn, Src1: ShiftAmt).getReg(Idx: 0);
4222 }
4223
4224 B.buildAnd(Dst: DstReg, Src0: AndMaskSrc, Src1: B.buildConstant(Res: S32, Val: Mask >> Shift));
4225 } else {
4226 B.buildCopy(Res: DstReg, Op: LiveIn);
4227 }
4228
4229 return true;
4230}
4231
4232bool AMDGPULegalizerInfo::loadInputValue(
4233 Register DstReg, MachineIRBuilder &B,
4234 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4235 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4236 const ArgDescriptor *Arg = nullptr;
4237 const TargetRegisterClass *ArgRC;
4238 LLT ArgTy;
4239
4240 CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4241 const ArgDescriptor WorkGroupIDX =
4242 ArgDescriptor::createRegister(AMDGPU::TTMP9);
4243 // If GridZ is not programmed in an entry function then the hardware will set
4244 // it to all zeros, so there is no need to mask the GridY value in the low
4245 // order bits.
4246 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4247 AMDGPU::TTMP7,
4248 AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4249 const ArgDescriptor WorkGroupIDZ =
4250 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
4251 if (ST.hasArchitectedSGPRs() &&
4252 (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) {
4253 switch (ArgType) {
4254 case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
4255 Arg = &WorkGroupIDX;
4256 ArgRC = &AMDGPU::SReg_32RegClass;
4257 ArgTy = LLT::scalar(SizeInBits: 32);
4258 break;
4259 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
4260 Arg = &WorkGroupIDY;
4261 ArgRC = &AMDGPU::SReg_32RegClass;
4262 ArgTy = LLT::scalar(SizeInBits: 32);
4263 break;
4264 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
4265 Arg = &WorkGroupIDZ;
4266 ArgRC = &AMDGPU::SReg_32RegClass;
4267 ArgTy = LLT::scalar(SizeInBits: 32);
4268 break;
4269 default:
4270 break;
4271 }
4272 }
4273
4274 if (!Arg)
4275 std::tie(args&: Arg, args&: ArgRC, args&: ArgTy) = MFI->getPreloadedValue(Value: ArgType);
4276
4277 if (!Arg) {
4278 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
4279 // The intrinsic may appear when we have a 0 sized kernarg segment, in which
4280 // case the pointer argument may be missing and we use null.
4281 B.buildConstant(Res: DstReg, Val: 0);
4282 return true;
4283 }
4284
4285 // It's undefined behavior if a function marked with the amdgpu-no-*
4286 // attributes uses the corresponding intrinsic.
4287 B.buildUndef(Res: DstReg);
4288 return true;
4289 }
4290
4291 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4292 return false; // TODO: Handle these
4293 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4294}
4295
4296bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
4297 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
4298 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4299 if (!loadInputValue(DstReg: MI.getOperand(i: 0).getReg(), B, ArgType))
4300 return false;
4301
4302 MI.eraseFromParent();
4303 return true;
4304}
4305
4306static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI,
4307 int64_t C) {
4308 B.buildConstant(Res: MI.getOperand(i: 0).getReg(), Val: C);
4309 MI.eraseFromParent();
4310 return true;
4311}
4312
4313bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
4314 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
4315 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4316 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
4317 if (MaxID == 0)
4318 return replaceWithConstant(B, MI, C: 0);
4319
4320 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4321 const ArgDescriptor *Arg;
4322 const TargetRegisterClass *ArgRC;
4323 LLT ArgTy;
4324 std::tie(args&: Arg, args&: ArgRC, args&: ArgTy) = MFI->getPreloadedValue(Value: ArgType);
4325
4326 Register DstReg = MI.getOperand(i: 0).getReg();
4327 if (!Arg) {
4328 // It's undefined behavior if a function marked with the amdgpu-no-*
4329 // attributes uses the corresponding intrinsic.
4330 B.buildUndef(Res: DstReg);
4331 MI.eraseFromParent();
4332 return true;
4333 }
4334
4335 if (Arg->isMasked()) {
4336 // Don't bother inserting AssertZext for packed IDs since we're emitting the
4337 // masking operations anyway.
4338 //
4339 // TODO: We could assert the top bit is 0 for the source copy.
4340 if (!loadInputValue(DstReg, B, ArgType))
4341 return false;
4342 } else {
4343 Register TmpReg = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 32));
4344 if (!loadInputValue(DstReg: TmpReg, B, ArgType))
4345 return false;
4346 B.buildAssertZExt(Res: DstReg, Op: TmpReg, Size: llvm::bit_width(Value: MaxID));
4347 }
4348
4349 MI.eraseFromParent();
4350 return true;
4351}
4352
4353Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
4354 int64_t Offset) const {
4355 LLT PtrTy = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64);
4356 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(Ty: PtrTy);
4357
4358 // TODO: If we passed in the base kernel offset we could have a better
4359 // alignment than 4, but we don't really need it.
4360 if (!loadInputValue(DstReg: KernArgReg, B,
4361 ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
4362 llvm_unreachable("failed to find kernarg segment ptr");
4363
4364 auto COffset = B.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: Offset);
4365 // TODO: Should get nuw
4366 return B.buildPtrAdd(Res: PtrTy, Op0: KernArgReg, Op1: COffset).getReg(Idx: 0);
4367}
4368
4369/// Legalize a value that's loaded from kernel arguments. This is only used by
4370/// legacy intrinsics.
4371bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
4372 MachineIRBuilder &B,
4373 uint64_t Offset,
4374 Align Alignment) const {
4375 Register DstReg = MI.getOperand(i: 0).getReg();
4376
4377 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
4378 "unexpected kernarg parameter type");
4379
4380 Register Ptr = getKernargParameterPtr(B, Offset);
4381 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
4382 B.buildLoad(Res: DstReg, Addr: Ptr, PtrInfo, Alignment: Align(4),
4383 MMOFlags: MachineMemOperand::MODereferenceable |
4384 MachineMemOperand::MOInvariant);
4385 MI.eraseFromParent();
4386 return true;
4387}
4388
4389bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
4390 MachineRegisterInfo &MRI,
4391 MachineIRBuilder &B) const {
4392 Register Dst = MI.getOperand(i: 0).getReg();
4393 LLT DstTy = MRI.getType(Reg: Dst);
4394 LLT S16 = LLT::scalar(SizeInBits: 16);
4395 LLT S32 = LLT::scalar(SizeInBits: 32);
4396 LLT S64 = LLT::scalar(SizeInBits: 64);
4397
4398 if (DstTy == S16)
4399 return legalizeFDIV16(MI, MRI, B);
4400 if (DstTy == S32)
4401 return legalizeFDIV32(MI, MRI, B);
4402 if (DstTy == S64)
4403 return legalizeFDIV64(MI, MRI, B);
4404
4405 return false;
4406}
4407
4408void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
4409 Register DstDivReg,
4410 Register DstRemReg,
4411 Register X,
4412 Register Y) const {
4413 const LLT S1 = LLT::scalar(SizeInBits: 1);
4414 const LLT S32 = LLT::scalar(SizeInBits: 32);
4415
4416 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
4417 // algorithm used here.
4418
4419 // Initial estimate of inv(y).
4420 auto FloatY = B.buildUITOFP(Dst: S32, Src0: Y);
4421 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
4422 auto Scale = B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: 0x4f7ffffe));
4423 auto ScaledY = B.buildFMul(Dst: S32, Src0: RcpIFlag, Src1: Scale);
4424 auto Z = B.buildFPTOUI(Dst: S32, Src0: ScaledY);
4425
4426 // One round of UNR.
4427 auto NegY = B.buildSub(Dst: S32, Src0: B.buildConstant(Res: S32, Val: 0), Src1: Y);
4428 auto NegYZ = B.buildMul(Dst: S32, Src0: NegY, Src1: Z);
4429 Z = B.buildAdd(Dst: S32, Src0: Z, Src1: B.buildUMulH(Dst: S32, Src0: Z, Src1: NegYZ));
4430
4431 // Quotient/remainder estimate.
4432 auto Q = B.buildUMulH(Dst: S32, Src0: X, Src1: Z);
4433 auto R = B.buildSub(Dst: S32, Src0: X, Src1: B.buildMul(Dst: S32, Src0: Q, Src1: Y));
4434
4435 // First quotient/remainder refinement.
4436 auto One = B.buildConstant(Res: S32, Val: 1);
4437 auto Cond = B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: R, Op1: Y);
4438 if (DstDivReg)
4439 Q = B.buildSelect(Res: S32, Tst: Cond, Op0: B.buildAdd(Dst: S32, Src0: Q, Src1: One), Op1: Q);
4440 R = B.buildSelect(Res: S32, Tst: Cond, Op0: B.buildSub(Dst: S32, Src0: R, Src1: Y), Op1: R);
4441
4442 // Second quotient/remainder refinement.
4443 Cond = B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: R, Op1: Y);
4444 if (DstDivReg)
4445 B.buildSelect(Res: DstDivReg, Tst: Cond, Op0: B.buildAdd(Dst: S32, Src0: Q, Src1: One), Op1: Q);
4446
4447 if (DstRemReg)
4448 B.buildSelect(Res: DstRemReg, Tst: Cond, Op0: B.buildSub(Dst: S32, Src0: R, Src1: Y), Op1: R);
4449}
4450
4451// Build integer reciprocal sequence around V_RCP_IFLAG_F32
4452//
4453// Return lo, hi of result
4454//
4455// %cvt.lo = G_UITOFP Val.lo
4456// %cvt.hi = G_UITOFP Val.hi
4457// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
4458// %rcp = G_AMDGPU_RCP_IFLAG %mad
4459// %mul1 = G_FMUL %rcp, 0x5f7ffffc
4460// %mul2 = G_FMUL %mul1, 2**(-32)
4461// %trunc = G_INTRINSIC_TRUNC %mul2
4462// %mad2 = G_FMAD %trunc, -(2**32), %mul1
4463// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
4464static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
4465 Register Val) {
4466 const LLT S32 = LLT::scalar(SizeInBits: 32);
4467 auto Unmerge = B.buildUnmerge(Res: S32, Op: Val);
4468
4469 auto CvtLo = B.buildUITOFP(Dst: S32, Src0: Unmerge.getReg(Idx: 0));
4470 auto CvtHi = B.buildUITOFP(Dst: S32, Src0: Unmerge.getReg(Idx: 1));
4471
4472 auto Mad = B.buildFMAD(
4473 Dst: S32, Src0: CvtHi, // 2**32
4474 Src1: B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: 0x4f800000)), Src2: CvtLo);
4475
4476 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
4477 auto Mul1 = B.buildFMul(
4478 Dst: S32, Src0: Rcp, Src1: B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: 0x5f7ffffc)));
4479
4480 // 2**(-32)
4481 auto Mul2 = B.buildFMul(
4482 Dst: S32, Src0: Mul1, Src1: B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: 0x2f800000)));
4483 auto Trunc = B.buildIntrinsicTrunc(Dst: S32, Src0: Mul2);
4484
4485 // -(2**32)
4486 auto Mad2 = B.buildFMAD(
4487 Dst: S32, Src0: Trunc, Src1: B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: 0xcf800000)),
4488 Src2: Mul1);
4489
4490 auto ResultLo = B.buildFPTOUI(Dst: S32, Src0: Mad2);
4491 auto ResultHi = B.buildFPTOUI(Dst: S32, Src0: Trunc);
4492
4493 return {ResultLo.getReg(0), ResultHi.getReg(0)};
4494}
4495
4496void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
4497 Register DstDivReg,
4498 Register DstRemReg,
4499 Register Numer,
4500 Register Denom) const {
4501 const LLT S32 = LLT::scalar(SizeInBits: 32);
4502 const LLT S64 = LLT::scalar(SizeInBits: 64);
4503 const LLT S1 = LLT::scalar(SizeInBits: 1);
4504 Register RcpLo, RcpHi;
4505
4506 std::tie(args&: RcpLo, args&: RcpHi) = emitReciprocalU64(B, Val: Denom);
4507
4508 auto Rcp = B.buildMergeLikeInstr(Res: S64, Ops: {RcpLo, RcpHi});
4509
4510 auto Zero64 = B.buildConstant(Res: S64, Val: 0);
4511 auto NegDenom = B.buildSub(Dst: S64, Src0: Zero64, Src1: Denom);
4512
4513 auto MulLo1 = B.buildMul(Dst: S64, Src0: NegDenom, Src1: Rcp);
4514 auto MulHi1 = B.buildUMulH(Dst: S64, Src0: Rcp, Src1: MulLo1);
4515
4516 auto UnmergeMulHi1 = B.buildUnmerge(Res: S32, Op: MulHi1);
4517 Register MulHi1_Lo = UnmergeMulHi1.getReg(Idx: 0);
4518 Register MulHi1_Hi = UnmergeMulHi1.getReg(Idx: 1);
4519
4520 auto Add1_Lo = B.buildUAddo(Res: S32, CarryOut: S1, Op0: RcpLo, Op1: MulHi1_Lo);
4521 auto Add1_Hi = B.buildUAdde(Res: S32, CarryOut: S1, Op0: RcpHi, Op1: MulHi1_Hi, CarryIn: Add1_Lo.getReg(Idx: 1));
4522 auto Add1 = B.buildMergeLikeInstr(Res: S64, Ops: {Add1_Lo, Add1_Hi});
4523
4524 auto MulLo2 = B.buildMul(Dst: S64, Src0: NegDenom, Src1: Add1);
4525 auto MulHi2 = B.buildUMulH(Dst: S64, Src0: Add1, Src1: MulLo2);
4526 auto UnmergeMulHi2 = B.buildUnmerge(Res: S32, Op: MulHi2);
4527 Register MulHi2_Lo = UnmergeMulHi2.getReg(Idx: 0);
4528 Register MulHi2_Hi = UnmergeMulHi2.getReg(Idx: 1);
4529
4530 auto Zero32 = B.buildConstant(Res: S32, Val: 0);
4531 auto Add2_Lo = B.buildUAddo(Res: S32, CarryOut: S1, Op0: Add1_Lo, Op1: MulHi2_Lo);
4532 auto Add2_Hi = B.buildUAdde(Res: S32, CarryOut: S1, Op0: Add1_Hi, Op1: MulHi2_Hi, CarryIn: Add2_Lo.getReg(Idx: 1));
4533 auto Add2 = B.buildMergeLikeInstr(Res: S64, Ops: {Add2_Lo, Add2_Hi});
4534
4535 auto UnmergeNumer = B.buildUnmerge(Res: S32, Op: Numer);
4536 Register NumerLo = UnmergeNumer.getReg(Idx: 0);
4537 Register NumerHi = UnmergeNumer.getReg(Idx: 1);
4538
4539 auto MulHi3 = B.buildUMulH(Dst: S64, Src0: Numer, Src1: Add2);
4540 auto Mul3 = B.buildMul(Dst: S64, Src0: Denom, Src1: MulHi3);
4541 auto UnmergeMul3 = B.buildUnmerge(Res: S32, Op: Mul3);
4542 Register Mul3_Lo = UnmergeMul3.getReg(Idx: 0);
4543 Register Mul3_Hi = UnmergeMul3.getReg(Idx: 1);
4544 auto Sub1_Lo = B.buildUSubo(Res: S32, CarryOut: S1, Op0: NumerLo, Op1: Mul3_Lo);
4545 auto Sub1_Hi = B.buildUSube(Res: S32, CarryOut: S1, Op0: NumerHi, Op1: Mul3_Hi, CarryIn: Sub1_Lo.getReg(Idx: 1));
4546 auto Sub1_Mi = B.buildSub(Dst: S32, Src0: NumerHi, Src1: Mul3_Hi);
4547 auto Sub1 = B.buildMergeLikeInstr(Res: S64, Ops: {Sub1_Lo, Sub1_Hi});
4548
4549 auto UnmergeDenom = B.buildUnmerge(Res: S32, Op: Denom);
4550 Register DenomLo = UnmergeDenom.getReg(Idx: 0);
4551 Register DenomHi = UnmergeDenom.getReg(Idx: 1);
4552
4553 auto CmpHi = B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: Sub1_Hi, Op1: DenomHi);
4554 auto C1 = B.buildSExt(Res: S32, Op: CmpHi);
4555
4556 auto CmpLo = B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: Sub1_Lo, Op1: DenomLo);
4557 auto C2 = B.buildSExt(Res: S32, Op: CmpLo);
4558
4559 auto CmpEq = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: S1, Op0: Sub1_Hi, Op1: DenomHi);
4560 auto C3 = B.buildSelect(Res: S32, Tst: CmpEq, Op0: C2, Op1: C1);
4561
4562 // TODO: Here and below portions of the code can be enclosed into if/endif.
4563 // Currently control flow is unconditional and we have 4 selects after
4564 // potential endif to substitute PHIs.
4565
4566 // if C3 != 0 ...
4567 auto Sub2_Lo = B.buildUSubo(Res: S32, CarryOut: S1, Op0: Sub1_Lo, Op1: DenomLo);
4568 auto Sub2_Mi = B.buildUSube(Res: S32, CarryOut: S1, Op0: Sub1_Mi, Op1: DenomHi, CarryIn: Sub1_Lo.getReg(Idx: 1));
4569 auto Sub2_Hi = B.buildUSube(Res: S32, CarryOut: S1, Op0: Sub2_Mi, Op1: Zero32, CarryIn: Sub2_Lo.getReg(Idx: 1));
4570 auto Sub2 = B.buildMergeLikeInstr(Res: S64, Ops: {Sub2_Lo, Sub2_Hi});
4571
4572 auto One64 = B.buildConstant(Res: S64, Val: 1);
4573 auto Add3 = B.buildAdd(Dst: S64, Src0: MulHi3, Src1: One64);
4574
4575 auto C4 =
4576 B.buildSExt(Res: S32, Op: B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: Sub2_Hi, Op1: DenomHi));
4577 auto C5 =
4578 B.buildSExt(Res: S32, Op: B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: Sub2_Lo, Op1: DenomLo));
4579 auto C6 = B.buildSelect(
4580 Res: S32, Tst: B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: S1, Op0: Sub2_Hi, Op1: DenomHi), Op0: C5, Op1: C4);
4581
4582 // if (C6 != 0)
4583 auto Add4 = B.buildAdd(Dst: S64, Src0: Add3, Src1: One64);
4584 auto Sub3_Lo = B.buildUSubo(Res: S32, CarryOut: S1, Op0: Sub2_Lo, Op1: DenomLo);
4585
4586 auto Sub3_Mi = B.buildUSube(Res: S32, CarryOut: S1, Op0: Sub2_Mi, Op1: DenomHi, CarryIn: Sub2_Lo.getReg(Idx: 1));
4587 auto Sub3_Hi = B.buildUSube(Res: S32, CarryOut: S1, Op0: Sub3_Mi, Op1: Zero32, CarryIn: Sub3_Lo.getReg(Idx: 1));
4588 auto Sub3 = B.buildMergeLikeInstr(Res: S64, Ops: {Sub3_Lo, Sub3_Hi});
4589
4590 // endif C6
4591 // endif C3
4592
4593 if (DstDivReg) {
4594 auto Sel1 = B.buildSelect(
4595 Res: S64, Tst: B.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: C6, Op1: Zero32), Op0: Add4, Op1: Add3);
4596 B.buildSelect(Res: DstDivReg, Tst: B.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: C3, Op1: Zero32),
4597 Op0: Sel1, Op1: MulHi3);
4598 }
4599
4600 if (DstRemReg) {
4601 auto Sel2 = B.buildSelect(
4602 Res: S64, Tst: B.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: C6, Op1: Zero32), Op0: Sub3, Op1: Sub2);
4603 B.buildSelect(Res: DstRemReg, Tst: B.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: C3, Op1: Zero32),
4604 Op0: Sel2, Op1: Sub1);
4605 }
4606}
4607
4608bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
4609 MachineRegisterInfo &MRI,
4610 MachineIRBuilder &B) const {
4611 Register DstDivReg, DstRemReg;
4612 switch (MI.getOpcode()) {
4613 default:
4614 llvm_unreachable("Unexpected opcode!");
4615 case AMDGPU::G_UDIV: {
4616 DstDivReg = MI.getOperand(i: 0).getReg();
4617 break;
4618 }
4619 case AMDGPU::G_UREM: {
4620 DstRemReg = MI.getOperand(i: 0).getReg();
4621 break;
4622 }
4623 case AMDGPU::G_UDIVREM: {
4624 DstDivReg = MI.getOperand(i: 0).getReg();
4625 DstRemReg = MI.getOperand(i: 1).getReg();
4626 break;
4627 }
4628 }
4629
4630 const LLT S64 = LLT::scalar(SizeInBits: 64);
4631 const LLT S32 = LLT::scalar(SizeInBits: 32);
4632 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4633 Register Num = MI.getOperand(i: FirstSrcOpIdx).getReg();
4634 Register Den = MI.getOperand(i: FirstSrcOpIdx + 1).getReg();
4635 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
4636
4637 if (Ty == S32)
4638 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, X: Num, Y: Den);
4639 else if (Ty == S64)
4640 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Numer: Num, Denom: Den);
4641 else
4642 return false;
4643
4644 MI.eraseFromParent();
4645 return true;
4646}
4647
4648bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
4649 MachineRegisterInfo &MRI,
4650 MachineIRBuilder &B) const {
4651 const LLT S64 = LLT::scalar(SizeInBits: 64);
4652 const LLT S32 = LLT::scalar(SizeInBits: 32);
4653
4654 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
4655 if (Ty != S32 && Ty != S64)
4656 return false;
4657
4658 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4659 Register LHS = MI.getOperand(i: FirstSrcOpIdx).getReg();
4660 Register RHS = MI.getOperand(i: FirstSrcOpIdx + 1).getReg();
4661
4662 auto SignBitOffset = B.buildConstant(Res: S32, Val: Ty.getSizeInBits() - 1);
4663 auto LHSign = B.buildAShr(Dst: Ty, Src0: LHS, Src1: SignBitOffset);
4664 auto RHSign = B.buildAShr(Dst: Ty, Src0: RHS, Src1: SignBitOffset);
4665
4666 LHS = B.buildAdd(Dst: Ty, Src0: LHS, Src1: LHSign).getReg(Idx: 0);
4667 RHS = B.buildAdd(Dst: Ty, Src0: RHS, Src1: RHSign).getReg(Idx: 0);
4668
4669 LHS = B.buildXor(Dst: Ty, Src0: LHS, Src1: LHSign).getReg(Idx: 0);
4670 RHS = B.buildXor(Dst: Ty, Src0: RHS, Src1: RHSign).getReg(Idx: 0);
4671
4672 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
4673 switch (MI.getOpcode()) {
4674 default:
4675 llvm_unreachable("Unexpected opcode!");
4676 case AMDGPU::G_SDIV: {
4677 DstDivReg = MI.getOperand(i: 0).getReg();
4678 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4679 break;
4680 }
4681 case AMDGPU::G_SREM: {
4682 DstRemReg = MI.getOperand(i: 0).getReg();
4683 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4684 break;
4685 }
4686 case AMDGPU::G_SDIVREM: {
4687 DstDivReg = MI.getOperand(i: 0).getReg();
4688 DstRemReg = MI.getOperand(i: 1).getReg();
4689 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4690 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4691 break;
4692 }
4693 }
4694
4695 if (Ty == S32)
4696 legalizeUnsignedDIV_REM32Impl(B, DstDivReg: TmpDivReg, DstRemReg: TmpRemReg, X: LHS, Y: RHS);
4697 else
4698 legalizeUnsignedDIV_REM64Impl(B, DstDivReg: TmpDivReg, DstRemReg: TmpRemReg, Numer: LHS, Denom: RHS);
4699
4700 if (DstDivReg) {
4701 auto Sign = B.buildXor(Dst: Ty, Src0: LHSign, Src1: RHSign).getReg(Idx: 0);
4702 auto SignXor = B.buildXor(Dst: Ty, Src0: TmpDivReg, Src1: Sign).getReg(Idx: 0);
4703 B.buildSub(Dst: DstDivReg, Src0: SignXor, Src1: Sign);
4704 }
4705
4706 if (DstRemReg) {
4707 auto Sign = LHSign.getReg(Idx: 0); // Remainder sign is the same as LHS
4708 auto SignXor = B.buildXor(Dst: Ty, Src0: TmpRemReg, Src1: Sign).getReg(Idx: 0);
4709 B.buildSub(Dst: DstRemReg, Src0: SignXor, Src1: Sign);
4710 }
4711
4712 MI.eraseFromParent();
4713 return true;
4714}
4715
4716bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
4717 MachineRegisterInfo &MRI,
4718 MachineIRBuilder &B) const {
4719 Register Res = MI.getOperand(i: 0).getReg();
4720 Register LHS = MI.getOperand(i: 1).getReg();
4721 Register RHS = MI.getOperand(i: 2).getReg();
4722 uint16_t Flags = MI.getFlags();
4723 LLT ResTy = MRI.getType(Reg: Res);
4724
4725 const MachineFunction &MF = B.getMF();
4726 bool AllowInaccurateRcp = MI.getFlag(Flag: MachineInstr::FmAfn) ||
4727 MF.getTarget().Options.UnsafeFPMath;
4728
4729 if (auto CLHS = getConstantFPVRegVal(VReg: LHS, MRI)) {
4730 if (!AllowInaccurateRcp && ResTy != LLT::scalar(SizeInBits: 16))
4731 return false;
4732
4733 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
4734 // the CI documentation has a worst case error of 1 ulp.
4735 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
4736 // use it as long as we aren't trying to use denormals.
4737 //
4738 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
4739
4740 // 1 / x -> RCP(x)
4741 if (CLHS->isExactlyValue(V: 1.0)) {
4742 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4743 .addUse(RHS)
4744 .setMIFlags(Flags);
4745
4746 MI.eraseFromParent();
4747 return true;
4748 }
4749
4750 // -1 / x -> RCP( FNEG(x) )
4751 if (CLHS->isExactlyValue(V: -1.0)) {
4752 auto FNeg = B.buildFNeg(Dst: ResTy, Src0: RHS, Flags);
4753 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4754 .addUse(FNeg.getReg(0))
4755 .setMIFlags(Flags);
4756
4757 MI.eraseFromParent();
4758 return true;
4759 }
4760 }
4761
4762 // For f16 require afn or arcp.
4763 // For f32 require afn.
4764 if (!AllowInaccurateRcp && (ResTy != LLT::scalar(SizeInBits: 16) ||
4765 !MI.getFlag(Flag: MachineInstr::FmArcp)))
4766 return false;
4767
4768 // x / y -> x * (1.0 / y)
4769 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4770 .addUse(RHS)
4771 .setMIFlags(Flags);
4772 B.buildFMul(Dst: Res, Src0: LHS, Src1: RCP, Flags);
4773
4774 MI.eraseFromParent();
4775 return true;
4776}
4777
4778bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
4779 MachineRegisterInfo &MRI,
4780 MachineIRBuilder &B) const {
4781 Register Res = MI.getOperand(i: 0).getReg();
4782 Register X = MI.getOperand(i: 1).getReg();
4783 Register Y = MI.getOperand(i: 2).getReg();
4784 uint16_t Flags = MI.getFlags();
4785 LLT ResTy = MRI.getType(Reg: Res);
4786
4787 const MachineFunction &MF = B.getMF();
4788 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
4789 MI.getFlag(Flag: MachineInstr::FmAfn);
4790
4791 if (!AllowInaccurateRcp)
4792 return false;
4793
4794 auto NegY = B.buildFNeg(Dst: ResTy, Src0: Y);
4795 auto One = B.buildFConstant(Res: ResTy, Val: 1.0);
4796
4797 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4798 .addUse(Y)
4799 .setMIFlags(Flags);
4800
4801 auto Tmp0 = B.buildFMA(Dst: ResTy, Src0: NegY, Src1: R, Src2: One);
4802 R = B.buildFMA(Dst: ResTy, Src0: Tmp0, Src1: R, Src2: R);
4803
4804 auto Tmp1 = B.buildFMA(Dst: ResTy, Src0: NegY, Src1: R, Src2: One);
4805 R = B.buildFMA(Dst: ResTy, Src0: Tmp1, Src1: R, Src2: R);
4806
4807 auto Ret = B.buildFMul(Dst: ResTy, Src0: X, Src1: R);
4808 auto Tmp2 = B.buildFMA(Dst: ResTy, Src0: NegY, Src1: Ret, Src2: X);
4809
4810 B.buildFMA(Dst: Res, Src0: Tmp2, Src1: R, Src2: Ret);
4811 MI.eraseFromParent();
4812 return true;
4813}
4814
4815bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
4816 MachineRegisterInfo &MRI,
4817 MachineIRBuilder &B) const {
4818 if (legalizeFastUnsafeFDIV(MI, MRI, B))
4819 return true;
4820
4821 Register Res = MI.getOperand(i: 0).getReg();
4822 Register LHS = MI.getOperand(i: 1).getReg();
4823 Register RHS = MI.getOperand(i: 2).getReg();
4824
4825 uint16_t Flags = MI.getFlags();
4826
4827 LLT S16 = LLT::scalar(SizeInBits: 16);
4828 LLT S32 = LLT::scalar(SizeInBits: 32);
4829
4830 auto LHSExt = B.buildFPExt(Res: S32, Op: LHS, Flags);
4831 auto RHSExt = B.buildFPExt(Res: S32, Op: RHS, Flags);
4832
4833 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
4834 .addUse(RHSExt.getReg(0))
4835 .setMIFlags(Flags);
4836
4837 auto QUOT = B.buildFMul(Dst: S32, Src0: LHSExt, Src1: RCP, Flags);
4838 auto RDst = B.buildFPTrunc(Res: S16, Op: QUOT, Flags);
4839
4840 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
4841 .addUse(RDst.getReg(0))
4842 .addUse(RHS)
4843 .addUse(LHS)
4844 .setMIFlags(Flags);
4845
4846 MI.eraseFromParent();
4847 return true;
4848}
4849
4850static constexpr unsigned SPDenormModeBitField =
4851 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 4, 2);
4852
4853// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
4854// to enable denorm mode. When 'Enable' is false, disable denorm mode.
4855static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B,
4856 const GCNSubtarget &ST,
4857 SIModeRegisterDefaults Mode) {
4858 // Set SP denorm mode to this value.
4859 unsigned SPDenormMode =
4860 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
4861
4862 if (ST.hasDenormModeInst()) {
4863 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
4864 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
4865
4866 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
4867 B.buildInstr(AMDGPU::S_DENORM_MODE)
4868 .addImm(NewDenormModeValue);
4869
4870 } else {
4871 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
4872 .addImm(SPDenormMode)
4873 .addImm(SPDenormModeBitField);
4874 }
4875}
4876
4877bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
4878 MachineRegisterInfo &MRI,
4879 MachineIRBuilder &B) const {
4880 if (legalizeFastUnsafeFDIV(MI, MRI, B))
4881 return true;
4882
4883 Register Res = MI.getOperand(i: 0).getReg();
4884 Register LHS = MI.getOperand(i: 1).getReg();
4885 Register RHS = MI.getOperand(i: 2).getReg();
4886 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4887 SIModeRegisterDefaults Mode = MFI->getMode();
4888
4889 uint16_t Flags = MI.getFlags();
4890
4891 LLT S32 = LLT::scalar(SizeInBits: 32);
4892 LLT S1 = LLT::scalar(SizeInBits: 1);
4893
4894 auto One = B.buildFConstant(Res: S32, Val: 1.0f);
4895
4896 auto DenominatorScaled =
4897 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
4898 .addUse(LHS)
4899 .addUse(RHS)
4900 .addImm(0)
4901 .setMIFlags(Flags);
4902 auto NumeratorScaled =
4903 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
4904 .addUse(LHS)
4905 .addUse(RHS)
4906 .addImm(1)
4907 .setMIFlags(Flags);
4908
4909 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
4910 .addUse(DenominatorScaled.getReg(0))
4911 .setMIFlags(Flags);
4912 auto NegDivScale0 = B.buildFNeg(Dst: S32, Src0: DenominatorScaled, Flags);
4913
4914 const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
4915 const bool HasDynamicDenormals =
4916 (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
4917 (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
4918
4919 Register SavedSPDenormMode;
4920 if (!PreservesDenormals) {
4921 if (HasDynamicDenormals) {
4922 SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4923 B.buildInstr(AMDGPU::S_GETREG_B32)
4924 .addDef(SavedSPDenormMode)
4925 .addImm(SPDenormModeBitField);
4926 }
4927 toggleSPDenormMode(Enable: true, B, ST, Mode);
4928 }
4929
4930 auto Fma0 = B.buildFMA(Dst: S32, Src0: NegDivScale0, Src1: ApproxRcp, Src2: One, Flags);
4931 auto Fma1 = B.buildFMA(Dst: S32, Src0: Fma0, Src1: ApproxRcp, Src2: ApproxRcp, Flags);
4932 auto Mul = B.buildFMul(Dst: S32, Src0: NumeratorScaled, Src1: Fma1, Flags);
4933 auto Fma2 = B.buildFMA(Dst: S32, Src0: NegDivScale0, Src1: Mul, Src2: NumeratorScaled, Flags);
4934 auto Fma3 = B.buildFMA(Dst: S32, Src0: Fma2, Src1: Fma1, Src2: Mul, Flags);
4935 auto Fma4 = B.buildFMA(Dst: S32, Src0: NegDivScale0, Src1: Fma3, Src2: NumeratorScaled, Flags);
4936
4937 if (!PreservesDenormals) {
4938 if (HasDynamicDenormals) {
4939 assert(SavedSPDenormMode);
4940 B.buildInstr(AMDGPU::S_SETREG_B32)
4941 .addReg(SavedSPDenormMode)
4942 .addImm(SPDenormModeBitField);
4943 } else
4944 toggleSPDenormMode(Enable: false, B, ST, Mode);
4945 }
4946
4947 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32})
4948 .addUse(Fma4.getReg(0))
4949 .addUse(Fma1.getReg(0))
4950 .addUse(Fma3.getReg(0))
4951 .addUse(NumeratorScaled.getReg(1))
4952 .setMIFlags(Flags);
4953
4954 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
4955 .addUse(Fmas.getReg(0))
4956 .addUse(RHS)
4957 .addUse(LHS)
4958 .setMIFlags(Flags);
4959
4960 MI.eraseFromParent();
4961 return true;
4962}
4963
4964bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
4965 MachineRegisterInfo &MRI,
4966 MachineIRBuilder &B) const {
4967 if (legalizeFastUnsafeFDIV64(MI, MRI, B))
4968 return true;
4969
4970 Register Res = MI.getOperand(i: 0).getReg();
4971 Register LHS = MI.getOperand(i: 1).getReg();
4972 Register RHS = MI.getOperand(i: 2).getReg();
4973
4974 uint16_t Flags = MI.getFlags();
4975
4976 LLT S64 = LLT::scalar(SizeInBits: 64);
4977 LLT S1 = LLT::scalar(SizeInBits: 1);
4978
4979 auto One = B.buildFConstant(Res: S64, Val: 1.0);
4980
4981 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
4982 .addUse(LHS)
4983 .addUse(RHS)
4984 .addImm(0)
4985 .setMIFlags(Flags);
4986
4987 auto NegDivScale0 = B.buildFNeg(Dst: S64, Src0: DivScale0.getReg(0), Flags);
4988
4989 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64})
4990 .addUse(DivScale0.getReg(0))
4991 .setMIFlags(Flags);
4992
4993 auto Fma0 = B.buildFMA(Dst: S64, Src0: NegDivScale0, Src1: Rcp, Src2: One, Flags);
4994 auto Fma1 = B.buildFMA(Dst: S64, Src0: Rcp, Src1: Fma0, Src2: Rcp, Flags);
4995 auto Fma2 = B.buildFMA(Dst: S64, Src0: NegDivScale0, Src1: Fma1, Src2: One, Flags);
4996
4997 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
4998 .addUse(LHS)
4999 .addUse(RHS)
5000 .addImm(1)
5001 .setMIFlags(Flags);
5002
5003 auto Fma3 = B.buildFMA(Dst: S64, Src0: Fma1, Src1: Fma2, Src2: Fma1, Flags);
5004 auto Mul = B.buildFMul(Dst: S64, Src0: DivScale1.getReg(0), Src1: Fma3, Flags);
5005 auto Fma4 = B.buildFMA(Dst: S64, Src0: NegDivScale0, Src1: Mul, Src2: DivScale1.getReg(0), Flags);
5006
5007 Register Scale;
5008 if (!ST.hasUsableDivScaleConditionOutput()) {
5009 // Workaround a hardware bug on SI where the condition output from div_scale
5010 // is not usable.
5011
5012 LLT S32 = LLT::scalar(SizeInBits: 32);
5013
5014 auto NumUnmerge = B.buildUnmerge(Res: S32, Op: LHS);
5015 auto DenUnmerge = B.buildUnmerge(Res: S32, Op: RHS);
5016 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
5017 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
5018
5019 auto CmpNum = B.buildICmp(Pred: ICmpInst::ICMP_EQ, Res: S1, Op0: NumUnmerge.getReg(Idx: 1),
5020 Op1: Scale1Unmerge.getReg(1));
5021 auto CmpDen = B.buildICmp(Pred: ICmpInst::ICMP_EQ, Res: S1, Op0: DenUnmerge.getReg(Idx: 1),
5022 Op1: Scale0Unmerge.getReg(1));
5023 Scale = B.buildXor(Dst: S1, Src0: CmpNum, Src1: CmpDen).getReg(0);
5024 } else {
5025 Scale = DivScale1.getReg(1);
5026 }
5027
5028 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64})
5029 .addUse(Fma4.getReg(0))
5030 .addUse(Fma3.getReg(0))
5031 .addUse(Mul.getReg(0))
5032 .addUse(Scale)
5033 .setMIFlags(Flags);
5034
5035 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res))
5036 .addUse(Fmas.getReg(0))
5037 .addUse(RHS)
5038 .addUse(LHS)
5039 .setMIFlags(Flags);
5040
5041 MI.eraseFromParent();
5042 return true;
5043}
5044
5045bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI,
5046 MachineRegisterInfo &MRI,
5047 MachineIRBuilder &B) const {
5048 Register Res0 = MI.getOperand(i: 0).getReg();
5049 Register Res1 = MI.getOperand(i: 1).getReg();
5050 Register Val = MI.getOperand(i: 2).getReg();
5051 uint16_t Flags = MI.getFlags();
5052
5053 LLT Ty = MRI.getType(Reg: Res0);
5054 LLT InstrExpTy = Ty == LLT::scalar(SizeInBits: 16) ? LLT::scalar(SizeInBits: 16) : LLT::scalar(SizeInBits: 32);
5055
5056 auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5057 .addUse(Val)
5058 .setMIFlags(Flags);
5059 auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5060 .addUse(Val)
5061 .setMIFlags(Flags);
5062
5063 if (ST.hasFractBug()) {
5064 auto Fabs = B.buildFAbs(Dst: Ty, Src0: Val);
5065 auto Inf = B.buildFConstant(Res: Ty, Val: APFloat::getInf(Sem: getFltSemanticForLLT(Ty)));
5066 auto IsFinite =
5067 B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: 1), Op0: Fabs, Op1: Inf, Flags);
5068 auto Zero = B.buildConstant(Res: InstrExpTy, Val: 0);
5069 Exp = B.buildSelect(Res: InstrExpTy, Tst: IsFinite, Op0: Exp, Op1: Zero);
5070 Mant = B.buildSelect(Res: Ty, Tst: IsFinite, Op0: Mant, Op1: Val);
5071 }
5072
5073 B.buildCopy(Res: Res0, Op: Mant);
5074 B.buildSExtOrTrunc(Res: Res1, Op: Exp);
5075
5076 MI.eraseFromParent();
5077 return true;
5078}
5079
5080bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
5081 MachineRegisterInfo &MRI,
5082 MachineIRBuilder &B) const {
5083 Register Res = MI.getOperand(i: 0).getReg();
5084 Register LHS = MI.getOperand(i: 2).getReg();
5085 Register RHS = MI.getOperand(i: 3).getReg();
5086 uint16_t Flags = MI.getFlags();
5087
5088 LLT S32 = LLT::scalar(SizeInBits: 32);
5089 LLT S1 = LLT::scalar(SizeInBits: 1);
5090
5091 auto Abs = B.buildFAbs(Dst: S32, Src0: RHS, Flags);
5092 const APFloat C0Val(1.0f);
5093
5094 auto C0 = B.buildFConstant(Res: S32, Val: 0x1p+96f);
5095 auto C1 = B.buildFConstant(Res: S32, Val: 0x1p-32f);
5096 auto C2 = B.buildFConstant(Res: S32, Val: 1.0f);
5097
5098 auto CmpRes = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: S1, Op0: Abs, Op1: C0, Flags);
5099 auto Sel = B.buildSelect(Res: S32, Tst: CmpRes, Op0: C1, Op1: C2, Flags);
5100
5101 auto Mul0 = B.buildFMul(Dst: S32, Src0: RHS, Src1: Sel, Flags);
5102
5103 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5104 .addUse(Mul0.getReg(0))
5105 .setMIFlags(Flags);
5106
5107 auto Mul1 = B.buildFMul(Dst: S32, Src0: LHS, Src1: RCP, Flags);
5108
5109 B.buildFMul(Dst: Res, Src0: Sel, Src1: Mul1, Flags);
5110
5111 MI.eraseFromParent();
5112 return true;
5113}
5114
5115bool AMDGPULegalizerInfo::legalizeFSQRTF16(MachineInstr &MI,
5116 MachineRegisterInfo &MRI,
5117 MachineIRBuilder &B) const {
5118 // Bypass the correct expansion a standard promotion through G_FSQRT would
5119 // get. The f32 op is accurate enough for the f16 cas.
5120 unsigned Flags = MI.getFlags();
5121 assert(!ST.has16BitInsts());
5122 const LLT F32 = LLT::scalar(SizeInBits: 32);
5123 auto Ext = B.buildFPExt(Res: F32, Op: MI.getOperand(i: 1), Flags);
5124 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32})
5125 .addUse(Ext.getReg(0))
5126 .setMIFlags(Flags);
5127 B.buildFPTrunc(Res: MI.getOperand(i: 0), Op: Log2, Flags);
5128 MI.eraseFromParent();
5129 return true;
5130}
5131
5132bool AMDGPULegalizerInfo::legalizeFSQRTF32(MachineInstr &MI,
5133 MachineRegisterInfo &MRI,
5134 MachineIRBuilder &B) const {
5135 MachineFunction &MF = B.getMF();
5136 Register Dst = MI.getOperand(i: 0).getReg();
5137 Register X = MI.getOperand(i: 1).getReg();
5138 const unsigned Flags = MI.getFlags();
5139 const LLT S1 = LLT::scalar(SizeInBits: 1);
5140 const LLT F32 = LLT::scalar(SizeInBits: 32);
5141 const LLT I32 = LLT::scalar(SizeInBits: 32);
5142
5143 if (allowApproxFunc(MF, Flags)) {
5144 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst}))
5145 .addUse(X)
5146 .setMIFlags(Flags);
5147 MI.eraseFromParent();
5148 return true;
5149 }
5150
5151 auto ScaleThreshold = B.buildFConstant(Res: F32, Val: 0x1.0p-96f);
5152 auto NeedScale = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: S1, Op0: ScaleThreshold, Op1: X, Flags);
5153 auto ScaleUpFactor = B.buildFConstant(Res: F32, Val: 0x1.0p+32f);
5154 auto ScaledX = B.buildFMul(Dst: F32, Src0: X, Src1: ScaleUpFactor, Flags);
5155 auto SqrtX = B.buildSelect(Res: F32, Tst: NeedScale, Op0: ScaledX, Op1: X, Flags);
5156
5157 Register SqrtS = MRI.createGenericVirtualRegister(Ty: F32);
5158 if (needsDenormHandlingF32(MF, Src: X, Flags)) {
5159 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS}))
5160 .addUse(SqrtX.getReg(0))
5161 .setMIFlags(Flags);
5162
5163 auto NegOne = B.buildConstant(Res: I32, Val: -1);
5164 auto SqrtSNextDown = B.buildAdd(Dst: I32, Src0: SqrtS, Src1: NegOne);
5165
5166 auto NegSqrtSNextDown = B.buildFNeg(Dst: F32, Src0: SqrtSNextDown, Flags);
5167 auto SqrtVP = B.buildFMA(Dst: F32, Src0: NegSqrtSNextDown, Src1: SqrtS, Src2: SqrtX, Flags);
5168
5169 auto PosOne = B.buildConstant(Res: I32, Val: 1);
5170 auto SqrtSNextUp = B.buildAdd(Dst: I32, Src0: SqrtS, Src1: PosOne);
5171
5172 auto NegSqrtSNextUp = B.buildFNeg(Dst: F32, Src0: SqrtSNextUp, Flags);
5173 auto SqrtVS = B.buildFMA(Dst: F32, Src0: NegSqrtSNextUp, Src1: SqrtS, Src2: SqrtX, Flags);
5174
5175 auto Zero = B.buildFConstant(Res: F32, Val: 0.0f);
5176 auto SqrtVPLE0 = B.buildFCmp(Pred: CmpInst::FCMP_OLE, Res: S1, Op0: SqrtVP, Op1: Zero, Flags);
5177
5178 SqrtS =
5179 B.buildSelect(Res: F32, Tst: SqrtVPLE0, Op0: SqrtSNextDown, Op1: SqrtS, Flags).getReg(Idx: 0);
5180
5181 auto SqrtVPVSGT0 = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: S1, Op0: SqrtVS, Op1: Zero, Flags);
5182 SqrtS =
5183 B.buildSelect(Res: F32, Tst: SqrtVPVSGT0, Op0: SqrtSNextUp, Op1: SqrtS, Flags).getReg(Idx: 0);
5184 } else {
5185 auto SqrtR =
5186 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0));
5187 B.buildFMul(Dst: SqrtS, Src0: SqrtX, Src1: SqrtR, Flags);
5188
5189 auto Half = B.buildFConstant(Res: F32, Val: 0.5f);
5190 auto SqrtH = B.buildFMul(Dst: F32, Src0: SqrtR, Src1: Half, Flags);
5191 auto NegSqrtH = B.buildFNeg(Dst: F32, Src0: SqrtH, Flags);
5192 auto SqrtE = B.buildFMA(Dst: F32, Src0: NegSqrtH, Src1: SqrtS, Src2: Half, Flags);
5193 SqrtH = B.buildFMA(Dst: F32, Src0: SqrtH, Src1: SqrtE, Src2: SqrtH, Flags);
5194 SqrtS = B.buildFMA(Dst: F32, Src0: SqrtS, Src1: SqrtE, Src2: SqrtS, Flags).getReg(0);
5195 auto NegSqrtS = B.buildFNeg(Dst: F32, Src0: SqrtS, Flags);
5196 auto SqrtD = B.buildFMA(Dst: F32, Src0: NegSqrtS, Src1: SqrtS, Src2: SqrtX, Flags);
5197 SqrtS = B.buildFMA(Dst: F32, Src0: SqrtD, Src1: SqrtH, Src2: SqrtS, Flags).getReg(0);
5198 }
5199
5200 auto ScaleDownFactor = B.buildFConstant(Res: F32, Val: 0x1.0p-16f);
5201
5202 auto ScaledDown = B.buildFMul(Dst: F32, Src0: SqrtS, Src1: ScaleDownFactor, Flags);
5203
5204 SqrtS = B.buildSelect(Res: F32, Tst: NeedScale, Op0: ScaledDown, Op1: SqrtS, Flags).getReg(Idx: 0);
5205
5206 auto IsZeroOrInf = B.buildIsFPClass(Res: LLT::scalar(SizeInBits: 1), Src: SqrtX, Mask: fcZero | fcPosInf);
5207 B.buildSelect(Res: Dst, Tst: IsZeroOrInf, Op0: SqrtX, Op1: SqrtS, Flags);
5208
5209 MI.eraseFromParent();
5210 return true;
5211}
5212
5213bool AMDGPULegalizerInfo::legalizeFSQRTF64(MachineInstr &MI,
5214 MachineRegisterInfo &MRI,
5215 MachineIRBuilder &B) const {
5216 // For double type, the SQRT and RSQ instructions don't have required
5217 // precision, we apply Goldschmidt's algorithm to improve the result:
5218 //
5219 // y0 = rsq(x)
5220 // g0 = x * y0
5221 // h0 = 0.5 * y0
5222 //
5223 // r0 = 0.5 - h0 * g0
5224 // g1 = g0 * r0 + g0
5225 // h1 = h0 * r0 + h0
5226 //
5227 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
5228 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
5229 // h2 = h1 * r1 + h1
5230 //
5231 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
5232 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
5233 //
5234 // sqrt(x) = g3
5235
5236 const LLT S1 = LLT::scalar(SizeInBits: 1);
5237 const LLT S32 = LLT::scalar(SizeInBits: 32);
5238 const LLT F64 = LLT::scalar(SizeInBits: 64);
5239
5240 Register Dst = MI.getOperand(i: 0).getReg();
5241 assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
5242
5243 Register X = MI.getOperand(i: 1).getReg();
5244 unsigned Flags = MI.getFlags();
5245
5246 auto ScaleConstant = B.buildFConstant(Res: F64, Val: 0x1.0p-767);
5247
5248 auto ZeroInt = B.buildConstant(Res: S32, Val: 0);
5249 auto Scaling = B.buildFCmp(Pred: FCmpInst::FCMP_OLT, Res: S1, Op0: X, Op1: ScaleConstant);
5250
5251 // Scale up input if it is too small.
5252 auto ScaleUpFactor = B.buildConstant(Res: S32, Val: 256);
5253 auto ScaleUp = B.buildSelect(Res: S32, Tst: Scaling, Op0: ScaleUpFactor, Op1: ZeroInt);
5254 auto SqrtX = B.buildFLdexp(Dst: F64, Src0: X, Src1: ScaleUp, Flags);
5255
5256 auto SqrtY =
5257 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX.getReg(0));
5258
5259 auto Half = B.buildFConstant(Res: F64, Val: 0.5);
5260 auto SqrtH0 = B.buildFMul(Dst: F64, Src0: SqrtY, Src1: Half);
5261 auto SqrtS0 = B.buildFMul(Dst: F64, Src0: SqrtX, Src1: SqrtY);
5262
5263 auto NegSqrtH0 = B.buildFNeg(Dst: F64, Src0: SqrtH0);
5264 auto SqrtR0 = B.buildFMA(Dst: F64, Src0: NegSqrtH0, Src1: SqrtS0, Src2: Half);
5265
5266 auto SqrtS1 = B.buildFMA(Dst: F64, Src0: SqrtS0, Src1: SqrtR0, Src2: SqrtS0);
5267 auto SqrtH1 = B.buildFMA(Dst: F64, Src0: SqrtH0, Src1: SqrtR0, Src2: SqrtH0);
5268
5269 auto NegSqrtS1 = B.buildFNeg(Dst: F64, Src0: SqrtS1);
5270 auto SqrtD0 = B.buildFMA(Dst: F64, Src0: NegSqrtS1, Src1: SqrtS1, Src2: SqrtX);
5271
5272 auto SqrtS2 = B.buildFMA(Dst: F64, Src0: SqrtD0, Src1: SqrtH1, Src2: SqrtS1);
5273
5274 auto NegSqrtS2 = B.buildFNeg(Dst: F64, Src0: SqrtS2);
5275 auto SqrtD1 = B.buildFMA(Dst: F64, Src0: NegSqrtS2, Src1: SqrtS2, Src2: SqrtX);
5276
5277 auto SqrtRet = B.buildFMA(Dst: F64, Src0: SqrtD1, Src1: SqrtH1, Src2: SqrtS2);
5278
5279 // Scale down the result.
5280 auto ScaleDownFactor = B.buildConstant(Res: S32, Val: -128);
5281 auto ScaleDown = B.buildSelect(Res: S32, Tst: Scaling, Op0: ScaleDownFactor, Op1: ZeroInt);
5282 SqrtRet = B.buildFLdexp(Dst: F64, Src0: SqrtRet, Src1: ScaleDown, Flags);
5283
5284 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
5285 // with finite only or nsz because rsq(+/-0) = +/-inf
5286
5287 // TODO: Check for DAZ and expand to subnormals
5288 auto IsZeroOrInf = B.buildIsFPClass(Res: LLT::scalar(SizeInBits: 1), Src: SqrtX, Mask: fcZero | fcPosInf);
5289
5290 // If x is +INF, +0, or -0, use its original value
5291 B.buildSelect(Res: Dst, Tst: IsZeroOrInf, Op0: SqrtX, Op1: SqrtRet, Flags);
5292
5293 MI.eraseFromParent();
5294 return true;
5295}
5296
5297bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI,
5298 MachineRegisterInfo &MRI,
5299 MachineIRBuilder &B) const {
5300 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
5301 if (Ty == LLT::scalar(SizeInBits: 32))
5302 return legalizeFSQRTF32(MI, MRI, B);
5303 if (Ty == LLT::scalar(SizeInBits: 64))
5304 return legalizeFSQRTF64(MI, MRI, B);
5305 if (Ty == LLT::scalar(SizeInBits: 16))
5306 return legalizeFSQRTF16(MI, MRI, B);
5307 return false;
5308}
5309
5310// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
5311// FIXME: Why do we handle this one but not other removed instructions?
5312//
5313// Reciprocal square root. The clamp prevents infinite results, clamping
5314// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
5315// +-max_float.
5316bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
5317 MachineRegisterInfo &MRI,
5318 MachineIRBuilder &B) const {
5319 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
5320 return true;
5321
5322 Register Dst = MI.getOperand(i: 0).getReg();
5323 Register Src = MI.getOperand(i: 2).getReg();
5324 auto Flags = MI.getFlags();
5325
5326 LLT Ty = MRI.getType(Reg: Dst);
5327
5328 const fltSemantics *FltSemantics;
5329 if (Ty == LLT::scalar(SizeInBits: 32))
5330 FltSemantics = &APFloat::IEEEsingle();
5331 else if (Ty == LLT::scalar(SizeInBits: 64))
5332 FltSemantics = &APFloat::IEEEdouble();
5333 else
5334 return false;
5335
5336 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
5337 .addUse(Src)
5338 .setMIFlags(Flags);
5339
5340 // We don't need to concern ourselves with the snan handling difference, since
5341 // the rsq quieted (or not) so use the one which will directly select.
5342 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5343 const bool UseIEEE = MFI->getMode().IEEE;
5344
5345 auto MaxFlt = B.buildFConstant(Res: Ty, Val: APFloat::getLargest(Sem: *FltSemantics));
5346 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Dst: Ty, Src0: Rsq, Src1: MaxFlt, Flags) :
5347 B.buildFMinNum(Dst: Ty, Src0: Rsq, Src1: MaxFlt, Flags);
5348
5349 auto MinFlt = B.buildFConstant(Res: Ty, Val: APFloat::getLargest(Sem: *FltSemantics, Negative: true));
5350
5351 if (UseIEEE)
5352 B.buildFMaxNumIEEE(Dst, Src0: ClampMax, Src1: MinFlt, Flags);
5353 else
5354 B.buildFMaxNum(Dst, Src0: ClampMax, Src1: MinFlt, Flags);
5355 MI.eraseFromParent();
5356 return true;
5357}
5358
5359static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
5360 switch (IID) {
5361 case Intrinsic::amdgcn_ds_fadd:
5362 return AMDGPU::G_ATOMICRMW_FADD;
5363 case Intrinsic::amdgcn_ds_fmin:
5364 return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
5365 case Intrinsic::amdgcn_ds_fmax:
5366 return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
5367 default:
5368 llvm_unreachable("not a DS FP intrinsic");
5369 }
5370}
5371
5372bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
5373 MachineInstr &MI,
5374 Intrinsic::ID IID) const {
5375 GISelChangeObserver &Observer = Helper.Observer;
5376 Observer.changingInstr(MI);
5377
5378 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
5379
5380 // The remaining operands were used to set fields in the MemOperand on
5381 // construction.
5382 for (int I = 6; I > 3; --I)
5383 MI.removeOperand(OpNo: I);
5384
5385 MI.removeOperand(OpNo: 1); // Remove the intrinsic ID.
5386 Observer.changedInstr(MI);
5387 return true;
5388}
5389
5390bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
5391 MachineRegisterInfo &MRI,
5392 MachineIRBuilder &B) const {
5393 uint64_t Offset =
5394 ST.getTargetLowering()->getImplicitParameterOffset(
5395 MF: B.getMF(), Param: AMDGPUTargetLowering::FIRST_IMPLICIT);
5396 LLT DstTy = MRI.getType(Reg: DstReg);
5397 LLT IdxTy = LLT::scalar(SizeInBits: DstTy.getSizeInBits());
5398
5399 Register KernargPtrReg = MRI.createGenericVirtualRegister(Ty: DstTy);
5400 if (!loadInputValue(DstReg: KernargPtrReg, B,
5401 ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
5402 return false;
5403
5404 // FIXME: This should be nuw
5405 B.buildPtrAdd(Res: DstReg, Op0: KernargPtrReg, Op1: B.buildConstant(Res: IdxTy, Val: Offset).getReg(Idx: 0));
5406 return true;
5407}
5408
5409/// To create a buffer resource from a 64-bit pointer, mask off the upper 32
5410/// bits of the pointer and replace them with the stride argument, then
5411/// merge_values everything together. In the common case of a raw buffer (the
5412/// stride component is 0), we can just AND off the upper half.
5413bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin(
5414 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5415 Register Result = MI.getOperand(i: 0).getReg();
5416 Register Pointer = MI.getOperand(i: 2).getReg();
5417 Register Stride = MI.getOperand(i: 3).getReg();
5418 Register NumRecords = MI.getOperand(i: 4).getReg();
5419 Register Flags = MI.getOperand(i: 5).getReg();
5420
5421 LLT S32 = LLT::scalar(SizeInBits: 32);
5422
5423 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
5424 auto Unmerge = B.buildUnmerge(Res: S32, Op: Pointer);
5425 Register LowHalf = Unmerge.getReg(Idx: 0);
5426 Register HighHalf = Unmerge.getReg(Idx: 1);
5427
5428 auto AndMask = B.buildConstant(Res: S32, Val: 0x0000ffff);
5429 auto Masked = B.buildAnd(Dst: S32, Src0: HighHalf, Src1: AndMask);
5430
5431 MachineInstrBuilder NewHighHalf = Masked;
5432 std::optional<ValueAndVReg> StrideConst =
5433 getIConstantVRegValWithLookThrough(VReg: Stride, MRI);
5434 if (!StrideConst || !StrideConst->Value.isZero()) {
5435 MachineInstrBuilder ShiftedStride;
5436 if (StrideConst) {
5437 uint32_t StrideVal = StrideConst->Value.getZExtValue();
5438 uint32_t ShiftedStrideVal = StrideVal << 16;
5439 ShiftedStride = B.buildConstant(Res: S32, Val: ShiftedStrideVal);
5440 } else {
5441 auto ExtStride = B.buildAnyExt(Res: S32, Op: Stride);
5442 auto ShiftConst = B.buildConstant(Res: S32, Val: 16);
5443 ShiftedStride = B.buildShl(Dst: S32, Src0: ExtStride, Src1: ShiftConst);
5444 }
5445 NewHighHalf = B.buildOr(Dst: S32, Src0: Masked, Src1: ShiftedStride);
5446 }
5447 Register NewHighHalfReg = NewHighHalf.getReg(Idx: 0);
5448 B.buildMergeValues(Res: Result, Ops: {LowHalf, NewHighHalfReg, NumRecords, Flags});
5449 MI.eraseFromParent();
5450 return true;
5451}
5452
5453bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
5454 MachineRegisterInfo &MRI,
5455 MachineIRBuilder &B) const {
5456 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5457 if (!MFI->isEntryFunction()) {
5458 return legalizePreloadedArgIntrin(MI, MRI, B,
5459 ArgType: AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
5460 }
5461
5462 Register DstReg = MI.getOperand(i: 0).getReg();
5463 if (!getImplicitArgPtr(DstReg, MRI, B))
5464 return false;
5465
5466 MI.eraseFromParent();
5467 return true;
5468}
5469
5470bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg,
5471 MachineRegisterInfo &MRI,
5472 MachineIRBuilder &B) const {
5473 Function &F = B.getMF().getFunction();
5474 std::optional<uint32_t> KnownSize =
5475 AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
5476 if (KnownSize.has_value())
5477 B.buildConstant(Res: DstReg, Val: *KnownSize);
5478 return false;
5479}
5480
5481bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI,
5482 MachineRegisterInfo &MRI,
5483 MachineIRBuilder &B) const {
5484
5485 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5486 if (!MFI->isEntryFunction()) {
5487 return legalizePreloadedArgIntrin(MI, MRI, B,
5488 ArgType: AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
5489 }
5490
5491 Register DstReg = MI.getOperand(i: 0).getReg();
5492 if (!getLDSKernelId(DstReg, MRI, B))
5493 return false;
5494
5495 MI.eraseFromParent();
5496 return true;
5497}
5498
5499bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
5500 MachineRegisterInfo &MRI,
5501 MachineIRBuilder &B,
5502 unsigned AddrSpace) const {
5503 Register ApertureReg = getSegmentAperture(AS: AddrSpace, MRI, B);
5504 auto Unmerge = B.buildUnmerge(Res: LLT::scalar(SizeInBits: 32), Op: MI.getOperand(i: 2).getReg());
5505 Register Hi32 = Unmerge.getReg(Idx: 1);
5506
5507 B.buildICmp(Pred: ICmpInst::ICMP_EQ, Res: MI.getOperand(i: 0), Op0: Hi32, Op1: ApertureReg);
5508 MI.eraseFromParent();
5509 return true;
5510}
5511
5512// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
5513// offset (the offset that is included in bounds checking and swizzling, to be
5514// split between the instruction's voffset and immoffset fields) and soffset
5515// (the offset that is excluded from bounds checking and swizzling, to go in
5516// the instruction's soffset field). This function takes the first kind of
5517// offset and figures out how to split it between voffset and immoffset.
5518std::pair<Register, unsigned>
5519AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
5520 Register OrigOffset) const {
5521 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
5522 Register BaseReg;
5523 unsigned ImmOffset;
5524 const LLT S32 = LLT::scalar(SizeInBits: 32);
5525 MachineRegisterInfo &MRI = *B.getMRI();
5526
5527 std::tie(args&: BaseReg, args&: ImmOffset) =
5528 AMDGPU::getBaseWithConstantOffset(MRI, Reg: OrigOffset);
5529
5530 // If BaseReg is a pointer, convert it to int.
5531 if (MRI.getType(Reg: BaseReg).isPointer())
5532 BaseReg = B.buildPtrToInt(Dst: MRI.getType(Reg: OrigOffset), Src: BaseReg).getReg(Idx: 0);
5533
5534 // If the immediate value is too big for the immoffset field, put only bits
5535 // that would normally fit in the immoffset field. The remaining value that
5536 // is copied/added for the voffset field is a large power of 2, and it
5537 // stands more chance of being CSEd with the copy/add for another similar
5538 // load/store.
5539 // However, do not do that rounding down if that is a negative
5540 // number, as it appears to be illegal to have a negative offset in the
5541 // vgpr, even if adding the immediate offset makes it positive.
5542 unsigned Overflow = ImmOffset & ~MaxImm;
5543 ImmOffset -= Overflow;
5544 if ((int32_t)Overflow < 0) {
5545 Overflow += ImmOffset;
5546 ImmOffset = 0;
5547 }
5548
5549 if (Overflow != 0) {
5550 if (!BaseReg) {
5551 BaseReg = B.buildConstant(Res: S32, Val: Overflow).getReg(Idx: 0);
5552 } else {
5553 auto OverflowVal = B.buildConstant(Res: S32, Val: Overflow);
5554 BaseReg = B.buildAdd(Dst: S32, Src0: BaseReg, Src1: OverflowVal).getReg(Idx: 0);
5555 }
5556 }
5557
5558 if (!BaseReg)
5559 BaseReg = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
5560
5561 return std::pair(BaseReg, ImmOffset);
5562}
5563
5564/// Handle register layout difference for f16 images for some subtargets.
5565Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
5566 MachineRegisterInfo &MRI,
5567 Register Reg,
5568 bool ImageStore) const {
5569 const LLT S16 = LLT::scalar(SizeInBits: 16);
5570 const LLT S32 = LLT::scalar(SizeInBits: 32);
5571 LLT StoreVT = MRI.getType(Reg);
5572 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
5573
5574 if (ST.hasUnpackedD16VMem()) {
5575 auto Unmerge = B.buildUnmerge(Res: S16, Op: Reg);
5576
5577 SmallVector<Register, 4> WideRegs;
5578 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5579 WideRegs.push_back(Elt: B.buildAnyExt(Res: S32, Op: Unmerge.getReg(Idx: I)).getReg(Idx: 0));
5580
5581 int NumElts = StoreVT.getNumElements();
5582
5583 return B.buildBuildVector(Res: LLT::fixed_vector(NumElements: NumElts, ScalarTy: S32), Ops: WideRegs)
5584 .getReg(Idx: 0);
5585 }
5586
5587 if (ImageStore && ST.hasImageStoreD16Bug()) {
5588 if (StoreVT.getNumElements() == 2) {
5589 SmallVector<Register, 4> PackedRegs;
5590 Reg = B.buildBitcast(Dst: S32, Src: Reg).getReg(Idx: 0);
5591 PackedRegs.push_back(Elt: Reg);
5592 PackedRegs.resize(N: 2, NV: B.buildUndef(Res: S32).getReg(Idx: 0));
5593 return B.buildBuildVector(Res: LLT::fixed_vector(NumElements: 2, ScalarTy: S32), Ops: PackedRegs)
5594 .getReg(Idx: 0);
5595 }
5596
5597 if (StoreVT.getNumElements() == 3) {
5598 SmallVector<Register, 4> PackedRegs;
5599 auto Unmerge = B.buildUnmerge(Res: S16, Op: Reg);
5600 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5601 PackedRegs.push_back(Elt: Unmerge.getReg(Idx: I));
5602 PackedRegs.resize(N: 6, NV: B.buildUndef(Res: S16).getReg(Idx: 0));
5603 Reg = B.buildBuildVector(Res: LLT::fixed_vector(NumElements: 6, ScalarTy: S16), Ops: PackedRegs).getReg(Idx: 0);
5604 return B.buildBitcast(Dst: LLT::fixed_vector(NumElements: 3, ScalarTy: S32), Src: Reg).getReg(Idx: 0);
5605 }
5606
5607 if (StoreVT.getNumElements() == 4) {
5608 SmallVector<Register, 4> PackedRegs;
5609 Reg = B.buildBitcast(Dst: LLT::fixed_vector(NumElements: 2, ScalarTy: S32), Src: Reg).getReg(Idx: 0);
5610 auto Unmerge = B.buildUnmerge(Res: S32, Op: Reg);
5611 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5612 PackedRegs.push_back(Elt: Unmerge.getReg(Idx: I));
5613 PackedRegs.resize(N: 4, NV: B.buildUndef(Res: S32).getReg(Idx: 0));
5614 return B.buildBuildVector(Res: LLT::fixed_vector(NumElements: 4, ScalarTy: S32), Ops: PackedRegs)
5615 .getReg(Idx: 0);
5616 }
5617
5618 llvm_unreachable("invalid data type");
5619 }
5620
5621 if (StoreVT == LLT::fixed_vector(NumElements: 3, ScalarTy: S16)) {
5622 Reg = B.buildPadVectorWithUndefElements(Res: LLT::fixed_vector(NumElements: 4, ScalarTy: S16), Op0: Reg)
5623 .getReg(Idx: 0);
5624 }
5625 return Reg;
5626}
5627
5628Register AMDGPULegalizerInfo::fixStoreSourceType(
5629 MachineIRBuilder &B, Register VData, bool IsFormat) const {
5630 MachineRegisterInfo *MRI = B.getMRI();
5631 LLT Ty = MRI->getType(Reg: VData);
5632
5633 const LLT S16 = LLT::scalar(SizeInBits: 16);
5634
5635 // Fixup buffer resources themselves needing to be v4i128.
5636 if (hasBufferRsrcWorkaround(Ty))
5637 return castBufferRsrcToV4I32(Pointer: VData, B);
5638
5639 // Fixup illegal register types for i8 stores.
5640 if (Ty == LLT::scalar(SizeInBits: 8) || Ty == S16) {
5641 Register AnyExt = B.buildAnyExt(Res: LLT::scalar(SizeInBits: 32), Op: VData).getReg(Idx: 0);
5642 return AnyExt;
5643 }
5644
5645 if (Ty.isVector()) {
5646 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
5647 if (IsFormat)
5648 return handleD16VData(B, MRI&: *MRI, Reg: VData);
5649 }
5650 }
5651
5652 return VData;
5653}
5654
5655bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
5656 MachineRegisterInfo &MRI,
5657 MachineIRBuilder &B,
5658 bool IsTyped,
5659 bool IsFormat) const {
5660 Register VData = MI.getOperand(i: 1).getReg();
5661 LLT Ty = MRI.getType(Reg: VData);
5662 LLT EltTy = Ty.getScalarType();
5663 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
5664 const LLT S32 = LLT::scalar(SizeInBits: 32);
5665
5666 VData = fixStoreSourceType(B, VData, IsFormat);
5667 castBufferRsrcArgToV4I32(MI, B, Idx: 2);
5668 Register RSrc = MI.getOperand(i: 2).getReg();
5669
5670 MachineMemOperand *MMO = *MI.memoperands_begin();
5671 const int MemSize = MMO->getSize().getValue();
5672
5673 unsigned ImmOffset;
5674
5675 // The typed intrinsics add an immediate after the registers.
5676 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
5677
5678 // The struct intrinsic variants add one additional operand over raw.
5679 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
5680 Register VIndex;
5681 int OpOffset = 0;
5682 if (HasVIndex) {
5683 VIndex = MI.getOperand(i: 3).getReg();
5684 OpOffset = 1;
5685 } else {
5686 VIndex = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
5687 }
5688
5689 Register VOffset = MI.getOperand(i: 3 + OpOffset).getReg();
5690 Register SOffset = MI.getOperand(i: 4 + OpOffset).getReg();
5691
5692 unsigned Format = 0;
5693 if (IsTyped) {
5694 Format = MI.getOperand(i: 5 + OpOffset).getImm();
5695 ++OpOffset;
5696 }
5697
5698 unsigned AuxiliaryData = MI.getOperand(i: 5 + OpOffset).getImm();
5699
5700 std::tie(args&: VOffset, args&: ImmOffset) = splitBufferOffsets(B, OrigOffset: VOffset);
5701
5702 unsigned Opc;
5703 if (IsTyped) {
5704 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
5705 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
5706 } else if (IsFormat) {
5707 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
5708 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
5709 } else {
5710 switch (MemSize) {
5711 case 1:
5712 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
5713 break;
5714 case 2:
5715 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
5716 break;
5717 default:
5718 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
5719 break;
5720 }
5721 }
5722
5723 auto MIB = B.buildInstr(Opcode: Opc)
5724 .addUse(RegNo: VData) // vdata
5725 .addUse(RegNo: RSrc) // rsrc
5726 .addUse(RegNo: VIndex) // vindex
5727 .addUse(RegNo: VOffset) // voffset
5728 .addUse(RegNo: SOffset) // soffset
5729 .addImm(Val: ImmOffset); // offset(imm)
5730
5731 if (IsTyped)
5732 MIB.addImm(Val: Format);
5733
5734 MIB.addImm(Val: AuxiliaryData) // cachepolicy, swizzled buffer(imm)
5735 .addImm(Val: HasVIndex ? -1 : 0) // idxen(imm)
5736 .addMemOperand(MMO);
5737
5738 MI.eraseFromParent();
5739 return true;
5740}
5741
5742static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
5743 Register VIndex, Register VOffset, Register SOffset,
5744 unsigned ImmOffset, unsigned Format,
5745 unsigned AuxiliaryData, MachineMemOperand *MMO,
5746 bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
5747 auto MIB = B.buildInstr(Opcode: Opc)
5748 .addDef(RegNo: LoadDstReg) // vdata
5749 .addUse(RegNo: RSrc) // rsrc
5750 .addUse(RegNo: VIndex) // vindex
5751 .addUse(RegNo: VOffset) // voffset
5752 .addUse(RegNo: SOffset) // soffset
5753 .addImm(Val: ImmOffset); // offset(imm)
5754
5755 if (IsTyped)
5756 MIB.addImm(Val: Format);
5757
5758 MIB.addImm(Val: AuxiliaryData) // cachepolicy, swizzled buffer(imm)
5759 .addImm(Val: HasVIndex ? -1 : 0) // idxen(imm)
5760 .addMemOperand(MMO);
5761}
5762
5763bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
5764 MachineRegisterInfo &MRI,
5765 MachineIRBuilder &B,
5766 bool IsFormat,
5767 bool IsTyped) const {
5768 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
5769 MachineMemOperand *MMO = *MI.memoperands_begin();
5770 const LLT MemTy = MMO->getMemoryType();
5771 const LLT S32 = LLT::scalar(SizeInBits: 32);
5772
5773 Register Dst = MI.getOperand(i: 0).getReg();
5774
5775 Register StatusDst;
5776 int OpOffset = 0;
5777 assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
5778 bool IsTFE = MI.getNumExplicitDefs() == 2;
5779 if (IsTFE) {
5780 StatusDst = MI.getOperand(i: 1).getReg();
5781 ++OpOffset;
5782 }
5783
5784 castBufferRsrcArgToV4I32(MI, B, Idx: 2 + OpOffset);
5785 Register RSrc = MI.getOperand(i: 2 + OpOffset).getReg();
5786
5787 // The typed intrinsics add an immediate after the registers.
5788 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
5789
5790 // The struct intrinsic variants add one additional operand over raw.
5791 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
5792 Register VIndex;
5793 if (HasVIndex) {
5794 VIndex = MI.getOperand(i: 3 + OpOffset).getReg();
5795 ++OpOffset;
5796 } else {
5797 VIndex = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
5798 }
5799
5800 Register VOffset = MI.getOperand(i: 3 + OpOffset).getReg();
5801 Register SOffset = MI.getOperand(i: 4 + OpOffset).getReg();
5802
5803 unsigned Format = 0;
5804 if (IsTyped) {
5805 Format = MI.getOperand(i: 5 + OpOffset).getImm();
5806 ++OpOffset;
5807 }
5808
5809 unsigned AuxiliaryData = MI.getOperand(i: 5 + OpOffset).getImm();
5810 unsigned ImmOffset;
5811
5812 LLT Ty = MRI.getType(Reg: Dst);
5813 // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
5814 // logic doesn't have to handle that case.
5815 if (hasBufferRsrcWorkaround(Ty)) {
5816 Ty = castBufferRsrcFromV4I32(MI, B, MRI, Idx: 0);
5817 Dst = MI.getOperand(i: 0).getReg();
5818 }
5819 LLT EltTy = Ty.getScalarType();
5820 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
5821 const bool Unpacked = ST.hasUnpackedD16VMem();
5822
5823 std::tie(args&: VOffset, args&: ImmOffset) = splitBufferOffsets(B, OrigOffset: VOffset);
5824
5825 unsigned Opc;
5826
5827 // TODO: Support TFE for typed and narrow loads.
5828 if (IsTyped) {
5829 if (IsTFE)
5830 return false;
5831 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
5832 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
5833 } else if (IsFormat) {
5834 if (IsD16) {
5835 if (IsTFE)
5836 return false;
5837 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
5838 } else {
5839 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
5840 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
5841 }
5842 } else {
5843 if (IsTFE)
5844 return false;
5845 switch (MemTy.getSizeInBits()) {
5846 case 8:
5847 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
5848 break;
5849 case 16:
5850 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
5851 break;
5852 default:
5853 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
5854 break;
5855 }
5856 }
5857
5858 if (IsTFE) {
5859 unsigned NumValueDWords = divideCeil(Numerator: Ty.getSizeInBits(), Denominator: 32);
5860 unsigned NumLoadDWords = NumValueDWords + 1;
5861 LLT LoadTy = LLT::fixed_vector(NumElements: NumLoadDWords, ScalarTy: S32);
5862 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(Ty: LoadTy);
5863 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
5864 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
5865 if (NumValueDWords == 1) {
5866 B.buildUnmerge(Res: {Dst, StatusDst}, Op: LoadDstReg);
5867 } else {
5868 SmallVector<Register, 5> LoadElts;
5869 for (unsigned I = 0; I != NumValueDWords; ++I)
5870 LoadElts.push_back(Elt: B.getMRI()->createGenericVirtualRegister(Ty: S32));
5871 LoadElts.push_back(Elt: StatusDst);
5872 B.buildUnmerge(Res: LoadElts, Op: LoadDstReg);
5873 LoadElts.truncate(N: NumValueDWords);
5874 B.buildMergeLikeInstr(Res: Dst, Ops: LoadElts);
5875 }
5876 } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
5877 (IsD16 && !Ty.isVector())) {
5878 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(Ty: S32);
5879 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
5880 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
5881 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
5882 B.buildTrunc(Res: Dst, Op: LoadDstReg);
5883 } else if (Unpacked && IsD16 && Ty.isVector()) {
5884 LLT UnpackedTy = Ty.changeElementSize(NewEltSize: 32);
5885 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(Ty: UnpackedTy);
5886 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
5887 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
5888 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
5889 // FIXME: G_TRUNC should work, but legalization currently fails
5890 auto Unmerge = B.buildUnmerge(Res: S32, Op: LoadDstReg);
5891 SmallVector<Register, 4> Repack;
5892 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
5893 Repack.push_back(Elt: B.buildTrunc(Res: EltTy, Op: Unmerge.getReg(Idx: I)).getReg(Idx: 0));
5894 B.buildMergeLikeInstr(Res: Dst, Ops: Repack);
5895 } else {
5896 buildBufferLoad(Opc, LoadDstReg: Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
5897 AuxiliaryData, MMO, IsTyped, HasVIndex, B);
5898 }
5899
5900 MI.eraseFromParent();
5901 return true;
5902}
5903
5904static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
5905 switch (IntrID) {
5906 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5907 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
5908 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5909 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
5910 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
5911 case Intrinsic::amdgcn_raw_buffer_atomic_add:
5912 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
5913 case Intrinsic::amdgcn_struct_buffer_atomic_add:
5914 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
5915 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
5916 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5917 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
5918 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5919 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
5920 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
5921 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5922 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
5923 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5924 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
5925 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
5926 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5927 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
5928 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5929 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
5930 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
5931 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5932 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
5933 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5934 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
5935 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
5936 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5937 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
5938 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5939 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
5940 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
5941 case Intrinsic::amdgcn_raw_buffer_atomic_and:
5942 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
5943 case Intrinsic::amdgcn_struct_buffer_atomic_and:
5944 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
5945 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
5946 case Intrinsic::amdgcn_raw_buffer_atomic_or:
5947 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
5948 case Intrinsic::amdgcn_struct_buffer_atomic_or:
5949 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
5950 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
5951 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5952 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
5953 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5954 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
5955 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
5956 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5957 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
5958 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5959 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
5960 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
5961 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5962 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
5963 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5964 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
5965 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
5966 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5967 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
5968 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5969 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
5970 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
5971 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5972 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
5973 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
5974 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
5975 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
5976 case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
5977 case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16:
5978 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16;
5979 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5980 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
5981 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5982 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
5983 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
5984 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5985 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
5986 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5987 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
5988 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
5989 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
5990 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
5991 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
5992 default:
5993 llvm_unreachable("unhandled atomic opcode");
5994 }
5995}
5996
5997bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
5998 MachineIRBuilder &B,
5999 Intrinsic::ID IID) const {
6000 const bool IsCmpSwap =
6001 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6002 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6003 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6004 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6005
6006 Register Dst = MI.getOperand(i: 0).getReg();
6007 // Since we don't have 128-bit atomics, we don't need to handle the case of
6008 // p8 argmunents to the atomic itself
6009 Register VData = MI.getOperand(i: 2).getReg();
6010
6011 Register CmpVal;
6012 int OpOffset = 0;
6013
6014 if (IsCmpSwap) {
6015 CmpVal = MI.getOperand(i: 3).getReg();
6016 ++OpOffset;
6017 }
6018
6019 castBufferRsrcArgToV4I32(MI, B, Idx: 3 + OpOffset);
6020 Register RSrc = MI.getOperand(i: 3 + OpOffset).getReg();
6021 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6022
6023 // The struct intrinsic variants add one additional operand over raw.
6024 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6025 Register VIndex;
6026 if (HasVIndex) {
6027 VIndex = MI.getOperand(i: 4 + OpOffset).getReg();
6028 ++OpOffset;
6029 } else {
6030 VIndex = B.buildConstant(Res: LLT::scalar(SizeInBits: 32), Val: 0).getReg(Idx: 0);
6031 }
6032
6033 Register VOffset = MI.getOperand(i: 4 + OpOffset).getReg();
6034 Register SOffset = MI.getOperand(i: 5 + OpOffset).getReg();
6035 unsigned AuxiliaryData = MI.getOperand(i: 6 + OpOffset).getImm();
6036
6037 MachineMemOperand *MMO = *MI.memoperands_begin();
6038
6039 unsigned ImmOffset;
6040 std::tie(args&: VOffset, args&: ImmOffset) = splitBufferOffsets(B, OrigOffset: VOffset);
6041
6042 auto MIB = B.buildInstr(Opcode: getBufferAtomicPseudo(IntrID: IID))
6043 .addDef(RegNo: Dst)
6044 .addUse(RegNo: VData); // vdata
6045
6046 if (IsCmpSwap)
6047 MIB.addReg(RegNo: CmpVal);
6048
6049 MIB.addUse(RegNo: RSrc) // rsrc
6050 .addUse(RegNo: VIndex) // vindex
6051 .addUse(RegNo: VOffset) // voffset
6052 .addUse(RegNo: SOffset) // soffset
6053 .addImm(Val: ImmOffset) // offset(imm)
6054 .addImm(Val: AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6055 .addImm(Val: HasVIndex ? -1 : 0) // idxen(imm)
6056 .addMemOperand(MMO);
6057
6058 MI.eraseFromParent();
6059 return true;
6060}
6061
6062/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
6063/// vector with s16 typed elements.
6064static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
6065 SmallVectorImpl<Register> &PackedAddrs,
6066 unsigned ArgOffset,
6067 const AMDGPU::ImageDimIntrinsicInfo *Intr,
6068 bool IsA16, bool IsG16) {
6069 const LLT S16 = LLT::scalar(SizeInBits: 16);
6070 const LLT V2S16 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16);
6071 auto EndIdx = Intr->VAddrEnd;
6072
6073 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
6074 MachineOperand &SrcOp = MI.getOperand(i: ArgOffset + I);
6075 if (!SrcOp.isReg())
6076 continue; // _L to _LZ may have eliminated this.
6077
6078 Register AddrReg = SrcOp.getReg();
6079
6080 if ((I < Intr->GradientStart) ||
6081 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
6082 (I >= Intr->CoordStart && !IsA16)) {
6083 if ((I < Intr->GradientStart) && IsA16 &&
6084 (B.getMRI()->getType(Reg: AddrReg) == S16)) {
6085 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
6086 // Special handling of bias when A16 is on. Bias is of type half but
6087 // occupies full 32-bit.
6088 PackedAddrs.push_back(
6089 Elt: B.buildBuildVector(Res: V2S16, Ops: {AddrReg, B.buildUndef(Res: S16).getReg(Idx: 0)})
6090 .getReg(Idx: 0));
6091 } else {
6092 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
6093 "Bias needs to be converted to 16 bit in A16 mode");
6094 // Handle any gradient or coordinate operands that should not be packed
6095 AddrReg = B.buildBitcast(Dst: V2S16, Src: AddrReg).getReg(Idx: 0);
6096 PackedAddrs.push_back(Elt: AddrReg);
6097 }
6098 } else {
6099 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
6100 // derivatives dx/dh and dx/dv are packed with undef.
6101 if (((I + 1) >= EndIdx) ||
6102 ((Intr->NumGradients / 2) % 2 == 1 &&
6103 (I == static_cast<unsigned>(Intr->GradientStart +
6104 (Intr->NumGradients / 2) - 1) ||
6105 I == static_cast<unsigned>(Intr->GradientStart +
6106 Intr->NumGradients - 1))) ||
6107 // Check for _L to _LZ optimization
6108 !MI.getOperand(i: ArgOffset + I + 1).isReg()) {
6109 PackedAddrs.push_back(
6110 Elt: B.buildBuildVector(Res: V2S16, Ops: {AddrReg, B.buildUndef(Res: S16).getReg(Idx: 0)})
6111 .getReg(Idx: 0));
6112 } else {
6113 PackedAddrs.push_back(
6114 Elt: B.buildBuildVector(
6115 Res: V2S16, Ops: {AddrReg, MI.getOperand(i: ArgOffset + I + 1).getReg()})
6116 .getReg(Idx: 0));
6117 ++I;
6118 }
6119 }
6120 }
6121}
6122
6123/// Convert from separate vaddr components to a single vector address register,
6124/// and replace the remaining operands with $noreg.
6125static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
6126 int DimIdx, int NumVAddrs) {
6127 const LLT S32 = LLT::scalar(SizeInBits: 32);
6128 (void)S32;
6129 SmallVector<Register, 8> AddrRegs;
6130 for (int I = 0; I != NumVAddrs; ++I) {
6131 MachineOperand &SrcOp = MI.getOperand(i: DimIdx + I);
6132 if (SrcOp.isReg()) {
6133 AddrRegs.push_back(Elt: SrcOp.getReg());
6134 assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
6135 }
6136 }
6137
6138 int NumAddrRegs = AddrRegs.size();
6139 if (NumAddrRegs != 1) {
6140 auto VAddr =
6141 B.buildBuildVector(Res: LLT::fixed_vector(NumElements: NumAddrRegs, ScalarSizeInBits: 32), Ops: AddrRegs);
6142 MI.getOperand(i: DimIdx).setReg(VAddr.getReg(Idx: 0));
6143 }
6144
6145 for (int I = 1; I != NumVAddrs; ++I) {
6146 MachineOperand &SrcOp = MI.getOperand(i: DimIdx + I);
6147 if (SrcOp.isReg())
6148 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
6149 }
6150}
6151
6152/// Rewrite image intrinsics to use register layouts expected by the subtarget.
6153///
6154/// Depending on the subtarget, load/store with 16-bit element data need to be
6155/// rewritten to use the low half of 32-bit registers, or directly use a packed
6156/// layout. 16-bit addresses should also sometimes be packed into 32-bit
6157/// registers.
6158///
6159/// We don't want to directly select image instructions just yet, but also want
6160/// to exposes all register repacking to the legalizer/combiners. We also don't
6161/// want a selected instruction entering RegBankSelect. In order to avoid
6162/// defining a multitude of intermediate image instructions, directly hack on
6163/// the intrinsic's arguments. In cases like a16 addresses, this requires
6164/// padding now unnecessary arguments with $noreg.
6165bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
6166 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
6167 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
6168
6169 const MachineFunction &MF = *MI.getMF();
6170 const unsigned NumDefs = MI.getNumExplicitDefs();
6171 const unsigned ArgOffset = NumDefs + 1;
6172 bool IsTFE = NumDefs == 2;
6173 // We are only processing the operands of d16 image operations on subtargets
6174 // that use the unpacked register layout, or need to repack the TFE result.
6175
6176 // TODO: Do we need to guard against already legalized intrinsics?
6177 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
6178 AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Intr->BaseOpcode);
6179
6180 MachineRegisterInfo *MRI = B.getMRI();
6181 const LLT S32 = LLT::scalar(SizeInBits: 32);
6182 const LLT S16 = LLT::scalar(SizeInBits: 16);
6183 const LLT V2S16 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16);
6184
6185 unsigned DMask = 0;
6186 Register VData = MI.getOperand(i: NumDefs == 0 ? 1 : 0).getReg();
6187 LLT Ty = MRI->getType(Reg: VData);
6188
6189 const bool IsAtomicPacked16Bit =
6190 (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
6191 BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
6192
6193 // Check for 16 bit addresses and pack if true.
6194 LLT GradTy =
6195 MRI->getType(Reg: MI.getOperand(i: ArgOffset + Intr->GradientStart).getReg());
6196 LLT AddrTy =
6197 MRI->getType(Reg: MI.getOperand(i: ArgOffset + Intr->CoordStart).getReg());
6198 const bool IsG16 =
6199 ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
6200 const bool IsA16 = AddrTy == S16;
6201 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
6202
6203 int DMaskLanes = 0;
6204 if (!BaseOpcode->Atomic) {
6205 DMask = MI.getOperand(i: ArgOffset + Intr->DMaskIndex).getImm();
6206 if (BaseOpcode->Gather4) {
6207 DMaskLanes = 4;
6208 } else if (DMask != 0) {
6209 DMaskLanes = llvm::popcount(Value: DMask);
6210 } else if (!IsTFE && !BaseOpcode->Store) {
6211 // If dmask is 0, this is a no-op load. This can be eliminated.
6212 B.buildUndef(Res: MI.getOperand(i: 0));
6213 MI.eraseFromParent();
6214 return true;
6215 }
6216 }
6217
6218 Observer.changingInstr(MI);
6219 auto ChangedInstr = make_scope_exit(F: [&] { Observer.changedInstr(MI); });
6220
6221 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
6222 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
6223 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
6224 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
6225 unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
6226
6227 // Track that we legalized this
6228 MI.setDesc(B.getTII().get(Opcode: NewOpcode));
6229
6230 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
6231 // dmask to be at least 1 otherwise the instruction will fail
6232 if (IsTFE && DMask == 0) {
6233 DMask = 0x1;
6234 DMaskLanes = 1;
6235 MI.getOperand(i: ArgOffset + Intr->DMaskIndex).setImm(DMask);
6236 }
6237
6238 if (BaseOpcode->Atomic) {
6239 Register VData0 = MI.getOperand(i: 2).getReg();
6240 LLT Ty = MRI->getType(Reg: VData0);
6241
6242 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
6243 if (Ty.isVector() && !IsAtomicPacked16Bit)
6244 return false;
6245
6246 if (BaseOpcode->AtomicX2) {
6247 Register VData1 = MI.getOperand(i: 3).getReg();
6248 // The two values are packed in one register.
6249 LLT PackedTy = LLT::fixed_vector(NumElements: 2, ScalarTy: Ty);
6250 auto Concat = B.buildBuildVector(Res: PackedTy, Ops: {VData0, VData1});
6251 MI.getOperand(i: 2).setReg(Concat.getReg(Idx: 0));
6252 MI.getOperand(3).setReg(AMDGPU::NoRegister);
6253 }
6254 }
6255
6256 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
6257
6258 // Rewrite the addressing register layout before doing anything else.
6259 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
6260 // 16 bit gradients are supported, but are tied to the A16 control
6261 // so both gradients and addresses must be 16 bit
6262 return false;
6263 }
6264
6265 if (IsA16 && !ST.hasA16()) {
6266 // A16 not supported
6267 return false;
6268 }
6269
6270 const unsigned NSAMaxSize = ST.getNSAMaxSize(HasSampler: BaseOpcode->Sampler);
6271 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
6272
6273 if (IsA16 || IsG16) {
6274 // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
6275 // instructions expect VGPR_32
6276 SmallVector<Register, 4> PackedRegs;
6277
6278 packImage16bitOpsToDwords(B, MI, PackedAddrs&: PackedRegs, ArgOffset, Intr, IsA16, IsG16);
6279
6280 // See also below in the non-a16 branch
6281 const bool UseNSA = ST.hasNSAEncoding() &&
6282 PackedRegs.size() >= ST.getNSAThreshold(MF) &&
6283 (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
6284 const bool UsePartialNSA =
6285 UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
6286
6287 if (UsePartialNSA) {
6288 // Pack registers that would go over NSAMaxSize into last VAddr register
6289 LLT PackedAddrTy =
6290 LLT::fixed_vector(NumElements: 2 * (PackedRegs.size() - NSAMaxSize + 1), ScalarSizeInBits: 16);
6291 auto Concat = B.buildConcatVectors(
6292 Res: PackedAddrTy, Ops: ArrayRef(PackedRegs).slice(N: NSAMaxSize - 1));
6293 PackedRegs[NSAMaxSize - 1] = Concat.getReg(Idx: 0);
6294 PackedRegs.resize(N: NSAMaxSize);
6295 } else if (!UseNSA && PackedRegs.size() > 1) {
6296 LLT PackedAddrTy = LLT::fixed_vector(NumElements: 2 * PackedRegs.size(), ScalarSizeInBits: 16);
6297 auto Concat = B.buildConcatVectors(Res: PackedAddrTy, Ops: PackedRegs);
6298 PackedRegs[0] = Concat.getReg(Idx: 0);
6299 PackedRegs.resize(N: 1);
6300 }
6301
6302 const unsigned NumPacked = PackedRegs.size();
6303 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
6304 MachineOperand &SrcOp = MI.getOperand(i: ArgOffset + I);
6305 if (!SrcOp.isReg()) {
6306 assert(SrcOp.isImm() && SrcOp.getImm() == 0);
6307 continue;
6308 }
6309
6310 assert(SrcOp.getReg() != AMDGPU::NoRegister);
6311
6312 if (I - Intr->VAddrStart < NumPacked)
6313 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
6314 else
6315 SrcOp.setReg(AMDGPU::NoRegister);
6316 }
6317 } else {
6318 // If the register allocator cannot place the address registers contiguously
6319 // without introducing moves, then using the non-sequential address encoding
6320 // is always preferable, since it saves VALU instructions and is usually a
6321 // wash in terms of code size or even better.
6322 //
6323 // However, we currently have no way of hinting to the register allocator
6324 // that MIMG addresses should be placed contiguously when it is possible to
6325 // do so, so force non-NSA for the common 2-address case as a heuristic.
6326 //
6327 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
6328 // allocation when possible.
6329 //
6330 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
6331 // set of the remaining addresses.
6332 const bool UseNSA = ST.hasNSAEncoding() &&
6333 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
6334 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
6335 const bool UsePartialNSA =
6336 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
6337
6338 if (UsePartialNSA) {
6339 convertImageAddrToPacked(B, MI,
6340 DimIdx: ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
6341 NumVAddrs: Intr->NumVAddrs - NSAMaxSize + 1);
6342 } else if (!UseNSA && Intr->NumVAddrs > 1) {
6343 convertImageAddrToPacked(B, MI, DimIdx: ArgOffset + Intr->VAddrStart,
6344 NumVAddrs: Intr->NumVAddrs);
6345 }
6346 }
6347
6348 int Flags = 0;
6349 if (IsA16)
6350 Flags |= 1;
6351 if (IsG16)
6352 Flags |= 2;
6353 MI.addOperand(Op: MachineOperand::CreateImm(Val: Flags));
6354
6355 if (BaseOpcode->Store) { // No TFE for stores?
6356 // TODO: Handle dmask trim
6357 if (!Ty.isVector() || !IsD16)
6358 return true;
6359
6360 Register RepackedReg = handleD16VData(B, MRI&: *MRI, Reg: VData, ImageStore: true);
6361 if (RepackedReg != VData) {
6362 MI.getOperand(i: 1).setReg(RepackedReg);
6363 }
6364
6365 return true;
6366 }
6367
6368 Register DstReg = MI.getOperand(i: 0).getReg();
6369 const LLT EltTy = Ty.getScalarType();
6370 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
6371
6372 // Confirm that the return type is large enough for the dmask specified
6373 if (NumElts < DMaskLanes)
6374 return false;
6375
6376 if (NumElts > 4 || DMaskLanes > 4)
6377 return false;
6378
6379 // Image atomic instructions are using DMask to specify how many bits
6380 // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
6381 // DMaskLanes for image atomic has default value '0'.
6382 // We must be sure that atomic variants (especially packed) will not be
6383 // truncated from v2s16 or v4s16 to s16 type.
6384 //
6385 // ChangeElementCount will be needed for image load where Ty is always scalar.
6386 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
6387 const LLT AdjustedTy =
6388 DMaskLanes == 0
6389 ? Ty
6390 : Ty.changeElementCount(EC: ElementCount::getFixed(MinVal: AdjustedNumElts));
6391
6392 // The raw dword aligned data component of the load. The only legal cases
6393 // where this matters should be when using the packed D16 format, for
6394 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
6395 LLT RoundedTy;
6396
6397 // S32 vector to cover all data, plus TFE result element.
6398 LLT TFETy;
6399
6400 // Register type to use for each loaded component. Will be S32 or V2S16.
6401 LLT RegTy;
6402
6403 if (IsD16 && ST.hasUnpackedD16VMem()) {
6404 RoundedTy =
6405 LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: AdjustedNumElts), ScalarSize: 32);
6406 TFETy = LLT::fixed_vector(NumElements: AdjustedNumElts + 1, ScalarSizeInBits: 32);
6407 RegTy = S32;
6408 } else {
6409 unsigned EltSize = EltTy.getSizeInBits();
6410 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
6411 unsigned RoundedSize = 32 * RoundedElts;
6412 RoundedTy = LLT::scalarOrVector(
6413 EC: ElementCount::getFixed(MinVal: RoundedSize / EltSize), ScalarSize: EltSize);
6414 TFETy = LLT::fixed_vector(NumElements: RoundedSize / 32 + 1, ScalarTy: S32);
6415 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
6416 }
6417
6418 // The return type does not need adjustment.
6419 // TODO: Should we change s16 case to s32 or <2 x s16>?
6420 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
6421 return true;
6422
6423 Register Dst1Reg;
6424
6425 // Insert after the instruction.
6426 B.setInsertPt(MBB&: *MI.getParent(), II: ++MI.getIterator());
6427
6428 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
6429 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
6430 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
6431 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
6432
6433 Register NewResultReg = MRI->createGenericVirtualRegister(Ty: LoadResultTy);
6434
6435 MI.getOperand(i: 0).setReg(NewResultReg);
6436
6437 // In the IR, TFE is supposed to be used with a 2 element struct return
6438 // type. The instruction really returns these two values in one contiguous
6439 // register, with one additional dword beyond the loaded data. Rewrite the
6440 // return type to use a single register result.
6441
6442 if (IsTFE) {
6443 Dst1Reg = MI.getOperand(i: 1).getReg();
6444 if (MRI->getType(Reg: Dst1Reg) != S32)
6445 return false;
6446
6447 // TODO: Make sure the TFE operand bit is set.
6448 MI.removeOperand(OpNo: 1);
6449
6450 // Handle the easy case that requires no repack instructions.
6451 if (Ty == S32) {
6452 B.buildUnmerge(Res: {DstReg, Dst1Reg}, Op: NewResultReg);
6453 return true;
6454 }
6455 }
6456
6457 // Now figure out how to copy the new result register back into the old
6458 // result.
6459 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
6460
6461 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
6462
6463 if (ResultNumRegs == 1) {
6464 assert(!IsTFE);
6465 ResultRegs[0] = NewResultReg;
6466 } else {
6467 // We have to repack into a new vector of some kind.
6468 for (int I = 0; I != NumDataRegs; ++I)
6469 ResultRegs[I] = MRI->createGenericVirtualRegister(Ty: RegTy);
6470 B.buildUnmerge(Res: ResultRegs, Op: NewResultReg);
6471
6472 // Drop the final TFE element to get the data part. The TFE result is
6473 // directly written to the right place already.
6474 if (IsTFE)
6475 ResultRegs.resize(N: NumDataRegs);
6476 }
6477
6478 // For an s16 scalar result, we form an s32 result with a truncate regardless
6479 // of packed vs. unpacked.
6480 if (IsD16 && !Ty.isVector()) {
6481 B.buildTrunc(Res: DstReg, Op: ResultRegs[0]);
6482 return true;
6483 }
6484
6485 // Avoid a build/concat_vector of 1 entry.
6486 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
6487 B.buildBitcast(Dst: DstReg, Src: ResultRegs[0]);
6488 return true;
6489 }
6490
6491 assert(Ty.isVector());
6492
6493 if (IsD16) {
6494 // For packed D16 results with TFE enabled, all the data components are
6495 // S32. Cast back to the expected type.
6496 //
6497 // TODO: We don't really need to use load s32 elements. We would only need one
6498 // cast for the TFE result if a multiple of v2s16 was used.
6499 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
6500 for (Register &Reg : ResultRegs)
6501 Reg = B.buildBitcast(Dst: V2S16, Src: Reg).getReg(Idx: 0);
6502 } else if (ST.hasUnpackedD16VMem()) {
6503 for (Register &Reg : ResultRegs)
6504 Reg = B.buildTrunc(Res: S16, Op: Reg).getReg(Idx: 0);
6505 }
6506 }
6507
6508 auto padWithUndef = [&](LLT Ty, int NumElts) {
6509 if (NumElts == 0)
6510 return;
6511 Register Undef = B.buildUndef(Res: Ty).getReg(Idx: 0);
6512 for (int I = 0; I != NumElts; ++I)
6513 ResultRegs.push_back(Elt: Undef);
6514 };
6515
6516 // Pad out any elements eliminated due to the dmask.
6517 LLT ResTy = MRI->getType(Reg: ResultRegs[0]);
6518 if (!ResTy.isVector()) {
6519 padWithUndef(ResTy, NumElts - ResultRegs.size());
6520 B.buildBuildVector(Res: DstReg, Ops: ResultRegs);
6521 return true;
6522 }
6523
6524 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
6525 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
6526
6527 // Deal with the one annoying legal case.
6528 const LLT V3S16 = LLT::fixed_vector(NumElements: 3, ScalarSizeInBits: 16);
6529 if (Ty == V3S16) {
6530 if (IsTFE) {
6531 if (ResultRegs.size() == 1) {
6532 NewResultReg = ResultRegs[0];
6533 } else if (ResultRegs.size() == 2) {
6534 LLT V4S16 = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 16);
6535 NewResultReg = B.buildConcatVectors(Res: V4S16, Ops: ResultRegs).getReg(Idx: 0);
6536 } else {
6537 return false;
6538 }
6539 }
6540
6541 if (MRI->getType(Reg: DstReg).getNumElements() <
6542 MRI->getType(Reg: NewResultReg).getNumElements()) {
6543 B.buildDeleteTrailingVectorElements(Res: DstReg, Op0: NewResultReg);
6544 } else {
6545 B.buildPadVectorWithUndefElements(Res: DstReg, Op0: NewResultReg);
6546 }
6547 return true;
6548 }
6549
6550 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
6551 B.buildConcatVectors(Res: DstReg, Ops: ResultRegs);
6552 return true;
6553}
6554
6555bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper,
6556 MachineInstr &MI) const {
6557 MachineIRBuilder &B = Helper.MIRBuilder;
6558 GISelChangeObserver &Observer = Helper.Observer;
6559
6560 Register OrigDst = MI.getOperand(i: 0).getReg();
6561 Register Dst;
6562 LLT Ty = B.getMRI()->getType(Reg: OrigDst);
6563 unsigned Size = Ty.getSizeInBits();
6564 MachineFunction &MF = B.getMF();
6565 unsigned Opc = 0;
6566 if (Size < 32 && ST.hasScalarSubwordLoads()) {
6567 assert(Size == 8 || Size == 16);
6568 Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
6569 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
6570 // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
6571 // destination register.
6572 Dst = B.getMRI()->createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 32));
6573 } else {
6574 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
6575 Dst = OrigDst;
6576 }
6577
6578 Observer.changingInstr(MI);
6579
6580 // Handle needing to s.buffer.load() a p8 value.
6581 if (hasBufferRsrcWorkaround(Ty)) {
6582 Ty = castBufferRsrcFromV4I32(MI, B, MRI&: *B.getMRI(), Idx: 0);
6583 B.setInsertPt(MBB&: B.getMBB(), II: MI);
6584 }
6585 if (shouldBitcastLoadStoreType(ST, Ty, MemTy: LLT::scalar(SizeInBits: Size))) {
6586 Ty = getBitcastRegisterType(Ty);
6587 Helper.bitcastDst(MI, CastTy: Ty, OpIdx: 0);
6588 B.setInsertPt(MBB&: B.getMBB(), II: MI);
6589 }
6590
6591 // FIXME: We don't really need this intermediate instruction. The intrinsic
6592 // should be fixed to have a memory operand. Since it's readnone, we're not
6593 // allowed to add one.
6594 MI.setDesc(B.getTII().get(Opcode: Opc));
6595 MI.removeOperand(OpNo: 1); // Remove intrinsic ID
6596
6597 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
6598 // TODO: Should this use datalayout alignment?
6599 const unsigned MemSize = (Size + 7) / 8;
6600 const Align MemAlign(std::min(a: MemSize, b: 4u));
6601 MachineMemOperand *MMO = MF.getMachineMemOperand(
6602 PtrInfo: MachinePointerInfo(),
6603 F: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
6604 MachineMemOperand::MOInvariant,
6605 Size: MemSize, BaseAlignment: MemAlign);
6606 MI.addMemOperand(MF, MO: MMO);
6607 if (Dst != OrigDst) {
6608 MI.getOperand(i: 0).setReg(Dst);
6609 B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
6610 B.buildTrunc(Res: OrigDst, Op: Dst);
6611 }
6612
6613 // If we don't have 96-bit result scalar loads, widening to 128-bit should
6614 // always be legal. We may need to restore this to a 96-bit result if it turns
6615 // out this needs to be converted to a vector load during RegBankSelect.
6616 if (!isPowerOf2_32(Value: Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
6617 if (Ty.isVector())
6618 Helper.moreElementsVectorDst(MI, MoreTy: getPow2VectorType(Ty), OpIdx: 0);
6619 else
6620 Helper.widenScalarDst(MI, WideTy: getPow2ScalarType(Ty), OpIdx: 0);
6621 }
6622
6623 Observer.changedInstr(MI);
6624 return true;
6625}
6626
6627// TODO: Move to selection
6628bool AMDGPULegalizerInfo::legalizeTrap(MachineInstr &MI,
6629 MachineRegisterInfo &MRI,
6630 MachineIRBuilder &B) const {
6631 if (!ST.isTrapHandlerEnabled() ||
6632 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
6633 return legalizeTrapEndpgm(MI, MRI, B);
6634
6635 return ST.supportsGetDoorbellID() ?
6636 legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B);
6637}
6638
6639bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
6640 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6641 const DebugLoc &DL = MI.getDebugLoc();
6642 MachineBasicBlock &BB = B.getMBB();
6643 MachineFunction *MF = BB.getParent();
6644
6645 if (BB.succ_empty() && std::next(x: MI.getIterator()) == BB.end()) {
6646 BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
6647 .addImm(0);
6648 MI.eraseFromParent();
6649 return true;
6650 }
6651
6652 // We need a block split to make the real endpgm a terminator. We also don't
6653 // want to break phis in successor blocks, so we can't just delete to the
6654 // end of the block.
6655 BB.splitAt(SplitInst&: MI, UpdateLiveIns: false /*UpdateLiveIns*/);
6656 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
6657 MF->push_back(MBB: TrapBB);
6658 BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
6659 .addImm(0);
6660 BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
6661 .addMBB(TrapBB);
6662
6663 BB.addSuccessor(Succ: TrapBB);
6664 MI.eraseFromParent();
6665 return true;
6666}
6667
6668bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
6669 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6670 MachineFunction &MF = B.getMF();
6671 const LLT S64 = LLT::scalar(SizeInBits: 64);
6672
6673 Register SGPR01(AMDGPU::SGPR0_SGPR1);
6674 // For code object version 5, queue_ptr is passed through implicit kernarg.
6675 if (AMDGPU::getAMDHSACodeObjectVersion(M: *MF.getFunction().getParent()) >=
6676 AMDGPU::AMDHSA_COV5) {
6677 AMDGPUTargetLowering::ImplicitParameter Param =
6678 AMDGPUTargetLowering::QUEUE_PTR;
6679 uint64_t Offset =
6680 ST.getTargetLowering()->getImplicitParameterOffset(MF: B.getMF(), Param);
6681
6682 Register KernargPtrReg = MRI.createGenericVirtualRegister(
6683 Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
6684
6685 if (!loadInputValue(DstReg: KernargPtrReg, B,
6686 ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
6687 return false;
6688
6689 // TODO: can we be smarter about machine pointer info?
6690 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
6691 MachineMemOperand *MMO = MF.getMachineMemOperand(
6692 PtrInfo,
6693 f: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
6694 MachineMemOperand::MOInvariant,
6695 MemTy: LLT::scalar(SizeInBits: 64), base_alignment: commonAlignment(A: Align(64), Offset));
6696
6697 // Pointer address
6698 Register LoadAddr = MRI.createGenericVirtualRegister(
6699 Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
6700 B.buildPtrAdd(Res: LoadAddr, Op0: KernargPtrReg,
6701 Op1: B.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: Offset).getReg(Idx: 0));
6702 // Load address
6703 Register Temp = B.buildLoad(Res: S64, Addr: LoadAddr, MMO&: *MMO).getReg(Idx: 0);
6704 B.buildCopy(Res: SGPR01, Op: Temp);
6705 B.buildInstr(AMDGPU::S_TRAP)
6706 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
6707 .addReg(SGPR01, RegState::Implicit);
6708 MI.eraseFromParent();
6709 return true;
6710 }
6711
6712 // Pass queue pointer to trap handler as input, and insert trap instruction
6713 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
6714 Register LiveIn =
6715 MRI.createGenericVirtualRegister(Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
6716 if (!loadInputValue(DstReg: LiveIn, B, ArgType: AMDGPUFunctionArgInfo::QUEUE_PTR))
6717 return false;
6718
6719 B.buildCopy(Res: SGPR01, Op: LiveIn);
6720 B.buildInstr(AMDGPU::S_TRAP)
6721 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
6722 .addReg(SGPR01, RegState::Implicit);
6723
6724 MI.eraseFromParent();
6725 return true;
6726}
6727
6728bool AMDGPULegalizerInfo::legalizeTrapHsa(MachineInstr &MI,
6729 MachineRegisterInfo &MRI,
6730 MachineIRBuilder &B) const {
6731 // We need to simulate the 's_trap 2' instruction on targets that run in
6732 // PRIV=1 (where it is treated as a nop).
6733 if (ST.hasPrivEnabledTrap2NopBug()) {
6734 ST.getInstrInfo()->insertSimulatedTrap(MRI, MBB&: B.getMBB(), MI,
6735 DL: MI.getDebugLoc());
6736 MI.eraseFromParent();
6737 return true;
6738 }
6739
6740 B.buildInstr(AMDGPU::S_TRAP)
6741 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
6742 MI.eraseFromParent();
6743 return true;
6744}
6745
6746bool AMDGPULegalizerInfo::legalizeDebugTrap(MachineInstr &MI,
6747 MachineRegisterInfo &MRI,
6748 MachineIRBuilder &B) const {
6749 // Is non-HSA path or trap-handler disabled? Then, report a warning
6750 // accordingly
6751 if (!ST.isTrapHandlerEnabled() ||
6752 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
6753 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
6754 "debugtrap handler not supported",
6755 MI.getDebugLoc(), DS_Warning);
6756 LLVMContext &Ctx = B.getMF().getFunction().getContext();
6757 Ctx.diagnose(DI: NoTrap);
6758 } else {
6759 // Insert debug-trap instruction
6760 B.buildInstr(AMDGPU::S_TRAP)
6761 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
6762 }
6763
6764 MI.eraseFromParent();
6765 return true;
6766}
6767
6768bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
6769 MachineIRBuilder &B) const {
6770 MachineRegisterInfo &MRI = *B.getMRI();
6771 const LLT S16 = LLT::scalar(SizeInBits: 16);
6772 const LLT S32 = LLT::scalar(SizeInBits: 32);
6773 const LLT V2S16 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16);
6774 const LLT V3S32 = LLT::fixed_vector(NumElements: 3, ScalarSizeInBits: 32);
6775
6776 Register DstReg = MI.getOperand(i: 0).getReg();
6777 Register NodePtr = MI.getOperand(i: 2).getReg();
6778 Register RayExtent = MI.getOperand(i: 3).getReg();
6779 Register RayOrigin = MI.getOperand(i: 4).getReg();
6780 Register RayDir = MI.getOperand(i: 5).getReg();
6781 Register RayInvDir = MI.getOperand(i: 6).getReg();
6782 Register TDescr = MI.getOperand(i: 7).getReg();
6783
6784 if (!ST.hasGFX10_AEncoding()) {
6785 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
6786 "intrinsic not supported on subtarget",
6787 MI.getDebugLoc());
6788 B.getMF().getFunction().getContext().diagnose(DI: BadIntrin);
6789 return false;
6790 }
6791
6792 const bool IsGFX11 = AMDGPU::isGFX11(ST);
6793 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
6794 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST);
6795 const bool IsA16 = MRI.getType(Reg: RayDir).getElementType().getSizeInBits() == 16;
6796 const bool Is64 = MRI.getType(Reg: NodePtr).getSizeInBits() == 64;
6797 const unsigned NumVDataDwords = 4;
6798 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
6799 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
6800 const bool UseNSA =
6801 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
6802
6803 const unsigned BaseOpcodes[2][2] = {
6804 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
6805 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
6806 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
6807 int Opcode;
6808 if (UseNSA) {
6809 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
6810 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
6811 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
6812 : AMDGPU::MIMGEncGfx10NSA,
6813 NumVDataDwords, NumVAddrDwords);
6814 } else {
6815 assert(!IsGFX12Plus);
6816 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
6817 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
6818 : AMDGPU::MIMGEncGfx10Default,
6819 NumVDataDwords, NumVAddrDwords);
6820 }
6821 assert(Opcode != -1);
6822
6823 SmallVector<Register, 12> Ops;
6824 if (UseNSA && IsGFX11Plus) {
6825 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
6826 auto Unmerge = B.buildUnmerge(Res: {S32, S32, S32}, Op: Src);
6827 auto Merged = B.buildMergeLikeInstr(
6828 Res: V3S32, Ops: {Unmerge.getReg(Idx: 0), Unmerge.getReg(Idx: 1), Unmerge.getReg(Idx: 2)});
6829 Ops.push_back(Elt: Merged.getReg(Idx: 0));
6830 };
6831
6832 Ops.push_back(Elt: NodePtr);
6833 Ops.push_back(Elt: RayExtent);
6834 packLanes(RayOrigin);
6835
6836 if (IsA16) {
6837 auto UnmergeRayDir = B.buildUnmerge(Res: {S16, S16, S16}, Op: RayDir);
6838 auto UnmergeRayInvDir = B.buildUnmerge(Res: {S16, S16, S16}, Op: RayInvDir);
6839 auto MergedDir = B.buildMergeLikeInstr(
6840 Res: V3S32,
6841 Ops: {B.buildBitcast(
6842 Dst: S32, Src: B.buildMergeLikeInstr(Res: V2S16, Ops: {UnmergeRayInvDir.getReg(Idx: 0),
6843 UnmergeRayDir.getReg(Idx: 0)}))
6844 .getReg(Idx: 0),
6845 B.buildBitcast(
6846 Dst: S32, Src: B.buildMergeLikeInstr(Res: V2S16, Ops: {UnmergeRayInvDir.getReg(Idx: 1),
6847 UnmergeRayDir.getReg(Idx: 1)}))
6848 .getReg(Idx: 0),
6849 B.buildBitcast(
6850 Dst: S32, Src: B.buildMergeLikeInstr(Res: V2S16, Ops: {UnmergeRayInvDir.getReg(Idx: 2),
6851 UnmergeRayDir.getReg(Idx: 2)}))
6852 .getReg(Idx: 0)});
6853 Ops.push_back(Elt: MergedDir.getReg(Idx: 0));
6854 } else {
6855 packLanes(RayDir);
6856 packLanes(RayInvDir);
6857 }
6858 } else {
6859 if (Is64) {
6860 auto Unmerge = B.buildUnmerge(Res: {S32, S32}, Op: NodePtr);
6861 Ops.push_back(Elt: Unmerge.getReg(Idx: 0));
6862 Ops.push_back(Elt: Unmerge.getReg(Idx: 1));
6863 } else {
6864 Ops.push_back(Elt: NodePtr);
6865 }
6866 Ops.push_back(Elt: RayExtent);
6867
6868 auto packLanes = [&Ops, &S32, &B](Register Src) {
6869 auto Unmerge = B.buildUnmerge(Res: {S32, S32, S32}, Op: Src);
6870 Ops.push_back(Elt: Unmerge.getReg(Idx: 0));
6871 Ops.push_back(Elt: Unmerge.getReg(Idx: 1));
6872 Ops.push_back(Elt: Unmerge.getReg(Idx: 2));
6873 };
6874
6875 packLanes(RayOrigin);
6876 if (IsA16) {
6877 auto UnmergeRayDir = B.buildUnmerge(Res: {S16, S16, S16}, Op: RayDir);
6878 auto UnmergeRayInvDir = B.buildUnmerge(Res: {S16, S16, S16}, Op: RayInvDir);
6879 Register R1 = MRI.createGenericVirtualRegister(Ty: S32);
6880 Register R2 = MRI.createGenericVirtualRegister(Ty: S32);
6881 Register R3 = MRI.createGenericVirtualRegister(Ty: S32);
6882 B.buildMergeLikeInstr(Res: R1,
6883 Ops: {UnmergeRayDir.getReg(Idx: 0), UnmergeRayDir.getReg(Idx: 1)});
6884 B.buildMergeLikeInstr(
6885 Res: R2, Ops: {UnmergeRayDir.getReg(Idx: 2), UnmergeRayInvDir.getReg(Idx: 0)});
6886 B.buildMergeLikeInstr(
6887 Res: R3, Ops: {UnmergeRayInvDir.getReg(Idx: 1), UnmergeRayInvDir.getReg(Idx: 2)});
6888 Ops.push_back(Elt: R1);
6889 Ops.push_back(Elt: R2);
6890 Ops.push_back(Elt: R3);
6891 } else {
6892 packLanes(RayDir);
6893 packLanes(RayInvDir);
6894 }
6895 }
6896
6897 if (!UseNSA) {
6898 // Build a single vector containing all the operands so far prepared.
6899 LLT OpTy = LLT::fixed_vector(NumElements: Ops.size(), ScalarSizeInBits: 32);
6900 Register MergedOps = B.buildMergeLikeInstr(Res: OpTy, Ops).getReg(Idx: 0);
6901 Ops.clear();
6902 Ops.push_back(Elt: MergedOps);
6903 }
6904
6905 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
6906 .addDef(DstReg)
6907 .addImm(Opcode);
6908
6909 for (Register R : Ops) {
6910 MIB.addUse(R);
6911 }
6912
6913 MIB.addUse(TDescr)
6914 .addImm(IsA16 ? 1 : 0)
6915 .cloneMemRefs(MI);
6916
6917 MI.eraseFromParent();
6918 return true;
6919}
6920
6921bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI,
6922 MachineIRBuilder &B) const {
6923 unsigned Opc;
6924 int RoundMode = MI.getOperand(i: 2).getImm();
6925
6926 if (RoundMode == (int)RoundingMode::TowardPositive)
6927 Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD;
6928 else if (RoundMode == (int)RoundingMode::TowardNegative)
6929 Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD;
6930 else
6931 return false;
6932
6933 B.buildInstr(Opcode: Opc)
6934 .addDef(RegNo: MI.getOperand(i: 0).getReg())
6935 .addUse(RegNo: MI.getOperand(i: 1).getReg());
6936
6937 MI.eraseFromParent();
6938
6939 return true;
6940}
6941
6942bool AMDGPULegalizerInfo::legalizeStackSave(MachineInstr &MI,
6943 MachineIRBuilder &B) const {
6944 const SITargetLowering *TLI = ST.getTargetLowering();
6945 Register StackPtr = TLI->getStackPointerRegisterToSaveRestore();
6946 Register DstReg = MI.getOperand(i: 0).getReg();
6947 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
6948 MI.eraseFromParent();
6949 return true;
6950}
6951
6952bool AMDGPULegalizerInfo::legalizeWaveID(MachineInstr &MI,
6953 MachineIRBuilder &B) const {
6954 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
6955 if (!ST.hasArchitectedSGPRs())
6956 return false;
6957 LLT S32 = LLT::scalar(SizeInBits: 32);
6958 Register DstReg = MI.getOperand(i: 0).getReg();
6959 auto TTMP8 = B.buildCopy(S32, Register(AMDGPU::TTMP8));
6960 auto LSB = B.buildConstant(Res: S32, Val: 25);
6961 auto Width = B.buildConstant(Res: S32, Val: 5);
6962 B.buildUbfx(Dst: DstReg, Src: TTMP8, LSB, Width);
6963 MI.eraseFromParent();
6964 return true;
6965}
6966
6967static constexpr unsigned FPEnvModeBitField =
6968 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 23);
6969
6970static constexpr unsigned FPEnvTrapBitField =
6971 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_TRAPSTS, 0, 5);
6972
6973bool AMDGPULegalizerInfo::legalizeGetFPEnv(MachineInstr &MI,
6974 MachineRegisterInfo &MRI,
6975 MachineIRBuilder &B) const {
6976 Register Src = MI.getOperand(i: 0).getReg();
6977 if (MRI.getType(Reg: Src) != S64)
6978 return false;
6979
6980 auto ModeReg =
6981 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
6982 /*HasSideEffects=*/true, /*isConvergent=*/false)
6983 .addImm(FPEnvModeBitField);
6984 auto TrapReg =
6985 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
6986 /*HasSideEffects=*/true, /*isConvergent=*/false)
6987 .addImm(FPEnvTrapBitField);
6988 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
6989 MI.eraseFromParent();
6990 return true;
6991}
6992
6993bool AMDGPULegalizerInfo::legalizeSetFPEnv(MachineInstr &MI,
6994 MachineRegisterInfo &MRI,
6995 MachineIRBuilder &B) const {
6996 Register Src = MI.getOperand(i: 0).getReg();
6997 if (MRI.getType(Reg: Src) != S64)
6998 return false;
6999
7000 auto Unmerge = B.buildUnmerge(Res: {S32, S32}, Op: MI.getOperand(i: 0));
7001 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7002 /*HasSideEffects=*/true, /*isConvergent=*/false)
7003 .addImm(static_cast<int16_t>(FPEnvModeBitField))
7004 .addReg(Unmerge.getReg(0));
7005 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7006 /*HasSideEffects=*/true, /*isConvergent=*/false)
7007 .addImm(static_cast<int16_t>(FPEnvTrapBitField))
7008 .addReg(Unmerge.getReg(1));
7009 MI.eraseFromParent();
7010 return true;
7011}
7012
7013bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
7014 MachineInstr &MI) const {
7015 MachineIRBuilder &B = Helper.MIRBuilder;
7016 MachineRegisterInfo &MRI = *B.getMRI();
7017
7018 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
7019 auto IntrID = cast<GIntrinsic>(Val&: MI).getIntrinsicID();
7020 switch (IntrID) {
7021 case Intrinsic::amdgcn_if:
7022 case Intrinsic::amdgcn_else: {
7023 MachineInstr *Br = nullptr;
7024 MachineBasicBlock *UncondBrTarget = nullptr;
7025 bool Negated = false;
7026 if (MachineInstr *BrCond =
7027 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7028 const SIRegisterInfo *TRI
7029 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7030
7031 Register Def = MI.getOperand(i: 1).getReg();
7032 Register Use = MI.getOperand(i: 3).getReg();
7033
7034 MachineBasicBlock *CondBrTarget = BrCond->getOperand(i: 1).getMBB();
7035
7036 if (Negated)
7037 std::swap(a&: CondBrTarget, b&: UncondBrTarget);
7038
7039 B.setInsertPt(MBB&: B.getMBB(), II: BrCond->getIterator());
7040 if (IntrID == Intrinsic::amdgcn_if) {
7041 B.buildInstr(AMDGPU::SI_IF)
7042 .addDef(Def)
7043 .addUse(Use)
7044 .addMBB(UncondBrTarget);
7045 } else {
7046 B.buildInstr(AMDGPU::SI_ELSE)
7047 .addDef(Def)
7048 .addUse(Use)
7049 .addMBB(UncondBrTarget);
7050 }
7051
7052 if (Br) {
7053 Br->getOperand(i: 0).setMBB(CondBrTarget);
7054 } else {
7055 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
7056 // since we're swapping branch targets it needs to be reinserted.
7057 // FIXME: IRTranslator should probably not do this
7058 B.buildBr(Dest&: *CondBrTarget);
7059 }
7060
7061 MRI.setRegClass(Reg: Def, RC: TRI->getWaveMaskRegClass());
7062 MRI.setRegClass(Reg: Use, RC: TRI->getWaveMaskRegClass());
7063 MI.eraseFromParent();
7064 BrCond->eraseFromParent();
7065 return true;
7066 }
7067
7068 return false;
7069 }
7070 case Intrinsic::amdgcn_loop: {
7071 MachineInstr *Br = nullptr;
7072 MachineBasicBlock *UncondBrTarget = nullptr;
7073 bool Negated = false;
7074 if (MachineInstr *BrCond =
7075 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7076 const SIRegisterInfo *TRI
7077 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7078
7079 MachineBasicBlock *CondBrTarget = BrCond->getOperand(i: 1).getMBB();
7080 Register Reg = MI.getOperand(i: 2).getReg();
7081
7082 if (Negated)
7083 std::swap(a&: CondBrTarget, b&: UncondBrTarget);
7084
7085 B.setInsertPt(MBB&: B.getMBB(), II: BrCond->getIterator());
7086 B.buildInstr(AMDGPU::SI_LOOP)
7087 .addUse(Reg)
7088 .addMBB(UncondBrTarget);
7089
7090 if (Br)
7091 Br->getOperand(i: 0).setMBB(CondBrTarget);
7092 else
7093 B.buildBr(Dest&: *CondBrTarget);
7094
7095 MI.eraseFromParent();
7096 BrCond->eraseFromParent();
7097 MRI.setRegClass(Reg, RC: TRI->getWaveMaskRegClass());
7098 return true;
7099 }
7100
7101 return false;
7102 }
7103 case Intrinsic::amdgcn_addrspacecast_nonnull:
7104 return legalizeAddrSpaceCast(MI, MRI, B);
7105 case Intrinsic::amdgcn_make_buffer_rsrc:
7106 return legalizePointerAsRsrcIntrin(MI, MRI, B);
7107 case Intrinsic::amdgcn_kernarg_segment_ptr:
7108 if (!AMDGPU::isKernel(CC: B.getMF().getFunction().getCallingConv())) {
7109 // This only makes sense to call in a kernel, so just lower to null.
7110 B.buildConstant(Res: MI.getOperand(i: 0).getReg(), Val: 0);
7111 MI.eraseFromParent();
7112 return true;
7113 }
7114
7115 return legalizePreloadedArgIntrin(
7116 MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
7117 case Intrinsic::amdgcn_implicitarg_ptr:
7118 return legalizeImplicitArgPtr(MI, MRI, B);
7119 case Intrinsic::amdgcn_workitem_id_x:
7120 return legalizeWorkitemIDIntrinsic(MI, MRI, B, Dim: 0,
7121 ArgType: AMDGPUFunctionArgInfo::WORKITEM_ID_X);
7122 case Intrinsic::amdgcn_workitem_id_y:
7123 return legalizeWorkitemIDIntrinsic(MI, MRI, B, Dim: 1,
7124 ArgType: AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
7125 case Intrinsic::amdgcn_workitem_id_z:
7126 return legalizeWorkitemIDIntrinsic(MI, MRI, B, Dim: 2,
7127 ArgType: AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
7128 case Intrinsic::amdgcn_workgroup_id_x:
7129 return legalizePreloadedArgIntrin(MI, MRI, B,
7130 ArgType: AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
7131 case Intrinsic::amdgcn_workgroup_id_y:
7132 return legalizePreloadedArgIntrin(MI, MRI, B,
7133 ArgType: AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
7134 case Intrinsic::amdgcn_workgroup_id_z:
7135 return legalizePreloadedArgIntrin(MI, MRI, B,
7136 ArgType: AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
7137 case Intrinsic::amdgcn_wave_id:
7138 return legalizeWaveID(MI, B);
7139 case Intrinsic::amdgcn_lds_kernel_id:
7140 return legalizePreloadedArgIntrin(MI, MRI, B,
7141 ArgType: AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
7142 case Intrinsic::amdgcn_dispatch_ptr:
7143 return legalizePreloadedArgIntrin(MI, MRI, B,
7144 ArgType: AMDGPUFunctionArgInfo::DISPATCH_PTR);
7145 case Intrinsic::amdgcn_queue_ptr:
7146 return legalizePreloadedArgIntrin(MI, MRI, B,
7147 ArgType: AMDGPUFunctionArgInfo::QUEUE_PTR);
7148 case Intrinsic::amdgcn_implicit_buffer_ptr:
7149 return legalizePreloadedArgIntrin(
7150 MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
7151 case Intrinsic::amdgcn_dispatch_id:
7152 return legalizePreloadedArgIntrin(MI, MRI, B,
7153 ArgType: AMDGPUFunctionArgInfo::DISPATCH_ID);
7154 case Intrinsic::r600_read_ngroups_x:
7155 // TODO: Emit error for hsa
7156 return legalizeKernargMemParameter(MI, B,
7157 Offset: SI::KernelInputOffsets::NGROUPS_X);
7158 case Intrinsic::r600_read_ngroups_y:
7159 return legalizeKernargMemParameter(MI, B,
7160 Offset: SI::KernelInputOffsets::NGROUPS_Y);
7161 case Intrinsic::r600_read_ngroups_z:
7162 return legalizeKernargMemParameter(MI, B,
7163 Offset: SI::KernelInputOffsets::NGROUPS_Z);
7164 case Intrinsic::r600_read_local_size_x:
7165 // TODO: Could insert G_ASSERT_ZEXT from s16
7166 return legalizeKernargMemParameter(MI, B, Offset: SI::KernelInputOffsets::LOCAL_SIZE_X);
7167 case Intrinsic::r600_read_local_size_y:
7168 // TODO: Could insert G_ASSERT_ZEXT from s16
7169 return legalizeKernargMemParameter(MI, B, Offset: SI::KernelInputOffsets::LOCAL_SIZE_Y);
7170 // TODO: Could insert G_ASSERT_ZEXT from s16
7171 case Intrinsic::r600_read_local_size_z:
7172 return legalizeKernargMemParameter(MI, B, Offset: SI::KernelInputOffsets::LOCAL_SIZE_Z);
7173 case Intrinsic::r600_read_global_size_x:
7174 return legalizeKernargMemParameter(MI, B, Offset: SI::KernelInputOffsets::GLOBAL_SIZE_X);
7175 case Intrinsic::r600_read_global_size_y:
7176 return legalizeKernargMemParameter(MI, B, Offset: SI::KernelInputOffsets::GLOBAL_SIZE_Y);
7177 case Intrinsic::r600_read_global_size_z:
7178 return legalizeKernargMemParameter(MI, B, Offset: SI::KernelInputOffsets::GLOBAL_SIZE_Z);
7179 case Intrinsic::amdgcn_fdiv_fast:
7180 return legalizeFDIVFastIntrin(MI, MRI, B);
7181 case Intrinsic::amdgcn_is_shared:
7182 return legalizeIsAddrSpace(MI, MRI, B, AddrSpace: AMDGPUAS::LOCAL_ADDRESS);
7183 case Intrinsic::amdgcn_is_private:
7184 return legalizeIsAddrSpace(MI, MRI, B, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS);
7185 case Intrinsic::amdgcn_wavefrontsize: {
7186 B.buildConstant(MI.getOperand(i: 0), ST.getWavefrontSize());
7187 MI.eraseFromParent();
7188 return true;
7189 }
7190 case Intrinsic::amdgcn_s_buffer_load:
7191 return legalizeSBufferLoad(Helper, MI);
7192 case Intrinsic::amdgcn_raw_buffer_store:
7193 case Intrinsic::amdgcn_raw_ptr_buffer_store:
7194 case Intrinsic::amdgcn_struct_buffer_store:
7195 case Intrinsic::amdgcn_struct_ptr_buffer_store:
7196 return legalizeBufferStore(MI, MRI, B, IsTyped: false, IsFormat: false);
7197 case Intrinsic::amdgcn_raw_buffer_store_format:
7198 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
7199 case Intrinsic::amdgcn_struct_buffer_store_format:
7200 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
7201 return legalizeBufferStore(MI, MRI, B, IsTyped: false, IsFormat: true);
7202 case Intrinsic::amdgcn_raw_tbuffer_store:
7203 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
7204 case Intrinsic::amdgcn_struct_tbuffer_store:
7205 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
7206 return legalizeBufferStore(MI, MRI, B, IsTyped: true, IsFormat: true);
7207 case Intrinsic::amdgcn_raw_buffer_load:
7208 case Intrinsic::amdgcn_raw_ptr_buffer_load:
7209 case Intrinsic::amdgcn_struct_buffer_load:
7210 case Intrinsic::amdgcn_struct_ptr_buffer_load:
7211 return legalizeBufferLoad(MI, MRI, B, IsFormat: false, IsTyped: false);
7212 case Intrinsic::amdgcn_raw_buffer_load_format:
7213 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
7214 case Intrinsic::amdgcn_struct_buffer_load_format:
7215 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
7216 return legalizeBufferLoad(MI, MRI, B, IsFormat: true, IsTyped: false);
7217 case Intrinsic::amdgcn_raw_tbuffer_load:
7218 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
7219 case Intrinsic::amdgcn_struct_tbuffer_load:
7220 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
7221 return legalizeBufferLoad(MI, MRI, B, IsFormat: true, IsTyped: true);
7222 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
7223 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
7224 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
7225 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
7226 case Intrinsic::amdgcn_raw_buffer_atomic_add:
7227 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
7228 case Intrinsic::amdgcn_struct_buffer_atomic_add:
7229 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
7230 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
7231 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
7232 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
7233 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
7234 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
7235 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
7236 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
7237 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
7238 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
7239 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
7240 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
7241 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
7242 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
7243 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
7244 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
7245 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
7246 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
7247 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
7248 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
7249 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
7250 case Intrinsic::amdgcn_raw_buffer_atomic_and:
7251 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
7252 case Intrinsic::amdgcn_struct_buffer_atomic_and:
7253 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
7254 case Intrinsic::amdgcn_raw_buffer_atomic_or:
7255 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
7256 case Intrinsic::amdgcn_struct_buffer_atomic_or:
7257 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
7258 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
7259 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
7260 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
7261 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
7262 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
7263 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
7264 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
7265 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
7266 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
7267 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
7268 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
7269 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
7270 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
7271 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
7272 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
7273 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
7274 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
7275 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
7276 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
7277 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
7278 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
7279 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
7280 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
7281 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
7282 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
7283 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
7284 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
7285 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
7286 case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
7287 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16:
7288 case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16:
7289 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16:
7290 return legalizeBufferAtomic(MI, B, IID: IntrID);
7291 case Intrinsic::amdgcn_rsq_clamp:
7292 return legalizeRsqClampIntrinsic(MI, MRI, B);
7293 case Intrinsic::amdgcn_ds_fadd:
7294 case Intrinsic::amdgcn_ds_fmin:
7295 case Intrinsic::amdgcn_ds_fmax:
7296 return legalizeDSAtomicFPIntrinsic(Helper, MI, IID: IntrID);
7297 case Intrinsic::amdgcn_image_bvh_intersect_ray:
7298 return legalizeBVHIntrinsic(MI, B);
7299 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
7300 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
7301 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
7302 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
7303 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
7304 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
7305 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
7306 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
7307 Register Index = MI.getOperand(i: 5).getReg();
7308 LLT S32 = LLT::scalar(SizeInBits: 32);
7309 if (MRI.getType(Reg: Index) != S32)
7310 MI.getOperand(i: 5).setReg(B.buildAnyExt(Res: S32, Op: Index).getReg(Idx: 0));
7311 return true;
7312 }
7313 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
7314 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
7315 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
7316 Register Index = MI.getOperand(i: 7).getReg();
7317 LLT S32 = LLT::scalar(SizeInBits: 32);
7318 if (MRI.getType(Reg: Index) != S32)
7319 MI.getOperand(i: 7).setReg(B.buildAnyExt(Res: S32, Op: Index).getReg(Idx: 0));
7320 return true;
7321 }
7322 case Intrinsic::amdgcn_fmed3: {
7323 GISelChangeObserver &Observer = Helper.Observer;
7324
7325 // FIXME: This is to workaround the inability of tablegen match combiners to
7326 // match intrinsics in patterns.
7327 Observer.changingInstr(MI);
7328 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
7329 MI.removeOperand(OpNo: 1);
7330 Observer.changedInstr(MI);
7331 return true;
7332 }
7333 default: {
7334 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
7335 AMDGPU::getImageDimIntrinsicInfo(Intr: IntrID))
7336 return legalizeImageIntrinsic(MI, B, Observer&: Helper.Observer, Intr: ImageDimIntr);
7337 return true;
7338 }
7339 }
7340
7341 return true;
7342}
7343

source code of llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp