AMDGPU.cpp source code [clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp]

1	//===------- AMDCPU.cpp - Emit LLVM Code for builtins ---------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This contains code to emit Builtin calls as LLVM code.
10	//
11	//===----------------------------------------------------------------------===//
12
13	#include "CGBuiltin.h"
14	#include "clang/Basic/TargetBuiltins.h"
15	#include "llvm/Analysis/ValueTracking.h"
16	#include "llvm/IR/IntrinsicsAMDGPU.h"
17	#include "llvm/IR/IntrinsicsR600.h"
18	#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
19	#include "llvm/Support/AMDGPUAddrSpace.h"
20
21	using namespace clang;
22	using namespace CodeGen;
23	using namespace llvm;
24
25	namespace {
26
27	// Has second type mangled argument.
28	static Value *
29	emitBinaryExpMaybeConstrainedFPBuiltin(CodeGenFunction &CGF, const CallExpr *E,
30	Intrinsic::ID IntrinsicID,
31	Intrinsic::ID ConstrainedIntrinsicID) {
32	llvm::Value *Src0 = CGF.EmitScalarExpr(E: E->getArg(Arg: `0`));
33	llvm::Value *Src1 = CGF.EmitScalarExpr(E: E->getArg(Arg: `1`));
34
35	CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
36	if (CGF.Builder.getIsFPConstrained()) {
37	Function *F = CGF.CGM.getIntrinsic(IID: ConstrainedIntrinsicID,
38	Tys: {Src0->getType(), Src1->getType()});
39	return CGF.Builder.CreateConstrainedFPCall(Callee: F, Args: {Src0, Src1});
40	}
41
42	Function *F =
43	CGF.CGM.getIntrinsic(IID: IntrinsicID, Tys: {Src0->getType(), Src1->getType()});
44	return CGF.Builder.CreateCall(Callee: F, Args: {Src0, Src1});
45	}
46
47	// If \p E is not null pointer, insert address space cast to match return
48	// type of \p E if necessary.
49	Value *EmitAMDGPUDispatchPtr(CodeGenFunction &CGF,
50	const CallExpr E = nullptr*) {
51	auto *F = CGF.CGM.getIntrinsic(Intrinsic::amdgcn_dispatch_ptr);
52	auto *Call = CGF.Builder.CreateCall(F);
53	Call->addRetAttr(
54	Attribute::getWithDereferenceableBytes(Context&: Call->getContext(), Bytes: `64`));
55	Call->addRetAttr(Attribute::getWithAlignment(Context&: Call->getContext(), Alignment: Align (`4`)));
56	if (!E)
57	return Call;
58	QualType BuiltinRetType = E->getType();
59	auto *RetTy = cast<llvm::PointerType>(Val: CGF.ConvertType(T: BuiltinRetType));
60	if (RetTy == Call->getType())
61	return Call;
62	return CGF.Builder.CreateAddrSpaceCast(Call, RetTy);
63	}
64
65	Value *EmitAMDGPUImplicitArgPtr(CodeGenFunction &CGF) {
66	auto *F = CGF.CGM.getIntrinsic(Intrinsic::amdgcn_implicitarg_ptr);
67	auto *Call = CGF.Builder.CreateCall(F);
68	Call->addRetAttr(
69	Attribute::getWithDereferenceableBytes(Context&: Call->getContext(), Bytes: `256`));
70	Call->addRetAttr(Attribute::getWithAlignment(Context&: Call->getContext(), Alignment: Align (`8`)));
71	return Call;
72	}
73
74	// \p Index is 0, 1, and 2 for x, y, and z dimension, respectively.
75	/// Emit code based on Code Object ABI version.
76	/// COV_4 : Emit code to use dispatch ptr
77	/// COV_5+ : Emit code to use implicitarg ptr
78	/// COV_NONE : Emit code to load a global variable "__oclc_ABI_version"
79	/// and use its value for COV_4 or COV_5+ approach. It is used for
80	/// compiling device libraries in an ABI-agnostic way.
81	Value EmitAMDGPUWorkGroupSize(CodeGenFunction &CGF, unsigned* Index) {
82	llvm::LoadInst *LD;
83
84	auto Cov = CGF.getTarget().getTargetOpts().CodeObjectVersion;
85
86	if (Cov == CodeObjectVersionKind::COV_None) {
87	StringRef Name = "__oclc_ABI_version";
88	auto *ABIVersionC = CGF.CGM.getModule().getNamedGlobal(Name);
89	if (!ABIVersionC)
90	ABIVersionC = new llvm::GlobalVariable (
91	CGF.CGM.getModule(), CGF.Int32Ty, false,
92	llvm::GlobalValue::ExternalLinkage, nullptr, Name, nullptr,
93	llvm::GlobalVariable::NotThreadLocal,
94	CGF.CGM.getContext().getTargetAddressSpace(AS: LangAS::opencl_constant));
95
96	// This load will be eliminated by the IPSCCP because it is constant
97	// weak_odr without externally_initialized. Either changing it to weak or
98	// adding externally_initialized will keep the load.
99	Value *ABIVersion = CGF.Builder.CreateAlignedLoad(CGF.Int32Ty, ABIVersionC,
100	CGF.CGM.getIntAlign());
101
102	Value *IsCOV5 = CGF.Builder.CreateICmpSGE(
103	LHS: ABIVersion,
104	RHS: llvm::ConstantInt::get(Ty: CGF.Int32Ty, V: CodeObjectVersionKind::COV_5));
105
106	// Indexing the implicit kernarg segment.
107	Value *ImplicitGEP = CGF.Builder.CreateConstGEP1_32(
108	Ty: CGF.Int8Ty, Ptr: EmitAMDGPUImplicitArgPtr(CGF), Idx0: `12` + Index * `2`);
109
110	// Indexing the HSA kernel_dispatch_packet struct.
111	Value *DispatchGEP = CGF.Builder.CreateConstGEP1_32(
112	Ty: CGF.Int8Ty, Ptr: EmitAMDGPUDispatchPtr(CGF), Idx0: `4` + Index * `2`);
113
114	auto Result = CGF.Builder.CreateSelect(C: IsCOV5, True: ImplicitGEP, False: DispatchGEP);
115	LD = CGF.Builder.CreateLoad(
116	Addr: Address(Result, CGF.Int16Ty, CharUnits::fromQuantity(Quantity: `2`)));
117	} else {
118	Value GEP = nullptr*;
119	if (Cov >= CodeObjectVersionKind::COV_5) {
120	// Indexing the implicit kernarg segment.
121	GEP = CGF.Builder.CreateConstGEP1_32(
122	Ty: CGF.Int8Ty, Ptr: EmitAMDGPUImplicitArgPtr(CGF), Idx0: `12` + Index * `2`);
123	} else {
124	// Indexing the HSA kernel_dispatch_packet struct.
125	GEP = CGF.Builder.CreateConstGEP1_32(
126	Ty: CGF.Int8Ty, Ptr: EmitAMDGPUDispatchPtr(CGF), Idx0: `4` + Index * `2`);
127	}
128	LD = CGF.Builder.CreateLoad(
129	Addr: Address (GEP, CGF.Int16Ty, CharUnits::fromQuantity(Quantity: `2`)));
130	}
131
132	llvm::MDBuilder MDHelper(CGF.getLLVMContext());
133	llvm::MDNode *RNode = MDHelper.createRange(Lo: APInt (`16`, `1`),
134	Hi: APInt (`16`, CGF.getTarget().getMaxOpenCLWorkGroupSize() + `1`));
135	LD->setMetadata(KindID: llvm::LLVMContext::MD_range, Node: RNode);
136	LD->setMetadata(KindID: llvm::LLVMContext::MD_noundef,
137	Node: llvm::MDNode::get(Context&: CGF.getLLVMContext(), MDs: {}));
138	LD->setMetadata(KindID: llvm::LLVMContext::MD_invariant_load,
139	Node: llvm::MDNode::get(Context&: CGF.getLLVMContext(), MDs: {}));
140	return LD;
141	}
142
143	// \p Index is 0, 1, and 2 for x, y, and z dimension, respectively.
144	Value EmitAMDGPUGridSize(CodeGenFunction &CGF, unsigned* Index) {
145	const unsigned XOffset = `12`;
146	auto *DP = EmitAMDGPUDispatchPtr(CGF);
147	// Indexing the HSA kernel_dispatch_packet struct.
148	auto Offset = llvm::ConstantInt::get(Ty: CGF.Int32Ty, V: XOffset + Index `4`);
149	auto *GEP = CGF.Builder.CreateGEP(Ty: CGF.Int8Ty, Ptr: DP, IdxList: Offset);
150	auto *LD = CGF.Builder.CreateLoad(
151	Addr: Address (GEP, CGF.Int32Ty, CharUnits::fromQuantity(Quantity: `4`)));
152
153	llvm::MDBuilder MDB(CGF.getLLVMContext());
154
155	// Known non-zero.
156	LD->setMetadata(KindID: llvm::LLVMContext::MD_range,
157	Node: MDB.createRange(Lo: APInt (`32`, `1`), Hi: APInt::getZero(numBits: `32`)));
158	LD->setMetadata(KindID: llvm::LLVMContext::MD_invariant_load,
159	Node: llvm::MDNode::get(Context&: CGF.getLLVMContext(), MDs: {}));
160	return LD;
161	}
162	} // namespace
163
164	// Generates the IR for __builtin_read_exec_.*
165	// Lowers the builtin to amdgcn_ballot intrinsic.
166	static Value EmitAMDGCNBallotForExec(CodeGenFunction &CGF, const* CallExpr *E,
167	llvm::Type *RegisterType,
168	llvm::Type ValueType, bool* isExecHi) {
169	CodeGen::CGBuilderTy &Builder = CGF.Builder;
170	CodeGen::CodeGenModule &CGM = CGF.CGM;
171
172	Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, {RegisterType});
173	llvm::Value Call = Builder.CreateCall(Callee: F, Args: {Builder.getInt1(V: true*)});
174
175	if (isExecHi) {
176	Value *Rt2 = Builder.CreateLShr(LHS: Call, RHS: `32`);
177	Rt2 = Builder.CreateTrunc(V: Rt2, DestTy: CGF.Int32Ty);
178	return Rt2;
179	}
180
181	return Call;
182	}
183
184	// Emit an intrinsic that has 1 float or double operand, and 1 integer.
185	static Value *emitFPIntBuiltin(CodeGenFunction &CGF,
186	const CallExpr *E,
187	unsigned IntrinsicID) {
188	llvm::Value *Src0 = CGF.EmitScalarExpr(E: E->getArg(Arg: `0`));
189	llvm::Value *Src1 = CGF.EmitScalarExpr(E: E->getArg(Arg: `1`));
190
191	Function *F = CGF.CGM.getIntrinsic(IID: IntrinsicID, Tys: Src0->getType());
192	return CGF.Builder.CreateCall(Callee: F, Args: {Src0, Src1});
193	}
194
195	// For processing memory ordering and memory scope arguments of various
196	// amdgcn builtins.
197	// \p Order takes a C++11 comptabile memory-ordering specifier and converts
198	// it into LLVM's memory ordering specifier using atomic C ABI, and writes
199	// to \p AO. \p Scope takes a const char and converts it into AMDGCN*
200	// specific SyncScopeID and writes it to \p SSID.
201	void CodeGenFunction::ProcessOrderScopeAMDGCN(Value Order, Value Scope,
202	llvm::AtomicOrdering &AO,
203	llvm::SyncScope::ID &SSID) {
204	int ord = cast<llvm::ConstantInt>(Val: Order)->getZExtValue();
205
206	// Map C11/C++11 memory ordering to LLVM memory ordering
207	assert(llvm::isValidAtomicOrderingCABI(ord));
208	switch (static_cast<llvm::AtomicOrderingCABI>(ord)) {
209	case llvm::AtomicOrderingCABI::acquire:
210	case llvm::AtomicOrderingCABI::consume:
211	AO = llvm::AtomicOrdering::Acquire;
212	break;
213	case llvm::AtomicOrderingCABI::release:
214	AO = llvm::AtomicOrdering::Release;
215	break;
216	case llvm::AtomicOrderingCABI::acq_rel:
217	AO = llvm::AtomicOrdering::AcquireRelease;
218	break;
219	case llvm::AtomicOrderingCABI::seq_cst:
220	AO = llvm::AtomicOrdering::SequentiallyConsistent;
221	break;
222	case llvm::AtomicOrderingCABI::relaxed:
223	AO = llvm::AtomicOrdering::Monotonic;
224	break;
225	}
226
227	// Some of the atomic builtins take the scope as a string name.
228	StringRef scp;
229	if (llvm::getConstantStringInfo(V: Scope, Str&: scp)) {
230	SSID = getLLVMContext().getOrInsertSyncScopeID(SSN: scp);
231	return;
232	}
233
234	// Older builtins had an enum argument for the memory scope.
235	int scope = cast<llvm::ConstantInt>(Val: Scope)->getZExtValue();
236	switch (scope) {
237	case `0`: // __MEMORY_SCOPE_SYSTEM
238	SSID = llvm::SyncScope::System;
239	break;
240	case `1`: // __MEMORY_SCOPE_DEVICE
241	SSID = getLLVMContext().getOrInsertSyncScopeID(SSN: "agent");
242	break;
243	case `2`: // __MEMORY_SCOPE_WRKGRP
244	SSID = getLLVMContext().getOrInsertSyncScopeID(SSN: "workgroup");
245	break;
246	case `3`: // __MEMORY_SCOPE_WVFRNT
247	SSID = getLLVMContext().getOrInsertSyncScopeID(SSN: "wavefront");
248	break;
249	case `4`: // __MEMORY_SCOPE_SINGLE
250	SSID = llvm::SyncScope::SingleThread;
251	break;
252	default:
253	SSID = llvm::SyncScope::System;
254	break;
255	}
256	}
257
258	llvm::Value CodeGenFunction::EmitScalarOrConstFoldImmArg(unsigned* ICEArguments,
259	unsigned Idx,
260	const CallExpr *E) {
261	llvm::Value Arg = nullptr*;
262	if ((ICEArguments & (`1` << Idx)) == `0`) {
263	Arg = EmitScalarExpr(E: E->getArg(Arg: Idx));
264	} else {
265	// If this is required to be a constant, constant fold it so that we
266	// know that the generated intrinsic gets a ConstantInt.
267	std::optional<llvm::APSInt> Result =
268	E->getArg(Arg: Idx)->getIntegerConstantExpr(Ctx: getContext());
269	assert(Result && "Expected argument to be a constant");
270	Arg = llvm::ConstantInt::get(Context&: getLLVMContext(), V: *Result);
271	}
272	return Arg;
273	}
274
275	void CodeGenFunction::AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst,
276	const CallExpr *E) {
277	constexpr const char *Tag = "amdgpu-as";
278
279	LLVMContext &Ctx = Inst->getContext();
280	SmallVector<MMRAMetadata::TagT, `3`> MMRAs;
281	for (unsigned K = `2`; K < E->getNumArgs(); ++K) {
282	llvm::Value *V = EmitScalarExpr(E: E->getArg(Arg: K));
283	StringRef AS;
284	if (llvm::getConstantStringInfo(V, Str&: AS)) {
285	MMRAs.push_back(Elt: {Tag, AS});
286	// TODO: Delete the resulting unused constant?
287	continue;
288	}
289	CGM.Error(loc: E->getExprLoc(),
290	error: "expected an address space name as a string literal");
291	}
292
293	llvm::sort(C&: MMRAs);
294	MMRAs.erase(CS: llvm::unique(R&: MMRAs), CE: MMRAs.end());
295	Inst->setMetadata(KindID: LLVMContext::MD_mmra, Node: MMRAMetadata::getMD(Ctx, Tags: MMRAs));
296	}
297
298	Value CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned* BuiltinID,
299	const CallExpr *E) {
300	llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent;
301	llvm::SyncScope::ID SSID;
302	switch (BuiltinID) {
303	case AMDGPU::BI__builtin_amdgcn_div_scale:
304	case AMDGPU::BI__builtin_amdgcn_div_scalef: {
305	// Translate from the intrinsics's struct return to the builtin's out
306	// argument.
307
308	Address FlagOutPtr = EmitPointerWithAlignment(Addr: E->getArg(Arg: `3`));
309
310	llvm::Value *X = EmitScalarExpr(E: E->getArg(Arg: `0`));
311	llvm::Value *Y = EmitScalarExpr(E: E->getArg(Arg: `1`));
312	llvm::Value *Z = EmitScalarExpr(E: E->getArg(Arg: `2`));
313
314	llvm::Function *Callee = CGM.getIntrinsic(Intrinsic::amdgcn_div_scale,
315	X->getType());
316
317	llvm::Value *Tmp = Builder.CreateCall(Callee, Args: {X, Y, Z});
318
319	llvm::Value *Result = Builder.CreateExtractValue(Agg: Tmp, Idxs: `0`);
320	llvm::Value *Flag = Builder.CreateExtractValue(Agg: Tmp, Idxs: `1`);
321
322	llvm::Type *RealFlagType = FlagOutPtr.getElementType();
323
324	llvm::Value *FlagExt = Builder.CreateZExt(V: Flag, DestTy: RealFlagType);
325	Builder.CreateStore(Val: FlagExt, Addr: FlagOutPtr);
326	return Result;
327	}
328	case AMDGPU::BI__builtin_amdgcn_div_fmas:
329	case AMDGPU::BI__builtin_amdgcn_div_fmasf: {
330	llvm::Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: `0`));
331	llvm::Value *Src1 = EmitScalarExpr(E: E->getArg(Arg: `1`));
332	llvm::Value *Src2 = EmitScalarExpr(E: E->getArg(Arg: `2`));
333	llvm::Value *Src3 = EmitScalarExpr(E: E->getArg(Arg: `3`));
334
335	llvm::Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_div_fmas,
336	Src0->getType());
337	llvm::Value *Src3ToBool = Builder.CreateIsNotNull(Arg: Src3);
338	return Builder.CreateCall(Callee: F, Args: {Src0, Src1, Src2, Src3ToBool});
339	}
340
341	case AMDGPU::BI__builtin_amdgcn_ds_swizzle:
342	return emitBuiltinWithOneOverloadedType<`2`>(*this, E,
343	Intrinsic::amdgcn_ds_swizzle);
344	case AMDGPU::BI__builtin_amdgcn_mov_dpp8:
345	case AMDGPU::BI__builtin_amdgcn_mov_dpp:
346	case AMDGPU::BI__builtin_amdgcn_update_dpp: {
347	llvm::SmallVector<llvm::Value *, `6`> Args;
348	// Find out if any arguments are required to be integer constant
349	// expressions.
350	unsigned ICEArguments = `0`;
351	ASTContext::GetBuiltinTypeError Error;
352	getContext().GetBuiltinType(ID: BuiltinID, Error, IntegerConstantArgs: &ICEArguments);
353	assert(Error == ASTContext::GE_None && "Should not codegen an error");
354	llvm::Type *DataTy = ConvertType(T: E->getArg(Arg: `0`)->getType());
355	unsigned Size = DataTy->getPrimitiveSizeInBits();
356	llvm::Type *IntTy =
357	llvm::IntegerType::get(C&: Builder.getContext(), NumBits: std::max(a: Size, b: `32u`));
358	Function *F =
359	CGM.getIntrinsic(BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp8
360	? Intrinsic::amdgcn_mov_dpp8
361	: Intrinsic::amdgcn_update_dpp,
362	IntTy);
363	assert(E->getNumArgs() == `5` \|\| E->getNumArgs() == `6` \|\|
364	E->getNumArgs() == `2`);
365	bool InsertOld = BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp;
366	if (InsertOld)
367	Args.push_back(Elt: llvm::PoisonValue::get(T: IntTy));
368	for (unsigned I = `0`; I != E->getNumArgs(); ++I) {
369	llvm::Value *V = EmitScalarOrConstFoldImmArg(ICEArguments, Idx: I, E);
370	if (I < (BuiltinID == AMDGPU::BI__builtin_amdgcn_update_dpp ? `2u` : `1u`) &&
371	Size < `32`) {
372	if (!DataTy->isIntegerTy())
373	V = Builder.CreateBitCast(
374	V, DestTy: llvm::IntegerType::get(C&: Builder.getContext(), NumBits: Size));
375	V = Builder.CreateZExtOrBitCast(V, DestTy: IntTy);
376	}
377	llvm::Type *ExpTy =
378	F->getFunctionType()->getFunctionParamType(i: I + InsertOld);
379	Args.push_back(Elt: Builder.CreateTruncOrBitCast(V, DestTy: ExpTy));
380	}
381	Value *V = Builder.CreateCall(Callee: F, Args);
382	if (Size < `32` && !DataTy->isIntegerTy())
383	V = Builder.CreateTrunc(
384	V, DestTy: llvm::IntegerType::get(C&: Builder.getContext(), NumBits: Size));
385	return Builder.CreateTruncOrBitCast(V, DestTy: DataTy);
386	}
387	case AMDGPU::BI__builtin_amdgcn_permlane16:
388	case AMDGPU::BI__builtin_amdgcn_permlanex16:
389	return emitBuiltinWithOneOverloadedType<`6`>(
390	*this, E,
391	BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16
392	? Intrinsic::amdgcn_permlane16
393	: Intrinsic::amdgcn_permlanex16);
394	case AMDGPU::BI__builtin_amdgcn_permlane64:
395	return emitBuiltinWithOneOverloadedType<`1`>(*this, E,
396	Intrinsic::amdgcn_permlane64);
397	case AMDGPU::BI__builtin_amdgcn_readlane:
398	return emitBuiltinWithOneOverloadedType<`2`>(*this, E,
399	Intrinsic::amdgcn_readlane);
400	case AMDGPU::BI__builtin_amdgcn_readfirstlane:
401	return emitBuiltinWithOneOverloadedType<`1`>(*this, E,
402	Intrinsic::amdgcn_readfirstlane);
403	case AMDGPU::BI__builtin_amdgcn_div_fixup:
404	case AMDGPU::BI__builtin_amdgcn_div_fixupf:
405	case AMDGPU::BI__builtin_amdgcn_div_fixuph:
406	return emitBuiltinWithOneOverloadedType<`3`>(*this, E,
407	Intrinsic::amdgcn_div_fixup);
408	case AMDGPU::BI__builtin_amdgcn_trig_preop:
409	case AMDGPU::BI__builtin_amdgcn_trig_preopf:
410	return emitFPIntBuiltin(*this, E, Intrinsic::amdgcn_trig_preop);
411	case AMDGPU::BI__builtin_amdgcn_rcp:
412	case AMDGPU::BI__builtin_amdgcn_rcpf:
413	case AMDGPU::BI__builtin_amdgcn_rcph:
414	return emitBuiltinWithOneOverloadedType<`1`>(*this, E, Intrinsic::amdgcn_rcp);
415	case AMDGPU::BI__builtin_amdgcn_sqrt:
416	case AMDGPU::BI__builtin_amdgcn_sqrtf:
417	case AMDGPU::BI__builtin_amdgcn_sqrth:
418	return emitBuiltinWithOneOverloadedType<`1`>(*this, E,
419	Intrinsic::amdgcn_sqrt);
420	case AMDGPU::BI__builtin_amdgcn_rsq:
421	case AMDGPU::BI__builtin_amdgcn_rsqf:
422	case AMDGPU::BI__builtin_amdgcn_rsqh:
423	return emitBuiltinWithOneOverloadedType<`1`>(*this, E, Intrinsic::amdgcn_rsq);
424	case AMDGPU::BI__builtin_amdgcn_rsq_clamp:
425	case AMDGPU::BI__builtin_amdgcn_rsq_clampf:
426	return emitBuiltinWithOneOverloadedType<`1`>(*this, E,
427	Intrinsic::amdgcn_rsq_clamp);
428	case AMDGPU::BI__builtin_amdgcn_sinf:
429	case AMDGPU::BI__builtin_amdgcn_sinh:
430	return emitBuiltinWithOneOverloadedType<`1`>(*this, E, Intrinsic::amdgcn_sin);
431	case AMDGPU::BI__builtin_amdgcn_cosf:
432	case AMDGPU::BI__builtin_amdgcn_cosh:
433	return emitBuiltinWithOneOverloadedType<`1`>(*this, E, Intrinsic::amdgcn_cos);
434	case AMDGPU::BI__builtin_amdgcn_dispatch_ptr:
435	return EmitAMDGPUDispatchPtr(CGF&: *this, E);
436	case AMDGPU::BI__builtin_amdgcn_logf:
437	return emitBuiltinWithOneOverloadedType<`1`>(*this, E, Intrinsic::amdgcn_log);
438	case AMDGPU::BI__builtin_amdgcn_exp2f:
439	return emitBuiltinWithOneOverloadedType<`1`>(*this, E,
440	Intrinsic::amdgcn_exp2);
441	case AMDGPU::BI__builtin_amdgcn_log_clampf:
442	return emitBuiltinWithOneOverloadedType<`1`>(*this, E,
443	Intrinsic::amdgcn_log_clamp);
444	case AMDGPU::BI__builtin_amdgcn_ldexp:
445	case AMDGPU::BI__builtin_amdgcn_ldexpf: {
446	llvm::Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: `0`));
447	llvm::Value *Src1 = EmitScalarExpr(E: E->getArg(Arg: `1`));
448	llvm::Function *F =
449	CGM.getIntrinsic(Intrinsic::ldexp, {Src0->getType(), Src1->getType()});
450	return Builder.CreateCall(Callee: F, Args: {Src0, Src1});
451	}
452	case AMDGPU::BI__builtin_amdgcn_ldexph: {
453	// The raw instruction has a different behavior for out of bounds exponent
454	// values (implicit truncation instead of saturate to short_min/short_max).
455	llvm::Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: `0`));
456	llvm::Value *Src1 = EmitScalarExpr(E: E->getArg(Arg: `1`));
457	llvm::Function *F =
458	CGM.getIntrinsic(Intrinsic::ldexp, {Src0->getType(), Int16Ty});
459	return Builder.CreateCall(Callee: F, Args: {Src0, Builder.CreateTrunc(V: Src1, DestTy: Int16Ty)});
460	}
461	case AMDGPU::BI__builtin_amdgcn_frexp_mant:
462	case AMDGPU::BI__builtin_amdgcn_frexp_mantf:
463	case AMDGPU::BI__builtin_amdgcn_frexp_manth:
464	return emitBuiltinWithOneOverloadedType<`1`>(*this, E,
465	Intrinsic::amdgcn_frexp_mant);
466	case AMDGPU::BI__builtin_amdgcn_frexp_exp:
467	case AMDGPU::BI__builtin_amdgcn_frexp_expf: {
468	Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: `0`));
469	Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_frexp_exp,
470	{ Builder.getInt32Ty(), Src0->getType() });
471	return Builder.CreateCall(Callee: F, Args: Src0);
472	}
473	case AMDGPU::BI__builtin_amdgcn_frexp_exph: {
474	Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: `0`));
475	Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_frexp_exp,
476	{ Builder.getInt16Ty(), Src0->getType() });
477	return Builder.CreateCall(Callee: F, Args: Src0);
478	}
479	case AMDGPU::BI__builtin_amdgcn_fract:
480	case AMDGPU::BI__builtin_amdgcn_fractf:
481	case AMDGPU::BI__builtin_amdgcn_fracth:
482	return emitBuiltinWithOneOverloadedType<`1`>(*this, E,
483	Intrinsic::amdgcn_fract);
484	case AMDGPU::BI__builtin_amdgcn_lerp:
485	return emitBuiltinWithOneOverloadedType<`3`>(*this, E,
486	Intrinsic::amdgcn_lerp);
487	case AMDGPU::BI__builtin_amdgcn_ubfe:
488	return emitBuiltinWithOneOverloadedType<`3`>(*this, E,
489	Intrinsic::amdgcn_ubfe);
490	case AMDGPU::BI__builtin_amdgcn_sbfe:
491	return emitBuiltinWithOneOverloadedType<`3`>(*this, E,
492	Intrinsic::amdgcn_sbfe);
493	case AMDGPU::BI__builtin_amdgcn_ballot_w32:
494	case AMDGPU::BI__builtin_amdgcn_ballot_w64: {
495	llvm::Type *ResultType = ConvertType(E->getType());
496	llvm::Value *Src = EmitScalarExpr(E: E->getArg(Arg: `0`));
497	Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, { ResultType });
498	return Builder.CreateCall(Callee: F, Args: { Src });
499	}
500	case AMDGPU::BI__builtin_amdgcn_uicmp:
501	case AMDGPU::BI__builtin_amdgcn_uicmpl:
502	case AMDGPU::BI__builtin_amdgcn_sicmp:
503	case AMDGPU::BI__builtin_amdgcn_sicmpl: {
504	llvm::Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: `0`));
505	llvm::Value *Src1 = EmitScalarExpr(E: E->getArg(Arg: `1`));
506	llvm::Value *Src2 = EmitScalarExpr(E: E->getArg(Arg: `2`));
507
508	// FIXME-GFX10: How should 32 bit mask be handled?
509	Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_icmp,
510	{ Builder.getInt64Ty(), Src0->getType() });
511	return Builder.CreateCall(Callee: F, Args: { Src0, Src1, Src2 });
512	}
513	case AMDGPU::BI__builtin_amdgcn_fcmp:
514	case AMDGPU::BI__builtin_amdgcn_fcmpf: {
515	llvm::Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: `0`));
516	llvm::Value *Src1 = EmitScalarExpr(E: E->getArg(Arg: `1`));
517	llvm::Value *Src2 = EmitScalarExpr(E: E->getArg(Arg: `2`));
518
519	// FIXME-GFX10: How should 32 bit mask be handled?
520	Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_fcmp,
521	{ Builder.getInt64Ty(), Src0->getType() });
522	return Builder.CreateCall(Callee: F, Args: { Src0, Src1, Src2 });
523	}
524	case AMDGPU::BI__builtin_amdgcn_class:
525	case AMDGPU::BI__builtin_amdgcn_classf:
526	case AMDGPU::BI__builtin_amdgcn_classh:
527	return emitFPIntBuiltin(*this, E, Intrinsic::amdgcn_class);
528	case AMDGPU::BI__builtin_amdgcn_fmed3f:
529	case AMDGPU::BI__builtin_amdgcn_fmed3h:
530	return emitBuiltinWithOneOverloadedType<`3`>(*this, E,
531	Intrinsic::amdgcn_fmed3);
532	case AMDGPU::BI__builtin_amdgcn_ds_append:
533	case AMDGPU::BI__builtin_amdgcn_ds_consume: {
534	Intrinsic::ID Intrin = BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_append ?
535	Intrinsic::amdgcn_ds_append : Intrinsic::amdgcn_ds_consume;
536	Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: `0`));
537	Function *F = CGM.getIntrinsic(IID: Intrin, Tys: { Src0->getType() });
538	return Builder.CreateCall(Callee: F, Args: { Src0, Builder.getFalse() });
539	}
540	case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
541	case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
542	case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
543	case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16:
544	case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
545	case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
546	case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
547	case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
548	case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
549	case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32:
550	case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32:
551	case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4f16:
552	case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4bf16:
553	case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16: {
554	Intrinsic::ID IID;
555	switch (BuiltinID) {
556	case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
557	case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
558	IID = Intrinsic::amdgcn_global_load_tr_b64;
559	break;
560	case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
561	case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16:
562	case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
563	case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
564	case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
565	case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
566	IID = Intrinsic::amdgcn_global_load_tr_b128;
567	break;
568	case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
569	IID = Intrinsic::amdgcn_ds_read_tr4_b64;
570	break;
571	case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32:
572	IID = Intrinsic::amdgcn_ds_read_tr8_b64;
573	break;
574	case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32:
575	IID = Intrinsic::amdgcn_ds_read_tr6_b96;
576	break;
577	case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16:
578	case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4f16:
579	case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4bf16:
580	IID = Intrinsic::amdgcn_ds_read_tr16_b64;
581	break;
582	}
583	llvm::Type *LoadTy = ConvertType(E->getType());
584	llvm::Value *Addr = EmitScalarExpr(E: E->getArg(Arg: `0`));
585	llvm::Function *F = CGM.getIntrinsic(IID, Tys: {LoadTy});
586	return Builder.CreateCall(Callee: F, Args: {Addr});
587	}
588	case AMDGPU::BI__builtin_amdgcn_load_to_lds: {
589	// Should this have asan instrumentation?
590	return emitBuiltinWithOneOverloadedType<`5`>(*this, E,
591	Intrinsic::amdgcn_load_to_lds);
592	}
593	case AMDGPU::BI__builtin_amdgcn_get_fpenv: {
594	Function *F = CGM.getIntrinsic(Intrinsic::get_fpenv,
595	{llvm::Type::getInt64Ty(getLLVMContext())});
596	return Builder.CreateCall(Callee: F);
597	}
598	case AMDGPU::BI__builtin_amdgcn_set_fpenv: {
599	Function *F = CGM.getIntrinsic(Intrinsic::set_fpenv,
600	{llvm::Type::getInt64Ty(getLLVMContext())});
601	llvm::Value *Env = EmitScalarExpr(E: E->getArg(Arg: `0`));
602	return Builder.CreateCall(Callee: F, Args: {Env});
603	}
604	case AMDGPU::BI__builtin_amdgcn_read_exec:
605	return EmitAMDGCNBallotForExec(CGF&: *this, E, RegisterType: Int64Ty, ValueType: Int64Ty, isExecHi: false);
606	case AMDGPU::BI__builtin_amdgcn_read_exec_lo:
607	return EmitAMDGCNBallotForExec(CGF&: *this, E, RegisterType: Int32Ty, ValueType: Int32Ty, isExecHi: false);
608	case AMDGPU::BI__builtin_amdgcn_read_exec_hi:
609	return EmitAMDGCNBallotForExec(CGF&: *this, E, RegisterType: Int64Ty, ValueType: Int64Ty, isExecHi: true);
610	case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray:
611	case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_h:
612	case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_l:
613	case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_lh: {
614	llvm::Value *NodePtr = EmitScalarExpr(E: E->getArg(Arg: `0`));
615	llvm::Value *RayExtent = EmitScalarExpr(E: E->getArg(Arg: `1`));
616	llvm::Value *RayOrigin = EmitScalarExpr(E: E->getArg(Arg: `2`));
617	llvm::Value *RayDir = EmitScalarExpr(E: E->getArg(Arg: `3`));
618	llvm::Value *RayInverseDir = EmitScalarExpr(E: E->getArg(Arg: `4`));
619	llvm::Value *TextureDescr = EmitScalarExpr(E: E->getArg(Arg: `5`));
620
621	// The builtins take these arguments as vec4 where the last element is
622	// ignored. The intrinsic takes them as vec3.
623	RayOrigin = Builder.CreateShuffleVector(V1: RayOrigin, V2: RayOrigin,
624	Mask: {`0`, `1`, `2`});
625	RayDir =
626	Builder.CreateShuffleVector(V1: RayDir, V2: RayDir, Mask: {`0`, `1`, `2`});
627	RayInverseDir = Builder.CreateShuffleVector(V1: RayInverseDir, V2: RayInverseDir,
628	Mask: {`0`, `1`, `2`});
629
630	Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_image_bvh_intersect_ray,
631	{NodePtr->getType(), RayDir->getType()});
632	return Builder.CreateCall(Callee: F, Args: {NodePtr, RayExtent, RayOrigin, RayDir,
633	RayInverseDir, TextureDescr});
634	}
635	case AMDGPU::BI__builtin_amdgcn_image_bvh8_intersect_ray:
636	case AMDGPU::BI__builtin_amdgcn_image_bvh_dual_intersect_ray: {
637	Intrinsic::ID IID;
638	switch (BuiltinID) {
639	case AMDGPU::BI__builtin_amdgcn_image_bvh8_intersect_ray:
640	IID = Intrinsic::amdgcn_image_bvh8_intersect_ray;
641	break;
642	case AMDGPU::BI__builtin_amdgcn_image_bvh_dual_intersect_ray:
643	IID = Intrinsic::amdgcn_image_bvh_dual_intersect_ray;
644	break;
645	}
646	llvm::Value *NodePtr = EmitScalarExpr(E: E->getArg(Arg: `0`));
647	llvm::Value *RayExtent = EmitScalarExpr(E: E->getArg(Arg: `1`));
648	llvm::Value *InstanceMask = EmitScalarExpr(E: E->getArg(Arg: `2`));
649	llvm::Value *RayOrigin = EmitScalarExpr(E: E->getArg(Arg: `3`));
650	llvm::Value *RayDir = EmitScalarExpr(E: E->getArg(Arg: `4`));
651	llvm::Value *Offset = EmitScalarExpr(E: E->getArg(Arg: `5`));
652	llvm::Value *TextureDescr = EmitScalarExpr(E: E->getArg(Arg: `6`));
653
654	Address RetRayOriginPtr = EmitPointerWithAlignment(Addr: E->getArg(Arg: `7`));
655	Address RetRayDirPtr = EmitPointerWithAlignment(Addr: E->getArg(Arg: `8`));
656
657	llvm::Function *IntrinsicFunc = CGM.getIntrinsic(IID);
658
659	llvm::CallInst *CI = Builder.CreateCall(
660	Callee: IntrinsicFunc, Args: {NodePtr, RayExtent, InstanceMask, RayOrigin, RayDir,
661	Offset, TextureDescr});
662
663	llvm::Value *RetVData = Builder.CreateExtractValue(Agg: CI, Idxs: `0`);
664	llvm::Value *RetRayOrigin = Builder.CreateExtractValue(Agg: CI, Idxs: `1`);
665	llvm::Value *RetRayDir = Builder.CreateExtractValue(Agg: CI, Idxs: `2`);
666
667	Builder.CreateStore(Val: RetRayOrigin, Addr: RetRayOriginPtr);
668	Builder.CreateStore(Val: RetRayDir, Addr: RetRayDirPtr);
669
670	return RetVData;
671	}
672
673	case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn:
674	case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn:
675	case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn:
676	case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn: {
677	Intrinsic::ID IID;
678	switch (BuiltinID) {
679	case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn:
680	IID = Intrinsic::amdgcn_ds_bvh_stack_rtn;
681	break;
682	case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn:
683	IID = Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn;
684	break;
685	case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn:
686	IID = Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn;
687	break;
688	case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn:
689	IID = Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn;
690	break;
691	}
692
693	SmallVector<Value *, `4`> Args;
694	for (int i = `0`, e = E->getNumArgs(); i != e; ++i)
695	Args.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: i)));
696
697	Function *F = CGM.getIntrinsic(IID);
698	Value *Call = Builder.CreateCall(Callee: F, Args);
699	Value *Rtn = Builder.CreateExtractValue(Agg: Call, Idxs: `0`);
700	Value *A = Builder.CreateExtractValue(Agg: Call, Idxs: `1`);
701	llvm::Type *RetTy = ConvertType(E->getType());
702	Value *I0 = Builder.CreateInsertElement(Vec: PoisonValue::get(T: RetTy), NewElt: Rtn,
703	Idx: (uint64_t)`0`);
704	// ds_bvh_stack_push8_pop2_rtn returns {i64, i32} but the builtin returns
705	// <2 x i64>, zext the second value.
706	if (A->getType()->getPrimitiveSizeInBits() <
707	RetTy->getScalarType()->getPrimitiveSizeInBits())
708	A = Builder.CreateZExt(V: A, DestTy: RetTy->getScalarType());
709
710	return Builder.CreateInsertElement(Vec: I0, NewElt: A, Idx: `1`);
711	}
712	case AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
713	case AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4: {
714	llvm::FixedVectorType *VT = FixedVectorType::get(ElementType: Builder.getInt32Ty(), NumElts: `8`);
715	Function *F = CGM.getIntrinsic(
716	BuiltinID == AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4
717	? Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4
718	: Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4,
719	{VT, VT});
720
721	SmallVector<Value *, `9`> Args;
722	for (unsigned I = `0`, N = E->getNumArgs(); I != N; ++I)
723	Args.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: I)));
724	return Builder.CreateCall(Callee: F, Args);
725	}
726	case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
727	case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32:
728	case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
729	case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64:
730	case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
731	case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32:
732	case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
733	case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64:
734	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32:
735	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64:
736	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
737	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64:
738	case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
739	case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
740	case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
741	case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
742	case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12:
743	case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12:
744	case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12:
745	case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12:
746	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12:
747	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12:
748	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12:
749	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12:
750	case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12:
751	case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12:
752	case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12:
753	case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12:
754	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12:
755	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12:
756	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12:
757	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12:
758	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12:
759	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12:
760	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12:
761	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12:
762	case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12:
763	case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12:
764	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32:
765	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64:
766	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32:
767	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64:
768	case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32:
769	case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64:
770	case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
771	case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
772	case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32:
773	case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64:
774	case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32:
775	case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64:
776	case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32:
777	case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64:
778	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
779	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
780	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
781	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
782	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
783	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
784	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
785	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64: {
786
787	// These operations perform a matrix multiplication and accumulation of
788	// the form:
789	// D = A B + C*
790	// We need to specify one type for matrices AB and one for matrices CD.
791	// Sparse matrix operations can have different types for A and B as well as
792	// an additional type for sparsity index.
793	// Destination type should be put before types used for source operands.
794	SmallVector<unsigned, `2`> ArgsForMatchingMatrixTypes;
795	// On GFX12, the intrinsics with 16-bit accumulator use a packed layout.
796	// There is no need for the variable opsel argument, so always set it to
797	// "false".
798	bool AppendFalseForOpselArg = false;
799	unsigned BuiltinWMMAOp;
800
801	switch (BuiltinID) {
802	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
803	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64:
804	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12:
805	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12:
806	ArgsForMatchingMatrixTypes = {`2`, `0`}; // CD, AB
807	BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_f16;
808	break;
809	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32:
810	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64:
811	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12:
812	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12:
813	ArgsForMatchingMatrixTypes = {`2`, `0`}; // CD, AB
814	BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf16;
815	break;
816	case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12:
817	case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12:
818	AppendFalseForOpselArg = true;
819	[[fallthrough]];
820	case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
821	case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
822	ArgsForMatchingMatrixTypes = {`2`, `0`}; // CD, AB
823	BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16;
824	break;
825	case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12:
826	case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12:
827	AppendFalseForOpselArg = true;
828	[[fallthrough]];
829	case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
830	case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
831	ArgsForMatchingMatrixTypes = {`2`, `0`}; // CD, AB
832	BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16;
833	break;
834	case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32:
835	case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64:
836	ArgsForMatchingMatrixTypes = {`2`, `0`}; // CD, AB
837	BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied;
838	break;
839	case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32:
840	case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64:
841	ArgsForMatchingMatrixTypes = {`2`, `0`}; // CD, AB
842	BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied;
843	break;
844	case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
845	case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
846	case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12:
847	case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12:
848	ArgsForMatchingMatrixTypes = {`4`, `1`}; // CD, AB
849	BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu8;
850	break;
851	case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
852	case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
853	case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12:
854	case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12:
855	ArgsForMatchingMatrixTypes = {`4`, `1`}; // CD, AB
856	BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu4;
857	break;
858	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12:
859	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12:
860	ArgsForMatchingMatrixTypes = {`2`, `0`}; // CD, AB
861	BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8;
862	break;
863	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12:
864	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12:
865	ArgsForMatchingMatrixTypes = {`2`, `0`}; // CD, AB
866	BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8;
867	break;
868	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12:
869	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12:
870	ArgsForMatchingMatrixTypes = {`2`, `0`}; // CD, AB
871	BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8;
872	break;
873	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12:
874	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12:
875	ArgsForMatchingMatrixTypes = {`2`, `0`}; // CD, AB
876	BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8;
877	break;
878	case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12:
879	case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12:
880	ArgsForMatchingMatrixTypes = {`4`, `1`}; // CD, AB
881	BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x32_iu4;
882	break;
883	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32:
884	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64:
885	ArgsForMatchingMatrixTypes = {`2`, `0`, `1`, `3`}; // CD, A, B, Index
886	BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_f16;
887	break;
888	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32:
889	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64:
890	ArgsForMatchingMatrixTypes = {`2`, `0`, `1`, `3`}; // CD, A, B, Index
891	BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16;
892	break;
893	case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32:
894	case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64:
895	ArgsForMatchingMatrixTypes = {`2`, `0`, `1`, `3`}; // CD, A, B, Index
896	BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x32_f16;
897	break;
898	case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
899	case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
900	ArgsForMatchingMatrixTypes = {`2`, `0`, `1`, `3`}; // CD, A, B, Index
901	BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16;
902	break;
903	case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32:
904	case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64:
905	ArgsForMatchingMatrixTypes = {`4`, `1`, `3`, `5`}; // CD, A, B, Index
906	BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8;
907	break;
908	case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32:
909	case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64:
910	ArgsForMatchingMatrixTypes = {`4`, `1`, `3`, `5`}; // CD, A, B, Index
911	BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4;
912	break;
913	case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32:
914	case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64:
915	ArgsForMatchingMatrixTypes = {`4`, `1`, `3`, `5`}; // CD, A, B, Index
916	BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4;
917	break;
918	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
919	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
920	ArgsForMatchingMatrixTypes = {`2`, `0`, `1`, `3`}; // CD, A, B, Index
921	BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8;
922	break;
923	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
924	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
925	ArgsForMatchingMatrixTypes = {`2`, `0`, `1`, `3`}; // CD, A, B, Index
926	BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8;
927	break;
928	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
929	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
930	ArgsForMatchingMatrixTypes = {`2`, `0`, `1`, `3`}; // CD, A, B, Index
931	BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8;
932	break;
933	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
934	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64:
935	ArgsForMatchingMatrixTypes = {`2`, `0`, `1`, `3`}; // CD, A, B, Index
936	BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8;
937	break;
938	}
939
940	SmallVector<Value *, `6`> Args;
941	for (int i = `0`, e = E->getNumArgs(); i != e; ++i)
942	Args.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: i)));
943	if (AppendFalseForOpselArg)
944	Args.push_back(Elt: Builder.getFalse());
945
946	SmallVector<llvm::Type *, `6`> ArgTypes;
947	for (auto ArgIdx : ArgsForMatchingMatrixTypes)
948	ArgTypes.push_back(Elt: Args [ArgIdx]->getType());
949
950	Function *F = CGM.getIntrinsic(IID: BuiltinWMMAOp, Tys: ArgTypes);
951	return Builder.CreateCall(Callee: F, Args);
952	}
953	// amdgcn workgroup size
954	case AMDGPU::BI__builtin_amdgcn_workgroup_size_x:
955	return EmitAMDGPUWorkGroupSize(CGF&: *this, Index: `0`);
956	case AMDGPU::BI__builtin_amdgcn_workgroup_size_y:
957	return EmitAMDGPUWorkGroupSize(CGF&: *this, Index: `1`);
958	case AMDGPU::BI__builtin_amdgcn_workgroup_size_z:
959	return EmitAMDGPUWorkGroupSize(CGF&: *this, Index: `2`);
960
961	// amdgcn grid size
962	case AMDGPU::BI__builtin_amdgcn_grid_size_x:
963	return EmitAMDGPUGridSize(CGF&: *this, Index: `0`);
964	case AMDGPU::BI__builtin_amdgcn_grid_size_y:
965	return EmitAMDGPUGridSize(CGF&: *this, Index: `1`);
966	case AMDGPU::BI__builtin_amdgcn_grid_size_z:
967	return EmitAMDGPUGridSize(CGF&: *this, Index: `2`);
968
969	// r600 intrinsics
970	case AMDGPU::BI__builtin_r600_recipsqrt_ieee:
971	case AMDGPU::BI__builtin_r600_recipsqrt_ieeef:
972	return emitBuiltinWithOneOverloadedType<`1`>(*this, E,
973	Intrinsic::r600_recipsqrt_ieee);
974	case AMDGPU::BI__builtin_amdgcn_alignbit: {
975	llvm::Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: `0`));
976	llvm::Value *Src1 = EmitScalarExpr(E: E->getArg(Arg: `1`));
977	llvm::Value *Src2 = EmitScalarExpr(E: E->getArg(Arg: `2`));
978	Function *F = CGM.getIntrinsic(Intrinsic::fshr, Src0->getType());
979	return Builder.CreateCall(Callee: F, Args: { Src0, Src1, Src2 });
980	}
981	case AMDGPU::BI__builtin_amdgcn_fence: {
982	ProcessOrderScopeAMDGCN(Order: EmitScalarExpr(E: E->getArg(Arg: `0`)),
983	Scope: EmitScalarExpr(E: E->getArg(Arg: `1`)), AO, SSID);
984	FenceInst *Fence = Builder.CreateFence(Ordering: AO, SSID);
985	if (E->getNumArgs() > `2`)
986	AddAMDGPUFenceAddressSpaceMMRA(Inst: Fence, E);
987	return Fence;
988	}
989	case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
990	case AMDGPU::BI__builtin_amdgcn_atomic_inc64:
991	case AMDGPU::BI__builtin_amdgcn_atomic_dec32:
992	case AMDGPU::BI__builtin_amdgcn_atomic_dec64:
993	case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
994	case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
995	case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
996	case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
997	case AMDGPU::BI__builtin_amdgcn_ds_faddf:
998	case AMDGPU::BI__builtin_amdgcn_ds_fminf:
999	case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
1000	case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
1001	case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
1002	case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
1003	case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
1004	case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
1005	case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
1006	case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
1007	case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
1008	case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
1009	case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
1010	case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
1011	case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: {
1012	llvm::AtomicRMWInst::BinOp BinOp;
1013	switch (BuiltinID) {
1014	case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
1015	case AMDGPU::BI__builtin_amdgcn_atomic_inc64:
1016	BinOp = llvm::AtomicRMWInst::UIncWrap;
1017	break;
1018	case AMDGPU::BI__builtin_amdgcn_atomic_dec32:
1019	case AMDGPU::BI__builtin_amdgcn_atomic_dec64:
1020	BinOp = llvm::AtomicRMWInst::UDecWrap;
1021	break;
1022	case AMDGPU::BI__builtin_amdgcn_ds_faddf:
1023	case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
1024	case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
1025	case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
1026	case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
1027	case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
1028	case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
1029	case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
1030	case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
1031	case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
1032	case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
1033	case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
1034	case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
1035	BinOp = llvm::AtomicRMWInst::FAdd;
1036	break;
1037	case AMDGPU::BI__builtin_amdgcn_ds_fminf:
1038	case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
1039	case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
1040	BinOp = llvm::AtomicRMWInst::FMin;
1041	break;
1042	case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
1043	case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
1044	case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
1045	BinOp = llvm::AtomicRMWInst::FMax;
1046	break;
1047	}
1048
1049	Address Ptr = CheckAtomicAlignment(CGF&: *this, E);
1050	Value *Val = EmitScalarExpr(E: E->getArg(Arg: `1`));
1051	llvm::Type *OrigTy = Val->getType();
1052	QualType PtrTy = E->getArg(Arg: `0`)->IgnoreImpCasts()->getType();
1053
1054	bool Volatile;
1055
1056	if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_faddf \|\|
1057	BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_fminf \|\|
1058	BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_fmaxf) {
1059	// __builtin_amdgcn_ds_faddf/fminf/fmaxf has an explicit volatile argument
1060	Volatile =
1061	cast<ConstantInt>(Val: EmitScalarExpr(E: E->getArg(Arg: `4`)))->getZExtValue();
1062	} else {
1063	// Infer volatile from the passed type.
1064	Volatile =
1065	PtrTy ->castAs<PointerType>()->getPointeeType().isVolatileQualified();
1066	}
1067
1068	if (E->getNumArgs() >= `4`) {
1069	// Some of the builtins have explicit ordering and scope arguments.
1070	ProcessOrderScopeAMDGCN(Order: EmitScalarExpr(E: E->getArg(Arg: `2`)),
1071	Scope: EmitScalarExpr(E: E->getArg(Arg: `3`)), AO, SSID);
1072	} else {
1073	// Most of the builtins do not have syncscope/order arguments. For DS
1074	// atomics the scope doesn't really matter, as they implicitly operate at
1075	// workgroup scope.
1076	//
1077	// The global/flat cases need to use agent scope to consistently produce
1078	// the native instruction instead of a cmpxchg expansion.
1079	SSID = getLLVMContext().getOrInsertSyncScopeID(SSN: "agent");
1080	AO = AtomicOrdering::Monotonic;
1081
1082	// The v2bf16 builtin uses i16 instead of a natural bfloat type.
1083	if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16 \|\|
1084	BuiltinID == AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16 \|\|
1085	BuiltinID == AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16) {
1086	llvm::Type *V2BF16Ty = FixedVectorType::get(
1087	ElementType: llvm::Type::getBFloatTy(C&: Builder.getContext()), NumElts: `2`);
1088	Val = Builder.CreateBitCast(V: Val, DestTy: V2BF16Ty);
1089	}
1090	}
1091
1092	llvm::AtomicRMWInst *RMW =
1093	Builder.CreateAtomicRMW(Op: BinOp, Addr: Ptr, Val, Ordering: AO, SSID);
1094	if (Volatile)
1095	RMW->setVolatile(true);
1096
1097	unsigned AddrSpace = Ptr.getType()->getAddressSpace();
1098	if (AddrSpace != llvm::AMDGPUAS::LOCAL_ADDRESS) {
1099	// Most targets require "amdgpu.no.fine.grained.memory" to emit the native
1100	// instruction for flat and global operations.
1101	llvm::MDTuple *EmptyMD = MDNode::get(Context&: getLLVMContext(), MDs: {});
1102	RMW->setMetadata(Kind: "amdgpu.no.fine.grained.memory", Node: EmptyMD);
1103
1104	// Most targets require "amdgpu.ignore.denormal.mode" to emit the native
1105	// instruction, but this only matters for float fadd.
1106	if (BinOp == llvm::AtomicRMWInst::FAdd && Val->getType()->isFloatTy())
1107	RMW->setMetadata(Kind: "amdgpu.ignore.denormal.mode", Node: EmptyMD);
1108	}
1109
1110	return Builder.CreateBitCast(V: RMW, DestTy: OrigTy);
1111	}
1112	case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtn:
1113	case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtnl: {
1114	llvm::Value *Arg = EmitScalarExpr(E: E->getArg(Arg: `0`));
1115	llvm::Type *ResultType = ConvertType(E->getType());
1116	// s_sendmsg_rtn is mangled using return type only.
1117	Function *F =
1118	CGM.getIntrinsic(Intrinsic::amdgcn_s_sendmsg_rtn, {ResultType});
1119	return Builder.CreateCall(Callee: F, Args: {Arg});
1120	}
1121	case AMDGPU::BI__builtin_amdgcn_permlane16_swap:
1122	case AMDGPU::BI__builtin_amdgcn_permlane32_swap: {
1123	// Because builtin types are limited, and the intrinsic uses a struct/pair
1124	// output, marshal the pair-of-i32 to <2 x i32>.
1125	Value *VDstOld = EmitScalarExpr(E: E->getArg(Arg: `0`));
1126	Value *VSrcOld = EmitScalarExpr(E: E->getArg(Arg: `1`));
1127	Value *FI = EmitScalarExpr(E: E->getArg(Arg: `2`));
1128	Value *BoundCtrl = EmitScalarExpr(E: E->getArg(Arg: `3`));
1129	Function *F =
1130	CGM.getIntrinsic(BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16_swap
1131	? Intrinsic::amdgcn_permlane16_swap
1132	: Intrinsic::amdgcn_permlane32_swap);
1133	llvm::CallInst *Call =
1134	Builder.CreateCall(Callee: F, Args: {VDstOld, VSrcOld, FI, BoundCtrl});
1135
1136	llvm::Value *Elt0 = Builder.CreateExtractValue(Agg: Call, Idxs: `0`);
1137	llvm::Value *Elt1 = Builder.CreateExtractValue(Agg: Call, Idxs: `1`);
1138
1139	llvm::Type *ResultType = ConvertType(E->getType());
1140
1141	llvm::Value *Insert0 = Builder.CreateInsertElement(
1142	Vec: llvm::PoisonValue::get(T: ResultType), NewElt: Elt0, UINT64_C(`0`));
1143	llvm::Value *AsVector =
1144	Builder.CreateInsertElement(Vec: Insert0, NewElt: Elt1, UINT64_C(`1`));
1145	return AsVector;
1146	}
1147	case AMDGPU::BI__builtin_amdgcn_bitop3_b32:
1148	case AMDGPU::BI__builtin_amdgcn_bitop3_b16:
1149	return emitBuiltinWithOneOverloadedType<`4`>(*this, E,
1150	Intrinsic::amdgcn_bitop3);
1151	case AMDGPU::BI__builtin_amdgcn_make_buffer_rsrc: {
1152	// TODO: LLVM has this overloaded to allow for fat pointers, but since
1153	// those haven't been plumbed through to Clang yet, default to creating the
1154	// resource type.
1155	SmallVector<Value *, `4`> Args;
1156	for (unsigned I = `0`; I < `4`; ++I)
1157	Args.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: I)));
1158	llvm::PointerType *RetTy = llvm::PointerType::get(
1159	C&: Builder.getContext(), AddressSpace: llvm::AMDGPUAS::BUFFER_RESOURCE);
1160	Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_make_buffer_rsrc,
1161	{RetTy, Args[`0`]->getType()});
1162	return Builder.CreateCall(Callee: F, Args);
1163	}
1164	case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b8:
1165	case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b16:
1166	case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b32:
1167	case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b64:
1168	case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b96:
1169	case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b128:
1170	return emitBuiltinWithOneOverloadedType<`5`>(
1171	*this, E, Intrinsic::amdgcn_raw_ptr_buffer_store);
1172	case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b8:
1173	case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b16:
1174	case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b32:
1175	case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b64:
1176	case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b96:
1177	case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b128: {
1178	llvm::Type RetTy = nullptr*;
1179	switch (BuiltinID) {
1180	case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b8:
1181	RetTy = Int8Ty;
1182	break;
1183	case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b16:
1184	RetTy = Int16Ty;
1185	break;
1186	case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b32:
1187	RetTy = Int32Ty;
1188	break;
1189	case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b64:
1190	RetTy = llvm::FixedVectorType::get(ElementType: Int32Ty, /NumElements=/NumElts: `2`);
1191	break;
1192	case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b96:
1193	RetTy = llvm::FixedVectorType::get(ElementType: Int32Ty, /NumElements=/NumElts: `3`);
1194	break;
1195	case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b128:
1196	RetTy = llvm::FixedVectorType::get(ElementType: Int32Ty, /NumElements=/NumElts: `4`);
1197	break;
1198	}
1199	Function *F =
1200	CGM.getIntrinsic(Intrinsic::amdgcn_raw_ptr_buffer_load, RetTy);
1201	return Builder.CreateCall(
1202	Callee: F, Args: {EmitScalarExpr(E: E->getArg(Arg: `0`)), EmitScalarExpr(E: E->getArg(Arg: `1`)),
1203	EmitScalarExpr(E: E->getArg(Arg: `2`)), EmitScalarExpr(E: E->getArg(Arg: `3`))});
1204	}
1205	case AMDGPU::BI__builtin_amdgcn_s_prefetch_data:
1206	return emitBuiltinWithOneOverloadedType<`2`>(
1207	*this, E, Intrinsic::amdgcn_s_prefetch_data);
1208	case Builtin::BIlogbf:
1209	case Builtin::BI__builtin_logbf: {
1210	Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: `0`));
1211	Function *FrExpFunc = CGM.getIntrinsic(
1212	Intrinsic::frexp, {Src0->getType(), Builder.getInt32Ty()});
1213	CallInst *FrExp = Builder.CreateCall(Callee: FrExpFunc, Args: Src0);
1214	Value *Exp = Builder.CreateExtractValue(Agg: FrExp, Idxs: `1`);
1215	Value *Add = Builder.CreateAdd(
1216	LHS: Exp, RHS: ConstantInt::getSigned(Ty: Exp->getType(), V: -`1`), Name: "", HasNUW: false, HasNSW: true);
1217	Value *SIToFP = Builder.CreateSIToFP(V: Add, DestTy: Builder.getFloatTy());
1218	Value *Fabs =
1219	emitBuiltinWithOneOverloadedType<`1`>(*this, E, Intrinsic::fabs);
1220	Value *FCmpONE = Builder.CreateFCmpONE(
1221	LHS: Fabs, RHS: ConstantFP::getInfinity(Ty: Builder.getFloatTy()));
1222	Value *Sel1 = Builder.CreateSelect(C: FCmpONE, True: SIToFP, False: Fabs);
1223	Value *FCmpOEQ =
1224	Builder.CreateFCmpOEQ(LHS: Src0, RHS: ConstantFP::getZero(Ty: Builder.getFloatTy()));
1225	Value *Sel2 = Builder.CreateSelect(
1226	C: FCmpOEQ,
1227	True: ConstantFP::getInfinity(Ty: Builder.getFloatTy(), /Negative=/true), False: Sel1);
1228	return Sel2;
1229	}
1230	case Builtin::BIlogb:
1231	case Builtin::BI__builtin_logb: {
1232	Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: `0`));
1233	Function *FrExpFunc = CGM.getIntrinsic(
1234	Intrinsic::frexp, {Src0->getType(), Builder.getInt32Ty()});
1235	CallInst *FrExp = Builder.CreateCall(Callee: FrExpFunc, Args: Src0);
1236	Value *Exp = Builder.CreateExtractValue(Agg: FrExp, Idxs: `1`);
1237	Value *Add = Builder.CreateAdd(
1238	LHS: Exp, RHS: ConstantInt::getSigned(Ty: Exp->getType(), V: -`1`), Name: "", HasNUW: false, HasNSW: true);
1239	Value *SIToFP = Builder.CreateSIToFP(V: Add, DestTy: Builder.getDoubleTy());
1240	Value *Fabs =
1241	emitBuiltinWithOneOverloadedType<`1`>(*this, E, Intrinsic::fabs);
1242	Value *FCmpONE = Builder.CreateFCmpONE(
1243	LHS: Fabs, RHS: ConstantFP::getInfinity(Ty: Builder.getDoubleTy()));
1244	Value *Sel1 = Builder.CreateSelect(C: FCmpONE, True: SIToFP, False: Fabs);
1245	Value *FCmpOEQ =
1246	Builder.CreateFCmpOEQ(LHS: Src0, RHS: ConstantFP::getZero(Ty: Builder.getDoubleTy()));
1247	Value *Sel2 = Builder.CreateSelect(
1248	C: FCmpOEQ,
1249	True: ConstantFP::getInfinity(Ty: Builder.getDoubleTy(), /Negative=/true),
1250	False: Sel1);
1251	return Sel2;
1252	}
1253	case Builtin::BIscalbnf:
1254	case Builtin::BI__builtin_scalbnf:
1255	case Builtin::BIscalbn:
1256	case Builtin::BI__builtin_scalbn:
1257	return emitBinaryExpMaybeConstrainedFPBuiltin(
1258	*this, E, Intrinsic::ldexp, Intrinsic::experimental_constrained_ldexp);
1259	default:
1260	return nullptr;
1261	}
1262	}
1263

source code of clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp