1//===------- AMDCPU.cpp - Emit LLVM Code for builtins ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This contains code to emit Builtin calls as LLVM code.
10//
11//===----------------------------------------------------------------------===//
12
13#include "CGBuiltin.h"
14#include "clang/Basic/TargetBuiltins.h"
15#include "llvm/Analysis/ValueTracking.h"
16#include "llvm/IR/IntrinsicsAMDGPU.h"
17#include "llvm/IR/IntrinsicsR600.h"
18#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
19#include "llvm/Support/AMDGPUAddrSpace.h"
20
21using namespace clang;
22using namespace CodeGen;
23using namespace llvm;
24
25namespace {
26
27// Has second type mangled argument.
28static Value *
29emitBinaryExpMaybeConstrainedFPBuiltin(CodeGenFunction &CGF, const CallExpr *E,
30 Intrinsic::ID IntrinsicID,
31 Intrinsic::ID ConstrainedIntrinsicID) {
32 llvm::Value *Src0 = CGF.EmitScalarExpr(E: E->getArg(Arg: 0));
33 llvm::Value *Src1 = CGF.EmitScalarExpr(E: E->getArg(Arg: 1));
34
35 CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
36 if (CGF.Builder.getIsFPConstrained()) {
37 Function *F = CGF.CGM.getIntrinsic(IID: ConstrainedIntrinsicID,
38 Tys: {Src0->getType(), Src1->getType()});
39 return CGF.Builder.CreateConstrainedFPCall(Callee: F, Args: {Src0, Src1});
40 }
41
42 Function *F =
43 CGF.CGM.getIntrinsic(IID: IntrinsicID, Tys: {Src0->getType(), Src1->getType()});
44 return CGF.Builder.CreateCall(Callee: F, Args: {Src0, Src1});
45}
46
47// If \p E is not null pointer, insert address space cast to match return
48// type of \p E if necessary.
49Value *EmitAMDGPUDispatchPtr(CodeGenFunction &CGF,
50 const CallExpr *E = nullptr) {
51 auto *F = CGF.CGM.getIntrinsic(Intrinsic::amdgcn_dispatch_ptr);
52 auto *Call = CGF.Builder.CreateCall(F);
53 Call->addRetAttr(
54 Attribute::getWithDereferenceableBytes(Context&: Call->getContext(), Bytes: 64));
55 Call->addRetAttr(Attribute::getWithAlignment(Context&: Call->getContext(), Alignment: Align(4)));
56 if (!E)
57 return Call;
58 QualType BuiltinRetType = E->getType();
59 auto *RetTy = cast<llvm::PointerType>(Val: CGF.ConvertType(T: BuiltinRetType));
60 if (RetTy == Call->getType())
61 return Call;
62 return CGF.Builder.CreateAddrSpaceCast(Call, RetTy);
63}
64
65Value *EmitAMDGPUImplicitArgPtr(CodeGenFunction &CGF) {
66 auto *F = CGF.CGM.getIntrinsic(Intrinsic::amdgcn_implicitarg_ptr);
67 auto *Call = CGF.Builder.CreateCall(F);
68 Call->addRetAttr(
69 Attribute::getWithDereferenceableBytes(Context&: Call->getContext(), Bytes: 256));
70 Call->addRetAttr(Attribute::getWithAlignment(Context&: Call->getContext(), Alignment: Align(8)));
71 return Call;
72}
73
74// \p Index is 0, 1, and 2 for x, y, and z dimension, respectively.
75/// Emit code based on Code Object ABI version.
76/// COV_4 : Emit code to use dispatch ptr
77/// COV_5+ : Emit code to use implicitarg ptr
78/// COV_NONE : Emit code to load a global variable "__oclc_ABI_version"
79/// and use its value for COV_4 or COV_5+ approach. It is used for
80/// compiling device libraries in an ABI-agnostic way.
81Value *EmitAMDGPUWorkGroupSize(CodeGenFunction &CGF, unsigned Index) {
82 llvm::LoadInst *LD;
83
84 auto Cov = CGF.getTarget().getTargetOpts().CodeObjectVersion;
85
86 if (Cov == CodeObjectVersionKind::COV_None) {
87 StringRef Name = "__oclc_ABI_version";
88 auto *ABIVersionC = CGF.CGM.getModule().getNamedGlobal(Name);
89 if (!ABIVersionC)
90 ABIVersionC = new llvm::GlobalVariable(
91 CGF.CGM.getModule(), CGF.Int32Ty, false,
92 llvm::GlobalValue::ExternalLinkage, nullptr, Name, nullptr,
93 llvm::GlobalVariable::NotThreadLocal,
94 CGF.CGM.getContext().getTargetAddressSpace(AS: LangAS::opencl_constant));
95
96 // This load will be eliminated by the IPSCCP because it is constant
97 // weak_odr without externally_initialized. Either changing it to weak or
98 // adding externally_initialized will keep the load.
99 Value *ABIVersion = CGF.Builder.CreateAlignedLoad(CGF.Int32Ty, ABIVersionC,
100 CGF.CGM.getIntAlign());
101
102 Value *IsCOV5 = CGF.Builder.CreateICmpSGE(
103 LHS: ABIVersion,
104 RHS: llvm::ConstantInt::get(Ty: CGF.Int32Ty, V: CodeObjectVersionKind::COV_5));
105
106 // Indexing the implicit kernarg segment.
107 Value *ImplicitGEP = CGF.Builder.CreateConstGEP1_32(
108 Ty: CGF.Int8Ty, Ptr: EmitAMDGPUImplicitArgPtr(CGF), Idx0: 12 + Index * 2);
109
110 // Indexing the HSA kernel_dispatch_packet struct.
111 Value *DispatchGEP = CGF.Builder.CreateConstGEP1_32(
112 Ty: CGF.Int8Ty, Ptr: EmitAMDGPUDispatchPtr(CGF), Idx0: 4 + Index * 2);
113
114 auto Result = CGF.Builder.CreateSelect(C: IsCOV5, True: ImplicitGEP, False: DispatchGEP);
115 LD = CGF.Builder.CreateLoad(
116 Addr: Address(Result, CGF.Int16Ty, CharUnits::fromQuantity(Quantity: 2)));
117 } else {
118 Value *GEP = nullptr;
119 if (Cov >= CodeObjectVersionKind::COV_5) {
120 // Indexing the implicit kernarg segment.
121 GEP = CGF.Builder.CreateConstGEP1_32(
122 Ty: CGF.Int8Ty, Ptr: EmitAMDGPUImplicitArgPtr(CGF), Idx0: 12 + Index * 2);
123 } else {
124 // Indexing the HSA kernel_dispatch_packet struct.
125 GEP = CGF.Builder.CreateConstGEP1_32(
126 Ty: CGF.Int8Ty, Ptr: EmitAMDGPUDispatchPtr(CGF), Idx0: 4 + Index * 2);
127 }
128 LD = CGF.Builder.CreateLoad(
129 Addr: Address(GEP, CGF.Int16Ty, CharUnits::fromQuantity(Quantity: 2)));
130 }
131
132 llvm::MDBuilder MDHelper(CGF.getLLVMContext());
133 llvm::MDNode *RNode = MDHelper.createRange(Lo: APInt(16, 1),
134 Hi: APInt(16, CGF.getTarget().getMaxOpenCLWorkGroupSize() + 1));
135 LD->setMetadata(KindID: llvm::LLVMContext::MD_range, Node: RNode);
136 LD->setMetadata(KindID: llvm::LLVMContext::MD_noundef,
137 Node: llvm::MDNode::get(Context&: CGF.getLLVMContext(), MDs: {}));
138 LD->setMetadata(KindID: llvm::LLVMContext::MD_invariant_load,
139 Node: llvm::MDNode::get(Context&: CGF.getLLVMContext(), MDs: {}));
140 return LD;
141}
142
143// \p Index is 0, 1, and 2 for x, y, and z dimension, respectively.
144Value *EmitAMDGPUGridSize(CodeGenFunction &CGF, unsigned Index) {
145 const unsigned XOffset = 12;
146 auto *DP = EmitAMDGPUDispatchPtr(CGF);
147 // Indexing the HSA kernel_dispatch_packet struct.
148 auto *Offset = llvm::ConstantInt::get(Ty: CGF.Int32Ty, V: XOffset + Index * 4);
149 auto *GEP = CGF.Builder.CreateGEP(Ty: CGF.Int8Ty, Ptr: DP, IdxList: Offset);
150 auto *LD = CGF.Builder.CreateLoad(
151 Addr: Address(GEP, CGF.Int32Ty, CharUnits::fromQuantity(Quantity: 4)));
152
153 llvm::MDBuilder MDB(CGF.getLLVMContext());
154
155 // Known non-zero.
156 LD->setMetadata(KindID: llvm::LLVMContext::MD_range,
157 Node: MDB.createRange(Lo: APInt(32, 1), Hi: APInt::getZero(numBits: 32)));
158 LD->setMetadata(KindID: llvm::LLVMContext::MD_invariant_load,
159 Node: llvm::MDNode::get(Context&: CGF.getLLVMContext(), MDs: {}));
160 return LD;
161}
162} // namespace
163
164// Generates the IR for __builtin_read_exec_*.
165// Lowers the builtin to amdgcn_ballot intrinsic.
166static Value *EmitAMDGCNBallotForExec(CodeGenFunction &CGF, const CallExpr *E,
167 llvm::Type *RegisterType,
168 llvm::Type *ValueType, bool isExecHi) {
169 CodeGen::CGBuilderTy &Builder = CGF.Builder;
170 CodeGen::CodeGenModule &CGM = CGF.CGM;
171
172 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, {RegisterType});
173 llvm::Value *Call = Builder.CreateCall(Callee: F, Args: {Builder.getInt1(V: true)});
174
175 if (isExecHi) {
176 Value *Rt2 = Builder.CreateLShr(LHS: Call, RHS: 32);
177 Rt2 = Builder.CreateTrunc(V: Rt2, DestTy: CGF.Int32Ty);
178 return Rt2;
179 }
180
181 return Call;
182}
183
184// Emit an intrinsic that has 1 float or double operand, and 1 integer.
185static Value *emitFPIntBuiltin(CodeGenFunction &CGF,
186 const CallExpr *E,
187 unsigned IntrinsicID) {
188 llvm::Value *Src0 = CGF.EmitScalarExpr(E: E->getArg(Arg: 0));
189 llvm::Value *Src1 = CGF.EmitScalarExpr(E: E->getArg(Arg: 1));
190
191 Function *F = CGF.CGM.getIntrinsic(IID: IntrinsicID, Tys: Src0->getType());
192 return CGF.Builder.CreateCall(Callee: F, Args: {Src0, Src1});
193}
194
195// For processing memory ordering and memory scope arguments of various
196// amdgcn builtins.
197// \p Order takes a C++11 comptabile memory-ordering specifier and converts
198// it into LLVM's memory ordering specifier using atomic C ABI, and writes
199// to \p AO. \p Scope takes a const char * and converts it into AMDGCN
200// specific SyncScopeID and writes it to \p SSID.
201void CodeGenFunction::ProcessOrderScopeAMDGCN(Value *Order, Value *Scope,
202 llvm::AtomicOrdering &AO,
203 llvm::SyncScope::ID &SSID) {
204 int ord = cast<llvm::ConstantInt>(Val: Order)->getZExtValue();
205
206 // Map C11/C++11 memory ordering to LLVM memory ordering
207 assert(llvm::isValidAtomicOrderingCABI(ord));
208 switch (static_cast<llvm::AtomicOrderingCABI>(ord)) {
209 case llvm::AtomicOrderingCABI::acquire:
210 case llvm::AtomicOrderingCABI::consume:
211 AO = llvm::AtomicOrdering::Acquire;
212 break;
213 case llvm::AtomicOrderingCABI::release:
214 AO = llvm::AtomicOrdering::Release;
215 break;
216 case llvm::AtomicOrderingCABI::acq_rel:
217 AO = llvm::AtomicOrdering::AcquireRelease;
218 break;
219 case llvm::AtomicOrderingCABI::seq_cst:
220 AO = llvm::AtomicOrdering::SequentiallyConsistent;
221 break;
222 case llvm::AtomicOrderingCABI::relaxed:
223 AO = llvm::AtomicOrdering::Monotonic;
224 break;
225 }
226
227 // Some of the atomic builtins take the scope as a string name.
228 StringRef scp;
229 if (llvm::getConstantStringInfo(V: Scope, Str&: scp)) {
230 SSID = getLLVMContext().getOrInsertSyncScopeID(SSN: scp);
231 return;
232 }
233
234 // Older builtins had an enum argument for the memory scope.
235 int scope = cast<llvm::ConstantInt>(Val: Scope)->getZExtValue();
236 switch (scope) {
237 case 0: // __MEMORY_SCOPE_SYSTEM
238 SSID = llvm::SyncScope::System;
239 break;
240 case 1: // __MEMORY_SCOPE_DEVICE
241 SSID = getLLVMContext().getOrInsertSyncScopeID(SSN: "agent");
242 break;
243 case 2: // __MEMORY_SCOPE_WRKGRP
244 SSID = getLLVMContext().getOrInsertSyncScopeID(SSN: "workgroup");
245 break;
246 case 3: // __MEMORY_SCOPE_WVFRNT
247 SSID = getLLVMContext().getOrInsertSyncScopeID(SSN: "wavefront");
248 break;
249 case 4: // __MEMORY_SCOPE_SINGLE
250 SSID = llvm::SyncScope::SingleThread;
251 break;
252 default:
253 SSID = llvm::SyncScope::System;
254 break;
255 }
256}
257
258llvm::Value *CodeGenFunction::EmitScalarOrConstFoldImmArg(unsigned ICEArguments,
259 unsigned Idx,
260 const CallExpr *E) {
261 llvm::Value *Arg = nullptr;
262 if ((ICEArguments & (1 << Idx)) == 0) {
263 Arg = EmitScalarExpr(E: E->getArg(Arg: Idx));
264 } else {
265 // If this is required to be a constant, constant fold it so that we
266 // know that the generated intrinsic gets a ConstantInt.
267 std::optional<llvm::APSInt> Result =
268 E->getArg(Arg: Idx)->getIntegerConstantExpr(Ctx: getContext());
269 assert(Result && "Expected argument to be a constant");
270 Arg = llvm::ConstantInt::get(Context&: getLLVMContext(), V: *Result);
271 }
272 return Arg;
273}
274
275void CodeGenFunction::AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst,
276 const CallExpr *E) {
277 constexpr const char *Tag = "amdgpu-as";
278
279 LLVMContext &Ctx = Inst->getContext();
280 SmallVector<MMRAMetadata::TagT, 3> MMRAs;
281 for (unsigned K = 2; K < E->getNumArgs(); ++K) {
282 llvm::Value *V = EmitScalarExpr(E: E->getArg(Arg: K));
283 StringRef AS;
284 if (llvm::getConstantStringInfo(V, Str&: AS)) {
285 MMRAs.push_back(Elt: {Tag, AS});
286 // TODO: Delete the resulting unused constant?
287 continue;
288 }
289 CGM.Error(loc: E->getExprLoc(),
290 error: "expected an address space name as a string literal");
291 }
292
293 llvm::sort(C&: MMRAs);
294 MMRAs.erase(CS: llvm::unique(R&: MMRAs), CE: MMRAs.end());
295 Inst->setMetadata(KindID: LLVMContext::MD_mmra, Node: MMRAMetadata::getMD(Ctx, Tags: MMRAs));
296}
297
298Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
299 const CallExpr *E) {
300 llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent;
301 llvm::SyncScope::ID SSID;
302 switch (BuiltinID) {
303 case AMDGPU::BI__builtin_amdgcn_div_scale:
304 case AMDGPU::BI__builtin_amdgcn_div_scalef: {
305 // Translate from the intrinsics's struct return to the builtin's out
306 // argument.
307
308 Address FlagOutPtr = EmitPointerWithAlignment(Addr: E->getArg(Arg: 3));
309
310 llvm::Value *X = EmitScalarExpr(E: E->getArg(Arg: 0));
311 llvm::Value *Y = EmitScalarExpr(E: E->getArg(Arg: 1));
312 llvm::Value *Z = EmitScalarExpr(E: E->getArg(Arg: 2));
313
314 llvm::Function *Callee = CGM.getIntrinsic(Intrinsic::amdgcn_div_scale,
315 X->getType());
316
317 llvm::Value *Tmp = Builder.CreateCall(Callee, Args: {X, Y, Z});
318
319 llvm::Value *Result = Builder.CreateExtractValue(Agg: Tmp, Idxs: 0);
320 llvm::Value *Flag = Builder.CreateExtractValue(Agg: Tmp, Idxs: 1);
321
322 llvm::Type *RealFlagType = FlagOutPtr.getElementType();
323
324 llvm::Value *FlagExt = Builder.CreateZExt(V: Flag, DestTy: RealFlagType);
325 Builder.CreateStore(Val: FlagExt, Addr: FlagOutPtr);
326 return Result;
327 }
328 case AMDGPU::BI__builtin_amdgcn_div_fmas:
329 case AMDGPU::BI__builtin_amdgcn_div_fmasf: {
330 llvm::Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
331 llvm::Value *Src1 = EmitScalarExpr(E: E->getArg(Arg: 1));
332 llvm::Value *Src2 = EmitScalarExpr(E: E->getArg(Arg: 2));
333 llvm::Value *Src3 = EmitScalarExpr(E: E->getArg(Arg: 3));
334
335 llvm::Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_div_fmas,
336 Src0->getType());
337 llvm::Value *Src3ToBool = Builder.CreateIsNotNull(Arg: Src3);
338 return Builder.CreateCall(Callee: F, Args: {Src0, Src1, Src2, Src3ToBool});
339 }
340
341 case AMDGPU::BI__builtin_amdgcn_ds_swizzle:
342 return emitBuiltinWithOneOverloadedType<2>(*this, E,
343 Intrinsic::amdgcn_ds_swizzle);
344 case AMDGPU::BI__builtin_amdgcn_mov_dpp8:
345 case AMDGPU::BI__builtin_amdgcn_mov_dpp:
346 case AMDGPU::BI__builtin_amdgcn_update_dpp: {
347 llvm::SmallVector<llvm::Value *, 6> Args;
348 // Find out if any arguments are required to be integer constant
349 // expressions.
350 unsigned ICEArguments = 0;
351 ASTContext::GetBuiltinTypeError Error;
352 getContext().GetBuiltinType(ID: BuiltinID, Error, IntegerConstantArgs: &ICEArguments);
353 assert(Error == ASTContext::GE_None && "Should not codegen an error");
354 llvm::Type *DataTy = ConvertType(T: E->getArg(Arg: 0)->getType());
355 unsigned Size = DataTy->getPrimitiveSizeInBits();
356 llvm::Type *IntTy =
357 llvm::IntegerType::get(C&: Builder.getContext(), NumBits: std::max(a: Size, b: 32u));
358 Function *F =
359 CGM.getIntrinsic(BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp8
360 ? Intrinsic::amdgcn_mov_dpp8
361 : Intrinsic::amdgcn_update_dpp,
362 IntTy);
363 assert(E->getNumArgs() == 5 || E->getNumArgs() == 6 ||
364 E->getNumArgs() == 2);
365 bool InsertOld = BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp;
366 if (InsertOld)
367 Args.push_back(Elt: llvm::PoisonValue::get(T: IntTy));
368 for (unsigned I = 0; I != E->getNumArgs(); ++I) {
369 llvm::Value *V = EmitScalarOrConstFoldImmArg(ICEArguments, Idx: I, E);
370 if (I < (BuiltinID == AMDGPU::BI__builtin_amdgcn_update_dpp ? 2u : 1u) &&
371 Size < 32) {
372 if (!DataTy->isIntegerTy())
373 V = Builder.CreateBitCast(
374 V, DestTy: llvm::IntegerType::get(C&: Builder.getContext(), NumBits: Size));
375 V = Builder.CreateZExtOrBitCast(V, DestTy: IntTy);
376 }
377 llvm::Type *ExpTy =
378 F->getFunctionType()->getFunctionParamType(i: I + InsertOld);
379 Args.push_back(Elt: Builder.CreateTruncOrBitCast(V, DestTy: ExpTy));
380 }
381 Value *V = Builder.CreateCall(Callee: F, Args);
382 if (Size < 32 && !DataTy->isIntegerTy())
383 V = Builder.CreateTrunc(
384 V, DestTy: llvm::IntegerType::get(C&: Builder.getContext(), NumBits: Size));
385 return Builder.CreateTruncOrBitCast(V, DestTy: DataTy);
386 }
387 case AMDGPU::BI__builtin_amdgcn_permlane16:
388 case AMDGPU::BI__builtin_amdgcn_permlanex16:
389 return emitBuiltinWithOneOverloadedType<6>(
390 *this, E,
391 BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16
392 ? Intrinsic::amdgcn_permlane16
393 : Intrinsic::amdgcn_permlanex16);
394 case AMDGPU::BI__builtin_amdgcn_permlane64:
395 return emitBuiltinWithOneOverloadedType<1>(*this, E,
396 Intrinsic::amdgcn_permlane64);
397 case AMDGPU::BI__builtin_amdgcn_readlane:
398 return emitBuiltinWithOneOverloadedType<2>(*this, E,
399 Intrinsic::amdgcn_readlane);
400 case AMDGPU::BI__builtin_amdgcn_readfirstlane:
401 return emitBuiltinWithOneOverloadedType<1>(*this, E,
402 Intrinsic::amdgcn_readfirstlane);
403 case AMDGPU::BI__builtin_amdgcn_div_fixup:
404 case AMDGPU::BI__builtin_amdgcn_div_fixupf:
405 case AMDGPU::BI__builtin_amdgcn_div_fixuph:
406 return emitBuiltinWithOneOverloadedType<3>(*this, E,
407 Intrinsic::amdgcn_div_fixup);
408 case AMDGPU::BI__builtin_amdgcn_trig_preop:
409 case AMDGPU::BI__builtin_amdgcn_trig_preopf:
410 return emitFPIntBuiltin(*this, E, Intrinsic::amdgcn_trig_preop);
411 case AMDGPU::BI__builtin_amdgcn_rcp:
412 case AMDGPU::BI__builtin_amdgcn_rcpf:
413 case AMDGPU::BI__builtin_amdgcn_rcph:
414 return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_rcp);
415 case AMDGPU::BI__builtin_amdgcn_sqrt:
416 case AMDGPU::BI__builtin_amdgcn_sqrtf:
417 case AMDGPU::BI__builtin_amdgcn_sqrth:
418 return emitBuiltinWithOneOverloadedType<1>(*this, E,
419 Intrinsic::amdgcn_sqrt);
420 case AMDGPU::BI__builtin_amdgcn_rsq:
421 case AMDGPU::BI__builtin_amdgcn_rsqf:
422 case AMDGPU::BI__builtin_amdgcn_rsqh:
423 return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_rsq);
424 case AMDGPU::BI__builtin_amdgcn_rsq_clamp:
425 case AMDGPU::BI__builtin_amdgcn_rsq_clampf:
426 return emitBuiltinWithOneOverloadedType<1>(*this, E,
427 Intrinsic::amdgcn_rsq_clamp);
428 case AMDGPU::BI__builtin_amdgcn_sinf:
429 case AMDGPU::BI__builtin_amdgcn_sinh:
430 return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_sin);
431 case AMDGPU::BI__builtin_amdgcn_cosf:
432 case AMDGPU::BI__builtin_amdgcn_cosh:
433 return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_cos);
434 case AMDGPU::BI__builtin_amdgcn_dispatch_ptr:
435 return EmitAMDGPUDispatchPtr(CGF&: *this, E);
436 case AMDGPU::BI__builtin_amdgcn_logf:
437 return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_log);
438 case AMDGPU::BI__builtin_amdgcn_exp2f:
439 return emitBuiltinWithOneOverloadedType<1>(*this, E,
440 Intrinsic::amdgcn_exp2);
441 case AMDGPU::BI__builtin_amdgcn_log_clampf:
442 return emitBuiltinWithOneOverloadedType<1>(*this, E,
443 Intrinsic::amdgcn_log_clamp);
444 case AMDGPU::BI__builtin_amdgcn_ldexp:
445 case AMDGPU::BI__builtin_amdgcn_ldexpf: {
446 llvm::Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
447 llvm::Value *Src1 = EmitScalarExpr(E: E->getArg(Arg: 1));
448 llvm::Function *F =
449 CGM.getIntrinsic(Intrinsic::ldexp, {Src0->getType(), Src1->getType()});
450 return Builder.CreateCall(Callee: F, Args: {Src0, Src1});
451 }
452 case AMDGPU::BI__builtin_amdgcn_ldexph: {
453 // The raw instruction has a different behavior for out of bounds exponent
454 // values (implicit truncation instead of saturate to short_min/short_max).
455 llvm::Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
456 llvm::Value *Src1 = EmitScalarExpr(E: E->getArg(Arg: 1));
457 llvm::Function *F =
458 CGM.getIntrinsic(Intrinsic::ldexp, {Src0->getType(), Int16Ty});
459 return Builder.CreateCall(Callee: F, Args: {Src0, Builder.CreateTrunc(V: Src1, DestTy: Int16Ty)});
460 }
461 case AMDGPU::BI__builtin_amdgcn_frexp_mant:
462 case AMDGPU::BI__builtin_amdgcn_frexp_mantf:
463 case AMDGPU::BI__builtin_amdgcn_frexp_manth:
464 return emitBuiltinWithOneOverloadedType<1>(*this, E,
465 Intrinsic::amdgcn_frexp_mant);
466 case AMDGPU::BI__builtin_amdgcn_frexp_exp:
467 case AMDGPU::BI__builtin_amdgcn_frexp_expf: {
468 Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
469 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_frexp_exp,
470 { Builder.getInt32Ty(), Src0->getType() });
471 return Builder.CreateCall(Callee: F, Args: Src0);
472 }
473 case AMDGPU::BI__builtin_amdgcn_frexp_exph: {
474 Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
475 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_frexp_exp,
476 { Builder.getInt16Ty(), Src0->getType() });
477 return Builder.CreateCall(Callee: F, Args: Src0);
478 }
479 case AMDGPU::BI__builtin_amdgcn_fract:
480 case AMDGPU::BI__builtin_amdgcn_fractf:
481 case AMDGPU::BI__builtin_amdgcn_fracth:
482 return emitBuiltinWithOneOverloadedType<1>(*this, E,
483 Intrinsic::amdgcn_fract);
484 case AMDGPU::BI__builtin_amdgcn_lerp:
485 return emitBuiltinWithOneOverloadedType<3>(*this, E,
486 Intrinsic::amdgcn_lerp);
487 case AMDGPU::BI__builtin_amdgcn_ubfe:
488 return emitBuiltinWithOneOverloadedType<3>(*this, E,
489 Intrinsic::amdgcn_ubfe);
490 case AMDGPU::BI__builtin_amdgcn_sbfe:
491 return emitBuiltinWithOneOverloadedType<3>(*this, E,
492 Intrinsic::amdgcn_sbfe);
493 case AMDGPU::BI__builtin_amdgcn_ballot_w32:
494 case AMDGPU::BI__builtin_amdgcn_ballot_w64: {
495 llvm::Type *ResultType = ConvertType(E->getType());
496 llvm::Value *Src = EmitScalarExpr(E: E->getArg(Arg: 0));
497 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, { ResultType });
498 return Builder.CreateCall(Callee: F, Args: { Src });
499 }
500 case AMDGPU::BI__builtin_amdgcn_uicmp:
501 case AMDGPU::BI__builtin_amdgcn_uicmpl:
502 case AMDGPU::BI__builtin_amdgcn_sicmp:
503 case AMDGPU::BI__builtin_amdgcn_sicmpl: {
504 llvm::Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
505 llvm::Value *Src1 = EmitScalarExpr(E: E->getArg(Arg: 1));
506 llvm::Value *Src2 = EmitScalarExpr(E: E->getArg(Arg: 2));
507
508 // FIXME-GFX10: How should 32 bit mask be handled?
509 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_icmp,
510 { Builder.getInt64Ty(), Src0->getType() });
511 return Builder.CreateCall(Callee: F, Args: { Src0, Src1, Src2 });
512 }
513 case AMDGPU::BI__builtin_amdgcn_fcmp:
514 case AMDGPU::BI__builtin_amdgcn_fcmpf: {
515 llvm::Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
516 llvm::Value *Src1 = EmitScalarExpr(E: E->getArg(Arg: 1));
517 llvm::Value *Src2 = EmitScalarExpr(E: E->getArg(Arg: 2));
518
519 // FIXME-GFX10: How should 32 bit mask be handled?
520 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_fcmp,
521 { Builder.getInt64Ty(), Src0->getType() });
522 return Builder.CreateCall(Callee: F, Args: { Src0, Src1, Src2 });
523 }
524 case AMDGPU::BI__builtin_amdgcn_class:
525 case AMDGPU::BI__builtin_amdgcn_classf:
526 case AMDGPU::BI__builtin_amdgcn_classh:
527 return emitFPIntBuiltin(*this, E, Intrinsic::amdgcn_class);
528 case AMDGPU::BI__builtin_amdgcn_fmed3f:
529 case AMDGPU::BI__builtin_amdgcn_fmed3h:
530 return emitBuiltinWithOneOverloadedType<3>(*this, E,
531 Intrinsic::amdgcn_fmed3);
532 case AMDGPU::BI__builtin_amdgcn_ds_append:
533 case AMDGPU::BI__builtin_amdgcn_ds_consume: {
534 Intrinsic::ID Intrin = BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_append ?
535 Intrinsic::amdgcn_ds_append : Intrinsic::amdgcn_ds_consume;
536 Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
537 Function *F = CGM.getIntrinsic(IID: Intrin, Tys: { Src0->getType() });
538 return Builder.CreateCall(Callee: F, Args: { Src0, Builder.getFalse() });
539 }
540 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
541 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
542 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
543 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16:
544 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
545 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
546 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
547 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
548 case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
549 case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32:
550 case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32:
551 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4f16:
552 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4bf16:
553 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16: {
554 Intrinsic::ID IID;
555 switch (BuiltinID) {
556 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
557 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
558 IID = Intrinsic::amdgcn_global_load_tr_b64;
559 break;
560 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
561 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16:
562 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
563 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
564 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
565 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
566 IID = Intrinsic::amdgcn_global_load_tr_b128;
567 break;
568 case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
569 IID = Intrinsic::amdgcn_ds_read_tr4_b64;
570 break;
571 case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32:
572 IID = Intrinsic::amdgcn_ds_read_tr8_b64;
573 break;
574 case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32:
575 IID = Intrinsic::amdgcn_ds_read_tr6_b96;
576 break;
577 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16:
578 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4f16:
579 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4bf16:
580 IID = Intrinsic::amdgcn_ds_read_tr16_b64;
581 break;
582 }
583 llvm::Type *LoadTy = ConvertType(E->getType());
584 llvm::Value *Addr = EmitScalarExpr(E: E->getArg(Arg: 0));
585 llvm::Function *F = CGM.getIntrinsic(IID, Tys: {LoadTy});
586 return Builder.CreateCall(Callee: F, Args: {Addr});
587 }
588 case AMDGPU::BI__builtin_amdgcn_load_to_lds: {
589 // Should this have asan instrumentation?
590 return emitBuiltinWithOneOverloadedType<5>(*this, E,
591 Intrinsic::amdgcn_load_to_lds);
592 }
593 case AMDGPU::BI__builtin_amdgcn_get_fpenv: {
594 Function *F = CGM.getIntrinsic(Intrinsic::get_fpenv,
595 {llvm::Type::getInt64Ty(getLLVMContext())});
596 return Builder.CreateCall(Callee: F);
597 }
598 case AMDGPU::BI__builtin_amdgcn_set_fpenv: {
599 Function *F = CGM.getIntrinsic(Intrinsic::set_fpenv,
600 {llvm::Type::getInt64Ty(getLLVMContext())});
601 llvm::Value *Env = EmitScalarExpr(E: E->getArg(Arg: 0));
602 return Builder.CreateCall(Callee: F, Args: {Env});
603 }
604 case AMDGPU::BI__builtin_amdgcn_read_exec:
605 return EmitAMDGCNBallotForExec(CGF&: *this, E, RegisterType: Int64Ty, ValueType: Int64Ty, isExecHi: false);
606 case AMDGPU::BI__builtin_amdgcn_read_exec_lo:
607 return EmitAMDGCNBallotForExec(CGF&: *this, E, RegisterType: Int32Ty, ValueType: Int32Ty, isExecHi: false);
608 case AMDGPU::BI__builtin_amdgcn_read_exec_hi:
609 return EmitAMDGCNBallotForExec(CGF&: *this, E, RegisterType: Int64Ty, ValueType: Int64Ty, isExecHi: true);
610 case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray:
611 case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_h:
612 case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_l:
613 case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_lh: {
614 llvm::Value *NodePtr = EmitScalarExpr(E: E->getArg(Arg: 0));
615 llvm::Value *RayExtent = EmitScalarExpr(E: E->getArg(Arg: 1));
616 llvm::Value *RayOrigin = EmitScalarExpr(E: E->getArg(Arg: 2));
617 llvm::Value *RayDir = EmitScalarExpr(E: E->getArg(Arg: 3));
618 llvm::Value *RayInverseDir = EmitScalarExpr(E: E->getArg(Arg: 4));
619 llvm::Value *TextureDescr = EmitScalarExpr(E: E->getArg(Arg: 5));
620
621 // The builtins take these arguments as vec4 where the last element is
622 // ignored. The intrinsic takes them as vec3.
623 RayOrigin = Builder.CreateShuffleVector(V1: RayOrigin, V2: RayOrigin,
624 Mask: {0, 1, 2});
625 RayDir =
626 Builder.CreateShuffleVector(V1: RayDir, V2: RayDir, Mask: {0, 1, 2});
627 RayInverseDir = Builder.CreateShuffleVector(V1: RayInverseDir, V2: RayInverseDir,
628 Mask: {0, 1, 2});
629
630 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_image_bvh_intersect_ray,
631 {NodePtr->getType(), RayDir->getType()});
632 return Builder.CreateCall(Callee: F, Args: {NodePtr, RayExtent, RayOrigin, RayDir,
633 RayInverseDir, TextureDescr});
634 }
635 case AMDGPU::BI__builtin_amdgcn_image_bvh8_intersect_ray:
636 case AMDGPU::BI__builtin_amdgcn_image_bvh_dual_intersect_ray: {
637 Intrinsic::ID IID;
638 switch (BuiltinID) {
639 case AMDGPU::BI__builtin_amdgcn_image_bvh8_intersect_ray:
640 IID = Intrinsic::amdgcn_image_bvh8_intersect_ray;
641 break;
642 case AMDGPU::BI__builtin_amdgcn_image_bvh_dual_intersect_ray:
643 IID = Intrinsic::amdgcn_image_bvh_dual_intersect_ray;
644 break;
645 }
646 llvm::Value *NodePtr = EmitScalarExpr(E: E->getArg(Arg: 0));
647 llvm::Value *RayExtent = EmitScalarExpr(E: E->getArg(Arg: 1));
648 llvm::Value *InstanceMask = EmitScalarExpr(E: E->getArg(Arg: 2));
649 llvm::Value *RayOrigin = EmitScalarExpr(E: E->getArg(Arg: 3));
650 llvm::Value *RayDir = EmitScalarExpr(E: E->getArg(Arg: 4));
651 llvm::Value *Offset = EmitScalarExpr(E: E->getArg(Arg: 5));
652 llvm::Value *TextureDescr = EmitScalarExpr(E: E->getArg(Arg: 6));
653
654 Address RetRayOriginPtr = EmitPointerWithAlignment(Addr: E->getArg(Arg: 7));
655 Address RetRayDirPtr = EmitPointerWithAlignment(Addr: E->getArg(Arg: 8));
656
657 llvm::Function *IntrinsicFunc = CGM.getIntrinsic(IID);
658
659 llvm::CallInst *CI = Builder.CreateCall(
660 Callee: IntrinsicFunc, Args: {NodePtr, RayExtent, InstanceMask, RayOrigin, RayDir,
661 Offset, TextureDescr});
662
663 llvm::Value *RetVData = Builder.CreateExtractValue(Agg: CI, Idxs: 0);
664 llvm::Value *RetRayOrigin = Builder.CreateExtractValue(Agg: CI, Idxs: 1);
665 llvm::Value *RetRayDir = Builder.CreateExtractValue(Agg: CI, Idxs: 2);
666
667 Builder.CreateStore(Val: RetRayOrigin, Addr: RetRayOriginPtr);
668 Builder.CreateStore(Val: RetRayDir, Addr: RetRayDirPtr);
669
670 return RetVData;
671 }
672
673 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn:
674 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn:
675 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn:
676 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn: {
677 Intrinsic::ID IID;
678 switch (BuiltinID) {
679 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn:
680 IID = Intrinsic::amdgcn_ds_bvh_stack_rtn;
681 break;
682 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn:
683 IID = Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn;
684 break;
685 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn:
686 IID = Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn;
687 break;
688 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn:
689 IID = Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn;
690 break;
691 }
692
693 SmallVector<Value *, 4> Args;
694 for (int i = 0, e = E->getNumArgs(); i != e; ++i)
695 Args.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: i)));
696
697 Function *F = CGM.getIntrinsic(IID);
698 Value *Call = Builder.CreateCall(Callee: F, Args);
699 Value *Rtn = Builder.CreateExtractValue(Agg: Call, Idxs: 0);
700 Value *A = Builder.CreateExtractValue(Agg: Call, Idxs: 1);
701 llvm::Type *RetTy = ConvertType(E->getType());
702 Value *I0 = Builder.CreateInsertElement(Vec: PoisonValue::get(T: RetTy), NewElt: Rtn,
703 Idx: (uint64_t)0);
704 // ds_bvh_stack_push8_pop2_rtn returns {i64, i32} but the builtin returns
705 // <2 x i64>, zext the second value.
706 if (A->getType()->getPrimitiveSizeInBits() <
707 RetTy->getScalarType()->getPrimitiveSizeInBits())
708 A = Builder.CreateZExt(V: A, DestTy: RetTy->getScalarType());
709
710 return Builder.CreateInsertElement(Vec: I0, NewElt: A, Idx: 1);
711 }
712 case AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
713 case AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4: {
714 llvm::FixedVectorType *VT = FixedVectorType::get(ElementType: Builder.getInt32Ty(), NumElts: 8);
715 Function *F = CGM.getIntrinsic(
716 BuiltinID == AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4
717 ? Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4
718 : Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4,
719 {VT, VT});
720
721 SmallVector<Value *, 9> Args;
722 for (unsigned I = 0, N = E->getNumArgs(); I != N; ++I)
723 Args.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: I)));
724 return Builder.CreateCall(Callee: F, Args);
725 }
726 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
727 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32:
728 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
729 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64:
730 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
731 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32:
732 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
733 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64:
734 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32:
735 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64:
736 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
737 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64:
738 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
739 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
740 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
741 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
742 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12:
743 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12:
744 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12:
745 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12:
746 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12:
747 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12:
748 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12:
749 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12:
750 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12:
751 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12:
752 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12:
753 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12:
754 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12:
755 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12:
756 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12:
757 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12:
758 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12:
759 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12:
760 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12:
761 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12:
762 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12:
763 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12:
764 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32:
765 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64:
766 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32:
767 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64:
768 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32:
769 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64:
770 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
771 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
772 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32:
773 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64:
774 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32:
775 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64:
776 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32:
777 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64:
778 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
779 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
780 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
781 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
782 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
783 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
784 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
785 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64: {
786
787 // These operations perform a matrix multiplication and accumulation of
788 // the form:
789 // D = A * B + C
790 // We need to specify one type for matrices AB and one for matrices CD.
791 // Sparse matrix operations can have different types for A and B as well as
792 // an additional type for sparsity index.
793 // Destination type should be put before types used for source operands.
794 SmallVector<unsigned, 2> ArgsForMatchingMatrixTypes;
795 // On GFX12, the intrinsics with 16-bit accumulator use a packed layout.
796 // There is no need for the variable opsel argument, so always set it to
797 // "false".
798 bool AppendFalseForOpselArg = false;
799 unsigned BuiltinWMMAOp;
800
801 switch (BuiltinID) {
802 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
803 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64:
804 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12:
805 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12:
806 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
807 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_f16;
808 break;
809 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32:
810 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64:
811 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12:
812 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12:
813 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
814 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf16;
815 break;
816 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12:
817 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12:
818 AppendFalseForOpselArg = true;
819 [[fallthrough]];
820 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
821 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
822 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
823 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16;
824 break;
825 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12:
826 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12:
827 AppendFalseForOpselArg = true;
828 [[fallthrough]];
829 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
830 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
831 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
832 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16;
833 break;
834 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32:
835 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64:
836 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
837 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied;
838 break;
839 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32:
840 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64:
841 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
842 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied;
843 break;
844 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
845 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
846 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12:
847 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12:
848 ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
849 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu8;
850 break;
851 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
852 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
853 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12:
854 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12:
855 ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
856 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu4;
857 break;
858 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12:
859 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12:
860 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
861 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8;
862 break;
863 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12:
864 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12:
865 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
866 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8;
867 break;
868 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12:
869 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12:
870 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
871 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8;
872 break;
873 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12:
874 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12:
875 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
876 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8;
877 break;
878 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12:
879 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12:
880 ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
881 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x32_iu4;
882 break;
883 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32:
884 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64:
885 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
886 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_f16;
887 break;
888 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32:
889 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64:
890 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
891 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16;
892 break;
893 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32:
894 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64:
895 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
896 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x32_f16;
897 break;
898 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
899 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
900 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
901 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16;
902 break;
903 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32:
904 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64:
905 ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
906 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8;
907 break;
908 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32:
909 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64:
910 ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
911 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4;
912 break;
913 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32:
914 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64:
915 ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
916 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4;
917 break;
918 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
919 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
920 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
921 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8;
922 break;
923 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
924 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
925 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
926 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8;
927 break;
928 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
929 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
930 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
931 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8;
932 break;
933 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
934 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64:
935 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
936 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8;
937 break;
938 }
939
940 SmallVector<Value *, 6> Args;
941 for (int i = 0, e = E->getNumArgs(); i != e; ++i)
942 Args.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: i)));
943 if (AppendFalseForOpselArg)
944 Args.push_back(Elt: Builder.getFalse());
945
946 SmallVector<llvm::Type *, 6> ArgTypes;
947 for (auto ArgIdx : ArgsForMatchingMatrixTypes)
948 ArgTypes.push_back(Elt: Args[ArgIdx]->getType());
949
950 Function *F = CGM.getIntrinsic(IID: BuiltinWMMAOp, Tys: ArgTypes);
951 return Builder.CreateCall(Callee: F, Args);
952 }
953 // amdgcn workgroup size
954 case AMDGPU::BI__builtin_amdgcn_workgroup_size_x:
955 return EmitAMDGPUWorkGroupSize(CGF&: *this, Index: 0);
956 case AMDGPU::BI__builtin_amdgcn_workgroup_size_y:
957 return EmitAMDGPUWorkGroupSize(CGF&: *this, Index: 1);
958 case AMDGPU::BI__builtin_amdgcn_workgroup_size_z:
959 return EmitAMDGPUWorkGroupSize(CGF&: *this, Index: 2);
960
961 // amdgcn grid size
962 case AMDGPU::BI__builtin_amdgcn_grid_size_x:
963 return EmitAMDGPUGridSize(CGF&: *this, Index: 0);
964 case AMDGPU::BI__builtin_amdgcn_grid_size_y:
965 return EmitAMDGPUGridSize(CGF&: *this, Index: 1);
966 case AMDGPU::BI__builtin_amdgcn_grid_size_z:
967 return EmitAMDGPUGridSize(CGF&: *this, Index: 2);
968
969 // r600 intrinsics
970 case AMDGPU::BI__builtin_r600_recipsqrt_ieee:
971 case AMDGPU::BI__builtin_r600_recipsqrt_ieeef:
972 return emitBuiltinWithOneOverloadedType<1>(*this, E,
973 Intrinsic::r600_recipsqrt_ieee);
974 case AMDGPU::BI__builtin_amdgcn_alignbit: {
975 llvm::Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
976 llvm::Value *Src1 = EmitScalarExpr(E: E->getArg(Arg: 1));
977 llvm::Value *Src2 = EmitScalarExpr(E: E->getArg(Arg: 2));
978 Function *F = CGM.getIntrinsic(Intrinsic::fshr, Src0->getType());
979 return Builder.CreateCall(Callee: F, Args: { Src0, Src1, Src2 });
980 }
981 case AMDGPU::BI__builtin_amdgcn_fence: {
982 ProcessOrderScopeAMDGCN(Order: EmitScalarExpr(E: E->getArg(Arg: 0)),
983 Scope: EmitScalarExpr(E: E->getArg(Arg: 1)), AO, SSID);
984 FenceInst *Fence = Builder.CreateFence(Ordering: AO, SSID);
985 if (E->getNumArgs() > 2)
986 AddAMDGPUFenceAddressSpaceMMRA(Inst: Fence, E);
987 return Fence;
988 }
989 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
990 case AMDGPU::BI__builtin_amdgcn_atomic_inc64:
991 case AMDGPU::BI__builtin_amdgcn_atomic_dec32:
992 case AMDGPU::BI__builtin_amdgcn_atomic_dec64:
993 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
994 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
995 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
996 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
997 case AMDGPU::BI__builtin_amdgcn_ds_faddf:
998 case AMDGPU::BI__builtin_amdgcn_ds_fminf:
999 case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
1000 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
1001 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
1002 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
1003 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
1004 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
1005 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
1006 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
1007 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
1008 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
1009 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
1010 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
1011 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: {
1012 llvm::AtomicRMWInst::BinOp BinOp;
1013 switch (BuiltinID) {
1014 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
1015 case AMDGPU::BI__builtin_amdgcn_atomic_inc64:
1016 BinOp = llvm::AtomicRMWInst::UIncWrap;
1017 break;
1018 case AMDGPU::BI__builtin_amdgcn_atomic_dec32:
1019 case AMDGPU::BI__builtin_amdgcn_atomic_dec64:
1020 BinOp = llvm::AtomicRMWInst::UDecWrap;
1021 break;
1022 case AMDGPU::BI__builtin_amdgcn_ds_faddf:
1023 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
1024 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
1025 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
1026 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
1027 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
1028 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
1029 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
1030 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
1031 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
1032 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
1033 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
1034 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
1035 BinOp = llvm::AtomicRMWInst::FAdd;
1036 break;
1037 case AMDGPU::BI__builtin_amdgcn_ds_fminf:
1038 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
1039 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
1040 BinOp = llvm::AtomicRMWInst::FMin;
1041 break;
1042 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
1043 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
1044 case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
1045 BinOp = llvm::AtomicRMWInst::FMax;
1046 break;
1047 }
1048
1049 Address Ptr = CheckAtomicAlignment(CGF&: *this, E);
1050 Value *Val = EmitScalarExpr(E: E->getArg(Arg: 1));
1051 llvm::Type *OrigTy = Val->getType();
1052 QualType PtrTy = E->getArg(Arg: 0)->IgnoreImpCasts()->getType();
1053
1054 bool Volatile;
1055
1056 if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_faddf ||
1057 BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_fminf ||
1058 BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_fmaxf) {
1059 // __builtin_amdgcn_ds_faddf/fminf/fmaxf has an explicit volatile argument
1060 Volatile =
1061 cast<ConstantInt>(Val: EmitScalarExpr(E: E->getArg(Arg: 4)))->getZExtValue();
1062 } else {
1063 // Infer volatile from the passed type.
1064 Volatile =
1065 PtrTy->castAs<PointerType>()->getPointeeType().isVolatileQualified();
1066 }
1067
1068 if (E->getNumArgs() >= 4) {
1069 // Some of the builtins have explicit ordering and scope arguments.
1070 ProcessOrderScopeAMDGCN(Order: EmitScalarExpr(E: E->getArg(Arg: 2)),
1071 Scope: EmitScalarExpr(E: E->getArg(Arg: 3)), AO, SSID);
1072 } else {
1073 // Most of the builtins do not have syncscope/order arguments. For DS
1074 // atomics the scope doesn't really matter, as they implicitly operate at
1075 // workgroup scope.
1076 //
1077 // The global/flat cases need to use agent scope to consistently produce
1078 // the native instruction instead of a cmpxchg expansion.
1079 SSID = getLLVMContext().getOrInsertSyncScopeID(SSN: "agent");
1080 AO = AtomicOrdering::Monotonic;
1081
1082 // The v2bf16 builtin uses i16 instead of a natural bfloat type.
1083 if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16 ||
1084 BuiltinID == AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16 ||
1085 BuiltinID == AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16) {
1086 llvm::Type *V2BF16Ty = FixedVectorType::get(
1087 ElementType: llvm::Type::getBFloatTy(C&: Builder.getContext()), NumElts: 2);
1088 Val = Builder.CreateBitCast(V: Val, DestTy: V2BF16Ty);
1089 }
1090 }
1091
1092 llvm::AtomicRMWInst *RMW =
1093 Builder.CreateAtomicRMW(Op: BinOp, Addr: Ptr, Val, Ordering: AO, SSID);
1094 if (Volatile)
1095 RMW->setVolatile(true);
1096
1097 unsigned AddrSpace = Ptr.getType()->getAddressSpace();
1098 if (AddrSpace != llvm::AMDGPUAS::LOCAL_ADDRESS) {
1099 // Most targets require "amdgpu.no.fine.grained.memory" to emit the native
1100 // instruction for flat and global operations.
1101 llvm::MDTuple *EmptyMD = MDNode::get(Context&: getLLVMContext(), MDs: {});
1102 RMW->setMetadata(Kind: "amdgpu.no.fine.grained.memory", Node: EmptyMD);
1103
1104 // Most targets require "amdgpu.ignore.denormal.mode" to emit the native
1105 // instruction, but this only matters for float fadd.
1106 if (BinOp == llvm::AtomicRMWInst::FAdd && Val->getType()->isFloatTy())
1107 RMW->setMetadata(Kind: "amdgpu.ignore.denormal.mode", Node: EmptyMD);
1108 }
1109
1110 return Builder.CreateBitCast(V: RMW, DestTy: OrigTy);
1111 }
1112 case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtn:
1113 case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtnl: {
1114 llvm::Value *Arg = EmitScalarExpr(E: E->getArg(Arg: 0));
1115 llvm::Type *ResultType = ConvertType(E->getType());
1116 // s_sendmsg_rtn is mangled using return type only.
1117 Function *F =
1118 CGM.getIntrinsic(Intrinsic::amdgcn_s_sendmsg_rtn, {ResultType});
1119 return Builder.CreateCall(Callee: F, Args: {Arg});
1120 }
1121 case AMDGPU::BI__builtin_amdgcn_permlane16_swap:
1122 case AMDGPU::BI__builtin_amdgcn_permlane32_swap: {
1123 // Because builtin types are limited, and the intrinsic uses a struct/pair
1124 // output, marshal the pair-of-i32 to <2 x i32>.
1125 Value *VDstOld = EmitScalarExpr(E: E->getArg(Arg: 0));
1126 Value *VSrcOld = EmitScalarExpr(E: E->getArg(Arg: 1));
1127 Value *FI = EmitScalarExpr(E: E->getArg(Arg: 2));
1128 Value *BoundCtrl = EmitScalarExpr(E: E->getArg(Arg: 3));
1129 Function *F =
1130 CGM.getIntrinsic(BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16_swap
1131 ? Intrinsic::amdgcn_permlane16_swap
1132 : Intrinsic::amdgcn_permlane32_swap);
1133 llvm::CallInst *Call =
1134 Builder.CreateCall(Callee: F, Args: {VDstOld, VSrcOld, FI, BoundCtrl});
1135
1136 llvm::Value *Elt0 = Builder.CreateExtractValue(Agg: Call, Idxs: 0);
1137 llvm::Value *Elt1 = Builder.CreateExtractValue(Agg: Call, Idxs: 1);
1138
1139 llvm::Type *ResultType = ConvertType(E->getType());
1140
1141 llvm::Value *Insert0 = Builder.CreateInsertElement(
1142 Vec: llvm::PoisonValue::get(T: ResultType), NewElt: Elt0, UINT64_C(0));
1143 llvm::Value *AsVector =
1144 Builder.CreateInsertElement(Vec: Insert0, NewElt: Elt1, UINT64_C(1));
1145 return AsVector;
1146 }
1147 case AMDGPU::BI__builtin_amdgcn_bitop3_b32:
1148 case AMDGPU::BI__builtin_amdgcn_bitop3_b16:
1149 return emitBuiltinWithOneOverloadedType<4>(*this, E,
1150 Intrinsic::amdgcn_bitop3);
1151 case AMDGPU::BI__builtin_amdgcn_make_buffer_rsrc: {
1152 // TODO: LLVM has this overloaded to allow for fat pointers, but since
1153 // those haven't been plumbed through to Clang yet, default to creating the
1154 // resource type.
1155 SmallVector<Value *, 4> Args;
1156 for (unsigned I = 0; I < 4; ++I)
1157 Args.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: I)));
1158 llvm::PointerType *RetTy = llvm::PointerType::get(
1159 C&: Builder.getContext(), AddressSpace: llvm::AMDGPUAS::BUFFER_RESOURCE);
1160 Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_make_buffer_rsrc,
1161 {RetTy, Args[0]->getType()});
1162 return Builder.CreateCall(Callee: F, Args);
1163 }
1164 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b8:
1165 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b16:
1166 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b32:
1167 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b64:
1168 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b96:
1169 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b128:
1170 return emitBuiltinWithOneOverloadedType<5>(
1171 *this, E, Intrinsic::amdgcn_raw_ptr_buffer_store);
1172 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b8:
1173 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b16:
1174 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b32:
1175 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b64:
1176 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b96:
1177 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b128: {
1178 llvm::Type *RetTy = nullptr;
1179 switch (BuiltinID) {
1180 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b8:
1181 RetTy = Int8Ty;
1182 break;
1183 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b16:
1184 RetTy = Int16Ty;
1185 break;
1186 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b32:
1187 RetTy = Int32Ty;
1188 break;
1189 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b64:
1190 RetTy = llvm::FixedVectorType::get(ElementType: Int32Ty, /*NumElements=*/NumElts: 2);
1191 break;
1192 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b96:
1193 RetTy = llvm::FixedVectorType::get(ElementType: Int32Ty, /*NumElements=*/NumElts: 3);
1194 break;
1195 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b128:
1196 RetTy = llvm::FixedVectorType::get(ElementType: Int32Ty, /*NumElements=*/NumElts: 4);
1197 break;
1198 }
1199 Function *F =
1200 CGM.getIntrinsic(Intrinsic::amdgcn_raw_ptr_buffer_load, RetTy);
1201 return Builder.CreateCall(
1202 Callee: F, Args: {EmitScalarExpr(E: E->getArg(Arg: 0)), EmitScalarExpr(E: E->getArg(Arg: 1)),
1203 EmitScalarExpr(E: E->getArg(Arg: 2)), EmitScalarExpr(E: E->getArg(Arg: 3))});
1204 }
1205 case AMDGPU::BI__builtin_amdgcn_s_prefetch_data:
1206 return emitBuiltinWithOneOverloadedType<2>(
1207 *this, E, Intrinsic::amdgcn_s_prefetch_data);
1208 case Builtin::BIlogbf:
1209 case Builtin::BI__builtin_logbf: {
1210 Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
1211 Function *FrExpFunc = CGM.getIntrinsic(
1212 Intrinsic::frexp, {Src0->getType(), Builder.getInt32Ty()});
1213 CallInst *FrExp = Builder.CreateCall(Callee: FrExpFunc, Args: Src0);
1214 Value *Exp = Builder.CreateExtractValue(Agg: FrExp, Idxs: 1);
1215 Value *Add = Builder.CreateAdd(
1216 LHS: Exp, RHS: ConstantInt::getSigned(Ty: Exp->getType(), V: -1), Name: "", HasNUW: false, HasNSW: true);
1217 Value *SIToFP = Builder.CreateSIToFP(V: Add, DestTy: Builder.getFloatTy());
1218 Value *Fabs =
1219 emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::fabs);
1220 Value *FCmpONE = Builder.CreateFCmpONE(
1221 LHS: Fabs, RHS: ConstantFP::getInfinity(Ty: Builder.getFloatTy()));
1222 Value *Sel1 = Builder.CreateSelect(C: FCmpONE, True: SIToFP, False: Fabs);
1223 Value *FCmpOEQ =
1224 Builder.CreateFCmpOEQ(LHS: Src0, RHS: ConstantFP::getZero(Ty: Builder.getFloatTy()));
1225 Value *Sel2 = Builder.CreateSelect(
1226 C: FCmpOEQ,
1227 True: ConstantFP::getInfinity(Ty: Builder.getFloatTy(), /*Negative=*/true), False: Sel1);
1228 return Sel2;
1229 }
1230 case Builtin::BIlogb:
1231 case Builtin::BI__builtin_logb: {
1232 Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
1233 Function *FrExpFunc = CGM.getIntrinsic(
1234 Intrinsic::frexp, {Src0->getType(), Builder.getInt32Ty()});
1235 CallInst *FrExp = Builder.CreateCall(Callee: FrExpFunc, Args: Src0);
1236 Value *Exp = Builder.CreateExtractValue(Agg: FrExp, Idxs: 1);
1237 Value *Add = Builder.CreateAdd(
1238 LHS: Exp, RHS: ConstantInt::getSigned(Ty: Exp->getType(), V: -1), Name: "", HasNUW: false, HasNSW: true);
1239 Value *SIToFP = Builder.CreateSIToFP(V: Add, DestTy: Builder.getDoubleTy());
1240 Value *Fabs =
1241 emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::fabs);
1242 Value *FCmpONE = Builder.CreateFCmpONE(
1243 LHS: Fabs, RHS: ConstantFP::getInfinity(Ty: Builder.getDoubleTy()));
1244 Value *Sel1 = Builder.CreateSelect(C: FCmpONE, True: SIToFP, False: Fabs);
1245 Value *FCmpOEQ =
1246 Builder.CreateFCmpOEQ(LHS: Src0, RHS: ConstantFP::getZero(Ty: Builder.getDoubleTy()));
1247 Value *Sel2 = Builder.CreateSelect(
1248 C: FCmpOEQ,
1249 True: ConstantFP::getInfinity(Ty: Builder.getDoubleTy(), /*Negative=*/true),
1250 False: Sel1);
1251 return Sel2;
1252 }
1253 case Builtin::BIscalbnf:
1254 case Builtin::BI__builtin_scalbnf:
1255 case Builtin::BIscalbn:
1256 case Builtin::BI__builtin_scalbn:
1257 return emitBinaryExpMaybeConstrainedFPBuiltin(
1258 *this, E, Intrinsic::ldexp, Intrinsic::experimental_constrained_ldexp);
1259 default:
1260 return nullptr;
1261 }
1262}
1263

source code of clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp