1//===------- AMDCPU.cpp - Emit LLVM Code for builtins ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This contains code to emit Builtin calls as LLVM code.
10//
11//===----------------------------------------------------------------------===//
12
13#include "CGBuiltin.h"
14#include "clang/Basic/TargetBuiltins.h"
15#include "llvm/Analysis/ValueTracking.h"
16#include "llvm/IR/IntrinsicsAMDGPU.h"
17#include "llvm/IR/IntrinsicsR600.h"
18#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
19#include "llvm/Support/AMDGPUAddrSpace.h"
20
21using namespace clang;
22using namespace CodeGen;
23using namespace llvm;
24
25namespace {
26
27// Has second type mangled argument.
28static Value *
29emitBinaryExpMaybeConstrainedFPBuiltin(CodeGenFunction &CGF, const CallExpr *E,
30 Intrinsic::ID IntrinsicID,
31 Intrinsic::ID ConstrainedIntrinsicID) {
32 llvm::Value *Src0 = CGF.EmitScalarExpr(E: E->getArg(Arg: 0));
33 llvm::Value *Src1 = CGF.EmitScalarExpr(E: E->getArg(Arg: 1));
34
35 CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
36 if (CGF.Builder.getIsFPConstrained()) {
37 Function *F = CGF.CGM.getIntrinsic(IID: ConstrainedIntrinsicID,
38 Tys: {Src0->getType(), Src1->getType()});
39 return CGF.Builder.CreateConstrainedFPCall(Callee: F, Args: {Src0, Src1});
40 }
41
42 Function *F =
43 CGF.CGM.getIntrinsic(IID: IntrinsicID, Tys: {Src0->getType(), Src1->getType()});
44 return CGF.Builder.CreateCall(Callee: F, Args: {Src0, Src1});
45}
46
47// If \p E is not null pointer, insert address space cast to match return
48// type of \p E if necessary.
49Value *EmitAMDGPUDispatchPtr(CodeGenFunction &CGF,
50 const CallExpr *E = nullptr) {
51 auto *F = CGF.CGM.getIntrinsic(IID: Intrinsic::amdgcn_dispatch_ptr);
52 auto *Call = CGF.Builder.CreateCall(Callee: F);
53 Call->addRetAttr(
54 Attr: Attribute::getWithDereferenceableBytes(Context&: Call->getContext(), Bytes: 64));
55 Call->addRetAttr(Attr: Attribute::getWithAlignment(Context&: Call->getContext(), Alignment: Align(4)));
56 if (!E)
57 return Call;
58 QualType BuiltinRetType = E->getType();
59 auto *RetTy = cast<llvm::PointerType>(Val: CGF.ConvertType(T: BuiltinRetType));
60 if (RetTy == Call->getType())
61 return Call;
62 return CGF.Builder.CreateAddrSpaceCast(V: Call, DestTy: RetTy);
63}
64
65Value *EmitAMDGPUImplicitArgPtr(CodeGenFunction &CGF) {
66 auto *F = CGF.CGM.getIntrinsic(IID: Intrinsic::amdgcn_implicitarg_ptr);
67 auto *Call = CGF.Builder.CreateCall(Callee: F);
68 Call->addRetAttr(
69 Attr: Attribute::getWithDereferenceableBytes(Context&: Call->getContext(), Bytes: 256));
70 Call->addRetAttr(Attr: Attribute::getWithAlignment(Context&: Call->getContext(), Alignment: Align(8)));
71 return Call;
72}
73
74// \p Index is 0, 1, and 2 for x, y, and z dimension, respectively.
75/// Emit code based on Code Object ABI version.
76/// COV_4 : Emit code to use dispatch ptr
77/// COV_5+ : Emit code to use implicitarg ptr
78/// COV_NONE : Emit code to load a global variable "__oclc_ABI_version"
79/// and use its value for COV_4 or COV_5+ approach. It is used for
80/// compiling device libraries in an ABI-agnostic way.
81Value *EmitAMDGPUWorkGroupSize(CodeGenFunction &CGF, unsigned Index) {
82 llvm::LoadInst *LD;
83
84 auto Cov = CGF.getTarget().getTargetOpts().CodeObjectVersion;
85
86 if (Cov == CodeObjectVersionKind::COV_None) {
87 StringRef Name = "__oclc_ABI_version";
88 auto *ABIVersionC = CGF.CGM.getModule().getNamedGlobal(Name);
89 if (!ABIVersionC)
90 ABIVersionC = new llvm::GlobalVariable(
91 CGF.CGM.getModule(), CGF.Int32Ty, false,
92 llvm::GlobalValue::ExternalLinkage, nullptr, Name, nullptr,
93 llvm::GlobalVariable::NotThreadLocal,
94 CGF.CGM.getContext().getTargetAddressSpace(AS: LangAS::opencl_constant));
95
96 // This load will be eliminated by the IPSCCP because it is constant
97 // weak_odr without externally_initialized. Either changing it to weak or
98 // adding externally_initialized will keep the load.
99 Value *ABIVersion = CGF.Builder.CreateAlignedLoad(Ty: CGF.Int32Ty, Addr: ABIVersionC,
100 Align: CGF.CGM.getIntAlign());
101
102 Value *IsCOV5 = CGF.Builder.CreateICmpSGE(
103 LHS: ABIVersion,
104 RHS: llvm::ConstantInt::get(Ty: CGF.Int32Ty, V: CodeObjectVersionKind::COV_5));
105
106 // Indexing the implicit kernarg segment.
107 Value *ImplicitGEP = CGF.Builder.CreateConstGEP1_32(
108 Ty: CGF.Int8Ty, Ptr: EmitAMDGPUImplicitArgPtr(CGF), Idx0: 12 + Index * 2);
109
110 // Indexing the HSA kernel_dispatch_packet struct.
111 Value *DispatchGEP = CGF.Builder.CreateConstGEP1_32(
112 Ty: CGF.Int8Ty, Ptr: EmitAMDGPUDispatchPtr(CGF), Idx0: 4 + Index * 2);
113
114 auto Result = CGF.Builder.CreateSelect(C: IsCOV5, True: ImplicitGEP, False: DispatchGEP);
115 LD = CGF.Builder.CreateLoad(
116 Addr: Address(Result, CGF.Int16Ty, CharUnits::fromQuantity(Quantity: 2)));
117 } else {
118 Value *GEP = nullptr;
119 if (Cov >= CodeObjectVersionKind::COV_5) {
120 // Indexing the implicit kernarg segment.
121 GEP = CGF.Builder.CreateConstGEP1_32(
122 Ty: CGF.Int8Ty, Ptr: EmitAMDGPUImplicitArgPtr(CGF), Idx0: 12 + Index * 2);
123 } else {
124 // Indexing the HSA kernel_dispatch_packet struct.
125 GEP = CGF.Builder.CreateConstGEP1_32(
126 Ty: CGF.Int8Ty, Ptr: EmitAMDGPUDispatchPtr(CGF), Idx0: 4 + Index * 2);
127 }
128 LD = CGF.Builder.CreateLoad(
129 Addr: Address(GEP, CGF.Int16Ty, CharUnits::fromQuantity(Quantity: 2)));
130 }
131
132 llvm::MDBuilder MDHelper(CGF.getLLVMContext());
133 llvm::MDNode *RNode = MDHelper.createRange(Lo: APInt(16, 1),
134 Hi: APInt(16, CGF.getTarget().getMaxOpenCLWorkGroupSize() + 1));
135 LD->setMetadata(KindID: llvm::LLVMContext::MD_range, Node: RNode);
136 LD->setMetadata(KindID: llvm::LLVMContext::MD_noundef,
137 Node: llvm::MDNode::get(Context&: CGF.getLLVMContext(), MDs: {}));
138 LD->setMetadata(KindID: llvm::LLVMContext::MD_invariant_load,
139 Node: llvm::MDNode::get(Context&: CGF.getLLVMContext(), MDs: {}));
140 return LD;
141}
142
143// \p Index is 0, 1, and 2 for x, y, and z dimension, respectively.
144Value *EmitAMDGPUGridSize(CodeGenFunction &CGF, unsigned Index) {
145 const unsigned XOffset = 12;
146 auto *DP = EmitAMDGPUDispatchPtr(CGF);
147 // Indexing the HSA kernel_dispatch_packet struct.
148 auto *Offset = llvm::ConstantInt::get(Ty: CGF.Int32Ty, V: XOffset + Index * 4);
149 auto *GEP = CGF.Builder.CreateGEP(Ty: CGF.Int8Ty, Ptr: DP, IdxList: Offset);
150 auto *LD = CGF.Builder.CreateLoad(
151 Addr: Address(GEP, CGF.Int32Ty, CharUnits::fromQuantity(Quantity: 4)));
152
153 llvm::MDBuilder MDB(CGF.getLLVMContext());
154
155 // Known non-zero.
156 LD->setMetadata(KindID: llvm::LLVMContext::MD_range,
157 Node: MDB.createRange(Lo: APInt(32, 1), Hi: APInt::getZero(numBits: 32)));
158 LD->setMetadata(KindID: llvm::LLVMContext::MD_invariant_load,
159 Node: llvm::MDNode::get(Context&: CGF.getLLVMContext(), MDs: {}));
160 return LD;
161}
162} // namespace
163
164// Generates the IR for __builtin_read_exec_*.
165// Lowers the builtin to amdgcn_ballot intrinsic.
166static Value *EmitAMDGCNBallotForExec(CodeGenFunction &CGF, const CallExpr *E,
167 llvm::Type *RegisterType,
168 llvm::Type *ValueType, bool isExecHi) {
169 CodeGen::CGBuilderTy &Builder = CGF.Builder;
170 CodeGen::CodeGenModule &CGM = CGF.CGM;
171
172 Function *F = CGM.getIntrinsic(IID: Intrinsic::amdgcn_ballot, Tys: {RegisterType});
173 llvm::Value *Call = Builder.CreateCall(Callee: F, Args: {Builder.getInt1(V: true)});
174
175 if (isExecHi) {
176 Value *Rt2 = Builder.CreateLShr(LHS: Call, RHS: 32);
177 Rt2 = Builder.CreateTrunc(V: Rt2, DestTy: CGF.Int32Ty);
178 return Rt2;
179 }
180
181 return Call;
182}
183
184// Emit an intrinsic that has 1 float or double operand, and 1 integer.
185static Value *emitFPIntBuiltin(CodeGenFunction &CGF,
186 const CallExpr *E,
187 unsigned IntrinsicID) {
188 llvm::Value *Src0 = CGF.EmitScalarExpr(E: E->getArg(Arg: 0));
189 llvm::Value *Src1 = CGF.EmitScalarExpr(E: E->getArg(Arg: 1));
190
191 Function *F = CGF.CGM.getIntrinsic(IID: IntrinsicID, Tys: Src0->getType());
192 return CGF.Builder.CreateCall(Callee: F, Args: {Src0, Src1});
193}
194
195// For processing memory ordering and memory scope arguments of various
196// amdgcn builtins.
197// \p Order takes a C++11 comptabile memory-ordering specifier and converts
198// it into LLVM's memory ordering specifier using atomic C ABI, and writes
199// to \p AO. \p Scope takes a const char * and converts it into AMDGCN
200// specific SyncScopeID and writes it to \p SSID.
201void CodeGenFunction::ProcessOrderScopeAMDGCN(Value *Order, Value *Scope,
202 llvm::AtomicOrdering &AO,
203 llvm::SyncScope::ID &SSID) {
204 int ord = cast<llvm::ConstantInt>(Val: Order)->getZExtValue();
205
206 // Map C11/C++11 memory ordering to LLVM memory ordering
207 assert(llvm::isValidAtomicOrderingCABI(ord));
208 switch (static_cast<llvm::AtomicOrderingCABI>(ord)) {
209 case llvm::AtomicOrderingCABI::acquire:
210 case llvm::AtomicOrderingCABI::consume:
211 AO = llvm::AtomicOrdering::Acquire;
212 break;
213 case llvm::AtomicOrderingCABI::release:
214 AO = llvm::AtomicOrdering::Release;
215 break;
216 case llvm::AtomicOrderingCABI::acq_rel:
217 AO = llvm::AtomicOrdering::AcquireRelease;
218 break;
219 case llvm::AtomicOrderingCABI::seq_cst:
220 AO = llvm::AtomicOrdering::SequentiallyConsistent;
221 break;
222 case llvm::AtomicOrderingCABI::relaxed:
223 AO = llvm::AtomicOrdering::Monotonic;
224 break;
225 }
226
227 // Some of the atomic builtins take the scope as a string name.
228 StringRef scp;
229 if (llvm::getConstantStringInfo(V: Scope, Str&: scp)) {
230 SSID = getLLVMContext().getOrInsertSyncScopeID(SSN: scp);
231 return;
232 }
233
234 // Older builtins had an enum argument for the memory scope.
235 int scope = cast<llvm::ConstantInt>(Val: Scope)->getZExtValue();
236 switch (scope) {
237 case 0: // __MEMORY_SCOPE_SYSTEM
238 SSID = llvm::SyncScope::System;
239 break;
240 case 1: // __MEMORY_SCOPE_DEVICE
241 SSID = getLLVMContext().getOrInsertSyncScopeID(SSN: "agent");
242 break;
243 case 2: // __MEMORY_SCOPE_WRKGRP
244 SSID = getLLVMContext().getOrInsertSyncScopeID(SSN: "workgroup");
245 break;
246 case 3: // __MEMORY_SCOPE_WVFRNT
247 SSID = getLLVMContext().getOrInsertSyncScopeID(SSN: "wavefront");
248 break;
249 case 4: // __MEMORY_SCOPE_SINGLE
250 SSID = llvm::SyncScope::SingleThread;
251 break;
252 default:
253 SSID = llvm::SyncScope::System;
254 break;
255 }
256}
257
258llvm::Value *CodeGenFunction::EmitScalarOrConstFoldImmArg(unsigned ICEArguments,
259 unsigned Idx,
260 const CallExpr *E) {
261 llvm::Value *Arg = nullptr;
262 if ((ICEArguments & (1 << Idx)) == 0) {
263 Arg = EmitScalarExpr(E: E->getArg(Arg: Idx));
264 } else {
265 // If this is required to be a constant, constant fold it so that we
266 // know that the generated intrinsic gets a ConstantInt.
267 std::optional<llvm::APSInt> Result =
268 E->getArg(Arg: Idx)->getIntegerConstantExpr(Ctx: getContext());
269 assert(Result && "Expected argument to be a constant");
270 Arg = llvm::ConstantInt::get(Context&: getLLVMContext(), V: *Result);
271 }
272 return Arg;
273}
274
275void CodeGenFunction::AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst,
276 const CallExpr *E) {
277 constexpr const char *Tag = "amdgpu-as";
278
279 LLVMContext &Ctx = Inst->getContext();
280 SmallVector<MMRAMetadata::TagT, 3> MMRAs;
281 for (unsigned K = 2; K < E->getNumArgs(); ++K) {
282 llvm::Value *V = EmitScalarExpr(E: E->getArg(Arg: K));
283 StringRef AS;
284 if (llvm::getConstantStringInfo(V, Str&: AS)) {
285 MMRAs.push_back(Elt: {Tag, AS});
286 // TODO: Delete the resulting unused constant?
287 continue;
288 }
289 CGM.Error(loc: E->getExprLoc(),
290 error: "expected an address space name as a string literal");
291 }
292
293 llvm::sort(C&: MMRAs);
294 MMRAs.erase(CS: llvm::unique(R&: MMRAs), CE: MMRAs.end());
295 Inst->setMetadata(KindID: LLVMContext::MD_mmra, Node: MMRAMetadata::getMD(Ctx, Tags: MMRAs));
296}
297
298Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
299 const CallExpr *E) {
300 llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent;
301 llvm::SyncScope::ID SSID;
302 switch (BuiltinID) {
303 case AMDGPU::BI__builtin_amdgcn_div_scale:
304 case AMDGPU::BI__builtin_amdgcn_div_scalef: {
305 // Translate from the intrinsics's struct return to the builtin's out
306 // argument.
307
308 Address FlagOutPtr = EmitPointerWithAlignment(Addr: E->getArg(Arg: 3));
309
310 llvm::Value *X = EmitScalarExpr(E: E->getArg(Arg: 0));
311 llvm::Value *Y = EmitScalarExpr(E: E->getArg(Arg: 1));
312 llvm::Value *Z = EmitScalarExpr(E: E->getArg(Arg: 2));
313
314 llvm::Function *Callee = CGM.getIntrinsic(IID: Intrinsic::amdgcn_div_scale,
315 Tys: X->getType());
316
317 llvm::Value *Tmp = Builder.CreateCall(Callee, Args: {X, Y, Z});
318
319 llvm::Value *Result = Builder.CreateExtractValue(Agg: Tmp, Idxs: 0);
320 llvm::Value *Flag = Builder.CreateExtractValue(Agg: Tmp, Idxs: 1);
321
322 llvm::Type *RealFlagType = FlagOutPtr.getElementType();
323
324 llvm::Value *FlagExt = Builder.CreateZExt(V: Flag, DestTy: RealFlagType);
325 Builder.CreateStore(Val: FlagExt, Addr: FlagOutPtr);
326 return Result;
327 }
328 case AMDGPU::BI__builtin_amdgcn_div_fmas:
329 case AMDGPU::BI__builtin_amdgcn_div_fmasf: {
330 llvm::Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
331 llvm::Value *Src1 = EmitScalarExpr(E: E->getArg(Arg: 1));
332 llvm::Value *Src2 = EmitScalarExpr(E: E->getArg(Arg: 2));
333 llvm::Value *Src3 = EmitScalarExpr(E: E->getArg(Arg: 3));
334
335 llvm::Function *F = CGM.getIntrinsic(IID: Intrinsic::amdgcn_div_fmas,
336 Tys: Src0->getType());
337 llvm::Value *Src3ToBool = Builder.CreateIsNotNull(Arg: Src3);
338 return Builder.CreateCall(Callee: F, Args: {Src0, Src1, Src2, Src3ToBool});
339 }
340
341 case AMDGPU::BI__builtin_amdgcn_ds_swizzle:
342 return emitBuiltinWithOneOverloadedType<2>(CGF&: *this, E,
343 IntrinsicID: Intrinsic::amdgcn_ds_swizzle);
344 case AMDGPU::BI__builtin_amdgcn_mov_dpp8:
345 case AMDGPU::BI__builtin_amdgcn_mov_dpp:
346 case AMDGPU::BI__builtin_amdgcn_update_dpp: {
347 llvm::SmallVector<llvm::Value *, 6> Args;
348 // Find out if any arguments are required to be integer constant
349 // expressions.
350 unsigned ICEArguments = 0;
351 ASTContext::GetBuiltinTypeError Error;
352 getContext().GetBuiltinType(ID: BuiltinID, Error, IntegerConstantArgs: &ICEArguments);
353 assert(Error == ASTContext::GE_None && "Should not codegen an error");
354 llvm::Type *DataTy = ConvertType(T: E->getArg(Arg: 0)->getType());
355 unsigned Size = DataTy->getPrimitiveSizeInBits();
356 llvm::Type *IntTy =
357 llvm::IntegerType::get(C&: Builder.getContext(), NumBits: std::max(a: Size, b: 32u));
358 Function *F =
359 CGM.getIntrinsic(IID: BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp8
360 ? Intrinsic::amdgcn_mov_dpp8
361 : Intrinsic::amdgcn_update_dpp,
362 Tys: IntTy);
363 assert(E->getNumArgs() == 5 || E->getNumArgs() == 6 ||
364 E->getNumArgs() == 2);
365 bool InsertOld = BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp;
366 if (InsertOld)
367 Args.push_back(Elt: llvm::PoisonValue::get(T: IntTy));
368 for (unsigned I = 0; I != E->getNumArgs(); ++I) {
369 llvm::Value *V = EmitScalarOrConstFoldImmArg(ICEArguments, Idx: I, E);
370 if (I < (BuiltinID == AMDGPU::BI__builtin_amdgcn_update_dpp ? 2u : 1u) &&
371 Size < 32) {
372 if (!DataTy->isIntegerTy())
373 V = Builder.CreateBitCast(
374 V, DestTy: llvm::IntegerType::get(C&: Builder.getContext(), NumBits: Size));
375 V = Builder.CreateZExtOrBitCast(V, DestTy: IntTy);
376 }
377 llvm::Type *ExpTy =
378 F->getFunctionType()->getFunctionParamType(i: I + InsertOld);
379 Args.push_back(Elt: Builder.CreateTruncOrBitCast(V, DestTy: ExpTy));
380 }
381 Value *V = Builder.CreateCall(Callee: F, Args);
382 if (Size < 32 && !DataTy->isIntegerTy())
383 V = Builder.CreateTrunc(
384 V, DestTy: llvm::IntegerType::get(C&: Builder.getContext(), NumBits: Size));
385 return Builder.CreateTruncOrBitCast(V, DestTy: DataTy);
386 }
387 case AMDGPU::BI__builtin_amdgcn_permlane16:
388 case AMDGPU::BI__builtin_amdgcn_permlanex16:
389 return emitBuiltinWithOneOverloadedType<6>(
390 CGF&: *this, E,
391 IntrinsicID: BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16
392 ? Intrinsic::amdgcn_permlane16
393 : Intrinsic::amdgcn_permlanex16);
394 case AMDGPU::BI__builtin_amdgcn_permlane64:
395 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E,
396 IntrinsicID: Intrinsic::amdgcn_permlane64);
397 case AMDGPU::BI__builtin_amdgcn_readlane:
398 return emitBuiltinWithOneOverloadedType<2>(CGF&: *this, E,
399 IntrinsicID: Intrinsic::amdgcn_readlane);
400 case AMDGPU::BI__builtin_amdgcn_readfirstlane:
401 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E,
402 IntrinsicID: Intrinsic::amdgcn_readfirstlane);
403 case AMDGPU::BI__builtin_amdgcn_div_fixup:
404 case AMDGPU::BI__builtin_amdgcn_div_fixupf:
405 case AMDGPU::BI__builtin_amdgcn_div_fixuph:
406 return emitBuiltinWithOneOverloadedType<3>(CGF&: *this, E,
407 IntrinsicID: Intrinsic::amdgcn_div_fixup);
408 case AMDGPU::BI__builtin_amdgcn_trig_preop:
409 case AMDGPU::BI__builtin_amdgcn_trig_preopf:
410 return emitFPIntBuiltin(CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_trig_preop);
411 case AMDGPU::BI__builtin_amdgcn_rcp:
412 case AMDGPU::BI__builtin_amdgcn_rcpf:
413 case AMDGPU::BI__builtin_amdgcn_rcph:
414 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_rcp);
415 case AMDGPU::BI__builtin_amdgcn_sqrt:
416 case AMDGPU::BI__builtin_amdgcn_sqrtf:
417 case AMDGPU::BI__builtin_amdgcn_sqrth:
418 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E,
419 IntrinsicID: Intrinsic::amdgcn_sqrt);
420 case AMDGPU::BI__builtin_amdgcn_rsq:
421 case AMDGPU::BI__builtin_amdgcn_rsqf:
422 case AMDGPU::BI__builtin_amdgcn_rsqh:
423 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_rsq);
424 case AMDGPU::BI__builtin_amdgcn_rsq_clamp:
425 case AMDGPU::BI__builtin_amdgcn_rsq_clampf:
426 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E,
427 IntrinsicID: Intrinsic::amdgcn_rsq_clamp);
428 case AMDGPU::BI__builtin_amdgcn_sinf:
429 case AMDGPU::BI__builtin_amdgcn_sinh:
430 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_sin);
431 case AMDGPU::BI__builtin_amdgcn_cosf:
432 case AMDGPU::BI__builtin_amdgcn_cosh:
433 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_cos);
434 case AMDGPU::BI__builtin_amdgcn_dispatch_ptr:
435 return EmitAMDGPUDispatchPtr(CGF&: *this, E);
436 case AMDGPU::BI__builtin_amdgcn_logf:
437 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_log);
438 case AMDGPU::BI__builtin_amdgcn_exp2f:
439 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E,
440 IntrinsicID: Intrinsic::amdgcn_exp2);
441 case AMDGPU::BI__builtin_amdgcn_log_clampf:
442 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E,
443 IntrinsicID: Intrinsic::amdgcn_log_clamp);
444 case AMDGPU::BI__builtin_amdgcn_ldexp:
445 case AMDGPU::BI__builtin_amdgcn_ldexpf: {
446 llvm::Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
447 llvm::Value *Src1 = EmitScalarExpr(E: E->getArg(Arg: 1));
448 llvm::Function *F =
449 CGM.getIntrinsic(IID: Intrinsic::ldexp, Tys: {Src0->getType(), Src1->getType()});
450 return Builder.CreateCall(Callee: F, Args: {Src0, Src1});
451 }
452 case AMDGPU::BI__builtin_amdgcn_ldexph: {
453 // The raw instruction has a different behavior for out of bounds exponent
454 // values (implicit truncation instead of saturate to short_min/short_max).
455 llvm::Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
456 llvm::Value *Src1 = EmitScalarExpr(E: E->getArg(Arg: 1));
457 llvm::Function *F =
458 CGM.getIntrinsic(IID: Intrinsic::ldexp, Tys: {Src0->getType(), Int16Ty});
459 return Builder.CreateCall(Callee: F, Args: {Src0, Builder.CreateTrunc(V: Src1, DestTy: Int16Ty)});
460 }
461 case AMDGPU::BI__builtin_amdgcn_frexp_mant:
462 case AMDGPU::BI__builtin_amdgcn_frexp_mantf:
463 case AMDGPU::BI__builtin_amdgcn_frexp_manth:
464 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E,
465 IntrinsicID: Intrinsic::amdgcn_frexp_mant);
466 case AMDGPU::BI__builtin_amdgcn_frexp_exp:
467 case AMDGPU::BI__builtin_amdgcn_frexp_expf: {
468 Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
469 Function *F = CGM.getIntrinsic(IID: Intrinsic::amdgcn_frexp_exp,
470 Tys: { Builder.getInt32Ty(), Src0->getType() });
471 return Builder.CreateCall(Callee: F, Args: Src0);
472 }
473 case AMDGPU::BI__builtin_amdgcn_frexp_exph: {
474 Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
475 Function *F = CGM.getIntrinsic(IID: Intrinsic::amdgcn_frexp_exp,
476 Tys: { Builder.getInt16Ty(), Src0->getType() });
477 return Builder.CreateCall(Callee: F, Args: Src0);
478 }
479 case AMDGPU::BI__builtin_amdgcn_fract:
480 case AMDGPU::BI__builtin_amdgcn_fractf:
481 case AMDGPU::BI__builtin_amdgcn_fracth:
482 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E,
483 IntrinsicID: Intrinsic::amdgcn_fract);
484 case AMDGPU::BI__builtin_amdgcn_lerp:
485 return emitBuiltinWithOneOverloadedType<3>(CGF&: *this, E,
486 IntrinsicID: Intrinsic::amdgcn_lerp);
487 case AMDGPU::BI__builtin_amdgcn_ubfe:
488 return emitBuiltinWithOneOverloadedType<3>(CGF&: *this, E,
489 IntrinsicID: Intrinsic::amdgcn_ubfe);
490 case AMDGPU::BI__builtin_amdgcn_sbfe:
491 return emitBuiltinWithOneOverloadedType<3>(CGF&: *this, E,
492 IntrinsicID: Intrinsic::amdgcn_sbfe);
493 case AMDGPU::BI__builtin_amdgcn_ballot_w32:
494 case AMDGPU::BI__builtin_amdgcn_ballot_w64: {
495 llvm::Type *ResultType = ConvertType(T: E->getType());
496 llvm::Value *Src = EmitScalarExpr(E: E->getArg(Arg: 0));
497 Function *F = CGM.getIntrinsic(IID: Intrinsic::amdgcn_ballot, Tys: { ResultType });
498 return Builder.CreateCall(Callee: F, Args: { Src });
499 }
500 case AMDGPU::BI__builtin_amdgcn_tanh_bf16:
501 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E,
502 IntrinsicID: Intrinsic::amdgcn_tanh);
503 case AMDGPU::BI__builtin_amdgcn_uicmp:
504 case AMDGPU::BI__builtin_amdgcn_uicmpl:
505 case AMDGPU::BI__builtin_amdgcn_sicmp:
506 case AMDGPU::BI__builtin_amdgcn_sicmpl: {
507 llvm::Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
508 llvm::Value *Src1 = EmitScalarExpr(E: E->getArg(Arg: 1));
509 llvm::Value *Src2 = EmitScalarExpr(E: E->getArg(Arg: 2));
510
511 // FIXME-GFX10: How should 32 bit mask be handled?
512 Function *F = CGM.getIntrinsic(IID: Intrinsic::amdgcn_icmp,
513 Tys: { Builder.getInt64Ty(), Src0->getType() });
514 return Builder.CreateCall(Callee: F, Args: { Src0, Src1, Src2 });
515 }
516 case AMDGPU::BI__builtin_amdgcn_fcmp:
517 case AMDGPU::BI__builtin_amdgcn_fcmpf: {
518 llvm::Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
519 llvm::Value *Src1 = EmitScalarExpr(E: E->getArg(Arg: 1));
520 llvm::Value *Src2 = EmitScalarExpr(E: E->getArg(Arg: 2));
521
522 // FIXME-GFX10: How should 32 bit mask be handled?
523 Function *F = CGM.getIntrinsic(IID: Intrinsic::amdgcn_fcmp,
524 Tys: { Builder.getInt64Ty(), Src0->getType() });
525 return Builder.CreateCall(Callee: F, Args: { Src0, Src1, Src2 });
526 }
527 case AMDGPU::BI__builtin_amdgcn_class:
528 case AMDGPU::BI__builtin_amdgcn_classf:
529 case AMDGPU::BI__builtin_amdgcn_classh:
530 return emitFPIntBuiltin(CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_class);
531 case AMDGPU::BI__builtin_amdgcn_fmed3f:
532 case AMDGPU::BI__builtin_amdgcn_fmed3h:
533 return emitBuiltinWithOneOverloadedType<3>(CGF&: *this, E,
534 IntrinsicID: Intrinsic::amdgcn_fmed3);
535 case AMDGPU::BI__builtin_amdgcn_ds_append:
536 case AMDGPU::BI__builtin_amdgcn_ds_consume: {
537 Intrinsic::ID Intrin = BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_append ?
538 Intrinsic::amdgcn_ds_append : Intrinsic::amdgcn_ds_consume;
539 Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
540 Function *F = CGM.getIntrinsic(IID: Intrin, Tys: { Src0->getType() });
541 return Builder.CreateCall(Callee: F, Args: { Src0, Builder.getFalse() });
542 }
543 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
544 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
545 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
546 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16:
547 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
548 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
549 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
550 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
551 case AMDGPU::BI__builtin_amdgcn_global_load_tr4_b64_v2i32:
552 case AMDGPU::BI__builtin_amdgcn_global_load_tr8_b64_v2i32:
553 case AMDGPU::BI__builtin_amdgcn_global_load_tr6_b96_v3i32:
554 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8i16:
555 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8f16:
556 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8bf16:
557 case AMDGPU::BI__builtin_amdgcn_ds_load_tr4_b64_v2i32:
558 case AMDGPU::BI__builtin_amdgcn_ds_load_tr8_b64_v2i32:
559 case AMDGPU::BI__builtin_amdgcn_ds_load_tr6_b96_v3i32:
560 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8i16:
561 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8f16:
562 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8bf16:
563 case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
564 case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32:
565 case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32:
566 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4f16:
567 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4bf16:
568 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16: {
569 Intrinsic::ID IID;
570 switch (BuiltinID) {
571 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
572 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
573 case AMDGPU::BI__builtin_amdgcn_global_load_tr8_b64_v2i32:
574 IID = Intrinsic::amdgcn_global_load_tr_b64;
575 break;
576 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
577 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16:
578 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
579 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
580 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
581 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
582 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8i16:
583 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8f16:
584 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8bf16:
585 IID = Intrinsic::amdgcn_global_load_tr_b128;
586 break;
587 case AMDGPU::BI__builtin_amdgcn_global_load_tr4_b64_v2i32:
588 IID = Intrinsic::amdgcn_global_load_tr4_b64;
589 break;
590 case AMDGPU::BI__builtin_amdgcn_global_load_tr6_b96_v3i32:
591 IID = Intrinsic::amdgcn_global_load_tr6_b96;
592 break;
593 case AMDGPU::BI__builtin_amdgcn_ds_load_tr4_b64_v2i32:
594 IID = Intrinsic::amdgcn_ds_load_tr4_b64;
595 break;
596 case AMDGPU::BI__builtin_amdgcn_ds_load_tr6_b96_v3i32:
597 IID = Intrinsic::amdgcn_ds_load_tr6_b96;
598 break;
599 case AMDGPU::BI__builtin_amdgcn_ds_load_tr8_b64_v2i32:
600 IID = Intrinsic::amdgcn_ds_load_tr8_b64;
601 break;
602 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8i16:
603 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8f16:
604 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8bf16:
605 IID = Intrinsic::amdgcn_ds_load_tr16_b128;
606 break;
607 case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
608 IID = Intrinsic::amdgcn_ds_read_tr4_b64;
609 break;
610 case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32:
611 IID = Intrinsic::amdgcn_ds_read_tr8_b64;
612 break;
613 case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32:
614 IID = Intrinsic::amdgcn_ds_read_tr6_b96;
615 break;
616 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16:
617 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4f16:
618 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4bf16:
619 IID = Intrinsic::amdgcn_ds_read_tr16_b64;
620 break;
621 }
622 llvm::Type *LoadTy = ConvertType(T: E->getType());
623 llvm::Value *Addr = EmitScalarExpr(E: E->getArg(Arg: 0));
624 llvm::Function *F = CGM.getIntrinsic(IID, Tys: {LoadTy});
625 return Builder.CreateCall(Callee: F, Args: {Addr});
626 }
627 case AMDGPU::BI__builtin_amdgcn_load_to_lds: {
628 // Should this have asan instrumentation?
629 return emitBuiltinWithOneOverloadedType<5>(CGF&: *this, E,
630 IntrinsicID: Intrinsic::amdgcn_load_to_lds);
631 }
632 case AMDGPU::BI__builtin_amdgcn_get_fpenv: {
633 Function *F = CGM.getIntrinsic(IID: Intrinsic::get_fpenv,
634 Tys: {llvm::Type::getInt64Ty(C&: getLLVMContext())});
635 return Builder.CreateCall(Callee: F);
636 }
637 case AMDGPU::BI__builtin_amdgcn_set_fpenv: {
638 Function *F = CGM.getIntrinsic(IID: Intrinsic::set_fpenv,
639 Tys: {llvm::Type::getInt64Ty(C&: getLLVMContext())});
640 llvm::Value *Env = EmitScalarExpr(E: E->getArg(Arg: 0));
641 return Builder.CreateCall(Callee: F, Args: {Env});
642 }
643 case AMDGPU::BI__builtin_amdgcn_read_exec:
644 return EmitAMDGCNBallotForExec(CGF&: *this, E, RegisterType: Int64Ty, ValueType: Int64Ty, isExecHi: false);
645 case AMDGPU::BI__builtin_amdgcn_read_exec_lo:
646 return EmitAMDGCNBallotForExec(CGF&: *this, E, RegisterType: Int32Ty, ValueType: Int32Ty, isExecHi: false);
647 case AMDGPU::BI__builtin_amdgcn_read_exec_hi:
648 return EmitAMDGCNBallotForExec(CGF&: *this, E, RegisterType: Int64Ty, ValueType: Int64Ty, isExecHi: true);
649 case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray:
650 case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_h:
651 case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_l:
652 case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_lh: {
653 llvm::Value *NodePtr = EmitScalarExpr(E: E->getArg(Arg: 0));
654 llvm::Value *RayExtent = EmitScalarExpr(E: E->getArg(Arg: 1));
655 llvm::Value *RayOrigin = EmitScalarExpr(E: E->getArg(Arg: 2));
656 llvm::Value *RayDir = EmitScalarExpr(E: E->getArg(Arg: 3));
657 llvm::Value *RayInverseDir = EmitScalarExpr(E: E->getArg(Arg: 4));
658 llvm::Value *TextureDescr = EmitScalarExpr(E: E->getArg(Arg: 5));
659
660 // The builtins take these arguments as vec4 where the last element is
661 // ignored. The intrinsic takes them as vec3.
662 RayOrigin = Builder.CreateShuffleVector(V1: RayOrigin, V2: RayOrigin,
663 Mask: {0, 1, 2});
664 RayDir =
665 Builder.CreateShuffleVector(V1: RayDir, V2: RayDir, Mask: {0, 1, 2});
666 RayInverseDir = Builder.CreateShuffleVector(V1: RayInverseDir, V2: RayInverseDir,
667 Mask: {0, 1, 2});
668
669 Function *F = CGM.getIntrinsic(IID: Intrinsic::amdgcn_image_bvh_intersect_ray,
670 Tys: {NodePtr->getType(), RayDir->getType()});
671 return Builder.CreateCall(Callee: F, Args: {NodePtr, RayExtent, RayOrigin, RayDir,
672 RayInverseDir, TextureDescr});
673 }
674 case AMDGPU::BI__builtin_amdgcn_image_bvh8_intersect_ray:
675 case AMDGPU::BI__builtin_amdgcn_image_bvh_dual_intersect_ray: {
676 Intrinsic::ID IID;
677 switch (BuiltinID) {
678 case AMDGPU::BI__builtin_amdgcn_image_bvh8_intersect_ray:
679 IID = Intrinsic::amdgcn_image_bvh8_intersect_ray;
680 break;
681 case AMDGPU::BI__builtin_amdgcn_image_bvh_dual_intersect_ray:
682 IID = Intrinsic::amdgcn_image_bvh_dual_intersect_ray;
683 break;
684 }
685 llvm::Value *NodePtr = EmitScalarExpr(E: E->getArg(Arg: 0));
686 llvm::Value *RayExtent = EmitScalarExpr(E: E->getArg(Arg: 1));
687 llvm::Value *InstanceMask = EmitScalarExpr(E: E->getArg(Arg: 2));
688 llvm::Value *RayOrigin = EmitScalarExpr(E: E->getArg(Arg: 3));
689 llvm::Value *RayDir = EmitScalarExpr(E: E->getArg(Arg: 4));
690 llvm::Value *Offset = EmitScalarExpr(E: E->getArg(Arg: 5));
691 llvm::Value *TextureDescr = EmitScalarExpr(E: E->getArg(Arg: 6));
692
693 Address RetRayOriginPtr = EmitPointerWithAlignment(Addr: E->getArg(Arg: 7));
694 Address RetRayDirPtr = EmitPointerWithAlignment(Addr: E->getArg(Arg: 8));
695
696 llvm::Function *IntrinsicFunc = CGM.getIntrinsic(IID);
697
698 llvm::CallInst *CI = Builder.CreateCall(
699 Callee: IntrinsicFunc, Args: {NodePtr, RayExtent, InstanceMask, RayOrigin, RayDir,
700 Offset, TextureDescr});
701
702 llvm::Value *RetVData = Builder.CreateExtractValue(Agg: CI, Idxs: 0);
703 llvm::Value *RetRayOrigin = Builder.CreateExtractValue(Agg: CI, Idxs: 1);
704 llvm::Value *RetRayDir = Builder.CreateExtractValue(Agg: CI, Idxs: 2);
705
706 Builder.CreateStore(Val: RetRayOrigin, Addr: RetRayOriginPtr);
707 Builder.CreateStore(Val: RetRayDir, Addr: RetRayDirPtr);
708
709 return RetVData;
710 }
711
712 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn:
713 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn:
714 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn:
715 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn: {
716 Intrinsic::ID IID;
717 switch (BuiltinID) {
718 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn:
719 IID = Intrinsic::amdgcn_ds_bvh_stack_rtn;
720 break;
721 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn:
722 IID = Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn;
723 break;
724 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn:
725 IID = Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn;
726 break;
727 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn:
728 IID = Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn;
729 break;
730 }
731
732 SmallVector<Value *, 4> Args;
733 for (int i = 0, e = E->getNumArgs(); i != e; ++i)
734 Args.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: i)));
735
736 Function *F = CGM.getIntrinsic(IID);
737 Value *Call = Builder.CreateCall(Callee: F, Args);
738 Value *Rtn = Builder.CreateExtractValue(Agg: Call, Idxs: 0);
739 Value *A = Builder.CreateExtractValue(Agg: Call, Idxs: 1);
740 llvm::Type *RetTy = ConvertType(T: E->getType());
741 Value *I0 = Builder.CreateInsertElement(Vec: PoisonValue::get(T: RetTy), NewElt: Rtn,
742 Idx: (uint64_t)0);
743 // ds_bvh_stack_push8_pop2_rtn returns {i64, i32} but the builtin returns
744 // <2 x i64>, zext the second value.
745 if (A->getType()->getPrimitiveSizeInBits() <
746 RetTy->getScalarType()->getPrimitiveSizeInBits())
747 A = Builder.CreateZExt(V: A, DestTy: RetTy->getScalarType());
748
749 return Builder.CreateInsertElement(Vec: I0, NewElt: A, Idx: 1);
750 }
751 case AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
752 case AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4: {
753 llvm::FixedVectorType *VT = FixedVectorType::get(ElementType: Builder.getInt32Ty(), NumElts: 8);
754 Function *F = CGM.getIntrinsic(
755 IID: BuiltinID == AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4
756 ? Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4
757 : Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4,
758 Tys: {VT, VT});
759
760 SmallVector<Value *, 9> Args;
761 for (unsigned I = 0, N = E->getNumArgs(); I != N; ++I)
762 Args.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: I)));
763 return Builder.CreateCall(Callee: F, Args);
764 }
765 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
766 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32:
767 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
768 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64:
769 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
770 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32:
771 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
772 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64:
773 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32:
774 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64:
775 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
776 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64:
777 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
778 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
779 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
780 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
781 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12:
782 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12:
783 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12:
784 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12:
785 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12:
786 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12:
787 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12:
788 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12:
789 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12:
790 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12:
791 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12:
792 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12:
793 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12:
794 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12:
795 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12:
796 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12:
797 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12:
798 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12:
799 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12:
800 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12:
801 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12:
802 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12:
803 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32:
804 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64:
805 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32:
806 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64:
807 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32:
808 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64:
809 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
810 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
811 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32:
812 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64:
813 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32:
814 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64:
815 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32:
816 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64:
817 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
818 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
819 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
820 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
821 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
822 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
823 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
824 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64: {
825
826 // These operations perform a matrix multiplication and accumulation of
827 // the form:
828 // D = A * B + C
829 // We need to specify one type for matrices AB and one for matrices CD.
830 // Sparse matrix operations can have different types for A and B as well as
831 // an additional type for sparsity index.
832 // Destination type should be put before types used for source operands.
833 SmallVector<unsigned, 2> ArgsForMatchingMatrixTypes;
834 // On GFX12, the intrinsics with 16-bit accumulator use a packed layout.
835 // There is no need for the variable opsel argument, so always set it to
836 // "false".
837 bool AppendFalseForOpselArg = false;
838 unsigned BuiltinWMMAOp;
839
840 switch (BuiltinID) {
841 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
842 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64:
843 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12:
844 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12:
845 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
846 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_f16;
847 break;
848 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32:
849 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64:
850 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12:
851 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12:
852 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
853 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf16;
854 break;
855 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12:
856 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12:
857 AppendFalseForOpselArg = true;
858 [[fallthrough]];
859 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
860 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
861 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
862 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16;
863 break;
864 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12:
865 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12:
866 AppendFalseForOpselArg = true;
867 [[fallthrough]];
868 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
869 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
870 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
871 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16;
872 break;
873 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32:
874 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64:
875 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
876 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied;
877 break;
878 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32:
879 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64:
880 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
881 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied;
882 break;
883 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
884 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
885 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12:
886 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12:
887 ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
888 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu8;
889 break;
890 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
891 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
892 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12:
893 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12:
894 ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
895 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu4;
896 break;
897 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12:
898 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12:
899 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
900 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8;
901 break;
902 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12:
903 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12:
904 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
905 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8;
906 break;
907 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12:
908 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12:
909 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
910 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8;
911 break;
912 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12:
913 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12:
914 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
915 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8;
916 break;
917 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12:
918 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12:
919 ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
920 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x32_iu4;
921 break;
922 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32:
923 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64:
924 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
925 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_f16;
926 break;
927 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32:
928 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64:
929 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
930 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16;
931 break;
932 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32:
933 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64:
934 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
935 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x32_f16;
936 break;
937 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
938 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
939 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
940 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16;
941 break;
942 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32:
943 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64:
944 ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
945 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8;
946 break;
947 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32:
948 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64:
949 ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
950 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4;
951 break;
952 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32:
953 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64:
954 ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
955 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4;
956 break;
957 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
958 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
959 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
960 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8;
961 break;
962 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
963 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
964 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
965 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8;
966 break;
967 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
968 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
969 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
970 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8;
971 break;
972 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
973 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64:
974 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
975 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8;
976 break;
977 }
978
979 SmallVector<Value *, 6> Args;
980 for (int i = 0, e = E->getNumArgs(); i != e; ++i)
981 Args.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: i)));
982 if (AppendFalseForOpselArg)
983 Args.push_back(Elt: Builder.getFalse());
984
985 SmallVector<llvm::Type *, 6> ArgTypes;
986 for (auto ArgIdx : ArgsForMatchingMatrixTypes)
987 ArgTypes.push_back(Elt: Args[ArgIdx]->getType());
988
989 Function *F = CGM.getIntrinsic(IID: BuiltinWMMAOp, Tys: ArgTypes);
990 return Builder.CreateCall(Callee: F, Args);
991 }
992 // amdgcn workgroup size
993 case AMDGPU::BI__builtin_amdgcn_workgroup_size_x:
994 return EmitAMDGPUWorkGroupSize(CGF&: *this, Index: 0);
995 case AMDGPU::BI__builtin_amdgcn_workgroup_size_y:
996 return EmitAMDGPUWorkGroupSize(CGF&: *this, Index: 1);
997 case AMDGPU::BI__builtin_amdgcn_workgroup_size_z:
998 return EmitAMDGPUWorkGroupSize(CGF&: *this, Index: 2);
999
1000 // amdgcn grid size
1001 case AMDGPU::BI__builtin_amdgcn_grid_size_x:
1002 return EmitAMDGPUGridSize(CGF&: *this, Index: 0);
1003 case AMDGPU::BI__builtin_amdgcn_grid_size_y:
1004 return EmitAMDGPUGridSize(CGF&: *this, Index: 1);
1005 case AMDGPU::BI__builtin_amdgcn_grid_size_z:
1006 return EmitAMDGPUGridSize(CGF&: *this, Index: 2);
1007
1008 // r600 intrinsics
1009 case AMDGPU::BI__builtin_r600_recipsqrt_ieee:
1010 case AMDGPU::BI__builtin_r600_recipsqrt_ieeef:
1011 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E,
1012 IntrinsicID: Intrinsic::r600_recipsqrt_ieee);
1013 case AMDGPU::BI__builtin_amdgcn_alignbit: {
1014 llvm::Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
1015 llvm::Value *Src1 = EmitScalarExpr(E: E->getArg(Arg: 1));
1016 llvm::Value *Src2 = EmitScalarExpr(E: E->getArg(Arg: 2));
1017 Function *F = CGM.getIntrinsic(IID: Intrinsic::fshr, Tys: Src0->getType());
1018 return Builder.CreateCall(Callee: F, Args: { Src0, Src1, Src2 });
1019 }
1020 case AMDGPU::BI__builtin_amdgcn_fence: {
1021 ProcessOrderScopeAMDGCN(Order: EmitScalarExpr(E: E->getArg(Arg: 0)),
1022 Scope: EmitScalarExpr(E: E->getArg(Arg: 1)), AO, SSID);
1023 FenceInst *Fence = Builder.CreateFence(Ordering: AO, SSID);
1024 if (E->getNumArgs() > 2)
1025 AddAMDGPUFenceAddressSpaceMMRA(Inst: Fence, E);
1026 return Fence;
1027 }
1028 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
1029 case AMDGPU::BI__builtin_amdgcn_atomic_inc64:
1030 case AMDGPU::BI__builtin_amdgcn_atomic_dec32:
1031 case AMDGPU::BI__builtin_amdgcn_atomic_dec64:
1032 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
1033 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
1034 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
1035 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
1036 case AMDGPU::BI__builtin_amdgcn_ds_faddf:
1037 case AMDGPU::BI__builtin_amdgcn_ds_fminf:
1038 case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
1039 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
1040 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
1041 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
1042 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
1043 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
1044 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
1045 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
1046 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
1047 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
1048 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
1049 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
1050 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: {
1051 llvm::AtomicRMWInst::BinOp BinOp;
1052 switch (BuiltinID) {
1053 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
1054 case AMDGPU::BI__builtin_amdgcn_atomic_inc64:
1055 BinOp = llvm::AtomicRMWInst::UIncWrap;
1056 break;
1057 case AMDGPU::BI__builtin_amdgcn_atomic_dec32:
1058 case AMDGPU::BI__builtin_amdgcn_atomic_dec64:
1059 BinOp = llvm::AtomicRMWInst::UDecWrap;
1060 break;
1061 case AMDGPU::BI__builtin_amdgcn_ds_faddf:
1062 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
1063 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
1064 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
1065 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
1066 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
1067 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
1068 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
1069 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
1070 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
1071 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
1072 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
1073 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
1074 BinOp = llvm::AtomicRMWInst::FAdd;
1075 break;
1076 case AMDGPU::BI__builtin_amdgcn_ds_fminf:
1077 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
1078 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
1079 BinOp = llvm::AtomicRMWInst::FMin;
1080 break;
1081 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
1082 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
1083 case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
1084 BinOp = llvm::AtomicRMWInst::FMax;
1085 break;
1086 }
1087
1088 Address Ptr = CheckAtomicAlignment(CGF&: *this, E);
1089 Value *Val = EmitScalarExpr(E: E->getArg(Arg: 1));
1090 llvm::Type *OrigTy = Val->getType();
1091 QualType PtrTy = E->getArg(Arg: 0)->IgnoreImpCasts()->getType();
1092
1093 bool Volatile;
1094
1095 if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_faddf ||
1096 BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_fminf ||
1097 BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_fmaxf) {
1098 // __builtin_amdgcn_ds_faddf/fminf/fmaxf has an explicit volatile argument
1099 Volatile =
1100 cast<ConstantInt>(Val: EmitScalarExpr(E: E->getArg(Arg: 4)))->getZExtValue();
1101 } else {
1102 // Infer volatile from the passed type.
1103 Volatile =
1104 PtrTy->castAs<PointerType>()->getPointeeType().isVolatileQualified();
1105 }
1106
1107 if (E->getNumArgs() >= 4) {
1108 // Some of the builtins have explicit ordering and scope arguments.
1109 ProcessOrderScopeAMDGCN(Order: EmitScalarExpr(E: E->getArg(Arg: 2)),
1110 Scope: EmitScalarExpr(E: E->getArg(Arg: 3)), AO, SSID);
1111 } else {
1112 // Most of the builtins do not have syncscope/order arguments. For DS
1113 // atomics the scope doesn't really matter, as they implicitly operate at
1114 // workgroup scope.
1115 //
1116 // The global/flat cases need to use agent scope to consistently produce
1117 // the native instruction instead of a cmpxchg expansion.
1118 SSID = getLLVMContext().getOrInsertSyncScopeID(SSN: "agent");
1119 AO = AtomicOrdering::Monotonic;
1120
1121 // The v2bf16 builtin uses i16 instead of a natural bfloat type.
1122 if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16 ||
1123 BuiltinID == AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16 ||
1124 BuiltinID == AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16) {
1125 llvm::Type *V2BF16Ty = FixedVectorType::get(
1126 ElementType: llvm::Type::getBFloatTy(C&: Builder.getContext()), NumElts: 2);
1127 Val = Builder.CreateBitCast(V: Val, DestTy: V2BF16Ty);
1128 }
1129 }
1130
1131 llvm::AtomicRMWInst *RMW =
1132 Builder.CreateAtomicRMW(Op: BinOp, Addr: Ptr, Val, Ordering: AO, SSID);
1133 if (Volatile)
1134 RMW->setVolatile(true);
1135
1136 unsigned AddrSpace = Ptr.getType()->getAddressSpace();
1137 if (AddrSpace != llvm::AMDGPUAS::LOCAL_ADDRESS) {
1138 // Most targets require "amdgpu.no.fine.grained.memory" to emit the native
1139 // instruction for flat and global operations.
1140 llvm::MDTuple *EmptyMD = MDNode::get(Context&: getLLVMContext(), MDs: {});
1141 RMW->setMetadata(Kind: "amdgpu.no.fine.grained.memory", Node: EmptyMD);
1142
1143 // Most targets require "amdgpu.ignore.denormal.mode" to emit the native
1144 // instruction, but this only matters for float fadd.
1145 if (BinOp == llvm::AtomicRMWInst::FAdd && Val->getType()->isFloatTy())
1146 RMW->setMetadata(Kind: "amdgpu.ignore.denormal.mode", Node: EmptyMD);
1147 }
1148
1149 return Builder.CreateBitCast(V: RMW, DestTy: OrigTy);
1150 }
1151 case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtn:
1152 case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtnl: {
1153 llvm::Value *Arg = EmitScalarExpr(E: E->getArg(Arg: 0));
1154 llvm::Type *ResultType = ConvertType(T: E->getType());
1155 // s_sendmsg_rtn is mangled using return type only.
1156 Function *F =
1157 CGM.getIntrinsic(IID: Intrinsic::amdgcn_s_sendmsg_rtn, Tys: {ResultType});
1158 return Builder.CreateCall(Callee: F, Args: {Arg});
1159 }
1160 case AMDGPU::BI__builtin_amdgcn_permlane16_swap:
1161 case AMDGPU::BI__builtin_amdgcn_permlane32_swap: {
1162 // Because builtin types are limited, and the intrinsic uses a struct/pair
1163 // output, marshal the pair-of-i32 to <2 x i32>.
1164 Value *VDstOld = EmitScalarExpr(E: E->getArg(Arg: 0));
1165 Value *VSrcOld = EmitScalarExpr(E: E->getArg(Arg: 1));
1166 Value *FI = EmitScalarExpr(E: E->getArg(Arg: 2));
1167 Value *BoundCtrl = EmitScalarExpr(E: E->getArg(Arg: 3));
1168 Function *F =
1169 CGM.getIntrinsic(IID: BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16_swap
1170 ? Intrinsic::amdgcn_permlane16_swap
1171 : Intrinsic::amdgcn_permlane32_swap);
1172 llvm::CallInst *Call =
1173 Builder.CreateCall(Callee: F, Args: {VDstOld, VSrcOld, FI, BoundCtrl});
1174
1175 llvm::Value *Elt0 = Builder.CreateExtractValue(Agg: Call, Idxs: 0);
1176 llvm::Value *Elt1 = Builder.CreateExtractValue(Agg: Call, Idxs: 1);
1177
1178 llvm::Type *ResultType = ConvertType(T: E->getType());
1179
1180 llvm::Value *Insert0 = Builder.CreateInsertElement(
1181 Vec: llvm::PoisonValue::get(T: ResultType), NewElt: Elt0, UINT64_C(0));
1182 llvm::Value *AsVector =
1183 Builder.CreateInsertElement(Vec: Insert0, NewElt: Elt1, UINT64_C(1));
1184 return AsVector;
1185 }
1186 case AMDGPU::BI__builtin_amdgcn_bitop3_b32:
1187 case AMDGPU::BI__builtin_amdgcn_bitop3_b16:
1188 return emitBuiltinWithOneOverloadedType<4>(CGF&: *this, E,
1189 IntrinsicID: Intrinsic::amdgcn_bitop3);
1190 case AMDGPU::BI__builtin_amdgcn_make_buffer_rsrc: {
1191 // TODO: LLVM has this overloaded to allow for fat pointers, but since
1192 // those haven't been plumbed through to Clang yet, default to creating the
1193 // resource type.
1194 SmallVector<Value *, 4> Args;
1195 for (unsigned I = 0; I < 4; ++I)
1196 Args.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: I)));
1197 llvm::PointerType *RetTy = llvm::PointerType::get(
1198 C&: Builder.getContext(), AddressSpace: llvm::AMDGPUAS::BUFFER_RESOURCE);
1199 Function *F = CGM.getIntrinsic(IID: Intrinsic::amdgcn_make_buffer_rsrc,
1200 Tys: {RetTy, Args[0]->getType()});
1201 return Builder.CreateCall(Callee: F, Args);
1202 }
1203 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b8:
1204 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b16:
1205 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b32:
1206 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b64:
1207 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b96:
1208 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b128:
1209 return emitBuiltinWithOneOverloadedType<5>(
1210 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_raw_ptr_buffer_store);
1211 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b8:
1212 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b16:
1213 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b32:
1214 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b64:
1215 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b96:
1216 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b128: {
1217 llvm::Type *RetTy = nullptr;
1218 switch (BuiltinID) {
1219 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b8:
1220 RetTy = Int8Ty;
1221 break;
1222 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b16:
1223 RetTy = Int16Ty;
1224 break;
1225 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b32:
1226 RetTy = Int32Ty;
1227 break;
1228 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b64:
1229 RetTy = llvm::FixedVectorType::get(ElementType: Int32Ty, /*NumElements=*/NumElts: 2);
1230 break;
1231 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b96:
1232 RetTy = llvm::FixedVectorType::get(ElementType: Int32Ty, /*NumElements=*/NumElts: 3);
1233 break;
1234 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b128:
1235 RetTy = llvm::FixedVectorType::get(ElementType: Int32Ty, /*NumElements=*/NumElts: 4);
1236 break;
1237 }
1238 Function *F =
1239 CGM.getIntrinsic(IID: Intrinsic::amdgcn_raw_ptr_buffer_load, Tys: RetTy);
1240 return Builder.CreateCall(
1241 Callee: F, Args: {EmitScalarExpr(E: E->getArg(Arg: 0)), EmitScalarExpr(E: E->getArg(Arg: 1)),
1242 EmitScalarExpr(E: E->getArg(Arg: 2)), EmitScalarExpr(E: E->getArg(Arg: 3))});
1243 }
1244 case AMDGPU::BI__builtin_amdgcn_s_prefetch_data:
1245 return emitBuiltinWithOneOverloadedType<2>(
1246 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_s_prefetch_data);
1247 case Builtin::BIlogbf:
1248 case Builtin::BI__builtin_logbf: {
1249 Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
1250 Function *FrExpFunc = CGM.getIntrinsic(
1251 IID: Intrinsic::frexp, Tys: {Src0->getType(), Builder.getInt32Ty()});
1252 CallInst *FrExp = Builder.CreateCall(Callee: FrExpFunc, Args: Src0);
1253 Value *Exp = Builder.CreateExtractValue(Agg: FrExp, Idxs: 1);
1254 Value *Add = Builder.CreateAdd(
1255 LHS: Exp, RHS: ConstantInt::getSigned(Ty: Exp->getType(), V: -1), Name: "", HasNUW: false, HasNSW: true);
1256 Value *SIToFP = Builder.CreateSIToFP(V: Add, DestTy: Builder.getFloatTy());
1257 Value *Fabs =
1258 emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E, IntrinsicID: Intrinsic::fabs);
1259 Value *FCmpONE = Builder.CreateFCmpONE(
1260 LHS: Fabs, RHS: ConstantFP::getInfinity(Ty: Builder.getFloatTy()));
1261 Value *Sel1 = Builder.CreateSelect(C: FCmpONE, True: SIToFP, False: Fabs);
1262 Value *FCmpOEQ =
1263 Builder.CreateFCmpOEQ(LHS: Src0, RHS: ConstantFP::getZero(Ty: Builder.getFloatTy()));
1264 Value *Sel2 = Builder.CreateSelect(
1265 C: FCmpOEQ,
1266 True: ConstantFP::getInfinity(Ty: Builder.getFloatTy(), /*Negative=*/true), False: Sel1);
1267 return Sel2;
1268 }
1269 case Builtin::BIlogb:
1270 case Builtin::BI__builtin_logb: {
1271 Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
1272 Function *FrExpFunc = CGM.getIntrinsic(
1273 IID: Intrinsic::frexp, Tys: {Src0->getType(), Builder.getInt32Ty()});
1274 CallInst *FrExp = Builder.CreateCall(Callee: FrExpFunc, Args: Src0);
1275 Value *Exp = Builder.CreateExtractValue(Agg: FrExp, Idxs: 1);
1276 Value *Add = Builder.CreateAdd(
1277 LHS: Exp, RHS: ConstantInt::getSigned(Ty: Exp->getType(), V: -1), Name: "", HasNUW: false, HasNSW: true);
1278 Value *SIToFP = Builder.CreateSIToFP(V: Add, DestTy: Builder.getDoubleTy());
1279 Value *Fabs =
1280 emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E, IntrinsicID: Intrinsic::fabs);
1281 Value *FCmpONE = Builder.CreateFCmpONE(
1282 LHS: Fabs, RHS: ConstantFP::getInfinity(Ty: Builder.getDoubleTy()));
1283 Value *Sel1 = Builder.CreateSelect(C: FCmpONE, True: SIToFP, False: Fabs);
1284 Value *FCmpOEQ =
1285 Builder.CreateFCmpOEQ(LHS: Src0, RHS: ConstantFP::getZero(Ty: Builder.getDoubleTy()));
1286 Value *Sel2 = Builder.CreateSelect(
1287 C: FCmpOEQ,
1288 True: ConstantFP::getInfinity(Ty: Builder.getDoubleTy(), /*Negative=*/true),
1289 False: Sel1);
1290 return Sel2;
1291 }
1292 case Builtin::BIscalbnf:
1293 case Builtin::BI__builtin_scalbnf:
1294 case Builtin::BIscalbn:
1295 case Builtin::BI__builtin_scalbn:
1296 return emitBinaryExpMaybeConstrainedFPBuiltin(
1297 CGF&: *this, E, IntrinsicID: Intrinsic::ldexp, ConstrainedIntrinsicID: Intrinsic::experimental_constrained_ldexp);
1298 default:
1299 return nullptr;
1300 }
1301}
1302

source code of clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp