AMDGPU.cpp source code [clang/lib/CodeGen/Targets/AMDGPU.cpp]

1	//===- AMDGPU.cpp ---------------------------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "ABIInfoImpl.h"
10	#include "TargetInfo.h"
11	#include "clang/Basic/TargetOptions.h"
12
13	using namespace clang;
14	using namespace clang::CodeGen;
15
16	//===----------------------------------------------------------------------===//
17	// AMDGPU ABI Implementation
18	//===----------------------------------------------------------------------===//
19
20	namespace {
21
22	class AMDGPUABIInfo final : public DefaultABIInfo {
23	private:
24	static const unsigned MaxNumRegsForArgsRet = `16`;
25
26	unsigned numRegsForType(QualType Ty) const;
27
28	bool isHomogeneousAggregateBaseType(QualType Ty) const override;
29	bool isHomogeneousAggregateSmallEnough(const Type *Base,
30	uint64_t Members) const override;
31
32	// Coerce HIP scalar pointer arguments from generic pointers to global ones.
33	llvm::Type coerceKernelArgumentType(llvm::Type Ty, unsigned FromAS,
34	unsigned ToAS) const {
35	// Single value types.
36	auto *PtrTy = llvm::dyn_cast<llvm::PointerType>(Val: Ty);
37	if (PtrTy && PtrTy->getAddressSpace() == FromAS)
38	return llvm::PointerType::get(C&: Ty->getContext(), AddressSpace: ToAS);
39	return Ty;
40	}
41
42	public:
43	explicit AMDGPUABIInfo(CodeGen::CodeGenTypes &CGT) :
44	DefaultABIInfo (CGT) {}
45
46	ABIArgInfo classifyReturnType(QualType RetTy) const;
47	ABIArgInfo classifyKernelArgumentType(QualType Ty) const;
48	ABIArgInfo classifyArgumentType(QualType Ty, unsigned &NumRegsLeft) const;
49
50	void computeInfo(CGFunctionInfo &FI) const override;
51	Address EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
52	QualType Ty) const override;
53	};
54
55	bool AMDGPUABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const {
56	return true;
57	}
58
59	bool AMDGPUABIInfo::isHomogeneousAggregateSmallEnough(
60	const Type Base, uint64_t Members) const* {
61	uint32_t NumRegs = (getContext().getTypeSize(T: Base) + `31`) / `32`;
62
63	// Homogeneous Aggregates may occupy at most 16 registers.
64	return Members * NumRegs <= MaxNumRegsForArgsRet;
65	}
66
67	/// Estimate number of registers the type will use when passed in registers.
68	unsigned AMDGPUABIInfo::numRegsForType(QualType Ty) const {
69	unsigned NumRegs = `0`;
70
71	if (const VectorType *VT = Ty ->getAs<VectorType>()) {
72	// Compute from the number of elements. The reported size is based on the
73	// in-memory size, which includes the padding 4th element for 3-vectors.
74	QualType EltTy = VT->getElementType();
75	unsigned EltSize = getContext().getTypeSize(T: EltTy);
76
77	// 16-bit element vectors should be passed as packed.
78	if (EltSize == `16`)
79	return (VT->getNumElements() + `1`) / `2`;
80
81	unsigned EltNumRegs = (EltSize + `31`) / `32`;
82	return EltNumRegs * VT->getNumElements();
83	}
84
85	if (const RecordType *RT = Ty ->getAs<RecordType>()) {
86	const RecordDecl *RD = RT->getDecl();
87	assert(!RD->hasFlexibleArrayMember());
88
89	for (const FieldDecl *Field : RD->fields()) {
90	QualType FieldTy = Field->getType();
91	NumRegs += numRegsForType(Ty: FieldTy);
92	}
93
94	return NumRegs;
95	}
96
97	return (getContext().getTypeSize(T: Ty) + `31`) / `32`;
98	}
99
100	void AMDGPUABIInfo::computeInfo(CGFunctionInfo &FI) const {
101	llvm::CallingConv::ID CC = FI.getCallingConvention();
102
103	if (!getCXXABI().classifyReturnType(FI))
104	FI.getReturnInfo() = classifyReturnType(RetTy: FI.getReturnType());
105
106	unsigned NumRegsLeft = MaxNumRegsForArgsRet;
107	for (auto &Arg : FI.arguments()) {
108	if (CC == llvm::CallingConv::AMDGPU_KERNEL) {
109	Arg.info = classifyKernelArgumentType(Ty: Arg.type);
110	} else {
111	Arg.info = classifyArgumentType(Ty: Arg.type, NumRegsLeft);
112	}
113	}
114	}
115
116	Address AMDGPUABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
117	QualType Ty) const {
118	llvm_unreachable("AMDGPU does not support varargs");
119	}
120
121	ABIArgInfo AMDGPUABIInfo::classifyReturnType(QualType RetTy) const {
122	if (isAggregateTypeForABI(T: RetTy)) {
123	// Records with non-trivial destructors/copy-constructors should not be
124	// returned by value.
125	if (!getRecordArgABI(T: RetTy, CXXABI&: getCXXABI())) {
126	// Ignore empty structs/unions.
127	if (isEmptyRecord(Context&: getContext(), T: RetTy, AllowArrays: true))
128	return ABIArgInfo::getIgnore();
129
130	// Lower single-element structs to just return a regular value.
131	if (const Type *SeltTy = isSingleElementStruct(T: RetTy, Context&: getContext()))
132	return ABIArgInfo::getDirect(T: CGT.ConvertType(T: QualType (SeltTy, `0`)));
133
134	if (const RecordType *RT = RetTy ->getAs<RecordType>()) {
135	const RecordDecl *RD = RT->getDecl();
136	if (RD->hasFlexibleArrayMember())
137	return DefaultABIInfo::classifyReturnType(RetTy);
138	}
139
140	// Pack aggregates <= 4 bytes into single VGPR or pair.
141	uint64_t Size = getContext().getTypeSize(T: RetTy);
142	if (Size <= `16`)
143	return ABIArgInfo::getDirect(T: llvm::Type::getInt16Ty(C&: getVMContext()));
144
145	if (Size <= `32`)
146	return ABIArgInfo::getDirect(T: llvm::Type::getInt32Ty(C&: getVMContext()));
147
148	if (Size <= `64`) {
149	llvm::Type *I32Ty = llvm::Type::getInt32Ty(C&: getVMContext());
150	return ABIArgInfo::getDirect(T: llvm::ArrayType::get(ElementType: I32Ty, NumElements: `2`));
151	}
152
153	if (numRegsForType(Ty: RetTy) <= MaxNumRegsForArgsRet)
154	return ABIArgInfo::getDirect();
155	}
156	}
157
158	// Otherwise just do the default thing.
159	return DefaultABIInfo::classifyReturnType(RetTy);
160	}
161
162	/// For kernels all parameters are really passed in a special buffer. It doesn't
163	/// make sense to pass anything byval, so everything must be direct.
164	ABIArgInfo AMDGPUABIInfo::classifyKernelArgumentType(QualType Ty) const {
165	Ty = useFirstFieldIfTransparentUnion(Ty);
166
167	// TODO: Can we omit empty structs?
168
169	if (const Type *SeltTy = isSingleElementStruct(T: Ty, Context&: getContext()))
170	Ty = QualType (SeltTy, `0`);
171
172	llvm::Type *OrigLTy = CGT.ConvertType(T: Ty);
173	llvm::Type *LTy = OrigLTy;
174	if (getContext().getLangOpts().HIP) {
175	LTy = coerceKernelArgumentType(
176	Ty: OrigLTy, /FromAS=/getContext().getTargetAddressSpace(AS: LangAS::Default),
177	/ToAS=/getContext().getTargetAddressSpace(AS: LangAS::cuda_device));
178	}
179
180	// FIXME: Should also use this for OpenCL, but it requires addressing the
181	// problem of kernels being called.
182	//
183	// FIXME: This doesn't apply the optimization of coercing pointers in structs
184	// to global address space when using byref. This would require implementing a
185	// new kind of coercion of the in-memory type when for indirect arguments.
186	if (!getContext().getLangOpts().OpenCL && LTy == OrigLTy &&
187	isAggregateTypeForABI(T: Ty)) {
188	return ABIArgInfo::getIndirectAliased(
189	Alignment: getContext().getTypeAlignInChars(T: Ty),
190	AddrSpace: getContext().getTargetAddressSpace(AS: LangAS::opencl_constant),
191	Realign: false /Realign/, Padding: nullptr /Padding/);
192	}
193
194	// If we set CanBeFlattened to true, CodeGen will expand the struct to its
195	// individual elements, which confuses the Clover OpenCL backend; therefore we
196	// have to set it to false here. Other args of getDirect() are just defaults.
197	return ABIArgInfo::getDirect(T: LTy, Offset: `0`, Padding: nullptr, CanBeFlattened: false);
198	}
199
200	ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty,
201	unsigned &NumRegsLeft) const {
202	assert(NumRegsLeft <= MaxNumRegsForArgsRet && "register estimate underflow");
203
204	Ty = useFirstFieldIfTransparentUnion(Ty);
205
206	if (isAggregateTypeForABI(T: Ty)) {
207	// Records with non-trivial destructors/copy-constructors should not be
208	// passed by value.
209	if (auto RAA = getRecordArgABI(T: Ty, CXXABI&: getCXXABI()))
210	return getNaturalAlignIndirect(Ty, ByVal: RAA == CGCXXABI::RAA_DirectInMemory);
211
212	// Ignore empty structs/unions.
213	if (isEmptyRecord(Context&: getContext(), T: Ty, AllowArrays: true))
214	return ABIArgInfo::getIgnore();
215
216	// Lower single-element structs to just pass a regular value. TODO: We
217	// could do reasonable-size multiple-element structs too, using getExpand(),
218	// though watch out for things like bitfields.
219	if (const Type *SeltTy = isSingleElementStruct(T: Ty, Context&: getContext()))
220	return ABIArgInfo::getDirect(T: CGT.ConvertType(T: QualType (SeltTy, `0`)));
221
222	if (const RecordType *RT = Ty ->getAs<RecordType>()) {
223	const RecordDecl *RD = RT->getDecl();
224	if (RD->hasFlexibleArrayMember())
225	return DefaultABIInfo::classifyArgumentType(RetTy: Ty);
226	}
227
228	// Pack aggregates <= 8 bytes into single VGPR or pair.
229	uint64_t Size = getContext().getTypeSize(T: Ty);
230	if (Size <= `64`) {
231	unsigned NumRegs = (Size + `31`) / `32`;
232	NumRegsLeft -= std::min(a: NumRegsLeft, b: NumRegs);
233
234	if (Size <= `16`)
235	return ABIArgInfo::getDirect(T: llvm::Type::getInt16Ty(C&: getVMContext()));
236
237	if (Size <= `32`)
238	return ABIArgInfo::getDirect(T: llvm::Type::getInt32Ty(C&: getVMContext()));
239
240	// XXX: Should this be i64 instead, and should the limit increase?
241	llvm::Type *I32Ty = llvm::Type::getInt32Ty(C&: getVMContext());
242	return ABIArgInfo::getDirect(T: llvm::ArrayType::get(ElementType: I32Ty, NumElements: `2`));
243	}
244
245	if (NumRegsLeft > `0`) {
246	unsigned NumRegs = numRegsForType(Ty);
247	if (NumRegsLeft >= NumRegs) {
248	NumRegsLeft -= NumRegs;
249	return ABIArgInfo::getDirect();
250	}
251	}
252
253	// Use pass-by-reference in stead of pass-by-value for struct arguments in
254	// function ABI.
255	return ABIArgInfo::getIndirectAliased(
256	Alignment: getContext().getTypeAlignInChars(T: Ty),
257	AddrSpace: getContext().getTargetAddressSpace(AS: LangAS::opencl_private));
258	}
259
260	// Otherwise just do the default thing.
261	ABIArgInfo ArgInfo = DefaultABIInfo::classifyArgumentType(RetTy: Ty);
262	if (!ArgInfo.isIndirect()) {
263	unsigned NumRegs = numRegsForType(Ty);
264	NumRegsLeft -= std::min(a: NumRegs, b: NumRegsLeft);
265	}
266
267	return ArgInfo;
268	}
269
270	class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo {
271	public:
272	AMDGPUTargetCodeGenInfo(CodeGenTypes &CGT)
273	: TargetCodeGenInfo (std::make_unique<AMDGPUABIInfo>(args&: CGT)) {}
274
275	void setFunctionDeclAttributes(const FunctionDecl FD, llvm::Function F,
276	CodeGenModule &CGM) const;
277
278	void emitTargetGlobals(CodeGen::CodeGenModule &CGM) const override;
279
280	void setTargetAttributes(const Decl D, llvm::GlobalValue GV,
281	CodeGen::CodeGenModule &M) const override;
282	unsigned getOpenCLKernelCallingConv() const override;
283
284	llvm::Constant getNullPointer(const* CodeGen::CodeGenModule &CGM,
285	llvm::PointerType T, QualType QT) const* override;
286
287	LangAS getASTAllocaAddressSpace() const override {
288	return getLangASFromTargetAS(
289	TargetAS: getABIInfo().getDataLayout().getAllocaAddrSpace());
290	}
291	LangAS getGlobalVarAddressSpace(CodeGenModule &CGM,
292	const VarDecl D) const* override;
293	llvm::SyncScope::ID getLLVMSyncScopeID(const LangOptions &LangOpts,
294	SyncScope Scope,
295	llvm::AtomicOrdering Ordering,
296	llvm::LLVMContext &Ctx) const override;
297	llvm::Value *createEnqueuedBlockKernel(CodeGenFunction &CGF,
298	llvm::Function *BlockInvokeFunc,
299	llvm::Type BlockTy) const* override;
300	bool shouldEmitStaticExternCAliases() const override;
301	bool shouldEmitDWARFBitFieldSeparators() const override;
302	void setCUDAKernelCallingConvention(const FunctionType &FT) const* override;
303	};
304	}
305
306	static bool requiresAMDGPUProtectedVisibility(const Decl *D,
307	llvm::GlobalValue *GV) {
308	if (GV->getVisibility() != llvm::GlobalValue::HiddenVisibility)
309	return false;
310
311	return !D->hasAttr<OMPDeclareTargetDeclAttr>() &&
312	(D->hasAttr<OpenCLKernelAttr>() \|\|
313	(isa<FunctionDecl>(D) && D->hasAttr<CUDAGlobalAttr>()) \|\|
314	(isa<VarDecl>(D) &&
315	(D->hasAttr<CUDADeviceAttr>() \|\| D->hasAttr<CUDAConstantAttr>() \|\|
316	cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinSurfaceType() \|\|
317	cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinTextureType())));
318	}
319
320	void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
321	const FunctionDecl FD, llvm::Function F, CodeGenModule &M) const {
322	const auto *ReqdWGS =
323	M.getLangOpts().OpenCL ? FD->getAttr<ReqdWorkGroupSizeAttr>() : nullptr;
324	const bool IsOpenCLKernel =
325	M.getLangOpts().OpenCL && FD->hasAttr<OpenCLKernelAttr>();
326	const bool IsHIPKernel = M.getLangOpts().HIP && FD->hasAttr<CUDAGlobalAttr>();
327
328	const auto *FlatWGS = FD->getAttr<AMDGPUFlatWorkGroupSizeAttr>();
329	if (ReqdWGS \|\| FlatWGS) {
330	M.handleAMDGPUFlatWorkGroupSizeAttr(F, FlatWGS, ReqdWGS);
331	} else if (IsOpenCLKernel \|\| IsHIPKernel) {
332	// By default, restrict the maximum size to a value specified by
333	// --gpu-max-threads-per-block=n or its default value for HIP.
334	const unsigned OpenCLDefaultMaxWorkGroupSize = `256`;
335	const unsigned DefaultMaxWorkGroupSize =
336	IsOpenCLKernel ? OpenCLDefaultMaxWorkGroupSize
337	: M.getLangOpts().GPUMaxThreadsPerBlock;
338	std::string AttrVal =
339	std::string ("1,") + llvm::utostr(X: DefaultMaxWorkGroupSize);
340	F->addFnAttr(Kind: "amdgpu-flat-work-group-size", Val: AttrVal);
341	}
342
343	if (const auto *Attr = FD->getAttr<AMDGPUWavesPerEUAttr>())
344	M.handleAMDGPUWavesPerEUAttr(F, Attr);
345
346	if (const auto *Attr = FD->getAttr<AMDGPUNumSGPRAttr>()) {
347	unsigned NumSGPR = Attr->getNumSGPR();
348
349	if (NumSGPR != `0`)
350	F->addFnAttr(Kind: "amdgpu-num-sgpr", Val: llvm::utostr(X: NumSGPR));
351	}
352
353	if (const auto *Attr = FD->getAttr<AMDGPUNumVGPRAttr>()) {
354	uint32_t NumVGPR = Attr->getNumVGPR();
355
356	if (NumVGPR != `0`)
357	F->addFnAttr(Kind: "amdgpu-num-vgpr", Val: llvm::utostr(X: NumVGPR));
358	}
359
360	if (const auto *Attr = FD->getAttr<AMDGPUMaxNumWorkGroupsAttr>()) {
361	uint32_t X = Attr->getMaxNumWorkGroupsX()
362	->EvaluateKnownConstInt(M.getContext())
363	.getExtValue();
364	// Y and Z dimensions default to 1 if not specified
365	uint32_t Y = Attr->getMaxNumWorkGroupsY()
366	? Attr->getMaxNumWorkGroupsY()
367	->EvaluateKnownConstInt(M.getContext())
368	.getExtValue()
369	: `1`;
370	uint32_t Z = Attr->getMaxNumWorkGroupsZ()
371	? Attr->getMaxNumWorkGroupsZ()
372	->EvaluateKnownConstInt(M.getContext())
373	.getExtValue()
374	: `1`;
375
376	llvm::SmallString<`32`> AttrVal;
377	llvm::raw_svector_ostream OS(AttrVal);
378	OS << X << `','` << Y << `','` << Z;
379
380	F->addFnAttr(Kind: "amdgpu-max-num-workgroups", Val: AttrVal.str());
381	}
382	}
383
384	/// Emits control constants used to change per-architecture behaviour in the
385	/// AMDGPU ROCm device libraries.
386	void AMDGPUTargetCodeGenInfo::emitTargetGlobals(
387	CodeGen::CodeGenModule &CGM) const {
388	StringRef Name = "__oclc_ABI_version";
389	llvm::GlobalVariable *OriginalGV = CGM.getModule().getNamedGlobal(Name);
390	if (OriginalGV && !llvm::GlobalVariable::isExternalLinkage(Linkage: OriginalGV->getLinkage()))
391	return;
392
393	if (CGM.getTarget().getTargetOpts().CodeObjectVersion ==
394	llvm::CodeObjectVersionKind::COV_None)
395	return;
396
397	auto *Type = llvm::IntegerType::getIntNTy(C&: CGM.getModule().getContext(), N: `32`);
398	llvm::Constant *COV = llvm::ConstantInt::get(
399	Ty: Type, V: CGM.getTarget().getTargetOpts().CodeObjectVersion);
400
401	// It needs to be constant weak_odr without externally_initialized so that
402	// the load instuction can be eliminated by the IPSCCP.
403	auto GV = new* llvm::GlobalVariable (
404	CGM.getModule(), Type, true, llvm::GlobalValue::WeakODRLinkage, COV, Name,
405	nullptr, llvm::GlobalValue::ThreadLocalMode::NotThreadLocal,
406	CGM.getContext().getTargetAddressSpace(AS: LangAS::opencl_constant));
407	GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local);
408	GV->setVisibility(llvm::GlobalValue::VisibilityTypes::HiddenVisibility);
409
410	// Replace any external references to this variable with the new global.
411	if (OriginalGV) {
412	OriginalGV->replaceAllUsesWith(V: GV);
413	GV->takeName(V: OriginalGV);
414	OriginalGV->eraseFromParent();
415	}
416	}
417
418	void AMDGPUTargetCodeGenInfo::setTargetAttributes(
419	const Decl D, llvm::GlobalValue GV, CodeGen::CodeGenModule &M) const {
420	if (requiresAMDGPUProtectedVisibility(D, GV)) {
421	GV->setVisibility(llvm::GlobalValue::ProtectedVisibility);
422	GV->setDSOLocal(true);
423	}
424
425	if (GV->isDeclaration())
426	return;
427
428	llvm::Function *F = dyn_cast<llvm::Function>(Val: GV);
429	if (!F)
430	return;
431
432	const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(Val: D);
433	if (FD)
434	setFunctionDeclAttributes(FD, F, M);
435
436	if (M.getContext().getTargetInfo().allowAMDGPUUnsafeFPAtomics())
437	F->addFnAttr(Kind: "amdgpu-unsafe-fp-atomics", Val: "true");
438
439	if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts)
440	F->addFnAttr(Kind: "amdgpu-ieee", Val: "false");
441	}
442
443	unsigned AMDGPUTargetCodeGenInfo::getOpenCLKernelCallingConv() const {
444	return llvm::CallingConv::AMDGPU_KERNEL;
445	}
446
447	// Currently LLVM assumes null pointers always have value 0,
448	// which results in incorrectly transformed IR. Therefore, instead of
449	// emitting null pointers in private and local address spaces, a null
450	// pointer in generic address space is emitted which is casted to a
451	// pointer in local or private address space.
452	llvm::Constant *AMDGPUTargetCodeGenInfo::getNullPointer(
453	const CodeGen::CodeGenModule &CGM, llvm::PointerType *PT,
454	QualType QT) const {
455	if (CGM.getContext().getTargetNullPointerValue(QT) == `0`)
456	return llvm::ConstantPointerNull::get(T: PT);
457
458	auto &Ctx = CGM.getContext();
459	auto NPT = llvm::PointerType::get(
460	C&: PT->getContext(), AddressSpace: Ctx.getTargetAddressSpace(AS: LangAS::opencl_generic));
461	return llvm::ConstantExpr::getAddrSpaceCast(
462	C: llvm::ConstantPointerNull::get(T: NPT), Ty: PT);
463	}
464
465	LangAS
466	AMDGPUTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM,
467	const VarDecl D) const* {
468	assert(!CGM.getLangOpts().OpenCL &&
469	!(CGM.getLangOpts().CUDA && CGM.getLangOpts().CUDAIsDevice) &&
470	"Address space agnostic languages only");
471	LangAS DefaultGlobalAS = getLangASFromTargetAS(
472	TargetAS: CGM.getContext().getTargetAddressSpace(AS: LangAS::opencl_global));
473	if (!D)
474	return DefaultGlobalAS;
475
476	LangAS AddrSpace = D->getType().getAddressSpace();
477	if (AddrSpace != LangAS::Default)
478	return AddrSpace;
479
480	// Only promote to address space 4 if VarDecl has constant initialization.
481	if (D->getType().isConstantStorage(CGM.getContext(), false, false) &&
482	D->hasConstantInitialization()) {
483	if (auto ConstAS = CGM.getTarget().getConstantAddressSpace())
484	return *ConstAS;
485	}
486	return DefaultGlobalAS;
487	}
488
489	llvm::SyncScope::ID
490	AMDGPUTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &LangOpts,
491	SyncScope Scope,
492	llvm::AtomicOrdering Ordering,
493	llvm::LLVMContext &Ctx) const {
494	std::string Name;
495	switch (Scope) {
496	case SyncScope::HIPSingleThread:
497	case SyncScope::SingleScope:
498	Name = "singlethread";
499	break;
500	case SyncScope::HIPWavefront:
501	case SyncScope::OpenCLSubGroup:
502	case SyncScope::WavefrontScope:
503	Name = "wavefront";
504	break;
505	case SyncScope::HIPWorkgroup:
506	case SyncScope::OpenCLWorkGroup:
507	case SyncScope::WorkgroupScope:
508	Name = "workgroup";
509	break;
510	case SyncScope::HIPAgent:
511	case SyncScope::OpenCLDevice:
512	case SyncScope::DeviceScope:
513	Name = "agent";
514	break;
515	case SyncScope::SystemScope:
516	case SyncScope::HIPSystem:
517	case SyncScope::OpenCLAllSVMDevices:
518	Name = "";
519	break;
520	}
521
522	if (Ordering != llvm::AtomicOrdering::SequentiallyConsistent) {
523	if (!Name.empty())
524	Name = Twine(Twine(Name) + Twine("-")).str();
525
526	Name = Twine(Twine(Name) + Twine("one-as")).str();
527	}
528
529	return Ctx.getOrInsertSyncScopeID(SSN: Name);
530	}
531
532	bool AMDGPUTargetCodeGenInfo::shouldEmitStaticExternCAliases() const {
533	return false;
534	}
535
536	bool AMDGPUTargetCodeGenInfo::shouldEmitDWARFBitFieldSeparators() const {
537	return true;
538	}
539
540	void AMDGPUTargetCodeGenInfo::setCUDAKernelCallingConvention(
541	const FunctionType &FT) const* {
542	FT = getABIInfo().getContext().adjustFunctionType(
543	Fn: FT, EInfo: FT->getExtInfo().withCallingConv(cc: CC_OpenCLKernel));
544	}
545
546	/// Create an OpenCL kernel for an enqueued block.
547	///
548	/// The type of the first argument (the block literal) is the struct type
549	/// of the block literal instead of a pointer type. The first argument
550	/// (block literal) is passed directly by value to the kernel. The kernel
551	/// allocates the same type of struct on stack and stores the block literal
552	/// to it and passes its pointer to the block invoke function. The kernel
553	/// has "enqueued-block" function attribute and kernel argument metadata.
554	llvm::Value *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel(
555	CodeGenFunction &CGF, llvm::Function Invoke, llvm::Type BlockTy) const {
556	auto &Builder = CGF.Builder;
557	auto &C = CGF.getLLVMContext();
558
559	auto *InvokeFT = Invoke->getFunctionType();
560	llvm::SmallVector<llvm::Type *, `2`> ArgTys;
561	llvm::SmallVector<llvm::Metadata *, `8`> AddressQuals;
562	llvm::SmallVector<llvm::Metadata *, `8`> AccessQuals;
563	llvm::SmallVector<llvm::Metadata *, `8`> ArgTypeNames;
564	llvm::SmallVector<llvm::Metadata *, `8`> ArgBaseTypeNames;
565	llvm::SmallVector<llvm::Metadata *, `8`> ArgTypeQuals;
566	llvm::SmallVector<llvm::Metadata *, `8`> ArgNames;
567
568	ArgTys.push_back(Elt: BlockTy);
569	ArgTypeNames.push_back(Elt: llvm::MDString::get(Context&: C, Str: "__block_literal"));
570	AddressQuals.push_back(Elt: llvm::ConstantAsMetadata::get(C: Builder.getInt32(C: `0`)));
571	ArgBaseTypeNames.push_back(Elt: llvm::MDString::get(Context&: C, Str: "__block_literal"));
572	ArgTypeQuals.push_back(Elt: llvm::MDString::get(Context&: C, Str: ""));
573	AccessQuals.push_back(Elt: llvm::MDString::get(Context&: C, Str: "none"));
574	ArgNames.push_back(Elt: llvm::MDString::get(Context&: C, Str: "block_literal"));
575	for (unsigned I = `1`, E = InvokeFT->getNumParams(); I < E; ++I) {
576	ArgTys.push_back(Elt: InvokeFT->getParamType(i: I));
577	ArgTypeNames.push_back(Elt: llvm::MDString::get(Context&: C, Str: "void*"));
578	AddressQuals.push_back(Elt: llvm::ConstantAsMetadata::get(C: Builder.getInt32(C: `3`)));
579	AccessQuals.push_back(Elt: llvm::MDString::get(Context&: C, Str: "none"));
580	ArgBaseTypeNames.push_back(Elt: llvm::MDString::get(Context&: C, Str: "void*"));
581	ArgTypeQuals.push_back(Elt: llvm::MDString::get(Context&: C, Str: ""));
582	ArgNames.push_back(
583	Elt: llvm::MDString::get(Context&: C, Str: (Twine("local_arg") + Twine(I)).str()));
584	}
585	std::string Name = Invoke->getName().str() + "_kernel";
586	auto FT = llvm::FunctionType::get(Result: llvm::Type::getVoidTy(C), Params: ArgTys, isVarArg: false*);
587	auto *F = llvm::Function::Create(Ty: FT, Linkage: llvm::GlobalValue::InternalLinkage, N: Name,
588	M: &CGF.CGM.getModule());
589	F->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
590
591	llvm::AttrBuilder KernelAttrs(C);
592	// FIXME: The invoke isn't applying the right attributes either
593	// FIXME: This is missing setTargetAttributes
594	CGF.CGM.addDefaultFunctionDefinitionAttributes(attrs&: KernelAttrs);
595	KernelAttrs.addAttribute(A: "enqueued-block");
596	F->addFnAttrs(Attrs: KernelAttrs);
597
598	auto IP = CGF.Builder.saveIP();
599	auto *BB = llvm::BasicBlock::Create(Context&: C, Name: "entry", Parent: F);
600	Builder.SetInsertPoint(BB);
601	const auto BlockAlign = CGF.CGM.getDataLayout().getPrefTypeAlign(Ty: BlockTy);
602	auto BlockPtr = Builder.CreateAlloca(Ty: BlockTy, ArraySize: nullptr*);
603	BlockPtr->setAlignment(BlockAlign);
604	Builder.CreateAlignedStore(Val: F->arg_begin(), Ptr: BlockPtr, Align: BlockAlign);
605	auto *Cast = Builder.CreatePointerCast(V: BlockPtr, DestTy: InvokeFT->getParamType(i: `0`));
606	llvm::SmallVector<llvm::Value *, `2`> Args;
607	Args.push_back(Elt: Cast);
608	for (llvm::Argument &A : llvm::drop_begin(RangeOrContainer: F->args()))
609	Args.push_back(Elt: &A);
610	llvm::CallInst *call = Builder.CreateCall(Callee: Invoke, Args);
611	call->setCallingConv(Invoke->getCallingConv());
612	Builder.CreateRetVoid();
613	Builder.restoreIP(IP);
614
615	F->setMetadata(Kind: "kernel_arg_addr_space", Node: llvm::MDNode::get(Context&: C, MDs: AddressQuals));
616	F->setMetadata(Kind: "kernel_arg_access_qual", Node: llvm::MDNode::get(Context&: C, MDs: AccessQuals));
617	F->setMetadata(Kind: "kernel_arg_type", Node: llvm::MDNode::get(Context&: C, MDs: ArgTypeNames));
618	F->setMetadata(Kind: "kernel_arg_base_type",
619	Node: llvm::MDNode::get(Context&: C, MDs: ArgBaseTypeNames));
620	F->setMetadata(Kind: "kernel_arg_type_qual", Node: llvm::MDNode::get(Context&: C, MDs: ArgTypeQuals));
621	if (CGF.CGM.getCodeGenOpts().EmitOpenCLArgMetadata)
622	F->setMetadata(Kind: "kernel_arg_name", Node: llvm::MDNode::get(Context&: C, MDs: ArgNames));
623
624	return F;
625	}
626
627	void CodeGenModule::handleAMDGPUFlatWorkGroupSizeAttr(
628	llvm::Function F, const* AMDGPUFlatWorkGroupSizeAttr *FlatWGS,
629	const ReqdWorkGroupSizeAttr ReqdWGS, int32_t MinThreadsVal,
630	int32_t *MaxThreadsVal) {
631	unsigned Min = `0`;
632	unsigned Max = `0`;
633	if (FlatWGS) {
634	Min = FlatWGS->getMin()->EvaluateKnownConstInt(getContext()).getExtValue();
635	Max = FlatWGS->getMax()->EvaluateKnownConstInt(getContext()).getExtValue();
636	}
637	if (ReqdWGS && Min == `0` && Max == `0`)
638	Min = Max = ReqdWGS->getXDim() * ReqdWGS->getYDim() * ReqdWGS->getZDim();
639
640	if (Min != `0`) {
641	assert(Min <= Max && "Min must be less than or equal Max");
642
643	if (MinThreadsVal)
644	*MinThreadsVal = Min;
645	if (MaxThreadsVal)
646	*MaxThreadsVal = Max;
647	std::string AttrVal = llvm::utostr(X: Min) + "," + llvm::utostr(X: Max);
648	if (F)
649	F->addFnAttr(Kind: "amdgpu-flat-work-group-size", Val: AttrVal);
650	} else
651	assert(Max == `0` && "Max must be zero");
652	}
653
654	void CodeGenModule::handleAMDGPUWavesPerEUAttr(
655	llvm::Function F, const* AMDGPUWavesPerEUAttr *Attr) {
656	unsigned Min =
657	Attr->getMin()->EvaluateKnownConstInt(getContext()).getExtValue();
658	unsigned Max =
659	Attr->getMax()
660	? Attr->getMax()->EvaluateKnownConstInt(getContext()).getExtValue()
661	: `0`;
662
663	if (Min != `0`) {
664	assert((Max == `0` \|\| Min <= Max) && "Min must be less than or equal Max");
665
666	std::string AttrVal = llvm::utostr(X: Min);
667	if (Max != `0`)
668	AttrVal = AttrVal + "," + llvm::utostr(X: Max);
669	F->addFnAttr(Kind: "amdgpu-waves-per-eu", Val: AttrVal);
670	} else
671	assert(Max == `0` && "Max must be zero");
672	}
673
674	std::unique_ptr<TargetCodeGenInfo>
675	CodeGen::createAMDGPUTargetCodeGenInfo(CodeGenModule &CGM) {
676	return std::make_unique<AMDGPUTargetCodeGenInfo>(args&: CGM.getTypes());
677	}
678

source code of clang/lib/CodeGen/Targets/AMDGPU.cpp