CGCUDANV.cpp source code [clang/lib/CodeGen/CGCUDANV.cpp]

1	//===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This provides a class for CUDA code generation targeting the NVIDIA CUDA
10	// runtime library.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "CGCUDARuntime.h"
15	#include "CGCXXABI.h"
16	#include "CodeGenFunction.h"
17	#include "CodeGenModule.h"
18	#include "clang/AST/CharUnits.h"
19	#include "clang/AST/Decl.h"
20	#include "clang/Basic/Cuda.h"
21	#include "clang/CodeGen/CodeGenABITypes.h"
22	#include "clang/CodeGen/ConstantInitBuilder.h"
23	#include "llvm/ADT/StringRef.h"
24	#include "llvm/Frontend/Offloading/Utility.h"
25	#include "llvm/IR/BasicBlock.h"
26	#include "llvm/IR/Constants.h"
27	#include "llvm/IR/DerivedTypes.h"
28	#include "llvm/IR/ReplaceConstant.h"
29	#include "llvm/Support/Format.h"
30	#include "llvm/Support/VirtualFileSystem.h"
31
32	using namespace clang;
33	using namespace CodeGen;
34
35	namespace {
36	constexpr unsigned CudaFatMagic = `0x466243b1`;
37	constexpr unsigned HIPFatMagic = `0x48495046`; // "HIPF"
38
39	class CGNVCUDARuntime : public CGCUDARuntime {
40
41	/// The prefix used for function calls and section names (CUDA, HIP, LLVM)
42	StringRef Prefix;
43
44	private:
45	llvm::IntegerType IntTy, SizeTy;
46	llvm::Type *VoidTy;
47	llvm::PointerType *PtrTy;
48
49	/// Convenience reference to LLVM Context
50	llvm::LLVMContext &Context;
51	/// Convenience reference to the current module
52	llvm::Module &TheModule;
53	/// Keeps track of kernel launch stubs and handles emitted in this module
54	struct KernelInfo {
55	llvm::Function Kernel; // stub function to help launch kernel*
56	const Decl *D;
57	};
58	llvm::SmallVector<KernelInfo, `16`> EmittedKernels;
59	// Map a kernel mangled name to a symbol for identifying kernel in host code
60	// For CUDA, the symbol for identifying the kernel is the same as the device
61	// stub function. For HIP, they are different.
62	llvm::DenseMap<StringRef, llvm::GlobalValue *> KernelHandles;
63	// Map a kernel handle to the kernel stub.
64	llvm::DenseMap<llvm::GlobalValue , llvm::Function > KernelStubs;
65	struct VarInfo {
66	llvm::GlobalVariable *Var;
67	const VarDecl *D;
68	DeviceVarFlags Flags;
69	};
70	llvm::SmallVector<VarInfo, `16`> DeviceVars;
71	/// Keeps track of variable containing handle of GPU binary. Populated by
72	/// ModuleCtorFunction() and used to create corresponding cleanup calls in
73	/// ModuleDtorFunction()
74	llvm::GlobalVariable GpuBinaryHandle = nullptr*;
75	/// Whether we generate relocatable device code.
76	bool RelocatableDeviceCode;
77	/// Mangle context for device.
78	std::unique_ptr<MangleContext> DeviceMC;
79
80	llvm::FunctionCallee getSetupArgumentFn() const;
81	llvm::FunctionCallee getLaunchFn() const;
82
83	llvm::FunctionType getRegisterGlobalsFnTy() const*;
84	llvm::FunctionType getCallbackFnTy() const*;
85	llvm::FunctionType getRegisterLinkedBinaryFnTy() const*;
86	std::string addPrefixToName(StringRef FuncName) const;
87	std::string addUnderscoredPrefixToName(StringRef FuncName) const;
88
89	/// Creates a function to register all kernel stubs generated in this module.
90	llvm::Function *makeRegisterGlobalsFn();
91
92	/// Helper function that generates a constant string and returns a pointer to
93	/// the start of the string. The result of this function can be used anywhere
94	/// where the C code specifies const char.*
95	llvm::Constant makeConstantString(const* std::string &Str,
96	const std::string &Name = "") {
97	return CGM.GetAddrOfConstantCString(Str, GlobalName: Name.c_str()).getPointer();
98	}
99
100	/// Helper function which generates an initialized constant array from Str,
101	/// and optionally sets section name and alignment. AddNull specifies whether
102	/// the array should nave NUL termination.
103	llvm::Constant *makeConstantArray(StringRef Str,
104	StringRef Name = "",
105	StringRef SectionName = "",
106	unsigned Alignment = `0`,
107	bool AddNull = false) {
108	llvm::Constant *Value =
109	llvm::ConstantDataArray::getString(Context, Initializer: Str, AddNull);
110	auto GV = new* llvm::GlobalVariable (
111	TheModule, Value->getType(), /isConstant=/true,
112	llvm::GlobalValue::PrivateLinkage, Value, Name);
113	if (!SectionName.empty()) {
114	GV->setSection(SectionName);
115	// Mark the address as used which make sure that this section isn't
116	// merged and we will really have it in the object file.
117	GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
118	}
119	if (Alignment)
120	GV->setAlignment(llvm::Align (Alignment));
121	return GV;
122	}
123
124	/// Helper function that generates an empty dummy function returning void.
125	llvm::Function makeDummyFunction(llvm::FunctionType FnTy) {
126	assert(FnTy->getReturnType()->isVoidTy() &&
127	"Can only generate dummy functions returning void!");
128	llvm::Function *DummyFunc = llvm::Function::Create(
129	Ty: FnTy, Linkage: llvm::GlobalValue::InternalLinkage, N: "dummy", M: &TheModule);
130
131	llvm::BasicBlock *DummyBlock =
132	llvm::BasicBlock::Create(Context, Name: "", Parent: DummyFunc);
133	CGBuilderTy FuncBuilder(CGM, Context);
134	FuncBuilder.SetInsertPoint(DummyBlock);
135	FuncBuilder.CreateRetVoid();
136
137	return DummyFunc;
138	}
139
140	Address prepareKernelArgs(CodeGenFunction &CGF, FunctionArgList &Args);
141	Address prepareKernelArgsLLVMOffload(CodeGenFunction &CGF,
142	FunctionArgList &Args);
143	void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
144	void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
145	std::string getDeviceSideName(const NamedDecl *ND) override;
146
147	void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var,
148	bool Extern, bool Constant) {
149	DeviceVars.push_back({&Var,
150	VD,
151	{DeviceVarFlags::Variable, Extern, Constant,
152	VD->hasAttr<HIPManagedAttr>(),
153	/Normalized/ false, `0`}});
154	}
155	void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var,
156	bool Extern, int Type) {
157	DeviceVars.push_back(Elt: {.Var: &Var,
158	.D: VD,
159	.Flags: {DeviceVarFlags::Surface, Extern, /Constant/ false,
160	/Managed/ false,
161	/Normalized/ false, Type}});
162	}
163	void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var,
164	bool Extern, int Type, bool Normalized) {
165	DeviceVars.push_back(Elt: {.Var: &Var,
166	.D: VD,
167	.Flags: {DeviceVarFlags::Texture, Extern, /Constant/ false,
168	/Managed/ false, Normalized, Type}});
169	}
170
171	/// Creates module constructor function
172	llvm::Function *makeModuleCtorFunction();
173	/// Creates module destructor function
174	llvm::Function *makeModuleDtorFunction();
175	/// Transform managed variables for device compilation.
176	void transformManagedVars();
177	/// Create offloading entries to register globals in RDC mode.
178	void createOffloadingEntries();
179
180	public:
181	CGNVCUDARuntime(CodeGenModule &CGM);
182
183	llvm::GlobalValue getKernelHandle(llvm::Function F, GlobalDecl GD) override;
184	llvm::Function getKernelStub(llvm::GlobalValue Handle) override {
185	auto Loc = KernelStubs.find(Val: Handle);
186	assert(Loc != KernelStubs.end());
187	return Loc ->second;
188	}
189	void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
190	void handleVarRegistration(const VarDecl *VD,
191	llvm::GlobalVariable &Var) override;
192	void
193	internalizeDeviceSideVar(const VarDecl *D,
194	llvm::GlobalValue::LinkageTypes &Linkage) override;
195
196	llvm::Function *finalizeModule() override;
197	};
198
199	} // end anonymous namespace
200
201	std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
202	return (Prefix + FuncName).str();
203	}
204	std::string
205	CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
206	return ("__" + Prefix + FuncName).str();
207	}
208
209	static std::unique_ptr<MangleContext> InitDeviceMC(CodeGenModule &CGM) {
210	// If the host and device have different C++ ABIs, mark it as the device
211	// mangle context so that the mangling needs to retrieve the additional
212	// device lambda mangling number instead of the regular host one.
213	if (CGM.getContext().getAuxTargetInfo() &&
214	CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() &&
215	CGM.getContext().getAuxTargetInfo()->getCXXABI().isItaniumFamily()) {
216	return std::unique_ptr<MangleContext>(
217	CGM.getContext().createDeviceMangleContext(
218	T: *CGM.getContext().getAuxTargetInfo()));
219	}
220
221	return std::unique_ptr<MangleContext>(CGM.getContext().createMangleContext(
222	T: CGM.getContext().getAuxTargetInfo()));
223	}
224
225	CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
226	: CGCUDARuntime (CGM), Context(CGM.getLLVMContext()),
227	TheModule(CGM.getModule()),
228	RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
229	DeviceMC(InitDeviceMC(CGM)) {
230	IntTy = CGM.IntTy;
231	SizeTy = CGM.SizeTy;
232	VoidTy = CGM.VoidTy;
233	PtrTy = CGM.UnqualPtrTy;
234
235	if (CGM.getLangOpts().OffloadViaLLVM)
236	Prefix = "llvm";
237	else if (CGM.getLangOpts().HIP)
238	Prefix = "hip";
239	else
240	Prefix = "cuda";
241	}
242
243	llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
244	// cudaError_t cudaSetupArgument(void , size_t, size_t)*
245	llvm::Type *Params[] = {PtrTy, SizeTy, SizeTy};
246	return CGM.CreateRuntimeFunction(
247	Ty: llvm::FunctionType::get(Result: IntTy, Params, isVarArg: false),
248	Name: addPrefixToName(FuncName: "SetupArgument"));
249	}
250
251	llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const {
252	if (CGM.getLangOpts().HIP) {
253	// hipError_t hipLaunchByPtr(char );*
254	return CGM.CreateRuntimeFunction(
255	Ty: llvm::FunctionType::get(Result: IntTy, Params: PtrTy, isVarArg: false), Name: "hipLaunchByPtr");
256	}
257	// cudaError_t cudaLaunch(char );*
258	return CGM.CreateRuntimeFunction(Ty: llvm::FunctionType::get(Result: IntTy, Params: PtrTy, isVarArg: false),
259	Name: "cudaLaunch");
260	}
261
262	llvm::FunctionType CGNVCUDARuntime::getRegisterGlobalsFnTy() const* {
263	return llvm::FunctionType::get(Result: VoidTy, Params: PtrTy, isVarArg: false);
264	}
265
266	llvm::FunctionType CGNVCUDARuntime::getCallbackFnTy() const* {
267	return llvm::FunctionType::get(Result: VoidTy, Params: PtrTy, isVarArg: false);
268	}
269
270	llvm::FunctionType CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const* {
271	llvm::Type *Params[] = {llvm::PointerType::getUnqual(C&: Context), PtrTy, PtrTy,
272	llvm::PointerType::getUnqual(C&: Context)};
273	return llvm::FunctionType::get(Result: VoidTy, Params, isVarArg: false);
274	}
275
276	std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) {
277	GlobalDecl GD;
278	// D could be either a kernel or a variable.
279	if (auto *FD = dyn_cast<FunctionDecl>(Val: ND))
280	GD = GlobalDecl (FD, KernelReferenceKind::Kernel);
281	else
282	GD = GlobalDecl (ND);
283	std::string DeviceSideName;
284	MangleContext *MC;
285	if (CGM.getLangOpts().CUDAIsDevice)
286	MC = &CGM.getCXXABI().getMangleContext();
287	else
288	MC = DeviceMC.get();
289	if (MC->shouldMangleDeclName(D: ND)) {
290	SmallString<`256`> Buffer;
291	llvm::raw_svector_ostream Out(Buffer);
292	MC->mangleName(GD, Out);
293	DeviceSideName = std::string (Out.str());
294	} else
295	DeviceSideName = std::string (ND->getIdentifier()->getName());
296
297	// Make unique name for device side static file-scope variable for HIP.
298	if (CGM.getContext().shouldExternalize(ND) &&
299	CGM.getLangOpts().GPURelocatableDeviceCode) {
300	SmallString<`256`> Buffer;
301	llvm::raw_svector_ostream Out(Buffer);
302	Out << DeviceSideName;
303	CGM.printPostfixForExternalizedDecl(Out, ND);
304	DeviceSideName = std::string (Out.str());
305	}
306	return DeviceSideName;
307	}
308
309	void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
310	FunctionArgList &Args) {
311	EmittedKernels.push_back(Elt: {.Kernel: CGF.CurFn, .D: CGF.CurFuncDecl});
312	if (auto *GV =
313	dyn_cast<llvm::GlobalVariable>(Val: KernelHandles [CGF.CurFn->getName()])) {
314	GV->setLinkage(CGF.CurFn->getLinkage());
315	GV->setInitializer(CGF.CurFn);
316	}
317	if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
318	CudaFeature::CUDA_USES_NEW_LAUNCH) \|\|
319	(CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI) \|\|
320	(CGF.getLangOpts().OffloadViaLLVM))
321	emitDeviceStubBodyNew(CGF, Args);
322	else
323	emitDeviceStubBodyLegacy(CGF, Args);
324	}
325
326	/// CUDA passes the arguments with a level of indirection. For example, a
327	/// (void, short, void) is passed as {void , short , void *} to the launch
328	/// function. For the LLVM/offload launch we flatten the arguments into the
329	/// struct directly. In addition, we include the size of the arguments, thus
330	/// pass {sizeof({void , short, void }), ptr to {void , short, void },
331	/// nullptr}. The last nullptr needs to be initialized to an array of pointers
332	/// pointing to the arguments if we want to offload to the host.
333	Address CGNVCUDARuntime::prepareKernelArgsLLVMOffload(CodeGenFunction &CGF,
334	FunctionArgList &Args) {
335	SmallVector<llvm::Type *> ArgTypes, KernelLaunchParamsTypes;
336	for (auto &Arg : Args)
337	ArgTypes.push_back(Elt: CGF.ConvertTypeForMem(T: Arg->getType()));
338	llvm::StructType *KernelArgsTy = llvm::StructType::create(Elements: ArgTypes);
339
340	auto *Int64Ty = CGF.Builder.getInt64Ty();
341	KernelLaunchParamsTypes.push_back(Elt: Int64Ty);
342	KernelLaunchParamsTypes.push_back(Elt: PtrTy);
343	KernelLaunchParamsTypes.push_back(Elt: PtrTy);
344
345	llvm::StructType *KernelLaunchParamsTy =
346	llvm::StructType::create(Elements: KernelLaunchParamsTypes);
347	Address KernelArgs = CGF.CreateTempAllocaWithoutCast(
348	Ty: KernelArgsTy, align: CharUnits::fromQuantity(Quantity: `16`), Name: "kernel_args");
349	Address KernelLaunchParams = CGF.CreateTempAllocaWithoutCast(
350	Ty: KernelLaunchParamsTy, align: CharUnits::fromQuantity(Quantity: `16`),
351	Name: "kernel_launch_params");
352
353	auto KernelArgsSize = CGM.getDataLayout().getTypeAllocSize(Ty: KernelArgsTy);
354	CGF.Builder.CreateStore(Val: llvm::ConstantInt::get(Ty: Int64Ty, V: KernelArgsSize),
355	Addr: CGF.Builder.CreateStructGEP(Addr: KernelLaunchParams, Index: `0`));
356	CGF.Builder.CreateStore(Val: KernelArgs.emitRawPointer(CGF),
357	Addr: CGF.Builder.CreateStructGEP(Addr: KernelLaunchParams, Index: `1`));
358	CGF.Builder.CreateStore(Val: llvm::Constant::getNullValue(Ty: PtrTy),
359	Addr: CGF.Builder.CreateStructGEP(Addr: KernelLaunchParams, Index: `2`));
360
361	for (unsigned i = `0`; i < Args.size(); ++i) {
362	auto *ArgVal = CGF.Builder.CreateLoad(Addr: CGF.GetAddrOfLocalVar(VD: Args [i]));
363	CGF.Builder.CreateStore(Val: ArgVal, Addr: CGF.Builder.CreateStructGEP(Addr: KernelArgs, Index: i));
364	}
365
366	return KernelLaunchParams;
367	}
368
369	Address CGNVCUDARuntime::prepareKernelArgs(CodeGenFunction &CGF,
370	FunctionArgList &Args) {
371	// Calculate amount of space we will need for all arguments. If we have no
372	// args, allocate a single pointer so we still have a valid pointer to the
373	// argument array that we can pass to runtime, even if it will be unused.
374	Address KernelArgs = CGF.CreateTempAlloca(
375	Ty: PtrTy, align: CharUnits::fromQuantity(Quantity: `16`), Name: "kernel_args",
376	ArraySize: llvm::ConstantInt::get(Ty: SizeTy, V: std::max<size_t>(a: `1`, b: Args.size())));
377	// Store pointers to the arguments in a locally allocated launch_args.
378	for (unsigned i = `0`; i < Args.size(); ++i) {
379	llvm::Value *VarPtr = CGF.GetAddrOfLocalVar(VD: Args [i]).emitRawPointer(CGF);
380	llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(V: VarPtr, DestTy: PtrTy);
381	CGF.Builder.CreateDefaultAlignedStore(
382	Val: VoidVarPtr, Addr: CGF.Builder.CreateConstGEP1_32(
383	Ty: PtrTy, Ptr: KernelArgs.emitRawPointer(CGF), Idx0: i));
384	}
385	return KernelArgs;
386	}
387
388	// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
389	// array and kernels are launched using cudaLaunchKernel().
390	void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
391	FunctionArgList &Args) {
392	// Build the shadow stack entry at the very start of the function.
393	Address KernelArgs = CGF.getLangOpts().OffloadViaLLVM
394	? prepareKernelArgsLLVMOffload(CGF, Args)
395	: prepareKernelArgs(CGF, Args);
396
397	llvm::BasicBlock *EndBlock = CGF.createBasicBlock(name: "setup.end");
398
399	// Lookup cudaLaunchKernel/hipLaunchKernel function.
400	// HIP kernel launching API name depends on -fgpu-default-stream option. For
401	// the default value 'legacy', it is hipLaunchKernel. For 'per-thread',
402	// it is hipLaunchKernel_spt.
403	// cudaError_t cudaLaunchKernel(const void func, dim3 gridDim, dim3 blockDim,*
404	// void args, size_t sharedMem,
405	// cudaStream_t stream);
406	// hipError_t hipLaunchKernel[_spt](const void func, dim3 gridDim,*
407	// dim3 blockDim, void args,
408	// size_t sharedMem, hipStream_t stream);
409	TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
410	DeclContext *DC = TranslationUnitDecl::castToDeclContext(D: TUDecl);
411	std::string KernelLaunchAPI = "LaunchKernel";
412	if (CGF.getLangOpts().GPUDefaultStream ==
413	LangOptions::GPUDefaultStreamKind::PerThread) {
414	if (CGF.getLangOpts().HIP)
415	KernelLaunchAPI = KernelLaunchAPI + "_spt";
416	else if (CGF.getLangOpts().CUDA)
417	KernelLaunchAPI = KernelLaunchAPI + "_ptsz";
418	}
419	auto LaunchKernelName = addPrefixToName(FuncName: KernelLaunchAPI);
420	const IdentifierInfo &cudaLaunchKernelII =
421	CGM.getContext().Idents.get(Name: LaunchKernelName);
422	FunctionDecl cudaLaunchKernelFD = nullptr*;
423	for (auto *Result : DC->lookup(Name: &cudaLaunchKernelII)) {
424	if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Val: Result))
425	cudaLaunchKernelFD = FD;
426	}
427
428	if (cudaLaunchKernelFD == nullptr) {
429	CGM.Error(loc: CGF.CurFuncDecl->getLocation(),
430	error: "Can't find declaration for " + LaunchKernelName);
431	return;
432	}
433	// Create temporary dim3 grid_dim, block_dim.
434	ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(i: `1`);
435	QualType Dim3Ty = GridDimParam->getType();
436	Address GridDim =
437	CGF.CreateMemTemp(T: Dim3Ty, Align: CharUnits::fromQuantity(Quantity: `8`), Name: "grid_dim");
438	Address BlockDim =
439	CGF.CreateMemTemp(T: Dim3Ty, Align: CharUnits::fromQuantity(Quantity: `8`), Name: "block_dim");
440	Address ShmemSize =
441	CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size");
442	Address Stream = CGF.CreateTempAlloca(PtrTy, CGM.getPointerAlign(), "stream");
443	llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
444	Ty: llvm::FunctionType::get(Result: IntTy,
445	Params: {/gridDim=/GridDim.getType(),
446	/blockDim=/BlockDim.getType(),
447	/ShmemSize=/ShmemSize.getType(),
448	/Stream=/Stream.getType()},
449	/isVarArg=/false),
450	Name: addUnderscoredPrefixToName(FuncName: "PopCallConfiguration"));
451
452	CGF.EmitRuntimeCallOrInvoke(callee: cudaPopConfigFn, args: {GridDim.emitRawPointer(CGF),
453	BlockDim.emitRawPointer(CGF),
454	ShmemSize.emitRawPointer(CGF),
455	Stream.emitRawPointer(CGF)});
456
457	// Emit the call to cudaLaunch
458	llvm::Value *Kernel =
459	CGF.Builder.CreatePointerCast(V: KernelHandles [CGF.CurFn->getName()], DestTy: PtrTy);
460	CallArgList LaunchKernelArgs;
461	LaunchKernelArgs.add(rvalue: RValue::get(V: Kernel),
462	type: cudaLaunchKernelFD->getParamDecl(i: `0`)->getType());
463	LaunchKernelArgs.add(rvalue: RValue::getAggregate(addr: GridDim), type: Dim3Ty);
464	LaunchKernelArgs.add(rvalue: RValue::getAggregate(addr: BlockDim), type: Dim3Ty);
465	LaunchKernelArgs.add(rvalue: RValue::get(Addr: KernelArgs, CGF),
466	type: cudaLaunchKernelFD->getParamDecl(i: `3`)->getType());
467	LaunchKernelArgs.add(rvalue: RValue::get(V: CGF.Builder.CreateLoad(Addr: ShmemSize)),
468	type: cudaLaunchKernelFD->getParamDecl(i: `4`)->getType());
469	LaunchKernelArgs.add(rvalue: RValue::get(V: CGF.Builder.CreateLoad(Addr: Stream)),
470	type: cudaLaunchKernelFD->getParamDecl(i: `5`)->getType());
471
472	QualType QT = cudaLaunchKernelFD->getType();
473	QualType CQT = QT.getCanonicalType();
474	llvm::Type *Ty = CGM.getTypes().ConvertType(T: CQT);
475	llvm::FunctionType *FTy = cast<llvm::FunctionType>(Val: Ty);
476
477	const CGFunctionInfo &FI =
478	CGM.getTypes().arrangeFunctionDeclaration(GD: cudaLaunchKernelFD);
479	llvm::FunctionCallee cudaLaunchKernelFn =
480	CGM.CreateRuntimeFunction(Ty: FTy, Name: LaunchKernelName);
481	CGF.EmitCall(CallInfo: FI, Callee: CGCallee::forDirect(functionPtr: cudaLaunchKernelFn), ReturnValue: ReturnValueSlot (),
482	Args: LaunchKernelArgs);
483
484	// To prevent CUDA device stub functions from being merged by ICF in MSVC
485	// environment, create an unique global variable for each kernel and write to
486	// the variable in the device stub.
487	if (CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() &&
488	!CGF.getLangOpts().HIP) {
489	llvm::Function *KernelFunction = llvm::cast<llvm::Function>(Val: Kernel);
490	std::string GlobalVarName = (KernelFunction->getName() + ".id").str();
491
492	llvm::GlobalVariable *HandleVar =
493	CGM.getModule().getNamedGlobal(Name: GlobalVarName);
494	if (!HandleVar) {
495	HandleVar = new llvm::GlobalVariable (
496	CGM.getModule(), CGM.Int8Ty,
497	/Constant=/false, KernelFunction->getLinkage(),
498	llvm::ConstantInt::get(Ty: CGM.Int8Ty, V: `0`), GlobalVarName);
499	HandleVar->setDSOLocal(KernelFunction->isDSOLocal());
500	HandleVar->setVisibility(KernelFunction->getVisibility());
501	if (KernelFunction->hasComdat())
502	HandleVar->setComdat(CGM.getModule().getOrInsertComdat(Name: GlobalVarName));
503	}
504
505	CGF.Builder.CreateAlignedStore(Val: llvm::ConstantInt::get(Ty: CGM.Int8Ty, V: `1`),
506	Addr: HandleVar, Align: CharUnits::One(),
507	/IsVolatile=/true);
508	}
509
510	CGF.EmitBranch(Block: EndBlock);
511
512	CGF.EmitBlock(BB: EndBlock);
513	}
514
515	void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
516	FunctionArgList &Args) {
517	// Emit a call to cudaSetupArgument for each arg in Args.
518	llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
519	llvm::BasicBlock *EndBlock = CGF.createBasicBlock(name: "setup.end");
520	CharUnits Offset = CharUnits::Zero();
521	for (const VarDecl *A : Args) {
522	auto TInfo = CGM.getContext().getTypeInfoInChars(A->getType());
523	Offset = Offset.alignTo(Align: TInfo.Align);
524	llvm::Value *Args[] = {
525	CGF.Builder.CreatePointerCast(
526	V: CGF.GetAddrOfLocalVar(VD: A).emitRawPointer(CGF), DestTy: PtrTy),
527	llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()),
528	llvm::ConstantInt::get(Ty: SizeTy, V: Offset.getQuantity()),
529	};
530	llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args);
531	llvm::Constant *Zero = llvm::ConstantInt::get(Ty: IntTy, V: `0`);
532	llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(LHS: CB, RHS: Zero);
533	llvm::BasicBlock *NextBlock = CGF.createBasicBlock(name: "setup.next");
534	CGF.Builder.CreateCondBr(Cond: CBZero, True: NextBlock, False: EndBlock);
535	CGF.EmitBlock(BB: NextBlock);
536	Offset += TInfo.Width;
537	}
538
539	// Emit the call to cudaLaunch
540	llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
541	llvm::Value *Arg =
542	CGF.Builder.CreatePointerCast(V: KernelHandles [CGF.CurFn->getName()], DestTy: PtrTy);
543	CGF.EmitRuntimeCallOrInvoke(callee: cudaLaunchFn, args: Arg);
544	CGF.EmitBranch(Block: EndBlock);
545
546	CGF.EmitBlock(BB: EndBlock);
547	}
548
549	// Replace the original variable Var with the address loaded from variable
550	// ManagedVar populated by HIP runtime.
551	static void replaceManagedVar(llvm::GlobalVariable *Var,
552	llvm::GlobalVariable *ManagedVar) {
553	SmallVector<SmallVector<llvm::User *, `8`>, `8`> WorkList;
554	for (auto &&VarUse : Var->uses()) {
555	WorkList.push_back(Elt: {VarUse.getUser()});
556	}
557	while (!WorkList.empty()) {
558	auto &&WorkItem = WorkList.pop_back_val();
559	auto *U = WorkItem.back();
560	if (isa<llvm::ConstantExpr>(Val: U)) {
561	for (auto &&UU : U->uses()) {
562	WorkItem.push_back(Elt: UU.getUser());
563	WorkList.push_back(Elt: WorkItem);
564	WorkItem.pop_back();
565	}
566	continue;
567	}
568	if (auto *I = dyn_cast<llvm::Instruction>(Val: U)) {
569	llvm::Value *OldV = Var;
570	llvm::Instruction NewV = new* llvm::LoadInst (
571	Var->getType(), ManagedVar, "ld.managed", false,
572	llvm::Align (Var->getAlignment()), I->getIterator());
573	WorkItem.pop_back();
574	// Replace constant expressions directly or indirectly using the managed
575	// variable with instructions.
576	for (auto &&Op : WorkItem) {
577	auto *CE = cast<llvm::ConstantExpr>(Val: Op);
578	auto *NewInst = CE->getAsInstruction();
579	NewInst->insertBefore(BB&: *I->getParent(), InsertPos: I->getIterator());
580	NewInst->replaceUsesOfWith(From: OldV, To: NewV);
581	OldV = CE;
582	NewV = NewInst;
583	}
584	I->replaceUsesOfWith(From: OldV, To: NewV);
585	} else {
586	llvm_unreachable("Invalid use of managed variable");
587	}
588	}
589	}
590
591	/// Creates a function that sets up state on the host side for CUDA objects that
592	/// have a presence on both the host and device sides. Specifically, registers
593	/// the host side of kernel functions and device global variables with the CUDA
594	/// runtime.
595	/// \code
596	/// void __cuda_register_globals(void* GpuBinaryHandle) {*
597	/// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
598	/// ...
599	/// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
600	/// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...);
601	/// ...
602	/// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...);
603	/// }
604	/// \endcode
605	llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
606	// No need to register anything
607	if (EmittedKernels.empty() && DeviceVars.empty())
608	return nullptr;
609
610	llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
611	Ty: getRegisterGlobalsFnTy(), Linkage: llvm::GlobalValue::InternalLinkage,
612	N: addUnderscoredPrefixToName(FuncName: "_register_globals"), M: &TheModule);
613	llvm::BasicBlock *EntryBB =
614	llvm::BasicBlock::Create(Context, Name: "entry", Parent: RegisterKernelsFunc);
615	CGBuilderTy Builder(CGM, Context);
616	Builder.SetInsertPoint(EntryBB);
617
618	// void __cudaRegisterFunction(void , const char , char , const char ,*
619	// int, uint3, uint3, dim3, dim3, int)*
620	llvm::Type *RegisterFuncParams[] = {
621	PtrTy, PtrTy, PtrTy, PtrTy, IntTy,
622	PtrTy, PtrTy, PtrTy, PtrTy, llvm::PointerType::getUnqual(C&: Context)};
623	llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction(
624	Ty: llvm::FunctionType::get(Result: IntTy, Params: RegisterFuncParams, isVarArg: false),
625	Name: addUnderscoredPrefixToName(FuncName: "RegisterFunction"));
626
627	// Extract GpuBinaryHandle passed as the first argument passed to
628	// __cuda_register_globals() and generate __cudaRegisterFunction() call for
629	// each emitted kernel.
630	llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
631	for (auto &&I : EmittedKernels) {
632	llvm::Constant *KernelName =
633	makeConstantString(Str: getDeviceSideName(ND: cast<NamedDecl>(Val: I.D)));
634	llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(T: PtrTy);
635	llvm::Value *Args[] = {
636	&GpuBinaryHandlePtr,
637	KernelHandles [I.Kernel->getName()],
638	KernelName,
639	KernelName,
640	llvm::ConstantInt::get(Ty: IntTy, V: -`1`),
641	NullPtr,
642	NullPtr,
643	NullPtr,
644	NullPtr,
645	llvm::ConstantPointerNull::get(T: llvm::PointerType::getUnqual(C&: Context))};
646	Builder.CreateCall(Callee: RegisterFunc, Args);
647	}
648
649	llvm::Type *VarSizeTy = IntTy;
650	// For HIP or CUDA 9.0+, device variable size is type of `size_t`.
651	if (CGM.getLangOpts().HIP \|\|
652	ToCudaVersion(CGM.getTarget().getSDKVersion()) >= CudaVersion::CUDA_90)
653	VarSizeTy = SizeTy;
654
655	// void __cudaRegisterVar(void , char , char , const char ,*
656	// int, int, int, int)
657	llvm::Type *RegisterVarParams[] = {PtrTy, PtrTy, PtrTy, PtrTy,
658	IntTy, VarSizeTy, IntTy, IntTy};
659	llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
660	Ty: llvm::FunctionType::get(Result: VoidTy, Params: RegisterVarParams, isVarArg: false),
661	Name: addUnderscoredPrefixToName(FuncName: "RegisterVar"));
662	// void __hipRegisterManagedVar(void , char , char , const char ,*
663	// size_t, unsigned)
664	llvm::Type *RegisterManagedVarParams[] = {PtrTy, PtrTy, PtrTy,
665	PtrTy, VarSizeTy, IntTy};
666	llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction(
667	Ty: llvm::FunctionType::get(Result: VoidTy, Params: RegisterManagedVarParams, isVarArg: false),
668	Name: addUnderscoredPrefixToName(FuncName: "RegisterManagedVar"));
669	// void __cudaRegisterSurface(void , const struct surfaceReference ,*
670	// const void , const char , int, int);*
671	llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction(
672	Ty: llvm::FunctionType::get(
673	Result: VoidTy, Params: {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy}, isVarArg: false),
674	Name: addUnderscoredPrefixToName(FuncName: "RegisterSurface"));
675	// void __cudaRegisterTexture(void , const struct textureReference ,*
676	// const void , const char , int, int, int)*
677	llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction(
678	Ty: llvm::FunctionType::get(
679	Result: VoidTy, Params: {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy, IntTy}, isVarArg: false),
680	Name: addUnderscoredPrefixToName(FuncName: "RegisterTexture"));
681	for (auto &&Info : DeviceVars) {
682	llvm::GlobalVariable *Var = Info.Var;
683	assert((!Var->isDeclaration() \|\| Info.Flags.isManaged()) &&
684	"External variables should not show up here, except HIP managed "
685	"variables");
686	llvm::Constant *VarName = makeConstantString(Str: getDeviceSideName(Info.D));
687	switch (Info.Flags.getKind()) {
688	case DeviceVarFlags::Variable: {
689	uint64_t VarSize =
690	CGM.getDataLayout().getTypeAllocSize(Ty: Var->getValueType());
691	if (Info.Flags.isManaged()) {
692	assert(Var->getName().ends_with(".managed") &&
693	"HIP managed variables not transformed");
694	auto *ManagedVar = CGM.getModule().getNamedGlobal(
695	Name: Var->getName().drop_back(N: StringRef(".managed").size()));
696	llvm::Value *Args[] = {
697	&GpuBinaryHandlePtr,
698	ManagedVar,
699	Var,
700	VarName,
701	llvm::ConstantInt::get(Ty: VarSizeTy, V: VarSize),
702	llvm::ConstantInt::get(Ty: IntTy, V: Var->getAlignment())};
703	if (!Var->isDeclaration())
704	Builder.CreateCall(Callee: RegisterManagedVar, Args);
705	} else {
706	llvm::Value *Args[] = {
707	&GpuBinaryHandlePtr,
708	Var,
709	VarName,
710	VarName,
711	llvm::ConstantInt::get(Ty: IntTy, V: Info.Flags.isExtern()),
712	llvm::ConstantInt::get(Ty: VarSizeTy, V: VarSize),
713	llvm::ConstantInt::get(Ty: IntTy, V: Info.Flags.isConstant()),
714	llvm::ConstantInt::get(Ty: IntTy, V: `0`)};
715	Builder.CreateCall(Callee: RegisterVar, Args);
716	}
717	break;
718	}
719	case DeviceVarFlags::Surface:
720	Builder.CreateCall(
721	Callee: RegisterSurf,
722	Args: {&GpuBinaryHandlePtr, Var, VarName, VarName,
723	llvm::ConstantInt::get(Ty: IntTy, V: Info.Flags.getSurfTexType()),
724	llvm::ConstantInt::get(Ty: IntTy, V: Info.Flags.isExtern())});
725	break;
726	case DeviceVarFlags::Texture:
727	Builder.CreateCall(
728	Callee: RegisterTex,
729	Args: {&GpuBinaryHandlePtr, Var, VarName, VarName,
730	llvm::ConstantInt::get(Ty: IntTy, V: Info.Flags.getSurfTexType()),
731	llvm::ConstantInt::get(Ty: IntTy, V: Info.Flags.isNormalized()),
732	llvm::ConstantInt::get(Ty: IntTy, V: Info.Flags.isExtern())});
733	break;
734	}
735	}
736
737	Builder.CreateRetVoid();
738	return RegisterKernelsFunc;
739	}
740
741	/// Creates a global constructor function for the module:
742	///
743	/// For CUDA:
744	/// \code
745	/// void __cuda_module_ctor() {
746	/// Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
747	/// __cuda_register_globals(Handle);
748	/// }
749	/// \endcode
750	///
751	/// For HIP:
752	/// \code
753	/// void __hip_module_ctor() {
754	/// if (__hip_gpubin_handle == 0) {
755	/// __hip_gpubin_handle = __hipRegisterFatBinary(GpuBinaryBlob);
756	/// __hip_register_globals(__hip_gpubin_handle);
757	/// }
758	/// }
759	/// \endcode
760	llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
761	bool IsHIP = CGM.getLangOpts().HIP;
762	bool IsCUDA = CGM.getLangOpts().CUDA;
763	// No need to generate ctors/dtors if there is no GPU binary.
764	StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
765	if (CudaGpuBinaryFileName.empty() && !IsHIP)
766	return nullptr;
767	if ((IsHIP \|\| (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() &&
768	DeviceVars.empty())
769	return nullptr;
770
771	// void __{cuda\|hip}_register_globals(void handle);*
772	llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
773	// We always need a function to pass in as callback. Create a dummy
774	// implementation if we don't need to register anything.
775	if (RelocatableDeviceCode && !RegisterGlobalsFunc)
776	RegisterGlobalsFunc = makeDummyFunction(FnTy: getRegisterGlobalsFnTy());
777
778	// void * __{cuda\|hip}RegisterFatBinary(void );
779	llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction(
780	Ty: llvm::FunctionType::get(Result: PtrTy, Params: PtrTy, isVarArg: false),
781	Name: addUnderscoredPrefixToName(FuncName: "RegisterFatBinary"));
782	// struct { int magic, int version, void gpu_binary, void * dont_care };*
783	llvm::StructType *FatbinWrapperTy =
784	llvm::StructType::get(elt1: IntTy, elts: IntTy, elts: PtrTy, elts: PtrTy);
785
786	// Register GPU binary with the CUDA runtime, store returned handle in a
787	// global variable and save a reference in GpuBinaryHandle to be cleaned up
788	// in destructor on exit. Then associate all known kernels with the GPU binary
789	// handle so CUDA runtime can figure out what to call on the GPU side.
790	std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr;
791	if (!CudaGpuBinaryFileName.empty()) {
792	auto VFS = CGM.getFileSystem();
793	auto CudaGpuBinaryOrErr =
794	VFS ->getBufferForFile(Name: CudaGpuBinaryFileName, FileSize: -`1`, RequiresNullTerminator: false);
795	if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
796	CGM.getDiags().Report(diag::err_cannot_open_file)
797	<< CudaGpuBinaryFileName << EC.message();
798	return nullptr;
799	}
800	CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
801	}
802
803	llvm::Function *ModuleCtorFunc = llvm::Function::Create(
804	Ty: llvm::FunctionType::get(Result: VoidTy, isVarArg: false),
805	Linkage: llvm::GlobalValue::InternalLinkage,
806	N: addUnderscoredPrefixToName(FuncName: "_module_ctor"), M: &TheModule);
807	llvm::BasicBlock *CtorEntryBB =
808	llvm::BasicBlock::Create(Context, Name: "entry", Parent: ModuleCtorFunc);
809	CGBuilderTy CtorBuilder(CGM, Context);
810
811	CtorBuilder.SetInsertPoint(CtorEntryBB);
812
813	const char *FatbinConstantName;
814	const char *FatbinSectionName;
815	const char *ModuleIDSectionName;
816	StringRef ModuleIDPrefix;
817	llvm::Constant *FatBinStr;
818	unsigned FatMagic;
819	if (IsHIP) {
820	FatbinConstantName = ".hip_fatbin";
821	FatbinSectionName = ".hipFatBinSegment";
822
823	ModuleIDSectionName = "__hip_module_id";
824	ModuleIDPrefix = "__hip_";
825
826	if (CudaGpuBinary) {
827	// If fatbin is available from early finalization, create a string
828	// literal containing the fat binary loaded from the given file.
829	const unsigned HIPCodeObjectAlign = `4096`;
830	FatBinStr = makeConstantArray(Str: std::string (CudaGpuBinary ->getBuffer()), Name: "",
831	SectionName: FatbinConstantName, Alignment: HIPCodeObjectAlign);
832	} else {
833	// If fatbin is not available, create an external symbol
834	// __hip_fatbin in section .hip_fatbin. The external symbol is supposed
835	// to contain the fat binary but will be populated somewhere else,
836	// e.g. by lld through link script.
837	FatBinStr = new llvm::GlobalVariable (
838	CGM.getModule(), CGM.Int8Ty,
839	/isConstant=/true, llvm::GlobalValue::ExternalLinkage, nullptr,
840	"__hip_fatbin" + (CGM.getLangOpts().CUID.empty()
841	? ""
842	: "_" + CGM.getContext().getCUIDHash()),
843	nullptr, llvm::GlobalVariable::NotThreadLocal);
844	cast<llvm::GlobalVariable>(Val: FatBinStr)->setSection(FatbinConstantName);
845	}
846
847	FatMagic = HIPFatMagic;
848	} else {
849	if (RelocatableDeviceCode)
850	FatbinConstantName = CGM.getTriple().isMacOSX()
851	? "__NV_CUDA,__nv_relfatbin"
852	: "__nv_relfatbin";
853	else
854	FatbinConstantName =
855	CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
856	// NVIDIA's cuobjdump looks for fatbins in this section.
857	FatbinSectionName =
858	CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
859
860	ModuleIDSectionName = CGM.getTriple().isMacOSX()
861	? "__NV_CUDA,__nv_module_id"
862	: "__nv_module_id";
863	ModuleIDPrefix = "__nv_";
864
865	// For CUDA, create a string literal containing the fat binary loaded from
866	// the given file.
867	FatBinStr = makeConstantArray(Str: std::string (CudaGpuBinary ->getBuffer()), Name: "",
868	SectionName: FatbinConstantName, Alignment: `8`);
869	FatMagic = CudaFatMagic;
870	}
871
872	// Create initialized wrapper structure that points to the loaded GPU binary
873	ConstantInitBuilder Builder(CGM);
874	auto Values = Builder.beginStruct(structTy: FatbinWrapperTy);
875	// Fatbin wrapper magic.
876	Values.addInt(intTy: IntTy, value: FatMagic);
877	// Fatbin version.
878	Values.addInt(intTy: IntTy, value: `1`);
879	// Data.
880	Values.add(value: FatBinStr);
881	// Unused in fatbin v1.
882	Values.add(value: llvm::ConstantPointerNull::get(T: PtrTy));
883	llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
884	addUnderscoredPrefixToName(FuncName: "_fatbin_wrapper"), CGM.getPointerAlign(),
885	/constant/ true);
886	FatbinWrapper->setSection(FatbinSectionName);
887
888	// There is only one HIP fat binary per linked module, however there are
889	// multiple constructor functions. Make sure the fat binary is registered
890	// only once. The constructor functions are executed by the dynamic loader
891	// before the program gains control. The dynamic loader cannot execute the
892	// constructor functions concurrently since doing that would not guarantee
893	// thread safety of the loaded program. Therefore we can assume sequential
894	// execution of constructor functions here.
895	if (IsHIP) {
896	auto Linkage = RelocatableDeviceCode ? llvm::GlobalValue::ExternalLinkage
897	: llvm::GlobalValue::InternalLinkage;
898	llvm::BasicBlock *IfBlock =
899	llvm::BasicBlock::Create(Context, Name: "if", Parent: ModuleCtorFunc);
900	llvm::BasicBlock *ExitBlock =
901	llvm::BasicBlock::Create(Context, Name: "exit", Parent: ModuleCtorFunc);
902	// The name, size, and initialization pattern of this variable is part
903	// of HIP ABI.
904	GpuBinaryHandle = new llvm::GlobalVariable (
905	TheModule, PtrTy, /isConstant=/false, Linkage,
906	/Initializer=/
907	!RelocatableDeviceCode ? llvm::ConstantPointerNull::get(T: PtrTy)
908	: nullptr,
909	"__hip_gpubin_handle" + (CGM.getLangOpts().CUID.empty()
910	? ""
911	: "_" + CGM.getContext().getCUIDHash()));
912	GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
913	// Prevent the weak symbol in different shared libraries being merged.
914	if (Linkage != llvm::GlobalValue::InternalLinkage)
915	GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
916	Address GpuBinaryAddr(
917	GpuBinaryHandle, PtrTy,
918	CharUnits::fromQuantity(Quantity: GpuBinaryHandle->getAlignment()));
919	{
920	auto *HandleValue = CtorBuilder.CreateLoad(Addr: GpuBinaryAddr);
921	llvm::Constant *Zero =
922	llvm::Constant::getNullValue(Ty: HandleValue->getType());
923	llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(LHS: HandleValue, RHS: Zero);
924	CtorBuilder.CreateCondBr(Cond: EQZero, True: IfBlock, False: ExitBlock);
925	}
926	{
927	CtorBuilder.SetInsertPoint(IfBlock);
928	// GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper);
929	llvm::CallInst *RegisterFatbinCall =
930	CtorBuilder.CreateCall(Callee: RegisterFatbinFunc, Args: FatbinWrapper);
931	CtorBuilder.CreateStore(Val: RegisterFatbinCall, Addr: GpuBinaryAddr);
932	CtorBuilder.CreateBr(Dest: ExitBlock);
933	}
934	{
935	CtorBuilder.SetInsertPoint(ExitBlock);
936	// Call __hip_register_globals(GpuBinaryHandle);
937	if (RegisterGlobalsFunc) {
938	auto *HandleValue = CtorBuilder.CreateLoad(Addr: GpuBinaryAddr);
939	CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
940	}
941	}
942	} else if (!RelocatableDeviceCode) {
943	// Register binary with CUDA runtime. This is substantially different in
944	// default mode vs. separate compilation!
945	// GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
946	llvm::CallInst *RegisterFatbinCall =
947	CtorBuilder.CreateCall(Callee: RegisterFatbinFunc, Args: FatbinWrapper);
948	GpuBinaryHandle = new llvm::GlobalVariable (
949	TheModule, PtrTy, false, llvm::GlobalValue::InternalLinkage,
950	llvm::ConstantPointerNull::get(T: PtrTy), "__cuda_gpubin_handle");
951	GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
952	CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
953	CGM.getPointerAlign());
954
955	// Call __cuda_register_globals(GpuBinaryHandle);
956	if (RegisterGlobalsFunc)
957	CtorBuilder.CreateCall(Callee: RegisterGlobalsFunc, Args: RegisterFatbinCall);
958
959	// Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it.
960	if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
961	CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
962	// void __cudaRegisterFatBinaryEnd(void );
963	llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction(
964	Ty: llvm::FunctionType::get(Result: VoidTy, Params: PtrTy, isVarArg: false),
965	Name: "__cudaRegisterFatBinaryEnd");
966	CtorBuilder.CreateCall(Callee: RegisterFatbinEndFunc, Args: RegisterFatbinCall);
967	}
968	} else {
969	// Generate a unique module ID.
970	SmallString<`64`> ModuleID;
971	llvm::raw_svector_ostream OS(ModuleID);
972	OS << ModuleIDPrefix << llvm::format(Fmt: "%" PRIx64, Vals: FatbinWrapper->getGUID());
973	llvm::Constant *ModuleIDConstant = makeConstantArray(
974	Str: std::string(ModuleID), Name: "", SectionName: ModuleIDSectionName, Alignment: `32`, /AddNull=/true);
975
976	// Create an alias for the FatbinWrapper that nvcc will look for.
977	llvm::GlobalAlias::create(Linkage: llvm::GlobalValue::ExternalLinkage,
978	Name: Twine("__fatbinwrap") + ModuleID, Aliasee: FatbinWrapper);
979
980	// void __cudaRegisterLinkedBinary%ModuleID%(void ()(void ), void ,*
981	// void , void ()(void ))
982	SmallString<`128`> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary");
983	RegisterLinkedBinaryName += ModuleID;
984	llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
985	Ty: getRegisterLinkedBinaryFnTy(), Name: RegisterLinkedBinaryName);
986
987	assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
988	llvm::Value *Args[] = {RegisterGlobalsFunc, FatbinWrapper, ModuleIDConstant,
989	makeDummyFunction(FnTy: getCallbackFnTy())};
990	CtorBuilder.CreateCall(Callee: RegisterLinkedBinaryFunc, Args);
991	}
992
993	// Create destructor and register it with atexit() the way NVCC does it. Doing
994	// it during regular destructor phase worked in CUDA before 9.2 but results in
995	// double-free in 9.2.
996	if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
997	// extern "C" int atexit(void (f)(void));*
998	llvm::FunctionType *AtExitTy =
999	llvm::FunctionType::get(Result: IntTy, Params: CleanupFn->getType(), isVarArg: false);
1000	llvm::FunctionCallee AtExitFunc =
1001	CGM.CreateRuntimeFunction(Ty: AtExitTy, Name: "atexit", ExtraAttrs: llvm::AttributeList (),
1002	/Local=/true);
1003	CtorBuilder.CreateCall(Callee: AtExitFunc, Args: CleanupFn);
1004	}
1005
1006	CtorBuilder.CreateRetVoid();
1007	return ModuleCtorFunc;
1008	}
1009
1010	/// Creates a global destructor function that unregisters the GPU code blob
1011	/// registered by constructor.
1012	///
1013	/// For CUDA:
1014	/// \code
1015	/// void __cuda_module_dtor() {
1016	/// __cudaUnregisterFatBinary(Handle);
1017	/// }
1018	/// \endcode
1019	///
1020	/// For HIP:
1021	/// \code
1022	/// void __hip_module_dtor() {
1023	/// if (__hip_gpubin_handle) {
1024	/// __hipUnregisterFatBinary(__hip_gpubin_handle);
1025	/// __hip_gpubin_handle = 0;
1026	/// }
1027	/// }
1028	/// \endcode
1029	llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
1030	// No need for destructor if we don't have a handle to unregister.
1031	if (!GpuBinaryHandle)
1032	return nullptr;
1033
1034	// void __cudaUnregisterFatBinary(void * handle);*
1035	llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
1036	Ty: llvm::FunctionType::get(Result: VoidTy, Params: PtrTy, isVarArg: false),
1037	Name: addUnderscoredPrefixToName(FuncName: "UnregisterFatBinary"));
1038
1039	llvm::Function *ModuleDtorFunc = llvm::Function::Create(
1040	Ty: llvm::FunctionType::get(Result: VoidTy, isVarArg: false),
1041	Linkage: llvm::GlobalValue::InternalLinkage,
1042	N: addUnderscoredPrefixToName(FuncName: "_module_dtor"), M: &TheModule);
1043
1044	llvm::BasicBlock *DtorEntryBB =
1045	llvm::BasicBlock::Create(Context, Name: "entry", Parent: ModuleDtorFunc);
1046	CGBuilderTy DtorBuilder(CGM, Context);
1047	DtorBuilder.SetInsertPoint(DtorEntryBB);
1048
1049	Address GpuBinaryAddr(
1050	GpuBinaryHandle, GpuBinaryHandle->getValueType(),
1051	CharUnits::fromQuantity(Quantity: GpuBinaryHandle->getAlignment()));
1052	auto *HandleValue = DtorBuilder.CreateLoad(Addr: GpuBinaryAddr);
1053	// There is only one HIP fat binary per linked module, however there are
1054	// multiple destructor functions. Make sure the fat binary is unregistered
1055	// only once.
1056	if (CGM.getLangOpts().HIP) {
1057	llvm::BasicBlock *IfBlock =
1058	llvm::BasicBlock::Create(Context, Name: "if", Parent: ModuleDtorFunc);
1059	llvm::BasicBlock *ExitBlock =
1060	llvm::BasicBlock::Create(Context, Name: "exit", Parent: ModuleDtorFunc);
1061	llvm::Constant *Zero = llvm::Constant::getNullValue(Ty: HandleValue->getType());
1062	llvm::Value *NEZero = DtorBuilder.CreateICmpNE(LHS: HandleValue, RHS: Zero);
1063	DtorBuilder.CreateCondBr(Cond: NEZero, True: IfBlock, False: ExitBlock);
1064
1065	DtorBuilder.SetInsertPoint(IfBlock);
1066	DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
1067	DtorBuilder.CreateStore(Val: Zero, Addr: GpuBinaryAddr);
1068	DtorBuilder.CreateBr(Dest: ExitBlock);
1069
1070	DtorBuilder.SetInsertPoint(ExitBlock);
1071	} else {
1072	DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
1073	}
1074	DtorBuilder.CreateRetVoid();
1075	return ModuleDtorFunc;
1076	}
1077
1078	CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {
1079	return new CGNVCUDARuntime (CGM);
1080	}
1081
1082	void CGNVCUDARuntime::internalizeDeviceSideVar(
1083	const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage) {
1084	// For -fno-gpu-rdc, host-side shadows of external declarations of device-side
1085	// global variables become internal definitions. These have to be internal in
1086	// order to prevent name conflicts with global host variables with the same
1087	// name in a different TUs.
1088	//
1089	// For -fgpu-rdc, the shadow variables should not be internalized because
1090	// they may be accessed by different TU.
1091	if (CGM.getLangOpts().GPURelocatableDeviceCode)
1092	return;
1093
1094	// __shared__ variables are odd. Shadows do get created, but
1095	// they are not registered with the CUDA runtime, so they
1096	// can't really be used to access their device-side
1097	// counterparts. It's not clear yet whether it's nvcc's bug or
1098	// a feature, but we've got to do the same for compatibility.
1099	if (D->hasAttr<CUDADeviceAttr>() \|\| D->hasAttr<CUDAConstantAttr>() \|\|
1100	D->hasAttr<CUDASharedAttr>() \|\|
1101	D->getType()->isCUDADeviceBuiltinSurfaceType() \|\|
1102	D->getType()->isCUDADeviceBuiltinTextureType()) {
1103	Linkage = llvm::GlobalValue::InternalLinkage;
1104	}
1105	}
1106
1107	void CGNVCUDARuntime::handleVarRegistration(const VarDecl *D,
1108	llvm::GlobalVariable &GV) {
1109	if (D->hasAttr<CUDADeviceAttr>() \|\| D->hasAttr<CUDAConstantAttr>()) {
1110	// Shadow variables and their properties must be registered with CUDA
1111	// runtime. Skip Extern global variables, which will be registered in
1112	// the TU where they are defined.
1113	//
1114	// Don't register a C++17 inline variable. The local symbol can be
1115	// discarded and referencing a discarded local symbol from outside the
1116	// comdat (__cuda_register_globals) is disallowed by the ELF spec.
1117	//
1118	// HIP managed variables need to be always recorded in device and host
1119	// compilations for transformation.
1120	//
1121	// HIP managed variables and variables in CUDADeviceVarODRUsedByHost are
1122	// added to llvm.compiler-used, therefore they are safe to be registered.
1123	if ((!D->hasExternalStorage() && !D->isInline()) \|\|
1124	CGM.getContext().CUDADeviceVarODRUsedByHost.contains(D) \|\|
1125	D->hasAttr<HIPManagedAttr>()) {
1126	registerDeviceVar(D, GV, !D->hasDefinition(),
1127	D->hasAttr<CUDAConstantAttr>());
1128	}
1129	} else if (D->getType()->isCUDADeviceBuiltinSurfaceType() \|\|
1130	D->getType()->isCUDADeviceBuiltinTextureType()) {
1131	// Builtin surfaces and textures and their template arguments are
1132	// also registered with CUDA runtime.
1133	const auto *TD = cast<ClassTemplateSpecializationDecl>(
1134	D->getType()->castAs<RecordType>()->getDecl());
1135	const TemplateArgumentList &Args = TD->getTemplateArgs();
1136	if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) {
1137	assert(Args.size() == `2` &&
1138	"Unexpected number of template arguments of CUDA device "
1139	"builtin surface type.");
1140	auto SurfType = Args [`1`].getAsIntegral();
1141	if (!D->hasExternalStorage())
1142	registerDeviceSurf(VD: D, Var&: GV, Extern: !D->hasDefinition(), Type: SurfType.getSExtValue());
1143	} else {
1144	assert(Args.size() == `3` &&
1145	"Unexpected number of template arguments of CUDA device "
1146	"builtin texture type.");
1147	auto TexType = Args [`1`].getAsIntegral();
1148	auto Normalized = Args [`2`].getAsIntegral();
1149	if (!D->hasExternalStorage())
1150	registerDeviceTex(VD: D, Var&: GV, Extern: !D->hasDefinition(), Type: TexType.getSExtValue(),
1151	Normalized: Normalized.getZExtValue());
1152	}
1153	}
1154	}
1155
1156	// Transform managed variables to pointers to managed variables in device code.
1157	// Each use of the original managed variable is replaced by a load from the
1158	// transformed managed variable. The transformed managed variable contains
1159	// the address of managed memory which will be allocated by the runtime.
1160	void CGNVCUDARuntime::transformManagedVars() {
1161	for (auto &&Info : DeviceVars) {
1162	llvm::GlobalVariable *Var = Info.Var;
1163	if (Info.Flags.getKind() == DeviceVarFlags::Variable &&
1164	Info.Flags.isManaged()) {
1165	auto ManagedVar = new* llvm::GlobalVariable (
1166	CGM.getModule(), Var->getType(),
1167	/isConstant=/false, Var->getLinkage(),
1168	/Init=/Var->isDeclaration()
1169	? nullptr
1170	: llvm::ConstantPointerNull::get(T: Var->getType()),
1171	/Name=/"", /InsertBefore=/nullptr,
1172	llvm::GlobalVariable::NotThreadLocal,
1173	CGM.getContext().getTargetAddressSpace(AS: CGM.getLangOpts().CUDAIsDevice
1174	? LangAS::cuda_device
1175	: LangAS::Default));
1176	ManagedVar->setDSOLocal(Var->isDSOLocal());
1177	ManagedVar->setVisibility(Var->getVisibility());
1178	ManagedVar->setExternallyInitialized(true);
1179	replaceManagedVar(Var, ManagedVar);
1180	ManagedVar->takeName(V: Var);
1181	Var->setName(Twine(ManagedVar->getName()) + ".managed");
1182	// Keep managed variables even if they are not used in device code since
1183	// they need to be allocated by the runtime.
1184	if (CGM.getLangOpts().CUDAIsDevice && !Var->isDeclaration()) {
1185	assert(!ManagedVar->isDeclaration());
1186	CGM.addCompilerUsedGlobal(GV: Var);
1187	CGM.addCompilerUsedGlobal(GV: ManagedVar);
1188	}
1189	}
1190	}
1191	}
1192
1193	// Creates offloading entries for all the kernels and globals that must be
1194	// registered. The linker will provide a pointer to this section so we can
1195	// register the symbols with the linked device image.
1196	void CGNVCUDARuntime::createOffloadingEntries() {
1197	llvm::object::OffloadKind Kind = CGM.getLangOpts().HIP
1198	? llvm::object::OffloadKind::OFK_HIP
1199	: llvm::object::OffloadKind::OFK_Cuda;
1200	// For now, just spoof this as OpenMP because that's the runtime it uses.
1201	if (CGM.getLangOpts().OffloadViaLLVM)
1202	Kind = llvm::object::OffloadKind::OFK_OpenMP;
1203
1204	llvm::Module &M = CGM.getModule();
1205	for (KernelInfo &I : EmittedKernels)
1206	llvm::offloading::emitOffloadingEntry(
1207	M, Kind, Addr: KernelHandles [I.Kernel->getName()],
1208	Name: getDeviceSideName(ND: cast<NamedDecl>(Val: I.D)), /Flags=/Size: `0`, /Data=/Flags: `0`,
1209	Data: llvm::offloading::OffloadGlobalEntry);
1210
1211	for (VarInfo &I : DeviceVars) {
1212	uint64_t VarSize =
1213	CGM.getDataLayout().getTypeAllocSize(Ty: I.Var->getValueType());
1214	int32_t Flags =
1215	(I.Flags.isExtern()
1216	? static_cast<int32_t>(llvm::offloading::OffloadGlobalExtern)
1217	: `0`) \|
1218	(I.Flags.isConstant()
1219	? static_cast<int32_t>(llvm::offloading::OffloadGlobalConstant)
1220	: `0`) \|
1221	(I.Flags.isNormalized()
1222	? static_cast<int32_t>(llvm::offloading::OffloadGlobalNormalized)
1223	: `0`);
1224	if (I.Flags.getKind() == DeviceVarFlags::Variable) {
1225	if (I.Flags.isManaged()) {
1226	assert(I.Var->getName().ends_with(".managed") &&
1227	"HIP managed variables not transformed");
1228
1229	auto *ManagedVar = M.getNamedGlobal(
1230	Name: I.Var->getName().drop_back(N: StringRef(".managed").size()));
1231	llvm::offloading::emitOffloadingEntry(
1232	M, Kind, Addr: I.Var, Name: getDeviceSideName(I.D), Size: VarSize,
1233	Flags: llvm::offloading::OffloadGlobalManagedEntry \| Flags,
1234	/Data=/I.Var->getAlignment(), AuxAddr: ManagedVar);
1235	} else {
1236	llvm::offloading::emitOffloadingEntry(
1237	M, Kind, Addr: I.Var, Name: getDeviceSideName(I.D), Size: VarSize,
1238	Flags: llvm::offloading::OffloadGlobalEntry \| Flags,
1239	/Data=/`0`);
1240	}
1241	} else if (I.Flags.getKind() == DeviceVarFlags::Surface) {
1242	llvm::offloading::emitOffloadingEntry(
1243	M, Kind, Addr: I.Var, Name: getDeviceSideName(I.D), Size: VarSize,
1244	Flags: llvm::offloading::OffloadGlobalSurfaceEntry \| Flags,
1245	Data: I.Flags.getSurfTexType());
1246	} else if (I.Flags.getKind() == DeviceVarFlags::Texture) {
1247	llvm::offloading::emitOffloadingEntry(
1248	M, Kind, Addr: I.Var, Name: getDeviceSideName(I.D), Size: VarSize,
1249	Flags: llvm::offloading::OffloadGlobalTextureEntry \| Flags,
1250	Data: I.Flags.getSurfTexType());
1251	}
1252	}
1253	}
1254
1255	// Returns module constructor to be added.
1256	llvm::Function *CGNVCUDARuntime::finalizeModule() {
1257	transformManagedVars();
1258	if (CGM.getLangOpts().CUDAIsDevice) {
1259	// Mark ODR-used device variables as compiler used to prevent it from being
1260	// eliminated by optimization. This is necessary for device variables
1261	// ODR-used by host functions. Sema correctly marks them as ODR-used no
1262	// matter whether they are ODR-used by device or host functions.
1263	//
1264	// We do not need to do this if the variable has used attribute since it
1265	// has already been added.
1266	//
1267	// Static device variables have been externalized at this point, therefore
1268	// variables with LLVM private or internal linkage need not be added.
1269	for (auto &&Info : DeviceVars) {
1270	auto Kind = Info.Flags.getKind();
1271	if (!Info.Var->isDeclaration() &&
1272	!llvm::GlobalValue::isLocalLinkage(Info.Var->getLinkage()) &&
1273	(Kind == DeviceVarFlags::Variable \|\|
1274	Kind == DeviceVarFlags::Surface \|\|
1275	Kind == DeviceVarFlags::Texture) &&
1276	Info.D->isUsed() && !Info.D->hasAttr<UsedAttr>()) {
1277	CGM.addCompilerUsedGlobal(GV: Info.Var);
1278	}
1279	}
1280	return nullptr;
1281	}
1282	if (CGM.getLangOpts().OffloadViaLLVM \|\|
1283	(CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode))
1284	createOffloadingEntries();
1285	else
1286	return makeModuleCtorFunction();
1287
1288	return nullptr;
1289	}
1290
1291	llvm::GlobalValue CGNVCUDARuntime::getKernelHandle(llvm::Function F,
1292	GlobalDecl GD) {
1293	auto Loc = KernelHandles.find(Val: F->getName());
1294	if (Loc != KernelHandles.end()) {
1295	auto OldHandle = Loc ->second;
1296	if (KernelStubs [OldHandle] == F)
1297	return OldHandle;
1298
1299	// We've found the function name, but F itself has changed, so we need to
1300	// update the references.
1301	if (CGM.getLangOpts().HIP) {
1302	// For HIP compilation the handle itself does not change, so we only need
1303	// to update the Stub value.
1304	KernelStubs [OldHandle] = F;
1305	return OldHandle;
1306	}
1307	// For non-HIP compilation, erase the old Stub and fall-through to creating
1308	// new entries.
1309	KernelStubs.erase(Val: OldHandle);
1310	}
1311
1312	if (!CGM.getLangOpts().HIP) {
1313	KernelHandles [F->getName()] = F;
1314	KernelStubs [F] = F;
1315	return F;
1316	}
1317
1318	auto Var = new* llvm::GlobalVariable (
1319	TheModule, F->getType(), /isConstant=/true, F->getLinkage(),
1320	/Initializer=/nullptr,
1321	CGM.getMangledName(
1322	GD: GD.getWithKernelReferenceKind(Kind: KernelReferenceKind::Kernel)));
1323	Var->setAlignment(CGM.getPointerAlign().getAsAlign());
1324	Var->setDSOLocal(F->isDSOLocal());
1325	Var->setVisibility(F->getVisibility());
1326	auto *FD = cast<FunctionDecl>(Val: GD.getDecl());
1327	auto *FT = FD->getPrimaryTemplate();
1328	if (!FT \|\| FT->isThisDeclarationADefinition())
1329	CGM.maybeSetTrivialComdat(FD, Var);
1330	KernelHandles [F->getName()] = Var;
1331	KernelStubs [Var] = F;
1332	return Var;
1333	}
1334

source code of clang/lib/CodeGen/CGCUDANV.cpp