1 | //===- AMDGPU.cpp ---------------------------------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "ABIInfoImpl.h" |
10 | #include "TargetInfo.h" |
11 | #include "clang/Basic/TargetOptions.h" |
12 | |
13 | using namespace clang; |
14 | using namespace clang::CodeGen; |
15 | |
16 | //===----------------------------------------------------------------------===// |
17 | // AMDGPU ABI Implementation |
18 | //===----------------------------------------------------------------------===// |
19 | |
20 | namespace { |
21 | |
22 | class AMDGPUABIInfo final : public DefaultABIInfo { |
23 | private: |
24 | static const unsigned MaxNumRegsForArgsRet = 16; |
25 | |
26 | unsigned numRegsForType(QualType Ty) const; |
27 | |
28 | bool isHomogeneousAggregateBaseType(QualType Ty) const override; |
29 | bool isHomogeneousAggregateSmallEnough(const Type *Base, |
30 | uint64_t Members) const override; |
31 | |
32 | // Coerce HIP scalar pointer arguments from generic pointers to global ones. |
33 | llvm::Type *coerceKernelArgumentType(llvm::Type *Ty, unsigned FromAS, |
34 | unsigned ToAS) const { |
35 | // Single value types. |
36 | auto *PtrTy = llvm::dyn_cast<llvm::PointerType>(Val: Ty); |
37 | if (PtrTy && PtrTy->getAddressSpace() == FromAS) |
38 | return llvm::PointerType::get(C&: Ty->getContext(), AddressSpace: ToAS); |
39 | return Ty; |
40 | } |
41 | |
42 | public: |
43 | explicit AMDGPUABIInfo(CodeGen::CodeGenTypes &CGT) : |
44 | DefaultABIInfo(CGT) {} |
45 | |
46 | ABIArgInfo classifyReturnType(QualType RetTy) const; |
47 | ABIArgInfo classifyKernelArgumentType(QualType Ty) const; |
48 | ABIArgInfo classifyArgumentType(QualType Ty, unsigned &NumRegsLeft) const; |
49 | |
50 | void computeInfo(CGFunctionInfo &FI) const override; |
51 | Address EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, |
52 | QualType Ty) const override; |
53 | }; |
54 | |
55 | bool AMDGPUABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const { |
56 | return true; |
57 | } |
58 | |
59 | bool AMDGPUABIInfo::isHomogeneousAggregateSmallEnough( |
60 | const Type *Base, uint64_t Members) const { |
61 | uint32_t NumRegs = (getContext().getTypeSize(T: Base) + 31) / 32; |
62 | |
63 | // Homogeneous Aggregates may occupy at most 16 registers. |
64 | return Members * NumRegs <= MaxNumRegsForArgsRet; |
65 | } |
66 | |
67 | /// Estimate number of registers the type will use when passed in registers. |
68 | unsigned AMDGPUABIInfo::numRegsForType(QualType Ty) const { |
69 | unsigned NumRegs = 0; |
70 | |
71 | if (const VectorType *VT = Ty->getAs<VectorType>()) { |
72 | // Compute from the number of elements. The reported size is based on the |
73 | // in-memory size, which includes the padding 4th element for 3-vectors. |
74 | QualType EltTy = VT->getElementType(); |
75 | unsigned EltSize = getContext().getTypeSize(T: EltTy); |
76 | |
77 | // 16-bit element vectors should be passed as packed. |
78 | if (EltSize == 16) |
79 | return (VT->getNumElements() + 1) / 2; |
80 | |
81 | unsigned EltNumRegs = (EltSize + 31) / 32; |
82 | return EltNumRegs * VT->getNumElements(); |
83 | } |
84 | |
85 | if (const RecordType *RT = Ty->getAs<RecordType>()) { |
86 | const RecordDecl *RD = RT->getDecl(); |
87 | assert(!RD->hasFlexibleArrayMember()); |
88 | |
89 | for (const FieldDecl *Field : RD->fields()) { |
90 | QualType FieldTy = Field->getType(); |
91 | NumRegs += numRegsForType(Ty: FieldTy); |
92 | } |
93 | |
94 | return NumRegs; |
95 | } |
96 | |
97 | return (getContext().getTypeSize(T: Ty) + 31) / 32; |
98 | } |
99 | |
100 | void AMDGPUABIInfo::computeInfo(CGFunctionInfo &FI) const { |
101 | llvm::CallingConv::ID CC = FI.getCallingConvention(); |
102 | |
103 | if (!getCXXABI().classifyReturnType(FI)) |
104 | FI.getReturnInfo() = classifyReturnType(RetTy: FI.getReturnType()); |
105 | |
106 | unsigned NumRegsLeft = MaxNumRegsForArgsRet; |
107 | for (auto &Arg : FI.arguments()) { |
108 | if (CC == llvm::CallingConv::AMDGPU_KERNEL) { |
109 | Arg.info = classifyKernelArgumentType(Ty: Arg.type); |
110 | } else { |
111 | Arg.info = classifyArgumentType(Ty: Arg.type, NumRegsLeft); |
112 | } |
113 | } |
114 | } |
115 | |
116 | Address AMDGPUABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, |
117 | QualType Ty) const { |
118 | llvm_unreachable("AMDGPU does not support varargs" ); |
119 | } |
120 | |
121 | ABIArgInfo AMDGPUABIInfo::classifyReturnType(QualType RetTy) const { |
122 | if (isAggregateTypeForABI(T: RetTy)) { |
123 | // Records with non-trivial destructors/copy-constructors should not be |
124 | // returned by value. |
125 | if (!getRecordArgABI(T: RetTy, CXXABI&: getCXXABI())) { |
126 | // Ignore empty structs/unions. |
127 | if (isEmptyRecord(Context&: getContext(), T: RetTy, AllowArrays: true)) |
128 | return ABIArgInfo::getIgnore(); |
129 | |
130 | // Lower single-element structs to just return a regular value. |
131 | if (const Type *SeltTy = isSingleElementStruct(T: RetTy, Context&: getContext())) |
132 | return ABIArgInfo::getDirect(T: CGT.ConvertType(T: QualType(SeltTy, 0))); |
133 | |
134 | if (const RecordType *RT = RetTy->getAs<RecordType>()) { |
135 | const RecordDecl *RD = RT->getDecl(); |
136 | if (RD->hasFlexibleArrayMember()) |
137 | return DefaultABIInfo::classifyReturnType(RetTy); |
138 | } |
139 | |
140 | // Pack aggregates <= 4 bytes into single VGPR or pair. |
141 | uint64_t Size = getContext().getTypeSize(T: RetTy); |
142 | if (Size <= 16) |
143 | return ABIArgInfo::getDirect(T: llvm::Type::getInt16Ty(C&: getVMContext())); |
144 | |
145 | if (Size <= 32) |
146 | return ABIArgInfo::getDirect(T: llvm::Type::getInt32Ty(C&: getVMContext())); |
147 | |
148 | if (Size <= 64) { |
149 | llvm::Type *I32Ty = llvm::Type::getInt32Ty(C&: getVMContext()); |
150 | return ABIArgInfo::getDirect(T: llvm::ArrayType::get(ElementType: I32Ty, NumElements: 2)); |
151 | } |
152 | |
153 | if (numRegsForType(Ty: RetTy) <= MaxNumRegsForArgsRet) |
154 | return ABIArgInfo::getDirect(); |
155 | } |
156 | } |
157 | |
158 | // Otherwise just do the default thing. |
159 | return DefaultABIInfo::classifyReturnType(RetTy); |
160 | } |
161 | |
162 | /// For kernels all parameters are really passed in a special buffer. It doesn't |
163 | /// make sense to pass anything byval, so everything must be direct. |
164 | ABIArgInfo AMDGPUABIInfo::classifyKernelArgumentType(QualType Ty) const { |
165 | Ty = useFirstFieldIfTransparentUnion(Ty); |
166 | |
167 | // TODO: Can we omit empty structs? |
168 | |
169 | if (const Type *SeltTy = isSingleElementStruct(T: Ty, Context&: getContext())) |
170 | Ty = QualType(SeltTy, 0); |
171 | |
172 | llvm::Type *OrigLTy = CGT.ConvertType(T: Ty); |
173 | llvm::Type *LTy = OrigLTy; |
174 | if (getContext().getLangOpts().HIP) { |
175 | LTy = coerceKernelArgumentType( |
176 | Ty: OrigLTy, /*FromAS=*/getContext().getTargetAddressSpace(AS: LangAS::Default), |
177 | /*ToAS=*/getContext().getTargetAddressSpace(AS: LangAS::cuda_device)); |
178 | } |
179 | |
180 | // FIXME: Should also use this for OpenCL, but it requires addressing the |
181 | // problem of kernels being called. |
182 | // |
183 | // FIXME: This doesn't apply the optimization of coercing pointers in structs |
184 | // to global address space when using byref. This would require implementing a |
185 | // new kind of coercion of the in-memory type when for indirect arguments. |
186 | if (!getContext().getLangOpts().OpenCL && LTy == OrigLTy && |
187 | isAggregateTypeForABI(T: Ty)) { |
188 | return ABIArgInfo::getIndirectAliased( |
189 | Alignment: getContext().getTypeAlignInChars(T: Ty), |
190 | AddrSpace: getContext().getTargetAddressSpace(AS: LangAS::opencl_constant), |
191 | Realign: false /*Realign*/, Padding: nullptr /*Padding*/); |
192 | } |
193 | |
194 | // If we set CanBeFlattened to true, CodeGen will expand the struct to its |
195 | // individual elements, which confuses the Clover OpenCL backend; therefore we |
196 | // have to set it to false here. Other args of getDirect() are just defaults. |
197 | return ABIArgInfo::getDirect(T: LTy, Offset: 0, Padding: nullptr, CanBeFlattened: false); |
198 | } |
199 | |
200 | ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty, |
201 | unsigned &NumRegsLeft) const { |
202 | assert(NumRegsLeft <= MaxNumRegsForArgsRet && "register estimate underflow" ); |
203 | |
204 | Ty = useFirstFieldIfTransparentUnion(Ty); |
205 | |
206 | if (isAggregateTypeForABI(T: Ty)) { |
207 | // Records with non-trivial destructors/copy-constructors should not be |
208 | // passed by value. |
209 | if (auto RAA = getRecordArgABI(T: Ty, CXXABI&: getCXXABI())) |
210 | return getNaturalAlignIndirect(Ty, ByVal: RAA == CGCXXABI::RAA_DirectInMemory); |
211 | |
212 | // Ignore empty structs/unions. |
213 | if (isEmptyRecord(Context&: getContext(), T: Ty, AllowArrays: true)) |
214 | return ABIArgInfo::getIgnore(); |
215 | |
216 | // Lower single-element structs to just pass a regular value. TODO: We |
217 | // could do reasonable-size multiple-element structs too, using getExpand(), |
218 | // though watch out for things like bitfields. |
219 | if (const Type *SeltTy = isSingleElementStruct(T: Ty, Context&: getContext())) |
220 | return ABIArgInfo::getDirect(T: CGT.ConvertType(T: QualType(SeltTy, 0))); |
221 | |
222 | if (const RecordType *RT = Ty->getAs<RecordType>()) { |
223 | const RecordDecl *RD = RT->getDecl(); |
224 | if (RD->hasFlexibleArrayMember()) |
225 | return DefaultABIInfo::classifyArgumentType(RetTy: Ty); |
226 | } |
227 | |
228 | // Pack aggregates <= 8 bytes into single VGPR or pair. |
229 | uint64_t Size = getContext().getTypeSize(T: Ty); |
230 | if (Size <= 64) { |
231 | unsigned NumRegs = (Size + 31) / 32; |
232 | NumRegsLeft -= std::min(a: NumRegsLeft, b: NumRegs); |
233 | |
234 | if (Size <= 16) |
235 | return ABIArgInfo::getDirect(T: llvm::Type::getInt16Ty(C&: getVMContext())); |
236 | |
237 | if (Size <= 32) |
238 | return ABIArgInfo::getDirect(T: llvm::Type::getInt32Ty(C&: getVMContext())); |
239 | |
240 | // XXX: Should this be i64 instead, and should the limit increase? |
241 | llvm::Type *I32Ty = llvm::Type::getInt32Ty(C&: getVMContext()); |
242 | return ABIArgInfo::getDirect(T: llvm::ArrayType::get(ElementType: I32Ty, NumElements: 2)); |
243 | } |
244 | |
245 | if (NumRegsLeft > 0) { |
246 | unsigned NumRegs = numRegsForType(Ty); |
247 | if (NumRegsLeft >= NumRegs) { |
248 | NumRegsLeft -= NumRegs; |
249 | return ABIArgInfo::getDirect(); |
250 | } |
251 | } |
252 | |
253 | // Use pass-by-reference in stead of pass-by-value for struct arguments in |
254 | // function ABI. |
255 | return ABIArgInfo::getIndirectAliased( |
256 | Alignment: getContext().getTypeAlignInChars(T: Ty), |
257 | AddrSpace: getContext().getTargetAddressSpace(AS: LangAS::opencl_private)); |
258 | } |
259 | |
260 | // Otherwise just do the default thing. |
261 | ABIArgInfo ArgInfo = DefaultABIInfo::classifyArgumentType(RetTy: Ty); |
262 | if (!ArgInfo.isIndirect()) { |
263 | unsigned NumRegs = numRegsForType(Ty); |
264 | NumRegsLeft -= std::min(a: NumRegs, b: NumRegsLeft); |
265 | } |
266 | |
267 | return ArgInfo; |
268 | } |
269 | |
270 | class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo { |
271 | public: |
272 | AMDGPUTargetCodeGenInfo(CodeGenTypes &CGT) |
273 | : TargetCodeGenInfo(std::make_unique<AMDGPUABIInfo>(args&: CGT)) {} |
274 | |
275 | void setFunctionDeclAttributes(const FunctionDecl *FD, llvm::Function *F, |
276 | CodeGenModule &CGM) const; |
277 | |
278 | void emitTargetGlobals(CodeGen::CodeGenModule &CGM) const override; |
279 | |
280 | void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV, |
281 | CodeGen::CodeGenModule &M) const override; |
282 | unsigned getOpenCLKernelCallingConv() const override; |
283 | |
284 | llvm::Constant *getNullPointer(const CodeGen::CodeGenModule &CGM, |
285 | llvm::PointerType *T, QualType QT) const override; |
286 | |
287 | LangAS getASTAllocaAddressSpace() const override { |
288 | return getLangASFromTargetAS( |
289 | TargetAS: getABIInfo().getDataLayout().getAllocaAddrSpace()); |
290 | } |
291 | LangAS getGlobalVarAddressSpace(CodeGenModule &CGM, |
292 | const VarDecl *D) const override; |
293 | llvm::SyncScope::ID getLLVMSyncScopeID(const LangOptions &LangOpts, |
294 | SyncScope Scope, |
295 | llvm::AtomicOrdering Ordering, |
296 | llvm::LLVMContext &Ctx) const override; |
297 | llvm::Value *createEnqueuedBlockKernel(CodeGenFunction &CGF, |
298 | llvm::Function *BlockInvokeFunc, |
299 | llvm::Type *BlockTy) const override; |
300 | bool shouldEmitStaticExternCAliases() const override; |
301 | bool shouldEmitDWARFBitFieldSeparators() const override; |
302 | void setCUDAKernelCallingConvention(const FunctionType *&FT) const override; |
303 | }; |
304 | } |
305 | |
306 | static bool requiresAMDGPUProtectedVisibility(const Decl *D, |
307 | llvm::GlobalValue *GV) { |
308 | if (GV->getVisibility() != llvm::GlobalValue::HiddenVisibility) |
309 | return false; |
310 | |
311 | return !D->hasAttr<OMPDeclareTargetDeclAttr>() && |
312 | (D->hasAttr<OpenCLKernelAttr>() || |
313 | (isa<FunctionDecl>(D) && D->hasAttr<CUDAGlobalAttr>()) || |
314 | (isa<VarDecl>(D) && |
315 | (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() || |
316 | cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinSurfaceType() || |
317 | cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinTextureType()))); |
318 | } |
319 | |
320 | void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes( |
321 | const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const { |
322 | const auto *ReqdWGS = |
323 | M.getLangOpts().OpenCL ? FD->getAttr<ReqdWorkGroupSizeAttr>() : nullptr; |
324 | const bool IsOpenCLKernel = |
325 | M.getLangOpts().OpenCL && FD->hasAttr<OpenCLKernelAttr>(); |
326 | const bool IsHIPKernel = M.getLangOpts().HIP && FD->hasAttr<CUDAGlobalAttr>(); |
327 | |
328 | const auto *FlatWGS = FD->getAttr<AMDGPUFlatWorkGroupSizeAttr>(); |
329 | if (ReqdWGS || FlatWGS) { |
330 | M.handleAMDGPUFlatWorkGroupSizeAttr(F, FlatWGS, ReqdWGS); |
331 | } else if (IsOpenCLKernel || IsHIPKernel) { |
332 | // By default, restrict the maximum size to a value specified by |
333 | // --gpu-max-threads-per-block=n or its default value for HIP. |
334 | const unsigned OpenCLDefaultMaxWorkGroupSize = 256; |
335 | const unsigned DefaultMaxWorkGroupSize = |
336 | IsOpenCLKernel ? OpenCLDefaultMaxWorkGroupSize |
337 | : M.getLangOpts().GPUMaxThreadsPerBlock; |
338 | std::string AttrVal = |
339 | std::string("1," ) + llvm::utostr(X: DefaultMaxWorkGroupSize); |
340 | F->addFnAttr(Kind: "amdgpu-flat-work-group-size" , Val: AttrVal); |
341 | } |
342 | |
343 | if (const auto *Attr = FD->getAttr<AMDGPUWavesPerEUAttr>()) |
344 | M.handleAMDGPUWavesPerEUAttr(F, Attr); |
345 | |
346 | if (const auto *Attr = FD->getAttr<AMDGPUNumSGPRAttr>()) { |
347 | unsigned NumSGPR = Attr->getNumSGPR(); |
348 | |
349 | if (NumSGPR != 0) |
350 | F->addFnAttr(Kind: "amdgpu-num-sgpr" , Val: llvm::utostr(X: NumSGPR)); |
351 | } |
352 | |
353 | if (const auto *Attr = FD->getAttr<AMDGPUNumVGPRAttr>()) { |
354 | uint32_t NumVGPR = Attr->getNumVGPR(); |
355 | |
356 | if (NumVGPR != 0) |
357 | F->addFnAttr(Kind: "amdgpu-num-vgpr" , Val: llvm::utostr(X: NumVGPR)); |
358 | } |
359 | |
360 | if (const auto *Attr = FD->getAttr<AMDGPUMaxNumWorkGroupsAttr>()) { |
361 | uint32_t X = Attr->getMaxNumWorkGroupsX() |
362 | ->EvaluateKnownConstInt(M.getContext()) |
363 | .getExtValue(); |
364 | // Y and Z dimensions default to 1 if not specified |
365 | uint32_t Y = Attr->getMaxNumWorkGroupsY() |
366 | ? Attr->getMaxNumWorkGroupsY() |
367 | ->EvaluateKnownConstInt(M.getContext()) |
368 | .getExtValue() |
369 | : 1; |
370 | uint32_t Z = Attr->getMaxNumWorkGroupsZ() |
371 | ? Attr->getMaxNumWorkGroupsZ() |
372 | ->EvaluateKnownConstInt(M.getContext()) |
373 | .getExtValue() |
374 | : 1; |
375 | |
376 | llvm::SmallString<32> AttrVal; |
377 | llvm::raw_svector_ostream OS(AttrVal); |
378 | OS << X << ',' << Y << ',' << Z; |
379 | |
380 | F->addFnAttr(Kind: "amdgpu-max-num-workgroups" , Val: AttrVal.str()); |
381 | } |
382 | } |
383 | |
384 | /// Emits control constants used to change per-architecture behaviour in the |
385 | /// AMDGPU ROCm device libraries. |
386 | void AMDGPUTargetCodeGenInfo::emitTargetGlobals( |
387 | CodeGen::CodeGenModule &CGM) const { |
388 | StringRef Name = "__oclc_ABI_version" ; |
389 | llvm::GlobalVariable *OriginalGV = CGM.getModule().getNamedGlobal(Name); |
390 | if (OriginalGV && !llvm::GlobalVariable::isExternalLinkage(Linkage: OriginalGV->getLinkage())) |
391 | return; |
392 | |
393 | if (CGM.getTarget().getTargetOpts().CodeObjectVersion == |
394 | llvm::CodeObjectVersionKind::COV_None) |
395 | return; |
396 | |
397 | auto *Type = llvm::IntegerType::getIntNTy(C&: CGM.getModule().getContext(), N: 32); |
398 | llvm::Constant *COV = llvm::ConstantInt::get( |
399 | Ty: Type, V: CGM.getTarget().getTargetOpts().CodeObjectVersion); |
400 | |
401 | // It needs to be constant weak_odr without externally_initialized so that |
402 | // the load instuction can be eliminated by the IPSCCP. |
403 | auto *GV = new llvm::GlobalVariable( |
404 | CGM.getModule(), Type, true, llvm::GlobalValue::WeakODRLinkage, COV, Name, |
405 | nullptr, llvm::GlobalValue::ThreadLocalMode::NotThreadLocal, |
406 | CGM.getContext().getTargetAddressSpace(AS: LangAS::opencl_constant)); |
407 | GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local); |
408 | GV->setVisibility(llvm::GlobalValue::VisibilityTypes::HiddenVisibility); |
409 | |
410 | // Replace any external references to this variable with the new global. |
411 | if (OriginalGV) { |
412 | OriginalGV->replaceAllUsesWith(V: GV); |
413 | GV->takeName(V: OriginalGV); |
414 | OriginalGV->eraseFromParent(); |
415 | } |
416 | } |
417 | |
418 | void AMDGPUTargetCodeGenInfo::setTargetAttributes( |
419 | const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const { |
420 | if (requiresAMDGPUProtectedVisibility(D, GV)) { |
421 | GV->setVisibility(llvm::GlobalValue::ProtectedVisibility); |
422 | GV->setDSOLocal(true); |
423 | } |
424 | |
425 | if (GV->isDeclaration()) |
426 | return; |
427 | |
428 | llvm::Function *F = dyn_cast<llvm::Function>(Val: GV); |
429 | if (!F) |
430 | return; |
431 | |
432 | const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(Val: D); |
433 | if (FD) |
434 | setFunctionDeclAttributes(FD, F, M); |
435 | |
436 | if (M.getContext().getTargetInfo().allowAMDGPUUnsafeFPAtomics()) |
437 | F->addFnAttr(Kind: "amdgpu-unsafe-fp-atomics" , Val: "true" ); |
438 | |
439 | if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts) |
440 | F->addFnAttr(Kind: "amdgpu-ieee" , Val: "false" ); |
441 | } |
442 | |
443 | unsigned AMDGPUTargetCodeGenInfo::getOpenCLKernelCallingConv() const { |
444 | return llvm::CallingConv::AMDGPU_KERNEL; |
445 | } |
446 | |
447 | // Currently LLVM assumes null pointers always have value 0, |
448 | // which results in incorrectly transformed IR. Therefore, instead of |
449 | // emitting null pointers in private and local address spaces, a null |
450 | // pointer in generic address space is emitted which is casted to a |
451 | // pointer in local or private address space. |
452 | llvm::Constant *AMDGPUTargetCodeGenInfo::getNullPointer( |
453 | const CodeGen::CodeGenModule &CGM, llvm::PointerType *PT, |
454 | QualType QT) const { |
455 | if (CGM.getContext().getTargetNullPointerValue(QT) == 0) |
456 | return llvm::ConstantPointerNull::get(T: PT); |
457 | |
458 | auto &Ctx = CGM.getContext(); |
459 | auto NPT = llvm::PointerType::get( |
460 | C&: PT->getContext(), AddressSpace: Ctx.getTargetAddressSpace(AS: LangAS::opencl_generic)); |
461 | return llvm::ConstantExpr::getAddrSpaceCast( |
462 | C: llvm::ConstantPointerNull::get(T: NPT), Ty: PT); |
463 | } |
464 | |
465 | LangAS |
466 | AMDGPUTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM, |
467 | const VarDecl *D) const { |
468 | assert(!CGM.getLangOpts().OpenCL && |
469 | !(CGM.getLangOpts().CUDA && CGM.getLangOpts().CUDAIsDevice) && |
470 | "Address space agnostic languages only" ); |
471 | LangAS DefaultGlobalAS = getLangASFromTargetAS( |
472 | TargetAS: CGM.getContext().getTargetAddressSpace(AS: LangAS::opencl_global)); |
473 | if (!D) |
474 | return DefaultGlobalAS; |
475 | |
476 | LangAS AddrSpace = D->getType().getAddressSpace(); |
477 | if (AddrSpace != LangAS::Default) |
478 | return AddrSpace; |
479 | |
480 | // Only promote to address space 4 if VarDecl has constant initialization. |
481 | if (D->getType().isConstantStorage(CGM.getContext(), false, false) && |
482 | D->hasConstantInitialization()) { |
483 | if (auto ConstAS = CGM.getTarget().getConstantAddressSpace()) |
484 | return *ConstAS; |
485 | } |
486 | return DefaultGlobalAS; |
487 | } |
488 | |
489 | llvm::SyncScope::ID |
490 | AMDGPUTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &LangOpts, |
491 | SyncScope Scope, |
492 | llvm::AtomicOrdering Ordering, |
493 | llvm::LLVMContext &Ctx) const { |
494 | std::string Name; |
495 | switch (Scope) { |
496 | case SyncScope::HIPSingleThread: |
497 | case SyncScope::SingleScope: |
498 | Name = "singlethread" ; |
499 | break; |
500 | case SyncScope::HIPWavefront: |
501 | case SyncScope::OpenCLSubGroup: |
502 | case SyncScope::WavefrontScope: |
503 | Name = "wavefront" ; |
504 | break; |
505 | case SyncScope::HIPWorkgroup: |
506 | case SyncScope::OpenCLWorkGroup: |
507 | case SyncScope::WorkgroupScope: |
508 | Name = "workgroup" ; |
509 | break; |
510 | case SyncScope::HIPAgent: |
511 | case SyncScope::OpenCLDevice: |
512 | case SyncScope::DeviceScope: |
513 | Name = "agent" ; |
514 | break; |
515 | case SyncScope::SystemScope: |
516 | case SyncScope::HIPSystem: |
517 | case SyncScope::OpenCLAllSVMDevices: |
518 | Name = "" ; |
519 | break; |
520 | } |
521 | |
522 | if (Ordering != llvm::AtomicOrdering::SequentiallyConsistent) { |
523 | if (!Name.empty()) |
524 | Name = Twine(Twine(Name) + Twine("-" )).str(); |
525 | |
526 | Name = Twine(Twine(Name) + Twine("one-as" )).str(); |
527 | } |
528 | |
529 | return Ctx.getOrInsertSyncScopeID(SSN: Name); |
530 | } |
531 | |
532 | bool AMDGPUTargetCodeGenInfo::shouldEmitStaticExternCAliases() const { |
533 | return false; |
534 | } |
535 | |
536 | bool AMDGPUTargetCodeGenInfo::shouldEmitDWARFBitFieldSeparators() const { |
537 | return true; |
538 | } |
539 | |
540 | void AMDGPUTargetCodeGenInfo::setCUDAKernelCallingConvention( |
541 | const FunctionType *&FT) const { |
542 | FT = getABIInfo().getContext().adjustFunctionType( |
543 | Fn: FT, EInfo: FT->getExtInfo().withCallingConv(cc: CC_OpenCLKernel)); |
544 | } |
545 | |
546 | /// Create an OpenCL kernel for an enqueued block. |
547 | /// |
548 | /// The type of the first argument (the block literal) is the struct type |
549 | /// of the block literal instead of a pointer type. The first argument |
550 | /// (block literal) is passed directly by value to the kernel. The kernel |
551 | /// allocates the same type of struct on stack and stores the block literal |
552 | /// to it and passes its pointer to the block invoke function. The kernel |
553 | /// has "enqueued-block" function attribute and kernel argument metadata. |
554 | llvm::Value *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel( |
555 | CodeGenFunction &CGF, llvm::Function *Invoke, llvm::Type *BlockTy) const { |
556 | auto &Builder = CGF.Builder; |
557 | auto &C = CGF.getLLVMContext(); |
558 | |
559 | auto *InvokeFT = Invoke->getFunctionType(); |
560 | llvm::SmallVector<llvm::Type *, 2> ArgTys; |
561 | llvm::SmallVector<llvm::Metadata *, 8> AddressQuals; |
562 | llvm::SmallVector<llvm::Metadata *, 8> AccessQuals; |
563 | llvm::SmallVector<llvm::Metadata *, 8> ArgTypeNames; |
564 | llvm::SmallVector<llvm::Metadata *, 8> ArgBaseTypeNames; |
565 | llvm::SmallVector<llvm::Metadata *, 8> ArgTypeQuals; |
566 | llvm::SmallVector<llvm::Metadata *, 8> ArgNames; |
567 | |
568 | ArgTys.push_back(Elt: BlockTy); |
569 | ArgTypeNames.push_back(Elt: llvm::MDString::get(Context&: C, Str: "__block_literal" )); |
570 | AddressQuals.push_back(Elt: llvm::ConstantAsMetadata::get(C: Builder.getInt32(C: 0))); |
571 | ArgBaseTypeNames.push_back(Elt: llvm::MDString::get(Context&: C, Str: "__block_literal" )); |
572 | ArgTypeQuals.push_back(Elt: llvm::MDString::get(Context&: C, Str: "" )); |
573 | AccessQuals.push_back(Elt: llvm::MDString::get(Context&: C, Str: "none" )); |
574 | ArgNames.push_back(Elt: llvm::MDString::get(Context&: C, Str: "block_literal" )); |
575 | for (unsigned I = 1, E = InvokeFT->getNumParams(); I < E; ++I) { |
576 | ArgTys.push_back(Elt: InvokeFT->getParamType(i: I)); |
577 | ArgTypeNames.push_back(Elt: llvm::MDString::get(Context&: C, Str: "void*" )); |
578 | AddressQuals.push_back(Elt: llvm::ConstantAsMetadata::get(C: Builder.getInt32(C: 3))); |
579 | AccessQuals.push_back(Elt: llvm::MDString::get(Context&: C, Str: "none" )); |
580 | ArgBaseTypeNames.push_back(Elt: llvm::MDString::get(Context&: C, Str: "void*" )); |
581 | ArgTypeQuals.push_back(Elt: llvm::MDString::get(Context&: C, Str: "" )); |
582 | ArgNames.push_back( |
583 | Elt: llvm::MDString::get(Context&: C, Str: (Twine("local_arg" ) + Twine(I)).str())); |
584 | } |
585 | std::string Name = Invoke->getName().str() + "_kernel" ; |
586 | auto *FT = llvm::FunctionType::get(Result: llvm::Type::getVoidTy(C), Params: ArgTys, isVarArg: false); |
587 | auto *F = llvm::Function::Create(Ty: FT, Linkage: llvm::GlobalValue::InternalLinkage, N: Name, |
588 | M: &CGF.CGM.getModule()); |
589 | F->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL); |
590 | |
591 | llvm::AttrBuilder KernelAttrs(C); |
592 | // FIXME: The invoke isn't applying the right attributes either |
593 | // FIXME: This is missing setTargetAttributes |
594 | CGF.CGM.addDefaultFunctionDefinitionAttributes(attrs&: KernelAttrs); |
595 | KernelAttrs.addAttribute(A: "enqueued-block" ); |
596 | F->addFnAttrs(Attrs: KernelAttrs); |
597 | |
598 | auto IP = CGF.Builder.saveIP(); |
599 | auto *BB = llvm::BasicBlock::Create(Context&: C, Name: "entry" , Parent: F); |
600 | Builder.SetInsertPoint(BB); |
601 | const auto BlockAlign = CGF.CGM.getDataLayout().getPrefTypeAlign(Ty: BlockTy); |
602 | auto *BlockPtr = Builder.CreateAlloca(Ty: BlockTy, ArraySize: nullptr); |
603 | BlockPtr->setAlignment(BlockAlign); |
604 | Builder.CreateAlignedStore(Val: F->arg_begin(), Ptr: BlockPtr, Align: BlockAlign); |
605 | auto *Cast = Builder.CreatePointerCast(V: BlockPtr, DestTy: InvokeFT->getParamType(i: 0)); |
606 | llvm::SmallVector<llvm::Value *, 2> Args; |
607 | Args.push_back(Elt: Cast); |
608 | for (llvm::Argument &A : llvm::drop_begin(RangeOrContainer: F->args())) |
609 | Args.push_back(Elt: &A); |
610 | llvm::CallInst *call = Builder.CreateCall(Callee: Invoke, Args); |
611 | call->setCallingConv(Invoke->getCallingConv()); |
612 | Builder.CreateRetVoid(); |
613 | Builder.restoreIP(IP); |
614 | |
615 | F->setMetadata(Kind: "kernel_arg_addr_space" , Node: llvm::MDNode::get(Context&: C, MDs: AddressQuals)); |
616 | F->setMetadata(Kind: "kernel_arg_access_qual" , Node: llvm::MDNode::get(Context&: C, MDs: AccessQuals)); |
617 | F->setMetadata(Kind: "kernel_arg_type" , Node: llvm::MDNode::get(Context&: C, MDs: ArgTypeNames)); |
618 | F->setMetadata(Kind: "kernel_arg_base_type" , |
619 | Node: llvm::MDNode::get(Context&: C, MDs: ArgBaseTypeNames)); |
620 | F->setMetadata(Kind: "kernel_arg_type_qual" , Node: llvm::MDNode::get(Context&: C, MDs: ArgTypeQuals)); |
621 | if (CGF.CGM.getCodeGenOpts().EmitOpenCLArgMetadata) |
622 | F->setMetadata(Kind: "kernel_arg_name" , Node: llvm::MDNode::get(Context&: C, MDs: ArgNames)); |
623 | |
624 | return F; |
625 | } |
626 | |
627 | void CodeGenModule::handleAMDGPUFlatWorkGroupSizeAttr( |
628 | llvm::Function *F, const AMDGPUFlatWorkGroupSizeAttr *FlatWGS, |
629 | const ReqdWorkGroupSizeAttr *ReqdWGS, int32_t *MinThreadsVal, |
630 | int32_t *MaxThreadsVal) { |
631 | unsigned Min = 0; |
632 | unsigned Max = 0; |
633 | if (FlatWGS) { |
634 | Min = FlatWGS->getMin()->EvaluateKnownConstInt(getContext()).getExtValue(); |
635 | Max = FlatWGS->getMax()->EvaluateKnownConstInt(getContext()).getExtValue(); |
636 | } |
637 | if (ReqdWGS && Min == 0 && Max == 0) |
638 | Min = Max = ReqdWGS->getXDim() * ReqdWGS->getYDim() * ReqdWGS->getZDim(); |
639 | |
640 | if (Min != 0) { |
641 | assert(Min <= Max && "Min must be less than or equal Max" ); |
642 | |
643 | if (MinThreadsVal) |
644 | *MinThreadsVal = Min; |
645 | if (MaxThreadsVal) |
646 | *MaxThreadsVal = Max; |
647 | std::string AttrVal = llvm::utostr(X: Min) + "," + llvm::utostr(X: Max); |
648 | if (F) |
649 | F->addFnAttr(Kind: "amdgpu-flat-work-group-size" , Val: AttrVal); |
650 | } else |
651 | assert(Max == 0 && "Max must be zero" ); |
652 | } |
653 | |
654 | void CodeGenModule::handleAMDGPUWavesPerEUAttr( |
655 | llvm::Function *F, const AMDGPUWavesPerEUAttr *Attr) { |
656 | unsigned Min = |
657 | Attr->getMin()->EvaluateKnownConstInt(getContext()).getExtValue(); |
658 | unsigned Max = |
659 | Attr->getMax() |
660 | ? Attr->getMax()->EvaluateKnownConstInt(getContext()).getExtValue() |
661 | : 0; |
662 | |
663 | if (Min != 0) { |
664 | assert((Max == 0 || Min <= Max) && "Min must be less than or equal Max" ); |
665 | |
666 | std::string AttrVal = llvm::utostr(X: Min); |
667 | if (Max != 0) |
668 | AttrVal = AttrVal + "," + llvm::utostr(X: Max); |
669 | F->addFnAttr(Kind: "amdgpu-waves-per-eu" , Val: AttrVal); |
670 | } else |
671 | assert(Max == 0 && "Max must be zero" ); |
672 | } |
673 | |
674 | std::unique_ptr<TargetCodeGenInfo> |
675 | CodeGen::createAMDGPUTargetCodeGenInfo(CodeGenModule &CGM) { |
676 | return std::make_unique<AMDGPUTargetCodeGenInfo>(args&: CGM.getTypes()); |
677 | } |
678 | |