AMDGPUCallLowering.cpp source code [llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp]

1	//===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering -----===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	///
9	/// \file
10	/// This file implements the lowering of LLVM calls to machine code calls for
11	/// GlobalISel.
12	///
13	//===----------------------------------------------------------------------===//
14
15	#include "AMDGPUCallLowering.h"
16	#include "AMDGPU.h"
17	#include "AMDGPULegalizerInfo.h"
18	#include "AMDGPUTargetMachine.h"
19	#include "SIMachineFunctionInfo.h"
20	#include "SIRegisterInfo.h"
21	#include "llvm/CodeGen/Analysis.h"
22	#include "llvm/CodeGen/FunctionLoweringInfo.h"
23	#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
24	#include "llvm/CodeGen/MachineFrameInfo.h"
25	#include "llvm/IR/IntrinsicsAMDGPU.h"
26
27	#define DEBUG_TYPE "amdgpu-call-lowering"
28
29	using namespace llvm;
30
31	namespace {
32
33	/// Wrapper around extendRegister to ensure we extend to a full 32-bit register.
34	static Register extendRegisterMin32(CallLowering::ValueHandler &Handler,
35	Register ValVReg, const CCValAssign &VA) {
36	if (VA.getLocVT().getSizeInBits() < `32`) {
37	// 16-bit types are reported as legal for 32-bit registers. We need to
38	// extend and do a 32-bit copy to avoid the verifier complaining about it.
39	return Handler.MIRBuilder.buildAnyExt(Res: LLT::scalar(SizeInBits: `32`), Op: ValVReg).getReg(Idx: `0`);
40	}
41
42	return Handler.extendRegister(ValReg: ValVReg, VA);
43	}
44
45	struct AMDGPUOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
46	AMDGPUOutgoingValueHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
47	MachineInstrBuilder MIB)
48	: OutgoingValueHandler (B, MRI), MIB (MIB) {}
49
50	MachineInstrBuilder MIB;
51
52	Register getStackAddress(uint64_t Size, int64_t Offset,
53	MachinePointerInfo &MPO,
54	ISD::ArgFlagsTy Flags) override {
55	llvm_unreachable("not implemented");
56	}
57
58	void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
59	const MachinePointerInfo &MPO,
60	const CCValAssign &VA) override {
61	llvm_unreachable("not implemented");
62	}
63
64	void assignValueToReg(Register ValVReg, Register PhysReg,
65	const CCValAssign &VA) override {
66	Register ExtReg = extendRegisterMin32(Handler&: *this, ValVReg, VA);
67
68	// If this is a scalar return, insert a readfirstlane just in case the value
69	// ends up in a VGPR.
70	// FIXME: Assert this is a shader return.
71	const SIRegisterInfo *TRI
72	= static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
73	if (TRI->isSGPRReg(MRI, Reg: PhysReg)) {
74	LLT Ty = MRI.getType(Reg: ExtReg);
75	LLT S32 = LLT::scalar(SizeInBits: `32`);
76	if (Ty != S32) {
77	// FIXME: We should probably support readfirstlane intrinsics with all
78	// legal 32-bit types.
79	assert(Ty.getSizeInBits() == `32`);
80	if (Ty.isPointer())
81	ExtReg = MIRBuilder.buildPtrToInt(Dst: S32, Src: ExtReg).getReg(Idx: `0`);
82	else
83	ExtReg = MIRBuilder.buildBitcast(Dst: S32, Src: ExtReg).getReg(Idx: `0`);
84	}
85
86	auto ToSGPR = MIRBuilder
87	.buildIntrinsic(Intrinsic::amdgcn_readfirstlane,
88	{MRI.getType(ExtReg)})
89	.addReg(ExtReg);
90	ExtReg = ToSGPR.getReg(`0`);
91	}
92
93	MIRBuilder.buildCopy(Res: PhysReg, Op: ExtReg);
94	MIB.addUse(RegNo: PhysReg, Flags: RegState::Implicit);
95	}
96	};
97
98	struct AMDGPUIncomingArgHandler : public CallLowering::IncomingValueHandler {
99	uint64_t StackUsed = `0`;
100
101	AMDGPUIncomingArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI)
102	: IncomingValueHandler (B, MRI) {}
103
104	Register getStackAddress(uint64_t Size, int64_t Offset,
105	MachinePointerInfo &MPO,
106	ISD::ArgFlagsTy Flags) override {
107	auto &MFI = MIRBuilder.getMF().getFrameInfo();
108
109	// Byval is assumed to be writable memory, but other stack passed arguments
110	// are not.
111	const bool IsImmutable = !Flags.isByVal();
112	int FI = MFI.CreateFixedObject(Size, SPOffset: Offset, IsImmutable);
113	MPO = MachinePointerInfo::getFixedStack(MF&: MIRBuilder.getMF(), FI);
114	auto AddrReg = MIRBuilder.buildFrameIndex(
115	Res: LLT::pointer(AddressSpace: AMDGPUAS::PRIVATE_ADDRESS, SizeInBits: `32`), Idx: FI);
116	StackUsed = std::max(StackUsed, Size + Offset);
117	return AddrReg.getReg(Idx: `0`);
118	}
119
120	void assignValueToReg(Register ValVReg, Register PhysReg,
121	const CCValAssign &VA) override {
122	markPhysRegUsed(PhysReg);
123
124	if (VA.getLocVT().getSizeInBits() < `32`) {
125	// 16-bit types are reported as legal for 32-bit registers. We need to do
126	// a 32-bit copy, and truncate to avoid the verifier complaining about it.
127	auto Copy = MIRBuilder.buildCopy(Res: LLT::scalar(SizeInBits: `32`), Op: PhysReg);
128
129	// If we have signext/zeroext, it applies to the whole 32-bit register
130	// before truncation.
131	auto Extended =
132	buildExtensionHint(VA, SrcReg: Copy.getReg(Idx: `0`), NarrowTy: LLT (VA.getLocVT()));
133	MIRBuilder.buildTrunc(Res: ValVReg, Op: Extended);
134	return;
135	}
136
137	IncomingValueHandler::assignValueToReg(ValVReg, PhysReg, VA);
138	}
139
140	void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
141	const MachinePointerInfo &MPO,
142	const CCValAssign &VA) override {
143	MachineFunction &MF = MIRBuilder.getMF();
144
145	auto MMO = MF.getMachineMemOperand(
146	MPO, MachineMemOperand::MOLoad \| MachineMemOperand::MOInvariant, MemTy,
147	inferAlignFromPtrInfo(MF, MPO));
148	MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
149	}
150
151	/// How the physical register gets marked varies between formal
152	/// parameters (it's a basic-block live-in), and a call instruction
153	/// (it's an implicit-def of the BL).
154	virtual void markPhysRegUsed(unsigned PhysReg) = `0`;
155	};
156
157	struct FormalArgHandler : public AMDGPUIncomingArgHandler {
158	FormalArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI)
159	: AMDGPUIncomingArgHandler (B, MRI) {}
160
161	void markPhysRegUsed(unsigned PhysReg) override {
162	MIRBuilder.getMBB().addLiveIn(PhysReg);
163	}
164	};
165
166	struct CallReturnHandler : public AMDGPUIncomingArgHandler {
167	CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
168	MachineInstrBuilder MIB)
169	: AMDGPUIncomingArgHandler (MIRBuilder, MRI), MIB (MIB) {}
170
171	void markPhysRegUsed(unsigned PhysReg) override {
172	MIB.addDef(RegNo: PhysReg, Flags: RegState::Implicit);
173	}
174
175	MachineInstrBuilder MIB;
176	};
177
178	struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler {
179	/// For tail calls, the byte offset of the call's argument area from the
180	/// callee's. Unused elsewhere.
181	int FPDiff;
182
183	// Cache the SP register vreg if we need it more than once in this call site.
184	Register SPReg;
185
186	bool IsTailCall;
187
188	AMDGPUOutgoingArgHandler(MachineIRBuilder &MIRBuilder,
189	MachineRegisterInfo &MRI, MachineInstrBuilder MIB,
190	bool IsTailCall = false, int FPDiff = `0`)
191	: AMDGPUOutgoingValueHandler (MIRBuilder, MRI, MIB), FPDiff(FPDiff),
192	IsTailCall(IsTailCall) {}
193
194	Register getStackAddress(uint64_t Size, int64_t Offset,
195	MachinePointerInfo &MPO,
196	ISD::ArgFlagsTy Flags) override {
197	MachineFunction &MF = MIRBuilder.getMF();
198	const LLT PtrTy = LLT::pointer(AddressSpace: AMDGPUAS::PRIVATE_ADDRESS, SizeInBits: `32`);
199	const LLT S32 = LLT::scalar(SizeInBits: `32`);
200
201	if (IsTailCall) {
202	Offset += FPDiff;
203	int FI = MF.getFrameInfo().CreateFixedObject(Size, SPOffset: Offset, IsImmutable: true);
204	auto FIReg = MIRBuilder.buildFrameIndex(Res: PtrTy, Idx: FI);
205	MPO = MachinePointerInfo::getFixedStack(MF, FI);
206	return FIReg.getReg(Idx: `0`);
207	}
208
209	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
210
211	if (!SPReg) {
212	const GCNSubtarget &ST = MIRBuilder.getMF().getSubtarget<GCNSubtarget>();
213	if (ST.enableFlatScratch()) {
214	// The stack is accessed unswizzled, so we can use a regular copy.
215	SPReg = MIRBuilder.buildCopy(Res: PtrTy,
216	Op: MFI->getStackPtrOffsetReg()).getReg(Idx: `0`);
217	} else {
218	// The address we produce here, without knowing the use context, is going
219	// to be interpreted as a vector address, so we need to convert to a
220	// swizzled address.
221	SPReg = MIRBuilder.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {PtrTy},
222	{MFI->getStackPtrOffsetReg()}).getReg(`0`);
223	}
224	}
225
226	auto OffsetReg = MIRBuilder.buildConstant(Res: S32, Val: Offset);
227
228	auto AddrReg = MIRBuilder.buildPtrAdd(Res: PtrTy, Op0: SPReg, Op1: OffsetReg);
229	MPO = MachinePointerInfo::getStack(MF, Offset);
230	return AddrReg.getReg(Idx: `0`);
231	}
232
233	void assignValueToReg(Register ValVReg, Register PhysReg,
234	const CCValAssign &VA) override {
235	MIB.addUse(RegNo: PhysReg, Flags: RegState::Implicit);
236	Register ExtReg = extendRegisterMin32(Handler&: *this, ValVReg, VA);
237	MIRBuilder.buildCopy(Res: PhysReg, Op: ExtReg);
238	}
239
240	void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
241	const MachinePointerInfo &MPO,
242	const CCValAssign &VA) override {
243	MachineFunction &MF = MIRBuilder.getMF();
244	uint64_t LocMemOffset = VA.getLocMemOffset();
245	const auto &ST = MF.getSubtarget<GCNSubtarget>();
246
247	auto MMO = MF.getMachineMemOperand(
248	MPO, MachineMemOperand::MOStore, MemTy,
249	commonAlignment(ST.getStackAlignment(), LocMemOffset));
250	MIRBuilder.buildStore(ValVReg, Addr, *MMO);
251	}
252
253	void assignValueToAddress(const CallLowering::ArgInfo &Arg,
254	unsigned ValRegIndex, Register Addr, LLT MemTy,
255	const MachinePointerInfo &MPO,
256	const CCValAssign &VA) override {
257	Register ValVReg = VA.getLocInfo() != CCValAssign::LocInfo::FPExt
258	? extendRegister(ValReg: Arg.Regs [ValRegIndex], VA)
259	: Arg.Regs [ValRegIndex];
260	assignValueToAddress(ValVReg, Addr, MemTy, MPO, VA);
261	}
262	};
263	}
264
265	AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI)
266	: CallLowering (&TLI) {
267	}
268
269	// FIXME: Compatibility shim
270	static ISD::NodeType extOpcodeToISDExtOpcode(unsigned MIOpc) {
271	switch (MIOpc) {
272	case TargetOpcode::G_SEXT:
273	return ISD::SIGN_EXTEND;
274	case TargetOpcode::G_ZEXT:
275	return ISD::ZERO_EXTEND;
276	case TargetOpcode::G_ANYEXT:
277	return ISD::ANY_EXTEND;
278	default:
279	llvm_unreachable("not an extend opcode");
280	}
281	}
282
283	bool AMDGPUCallLowering::canLowerReturn(MachineFunction &MF,
284	CallingConv::ID CallConv,
285	SmallVectorImpl<BaseArgInfo> &Outs,
286	bool IsVarArg) const {
287	// For shaders. Vector types should be explicitly handled by CC.
288	if (AMDGPU::isEntryFunctionCC(CC: CallConv))
289	return true;
290
291	SmallVector<CCValAssign, `16`> ArgLocs;
292	const SITargetLowering &TLI = *getTLI<SITargetLowering>();
293	CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs,
294	MF.getFunction().getContext());
295
296	return checkReturn(CCInfo, Outs, Fn: TLI.CCAssignFnForReturn(CC: CallConv, IsVarArg));
297	}
298
299	/// Lower the return value for the already existing \p Ret. This assumes that
300	/// \p B's insertion point is correct.
301	bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B,
302	const Value *Val, ArrayRef<Register> VRegs,
303	MachineInstrBuilder &Ret) const {
304	if (!Val)
305	return true;
306
307	auto &MF = B.getMF();
308	const auto &F = MF.getFunction();
309	const DataLayout &DL = MF.getDataLayout();
310	MachineRegisterInfo *MRI = B.getMRI();
311	LLVMContext &Ctx = F.getContext();
312
313	CallingConv::ID CC = F.getCallingConv();
314	const SITargetLowering &TLI = *getTLI<SITargetLowering>();
315
316	SmallVector<EVT, `8`> SplitEVTs;
317	ComputeValueVTs(TLI, DL, Ty: Val->getType(), ValueVTs&: SplitEVTs);
318	assert(VRegs.size() == SplitEVTs.size() &&
319	"For each split Type there should be exactly one VReg.");
320
321	SmallVector<ArgInfo, `8`> SplitRetInfos;
322
323	for (unsigned i = `0`; i < SplitEVTs.size(); ++i) {
324	EVT VT = SplitEVTs [i];
325	Register Reg = VRegs [i];
326	ArgInfo RetInfo(Reg, VT.getTypeForEVT(Context&: Ctx), `0`);
327	setArgFlags(Arg&: RetInfo, OpIdx: AttributeList::ReturnIndex, DL, FuncInfo: F);
328
329	if (VT.isScalarInteger()) {
330	unsigned ExtendOp = TargetOpcode::G_ANYEXT;
331	if (RetInfo.Flags [`0`].isSExt()) {
332	assert(RetInfo.Regs.size() == `1` && "expect only simple return values");
333	ExtendOp = TargetOpcode::G_SEXT;
334	} else if (RetInfo.Flags [`0`].isZExt()) {
335	assert(RetInfo.Regs.size() == `1` && "expect only simple return values");
336	ExtendOp = TargetOpcode::G_ZEXT;
337	}
338
339	EVT ExtVT = TLI.getTypeForExtReturn(Context&: Ctx, VT,
340	ExtendKind: extOpcodeToISDExtOpcode(MIOpc: ExtendOp));
341	if (ExtVT != VT) {
342	RetInfo.Ty = ExtVT.getTypeForEVT(Context&: Ctx);
343	LLT ExtTy = getLLTForType(Ty&: *RetInfo.Ty, DL);
344	Reg = B.buildInstr(Opc: ExtendOp, DstOps: {ExtTy}, SrcOps: {Reg}).getReg(Idx: `0`);
345	}
346	}
347
348	if (Reg != RetInfo.Regs [`0`]) {
349	RetInfo.Regs [`0`] = Reg;
350	// Reset the arg flags after modifying Reg.
351	setArgFlags(Arg&: RetInfo, OpIdx: AttributeList::ReturnIndex, DL, FuncInfo: F);
352	}
353
354	splitToValueTypes(OrigArgInfo: RetInfo, SplitArgs&: SplitRetInfos, DL, CallConv: CC);
355	}
356
357	CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(CC, IsVarArg: F.isVarArg());
358
359	OutgoingValueAssigner Assigner(AssignFn);
360	AMDGPUOutgoingValueHandler RetHandler(B, *MRI, Ret);
361	return determineAndHandleAssignments(Handler&: RetHandler, Assigner, Args&: SplitRetInfos, MIRBuilder&: B,
362	CallConv: CC, IsVarArg: F.isVarArg());
363	}
364
365	bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val,
366	ArrayRef<Register> VRegs,
367	FunctionLoweringInfo &FLI) const {
368
369	MachineFunction &MF = B.getMF();
370	SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
371	MFI->setIfReturnsVoid(!Val);
372
373	assert(!Val == VRegs.empty() && "Return value without a vreg");
374
375	CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
376	const bool IsShader = AMDGPU::isShader(CC);
377	const bool IsWaveEnd =
378	(IsShader && MFI->returnsVoid()) \|\| AMDGPU::isKernel(CC);
379	if (IsWaveEnd) {
380	B.buildInstr(AMDGPU::S_ENDPGM)
381	.addImm(`0`);
382	return true;
383	}
384
385	unsigned ReturnOpc =
386	IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::SI_RETURN;
387	auto Ret = B.buildInstrNoInsert(Opcode: ReturnOpc);
388
389	if (!FLI.CanLowerReturn)
390	insertSRetStores(MIRBuilder&: B, RetTy: Val->getType(), VRegs, DemoteReg: FLI.DemoteRegister);
391	else if (!lowerReturnVal(B, Val, VRegs, Ret&: Ret))
392	return false;
393
394	// TODO: Handle CalleeSavedRegsViaCopy.
395
396	B.insertInstr(MIB: Ret);
397	return true;
398	}
399
400	void AMDGPUCallLowering::lowerParameterPtr(Register DstReg, MachineIRBuilder &B,
401	uint64_t Offset) const {
402	MachineFunction &MF = B.getMF();
403	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
404	MachineRegisterInfo &MRI = MF.getRegInfo();
405	Register KernArgSegmentPtr =
406	MFI->getPreloadedReg(Value: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
407	Register KernArgSegmentVReg = MRI.getLiveInVirtReg(PReg: KernArgSegmentPtr);
408
409	auto OffsetReg = B.buildConstant(Res: LLT::scalar(SizeInBits: `64`), Val: Offset);
410
411	B.buildPtrAdd(Res: DstReg, Op0: KernArgSegmentVReg, Op1: OffsetReg);
412	}
413
414	void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, ArgInfo &OrigArg,
415	uint64_t Offset,
416	Align Alignment) const {
417	MachineFunction &MF = B.getMF();
418	const Function &F = MF.getFunction();
419	const DataLayout &DL = F.getParent()->getDataLayout();
420	MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
421
422	LLT PtrTy = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: `64`);
423
424	SmallVector<ArgInfo, `32`> SplitArgs;
425	SmallVector<uint64_t> FieldOffsets;
426	splitToValueTypes(OrigArgInfo: OrigArg, SplitArgs, DL, CallConv: F.getCallingConv(), Offsets: &FieldOffsets);
427
428	unsigned Idx = `0`;
429	for (ArgInfo &SplitArg : SplitArgs) {
430	Register PtrReg = B.getMRI()->createGenericVirtualRegister(Ty: PtrTy);
431	lowerParameterPtr(DstReg: PtrReg, B, Offset: Offset + FieldOffsets [Idx]);
432
433	LLT ArgTy = getLLTForType(Ty&: *SplitArg.Ty, DL);
434	if (SplitArg.Flags [`0`].isPointer()) {
435	// Compensate for losing pointeriness in splitValueTypes.
436	LLT PtrTy = LLT::pointer(AddressSpace: SplitArg.Flags [`0`].getPointerAddrSpace(),
437	SizeInBits: ArgTy.getScalarSizeInBits());
438	ArgTy = ArgTy.isVector() ? LLT::vector(EC: ArgTy.getElementCount(), ScalarTy: PtrTy)
439	: PtrTy;
440	}
441
442	MachineMemOperand *MMO = MF.getMachineMemOperand(
443	PtrInfo,
444	f: MachineMemOperand::MOLoad \| MachineMemOperand::MODereferenceable \|
445	MachineMemOperand::MOInvariant,
446	MemTy: ArgTy, base_alignment: commonAlignment(A: Alignment, Offset: FieldOffsets [Idx]));
447
448	assert(SplitArg.Regs.size() == `1`);
449
450	B.buildLoad(Res: SplitArg.Regs [`0`], Addr: PtrReg, MMO&: *MMO);
451	++Idx;
452	}
453	}
454
455	// Allocate special inputs passed in user SGPRs.
456	static void allocateHSAUserSGPRs(CCState &CCInfo,
457	MachineIRBuilder &B,
458	MachineFunction &MF,
459	const SIRegisterInfo &TRI,
460	SIMachineFunctionInfo &Info) {
461	// FIXME: How should these inputs interact with inreg / custom SGPR inputs?
462	const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
463	if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
464	Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
465	MF.addLiveIn(PReg: PrivateSegmentBufferReg, RC: &AMDGPU::SGPR_128RegClass);
466	CCInfo.AllocateReg(Reg: PrivateSegmentBufferReg);
467	}
468
469	if (UserSGPRInfo.hasDispatchPtr()) {
470	Register DispatchPtrReg = Info.addDispatchPtr(TRI);
471	MF.addLiveIn(PReg: DispatchPtrReg, RC: &AMDGPU::SGPR_64RegClass);
472	CCInfo.AllocateReg(Reg: DispatchPtrReg);
473	}
474
475	const Module *M = MF.getFunction().getParent();
476	if (UserSGPRInfo.hasQueuePtr() &&
477	AMDGPU::getAMDHSACodeObjectVersion(M: *M) < AMDGPU::AMDHSA_COV5) {
478	Register QueuePtrReg = Info.addQueuePtr(TRI);
479	MF.addLiveIn(PReg: QueuePtrReg, RC: &AMDGPU::SGPR_64RegClass);
480	CCInfo.AllocateReg(Reg: QueuePtrReg);
481	}
482
483	if (UserSGPRInfo.hasKernargSegmentPtr()) {
484	MachineRegisterInfo &MRI = MF.getRegInfo();
485	Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
486	const LLT P4 = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: `64`);
487	Register VReg = MRI.createGenericVirtualRegister(Ty: P4);
488	MRI.addLiveIn(Reg: InputPtrReg, vreg: VReg);
489	B.getMBB().addLiveIn(PhysReg: InputPtrReg);
490	B.buildCopy(Res: VReg, Op: InputPtrReg);
491	CCInfo.AllocateReg(Reg: InputPtrReg);
492	}
493
494	if (UserSGPRInfo.hasDispatchID()) {
495	Register DispatchIDReg = Info.addDispatchID(TRI);
496	MF.addLiveIn(PReg: DispatchIDReg, RC: &AMDGPU::SGPR_64RegClass);
497	CCInfo.AllocateReg(Reg: DispatchIDReg);
498	}
499
500	if (UserSGPRInfo.hasFlatScratchInit()) {
501	Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
502	MF.addLiveIn(PReg: FlatScratchInitReg, RC: &AMDGPU::SGPR_64RegClass);
503	CCInfo.AllocateReg(Reg: FlatScratchInitReg);
504	}
505
506	// TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
507	// these from the dispatch pointer.
508	}
509
510	bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
511	MachineIRBuilder &B, const Function &F,
512	ArrayRef<ArrayRef<Register>> VRegs) const {
513	MachineFunction &MF = B.getMF();
514	const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>();
515	MachineRegisterInfo &MRI = MF.getRegInfo();
516	SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
517	const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
518	const SITargetLowering &TLI = *getTLI<SITargetLowering>();
519	const DataLayout &DL = F.getParent()->getDataLayout();
520
521	SmallVector<CCValAssign, `16`> ArgLocs;
522	CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
523
524	allocateHSAUserSGPRs(CCInfo, B, MF, TRI: TRI, Info&: Info);
525
526	unsigned i = `0`;
527	const Align KernArgBaseAlign(`16`);
528	const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset();
529	uint64_t ExplicitArgOffset = `0`;
530
531	// TODO: Align down to dword alignment and extract bits for extending loads.
532	for (auto &Arg : F.args()) {
533	const bool IsByRef = Arg.hasByRefAttr();
534	Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
535	unsigned AllocSize = DL.getTypeAllocSize(Ty: ArgTy);
536	if (AllocSize == `0`)
537	continue;
538
539	MaybeAlign ParamAlign = IsByRef ? Arg.getParamAlign() : std::nullopt;
540	Align ABIAlign = DL.getValueOrABITypeAlignment(Alignment: ParamAlign, Ty: ArgTy);
541
542	uint64_t ArgOffset = alignTo(Size: ExplicitArgOffset, A: ABIAlign) + BaseOffset;
543	ExplicitArgOffset = alignTo(Size: ExplicitArgOffset, A: ABIAlign) + AllocSize;
544
545	if (Arg.use_empty()) {
546	++i;
547	continue;
548	}
549
550	Align Alignment = commonAlignment(A: KernArgBaseAlign, Offset: ArgOffset);
551
552	if (IsByRef) {
553	unsigned ByRefAS = cast<PointerType>(Val: Arg.getType())->getAddressSpace();
554
555	assert(VRegs[i].size() == `1` &&
556	"expected only one register for byval pointers");
557	if (ByRefAS == AMDGPUAS::CONSTANT_ADDRESS) {
558	lowerParameterPtr(DstReg: VRegs [i][`0`], B, Offset: ArgOffset);
559	} else {
560	const LLT ConstPtrTy = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: `64`);
561	Register PtrReg = MRI.createGenericVirtualRegister(Ty: ConstPtrTy);
562	lowerParameterPtr(DstReg: PtrReg, B, Offset: ArgOffset);
563
564	B.buildAddrSpaceCast(Dst: VRegs [i][`0`], Src: PtrReg);
565	}
566	} else {
567	ArgInfo OrigArg(VRegs [i], Arg, i);
568	const unsigned OrigArgIdx = i + AttributeList::FirstArgIndex;
569	setArgFlags(Arg&: OrigArg, OpIdx: OrigArgIdx, DL, FuncInfo: F);
570	lowerParameter(B, OrigArg, Offset: ArgOffset, Alignment);
571	}
572
573	++i;
574	}
575
576	TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, TRI: TRI, Info&: Info);
577	TLI.allocateSystemSGPRs(CCInfo, MF, Info&: Info, CallConv: F.getCallingConv(), IsShader: false*);
578	return true;
579	}
580
581	bool AMDGPUCallLowering::lowerFormalArguments(
582	MachineIRBuilder &B, const Function &F, ArrayRef<ArrayRef<Register>> VRegs,
583	FunctionLoweringInfo &FLI) const {
584	CallingConv::ID CC = F.getCallingConv();
585
586	// The infrastructure for normal calling convention lowering is essentially
587	// useless for kernels. We want to avoid any kind of legalization or argument
588	// splitting.
589	if (CC == CallingConv::AMDGPU_KERNEL)
590	return lowerFormalArgumentsKernel(B, F, VRegs);
591
592	const bool IsGraphics = AMDGPU::isGraphics(CC);
593	const bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC);
594
595	MachineFunction &MF = B.getMF();
596	MachineBasicBlock &MBB = B.getMBB();
597	MachineRegisterInfo &MRI = MF.getRegInfo();
598	SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
599	const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
600	const SIRegisterInfo *TRI = Subtarget.getRegisterInfo();
601	const DataLayout &DL = F.getParent()->getDataLayout();
602
603	SmallVector<CCValAssign, `16`> ArgLocs;
604	CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());
605	const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
606
607	if (UserSGPRInfo.hasImplicitBufferPtr()) {
608	Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(TRI: *TRI);
609	MF.addLiveIn(PReg: ImplicitBufferPtrReg, RC: &AMDGPU::SGPR_64RegClass);
610	CCInfo.AllocateReg(Reg: ImplicitBufferPtrReg);
611	}
612
613	// FIXME: This probably isn't defined for mesa
614	if (UserSGPRInfo.hasFlatScratchInit() && !Subtarget.isAmdPalOS()) {
615	Register FlatScratchInitReg = Info->addFlatScratchInit(TRI: *TRI);
616	MF.addLiveIn(PReg: FlatScratchInitReg, RC: &AMDGPU::SGPR_64RegClass);
617	CCInfo.AllocateReg(Reg: FlatScratchInitReg);
618	}
619
620	SmallVector<ArgInfo, `32`> SplitArgs;
621	unsigned Idx = `0`;
622	unsigned PSInputNum = `0`;
623
624	// Insert the hidden sret parameter if the return value won't fit in the
625	// return registers.
626	if (!FLI.CanLowerReturn)
627	insertSRetIncomingArgument(F, SplitArgs, DemoteReg&: FLI.DemoteRegister, MRI, DL);
628
629	for (auto &Arg : F.args()) {
630	if (DL.getTypeStoreSize(Ty: Arg.getType()) == `0`)
631	continue;
632
633	const bool InReg = Arg.hasAttribute(Attribute::Kind: InReg);
634
635	if (Arg.hasAttribute(Attribute::Kind: SwiftSelf) \|\|
636	Arg.hasAttribute(Attribute::Kind: SwiftError) \|\|
637	Arg.hasAttribute(Attribute::Kind: Nest))
638	return false;
639
640	if (CC == CallingConv::AMDGPU_PS && !InReg && PSInputNum <= `15`) {
641	const bool ArgUsed = !Arg.use_empty();
642	bool SkipArg = !ArgUsed && !Info->isPSInputAllocated(Index: PSInputNum);
643
644	if (!SkipArg) {
645	Info->markPSInputAllocated(Index: PSInputNum);
646	if (ArgUsed)
647	Info->markPSInputEnabled(Index: PSInputNum);
648	}
649
650	++PSInputNum;
651
652	if (SkipArg) {
653	for (Register R : VRegs [Idx])
654	B.buildUndef(Res: R);
655
656	++Idx;
657	continue;
658	}
659	}
660
661	ArgInfo OrigArg(VRegs [Idx], Arg, Idx);
662	const unsigned OrigArgIdx = Idx + AttributeList::FirstArgIndex;
663	setArgFlags(Arg&: OrigArg, OpIdx: OrigArgIdx, DL, FuncInfo: F);
664
665	splitToValueTypes(OrigArgInfo: OrigArg, SplitArgs, DL, CallConv: CC);
666	++Idx;
667	}
668
669	// At least one interpolation mode must be enabled or else the GPU will
670	// hang.
671	//
672	// Check PSInputAddr instead of PSInputEnable. The idea is that if the user
673	// set PSInputAddr, the user wants to enable some bits after the compilation
674	// based on run-time states. Since we can't know what the final PSInputEna
675	// will look like, so we shouldn't do anything here and the user should take
676	// responsibility for the correct programming.
677	//
678	// Otherwise, the following restrictions apply:
679	// - At least one of PERSP_ (0xF) or LINEAR_* (0x70) must be enabled.*
680	// - If POS_W_FLOAT (11) is enabled, at least one of PERSP_ must be*
681	// enabled too.
682	if (CC == CallingConv::AMDGPU_PS) {
683	if ((Info->getPSInputAddr() & `0x7F`) == `0` \|\|
684	((Info->getPSInputAddr() & `0xF`) == `0` &&
685	Info->isPSInputAllocated(Index: `11`))) {
686	CCInfo.AllocateReg(AMDGPU::VGPR0);
687	CCInfo.AllocateReg(AMDGPU::VGPR1);
688	Info->markPSInputAllocated(Index: `0`);
689	Info->markPSInputEnabled(Index: `0`);
690	}
691
692	if (Subtarget.isAmdPalOS()) {
693	// For isAmdPalOS, the user does not enable some bits after compilation
694	// based on run-time states; the register values being generated here are
695	// the final ones set in hardware. Therefore we need to apply the
696	// workaround to PSInputAddr and PSInputEnable together. (The case where
697	// a bit is set in PSInputAddr but not PSInputEnable is where the frontend
698	// set up an input arg for a particular interpolation mode, but nothing
699	// uses that input arg. Really we should have an earlier pass that removes
700	// such an arg.)
701	unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
702	if ((PsInputBits & `0x7F`) == `0` \|\|
703	((PsInputBits & `0xF`) == `0` &&
704	(PsInputBits >> `11` & `1`)))
705	Info->markPSInputEnabled(Index: llvm::countr_zero(Val: Info->getPSInputAddr()));
706	}
707	}
708
709	const SITargetLowering &TLI = *getTLI<SITargetLowering>();
710	CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CC, IsVarArg: F.isVarArg());
711
712	if (!MBB.empty())
713	B.setInstr(*MBB.begin());
714
715	if (!IsEntryFunc && !IsGraphics) {
716	// For the fixed ABI, pass workitem IDs in the last argument register.
717	TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, TRI: TRI, Info&: Info);
718
719	if (!Subtarget.enableFlatScratch())
720	CCInfo.AllocateReg(Reg: Info->getScratchRSrcReg());
721	TLI.allocateSpecialInputSGPRs(CCInfo, MF, TRI: TRI, Info&: Info);
722	}
723
724	IncomingValueAssigner Assigner(AssignFn);
725	if (!determineAssignments(Assigner, Args&: SplitArgs, CCInfo))
726	return false;
727
728	FormalArgHandler Handler(B, MRI);
729	if (!handleAssignments(Handler, Args&: SplitArgs, CCState&: CCInfo, ArgLocs, MIRBuilder&: B))
730	return false;
731
732	uint64_t StackSize = Assigner.StackSize;
733
734	// Start adding system SGPRs.
735	if (IsEntryFunc)
736	TLI.allocateSystemSGPRs(CCInfo, MF, Info&: *Info, CallConv: CC, IsShader: IsGraphics);
737
738	// When we tail call, we need to check if the callee's arguments will fit on
739	// the caller's stack. So, whenever we lower formal arguments, we should keep
740	// track of this information, since we might lower a tail call in this
741	// function later.
742	Info->setBytesInStackArgArea(StackSize);
743
744	// Move back to the end of the basic block.
745	B.setMBB(MBB);
746
747	return true;
748	}
749
750	bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder,
751	CCState &CCInfo,
752	SmallVectorImpl<std::pair<MCRegister, Register>> &ArgRegs,
753	CallLoweringInfo &Info) const {
754	MachineFunction &MF = MIRBuilder.getMF();
755
756	// If there's no call site, this doesn't correspond to a call from the IR and
757	// doesn't need implicit inputs.
758	if (!Info.CB)
759	return true;
760
761	const AMDGPUFunctionArgInfo *CalleeArgInfo
762	= &AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
763
764	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
765	const AMDGPUFunctionArgInfo &CallerArgInfo = MFI->getArgInfo();
766
767
768	// TODO: Unify with private memory register handling. This is complicated by
769	// the fact that at least in kernels, the input argument is not necessarily
770	// in the same location as the input.
771	AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = {
772	AMDGPUFunctionArgInfo::DISPATCH_PTR,
773	AMDGPUFunctionArgInfo::QUEUE_PTR,
774	AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR,
775	AMDGPUFunctionArgInfo::DISPATCH_ID,
776	AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
777	AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
778	AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
779	AMDGPUFunctionArgInfo::LDS_KERNEL_ID,
780	};
781
782	static constexpr StringLiteral ImplicitAttrNames[] = {
783	"amdgpu-no-dispatch-ptr",
784	"amdgpu-no-queue-ptr",
785	"amdgpu-no-implicitarg-ptr",
786	"amdgpu-no-dispatch-id",
787	"amdgpu-no-workgroup-id-x",
788	"amdgpu-no-workgroup-id-y",
789	"amdgpu-no-workgroup-id-z",
790	"amdgpu-no-lds-kernel-id",
791	};
792
793	MachineRegisterInfo &MRI = MF.getRegInfo();
794
795	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
796	const AMDGPULegalizerInfo *LI
797	= static_cast<const AMDGPULegalizerInfo*>(ST.getLegalizerInfo());
798
799	unsigned I = `0`;
800	for (auto InputID : InputRegs) {
801	const ArgDescriptor *OutgoingArg;
802	const TargetRegisterClass *ArgRC;
803	LLT ArgTy;
804
805	// If the callee does not use the attribute value, skip copying the value.
806	if (Info.CB->hasFnAttr(Kind: ImplicitAttrNames[I++]))
807	continue;
808
809	std::tie(args&: OutgoingArg, args&: ArgRC, args&: ArgTy) =
810	CalleeArgInfo->getPreloadedValue(Value: InputID);
811	if (!OutgoingArg)
812	continue;
813
814	const ArgDescriptor *IncomingArg;
815	const TargetRegisterClass *IncomingArgRC;
816	std::tie(args&: IncomingArg, args&: IncomingArgRC, args&: ArgTy) =
817	CallerArgInfo.getPreloadedValue(Value: InputID);
818	assert(IncomingArgRC == ArgRC);
819
820	Register InputReg = MRI.createGenericVirtualRegister(Ty: ArgTy);
821
822	if (IncomingArg) {
823	LI->loadInputValue(DstReg: InputReg, B&: MIRBuilder, Arg: IncomingArg, ArgRC, ArgTy);
824	} else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
825	LI->getImplicitArgPtr(DstReg: InputReg, MRI, B&: MIRBuilder);
826	} else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
827	std::optional<uint32_t> Id =
828	AMDGPUMachineFunction::getLDSKernelIdMetadata(F: MF.getFunction());
829	if (Id) {
830	MIRBuilder.buildConstant(Res: InputReg, Val: *Id);
831	} else {
832	MIRBuilder.buildUndef(Res: InputReg);
833	}
834	} else {
835	// We may have proven the input wasn't needed, although the ABI is
836	// requiring it. We just need to allocate the register appropriately.
837	MIRBuilder.buildUndef(Res: InputReg);
838	}
839
840	if (OutgoingArg->isRegister()) {
841	ArgRegs.emplace_back(Args: OutgoingArg->getRegister(), Args&: InputReg);
842	if (!CCInfo.AllocateReg(Reg: OutgoingArg->getRegister()))
843	report_fatal_error(reason: "failed to allocate implicit input argument");
844	} else {
845	LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n");
846	return false;
847	}
848	}
849
850	// Pack workitem IDs into a single register or pass it as is if already
851	// packed.
852	const ArgDescriptor *OutgoingArg;
853	const TargetRegisterClass *ArgRC;
854	LLT ArgTy;
855
856	std::tie(args&: OutgoingArg, args&: ArgRC, args&: ArgTy) =
857	CalleeArgInfo->getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_X);
858	if (!OutgoingArg)
859	std::tie(args&: OutgoingArg, args&: ArgRC, args&: ArgTy) =
860	CalleeArgInfo->getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
861	if (!OutgoingArg)
862	std::tie(args&: OutgoingArg, args&: ArgRC, args&: ArgTy) =
863	CalleeArgInfo->getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
864	if (!OutgoingArg)
865	return false;
866
867	auto WorkitemIDX =
868	CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_X);
869	auto WorkitemIDY =
870	CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
871	auto WorkitemIDZ =
872	CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
873
874	const ArgDescriptor *IncomingArgX = std::get<`0`>(t&: WorkitemIDX);
875	const ArgDescriptor *IncomingArgY = std::get<`0`>(t&: WorkitemIDY);
876	const ArgDescriptor *IncomingArgZ = std::get<`0`>(t&: WorkitemIDZ);
877	const LLT S32 = LLT::scalar(SizeInBits: `32`);
878
879	const bool NeedWorkItemIDX = !Info.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-x");
880	const bool NeedWorkItemIDY = !Info.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-y");
881	const bool NeedWorkItemIDZ = !Info.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-z");
882
883	// If incoming ids are not packed we need to pack them.
884	// FIXME: Should consider known workgroup size to eliminate known 0 cases.
885	Register InputReg;
886	if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
887	NeedWorkItemIDX) {
888	if (ST.getMaxWorkitemID(MF.getFunction(), `0`) != `0`) {
889	InputReg = MRI.createGenericVirtualRegister(Ty: S32);
890	LI->loadInputValue(DstReg: InputReg, B&: MIRBuilder, Arg: IncomingArgX,
891	ArgRC: std::get<`1`>(t&: WorkitemIDX), ArgTy: std::get<`2`>(t&: WorkitemIDX));
892	} else {
893	InputReg = MIRBuilder.buildConstant(Res: S32, Val: `0`).getReg(Idx: `0`);
894	}
895	}
896
897	if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
898	NeedWorkItemIDY && ST.getMaxWorkitemID(MF.getFunction(), `1`) != `0`) {
899	Register Y = MRI.createGenericVirtualRegister(Ty: S32);
900	LI->loadInputValue(DstReg: Y, B&: MIRBuilder, Arg: IncomingArgY, ArgRC: std::get<`1`>(t&: WorkitemIDY),
901	ArgTy: std::get<`2`>(t&: WorkitemIDY));
902
903	Y = MIRBuilder.buildShl(Dst: S32, Src0: Y, Src1: MIRBuilder.buildConstant(Res: S32, Val: `10`)).getReg(Idx: `0`);
904	InputReg = InputReg ? MIRBuilder.buildOr(Dst: S32, Src0: InputReg, Src1: Y).getReg(Idx: `0`) : Y;
905	}
906
907	if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
908	NeedWorkItemIDZ && ST.getMaxWorkitemID(MF.getFunction(), `2`) != `0`) {
909	Register Z = MRI.createGenericVirtualRegister(Ty: S32);
910	LI->loadInputValue(DstReg: Z, B&: MIRBuilder, Arg: IncomingArgZ, ArgRC: std::get<`1`>(t&: WorkitemIDZ),
911	ArgTy: std::get<`2`>(t&: WorkitemIDZ));
912
913	Z = MIRBuilder.buildShl(Dst: S32, Src0: Z, Src1: MIRBuilder.buildConstant(Res: S32, Val: `20`)).getReg(Idx: `0`);
914	InputReg = InputReg ? MIRBuilder.buildOr(Dst: S32, Src0: InputReg, Src1: Z).getReg(Idx: `0`) : Z;
915	}
916
917	if (!InputReg &&
918	(NeedWorkItemIDX \|\| NeedWorkItemIDY \|\| NeedWorkItemIDZ)) {
919	InputReg = MRI.createGenericVirtualRegister(Ty: S32);
920	if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
921	// We're in a situation where the outgoing function requires the workitem
922	// ID, but the calling function does not have it (e.g a graphics function
923	// calling a C calling convention function). This is illegal, but we need
924	// to produce something.
925	MIRBuilder.buildUndef(Res: InputReg);
926	} else {
927	// Workitem ids are already packed, any of present incoming arguments will
928	// carry all required fields.
929	ArgDescriptor IncomingArg = ArgDescriptor::createArg(
930	Arg: IncomingArgX ? *IncomingArgX :
931	IncomingArgY ? IncomingArgY : IncomingArgZ, Mask: ~`0u`);
932	LI->loadInputValue(InputReg, MIRBuilder, &IncomingArg,
933	&AMDGPU::VGPR_32RegClass, S32);
934	}
935	}
936
937	if (OutgoingArg->isRegister()) {
938	if (InputReg)
939	ArgRegs.emplace_back(Args: OutgoingArg->getRegister(), Args&: InputReg);
940
941	if (!CCInfo.AllocateReg(Reg: OutgoingArg->getRegister()))
942	report_fatal_error(reason: "failed to allocate implicit input argument");
943	} else {
944	LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n");
945	return false;
946	}
947
948	return true;
949	}
950
951	/// Returns a pair containing the fixed CCAssignFn and the vararg CCAssignFn for
952	/// CC.
953	static std::pair<CCAssignFn , CCAssignFn >
954	getAssignFnsForCC(CallingConv::ID CC, const SITargetLowering &TLI) {
955	return {TLI.CCAssignFnForCall(CC, IsVarArg: false), TLI.CCAssignFnForCall(CC, IsVarArg: true)};
956	}
957
958	static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect,
959	bool IsTailCall, bool isWave32,
960	CallingConv::ID CC) {
961	// For calls to amdgpu_cs_chain functions, the address is known to be uniform.
962	assert((AMDGPU::isChainCC(CC) \|\| !IsIndirect \|\| !IsTailCall) &&
963	"Indirect calls can't be tail calls, "
964	"because the address can be divergent");
965	if (!IsTailCall)
966	return AMDGPU::G_SI_CALL;
967
968	if (AMDGPU::isChainCC(CC))
969	return isWave32 ? AMDGPU::SI_CS_CHAIN_TC_W32 : AMDGPU::SI_CS_CHAIN_TC_W64;
970
971	return CC == CallingConv::AMDGPU_Gfx ? AMDGPU::SI_TCRETURN_GFX :
972	AMDGPU::SI_TCRETURN;
973	}
974
975	// Add operands to call instruction to track the callee.
976	static bool addCallTargetOperands(MachineInstrBuilder &CallInst,
977	MachineIRBuilder &MIRBuilder,
978	AMDGPUCallLowering::CallLoweringInfo &Info) {
979	if (Info.Callee.isReg()) {
980	CallInst.addReg(RegNo: Info.Callee.getReg());
981	CallInst.addImm(Val: `0`);
982	} else if (Info.Callee.isGlobal() && Info.Callee.getOffset() == `0`) {
983	// The call lowering lightly assumed we can directly encode a call target in
984	// the instruction, which is not the case. Materialize the address here.
985	const GlobalValue *GV = Info.Callee.getGlobal();
986	auto Ptr = MIRBuilder.buildGlobalValue(
987	Res: LLT::pointer(AddressSpace: GV->getAddressSpace(), SizeInBits: `64`), GV);
988	CallInst.addReg(RegNo: Ptr.getReg(Idx: `0`));
989	CallInst.add(MO: Info.Callee);
990	} else
991	return false;
992
993	return true;
994	}
995
996	bool AMDGPUCallLowering::doCallerAndCalleePassArgsTheSameWay(
997	CallLoweringInfo &Info, MachineFunction &MF,
998	SmallVectorImpl<ArgInfo> &InArgs) const {
999	const Function &CallerF = MF.getFunction();
1000	CallingConv::ID CalleeCC = Info.CallConv;
1001	CallingConv::ID CallerCC = CallerF.getCallingConv();
1002
1003	// If the calling conventions match, then everything must be the same.
1004	if (CalleeCC == CallerCC)
1005	return true;
1006
1007	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1008
1009	// Make sure that the caller and callee preserve all of the same registers.
1010	auto TRI = ST.getRegisterInfo();
1011
1012	const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
1013	const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
1014	if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
1015	return false;
1016
1017	// Check if the caller and callee will handle arguments in the same way.
1018	const SITargetLowering &TLI = *getTLI<SITargetLowering>();
1019	CCAssignFn *CalleeAssignFnFixed;
1020	CCAssignFn *CalleeAssignFnVarArg;
1021	std::tie(args&: CalleeAssignFnFixed, args&: CalleeAssignFnVarArg) =
1022	getAssignFnsForCC(CC: CalleeCC, TLI);
1023
1024	CCAssignFn *CallerAssignFnFixed;
1025	CCAssignFn *CallerAssignFnVarArg;
1026	std::tie(args&: CallerAssignFnFixed, args&: CallerAssignFnVarArg) =
1027	getAssignFnsForCC(CC: CallerCC, TLI);
1028
1029	// FIXME: We are not accounting for potential differences in implicitly passed
1030	// inputs, but only the fixed ABI is supported now anyway.
1031	IncomingValueAssigner CalleeAssigner(CalleeAssignFnFixed,
1032	CalleeAssignFnVarArg);
1033	IncomingValueAssigner CallerAssigner(CallerAssignFnFixed,
1034	CallerAssignFnVarArg);
1035	return resultsCompatible(Info, MF, InArgs, CalleeAssigner, CallerAssigner);
1036	}
1037
1038	bool AMDGPUCallLowering::areCalleeOutgoingArgsTailCallable(
1039	CallLoweringInfo &Info, MachineFunction &MF,
1040	SmallVectorImpl<ArgInfo> &OutArgs) const {
1041	// If there are no outgoing arguments, then we are done.
1042	if (OutArgs.empty())
1043	return true;
1044
1045	const Function &CallerF = MF.getFunction();
1046	CallingConv::ID CalleeCC = Info.CallConv;
1047	CallingConv::ID CallerCC = CallerF.getCallingConv();
1048	const SITargetLowering &TLI = *getTLI<SITargetLowering>();
1049
1050	CCAssignFn *AssignFnFixed;
1051	CCAssignFn *AssignFnVarArg;
1052	std::tie(args&: AssignFnFixed, args&: AssignFnVarArg) = getAssignFnsForCC(CC: CalleeCC, TLI);
1053
1054	// We have outgoing arguments. Make sure that we can tail call with them.
1055	SmallVector<CCValAssign, `16`> OutLocs;
1056	CCState OutInfo(CalleeCC, false, MF, OutLocs, CallerF.getContext());
1057	OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg);
1058
1059	if (!determineAssignments(Assigner, Args&: OutArgs, CCInfo&: OutInfo)) {
1060	LLVM_DEBUG(dbgs() << "... Could not analyze call operands.\n");
1061	return false;
1062	}
1063
1064	// Make sure that they can fit on the caller's stack.
1065	const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1066	if (OutInfo.getStackSize() > FuncInfo->getBytesInStackArgArea()) {
1067	LLVM_DEBUG(dbgs() << "... Cannot fit call operands on caller's stack.\n");
1068	return false;
1069	}
1070
1071	// Verify that the parameters in callee-saved registers match.
1072	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1073	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1074	const uint32_t *CallerPreservedMask = TRI->getCallPreservedMask(MF, CallerCC);
1075	MachineRegisterInfo &MRI = MF.getRegInfo();
1076	return parametersInCSRMatch(MRI, CallerPreservedMask, ArgLocs: OutLocs, OutVals: OutArgs);
1077	}
1078
1079	/// Return true if the calling convention is one that we can guarantee TCO for.
1080	static bool canGuaranteeTCO(CallingConv::ID CC) {
1081	return CC == CallingConv::Fast;
1082	}
1083
1084	/// Return true if we might ever do TCO for calls with this calling convention.
1085	static bool mayTailCallThisCC(CallingConv::ID CC) {
1086	switch (CC) {
1087	case CallingConv::C:
1088	case CallingConv::AMDGPU_Gfx:
1089	return true;
1090	default:
1091	return canGuaranteeTCO(CC);
1092	}
1093	}
1094
1095	bool AMDGPUCallLowering::isEligibleForTailCallOptimization(
1096	MachineIRBuilder &B, CallLoweringInfo &Info,
1097	SmallVectorImpl<ArgInfo> &InArgs, SmallVectorImpl<ArgInfo> &OutArgs) const {
1098	// Must pass all target-independent checks in order to tail call optimize.
1099	if (!Info.IsTailCall)
1100	return false;
1101
1102	// Indirect calls can't be tail calls, because the address can be divergent.
1103	// TODO Check divergence info if the call really is divergent.
1104	if (Info.Callee.isReg())
1105	return false;
1106
1107	MachineFunction &MF = B.getMF();
1108	const Function &CallerF = MF.getFunction();
1109	CallingConv::ID CalleeCC = Info.CallConv;
1110	CallingConv::ID CallerCC = CallerF.getCallingConv();
1111
1112	const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
1113	const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
1114	// Kernels aren't callable, and don't have a live in return address so it
1115	// doesn't make sense to do a tail call with entry functions.
1116	if (!CallerPreserved)
1117	return false;
1118
1119	if (!mayTailCallThisCC(CC: CalleeCC)) {
1120	LLVM_DEBUG(dbgs() << "... Calling convention cannot be tail called.\n");
1121	return false;
1122	}
1123
1124	if (any_of(Range: CallerF.args(), P: [](const Argument &A) {
1125	return A.hasByValAttr() \|\| A.hasSwiftErrorAttr();
1126	})) {
1127	LLVM_DEBUG(dbgs() << "... Cannot tail call from callers with byval "
1128	"or swifterror arguments\n");
1129	return false;
1130	}
1131
1132	// If we have -tailcallopt, then we're done.
1133	if (MF.getTarget().Options.GuaranteedTailCallOpt)
1134	return canGuaranteeTCO(CC: CalleeCC) && CalleeCC == CallerF.getCallingConv();
1135
1136	// Verify that the incoming and outgoing arguments from the callee are
1137	// safe to tail call.
1138	if (!doCallerAndCalleePassArgsTheSameWay(Info, MF, InArgs)) {
1139	LLVM_DEBUG(
1140	dbgs()
1141	<< "... Caller and callee have incompatible calling conventions.\n");
1142	return false;
1143	}
1144
1145	if (!areCalleeOutgoingArgsTailCallable(Info, MF, OutArgs))
1146	return false;
1147
1148	LLVM_DEBUG(dbgs() << "... Call is eligible for tail call optimization.\n");
1149	return true;
1150	}
1151
1152	// Insert outgoing implicit arguments for a call, by inserting copies to the
1153	// implicit argument registers and adding the necessary implicit uses to the
1154	// call instruction.
1155	void AMDGPUCallLowering::handleImplicitCallArguments(
1156	MachineIRBuilder &MIRBuilder, MachineInstrBuilder &CallInst,
1157	const GCNSubtarget &ST, const SIMachineFunctionInfo &FuncInfo,
1158	CallingConv::ID CalleeCC,
1159	ArrayRef<std::pair<MCRegister, Register>> ImplicitArgRegs) const {
1160	if (!ST.enableFlatScratch()) {
1161	// Insert copies for the SRD. In the HSA case, this should be an identity
1162	// copy.
1163	auto ScratchRSrcReg = MIRBuilder.buildCopy(Res: LLT::fixed_vector(NumElements: `4`, ScalarSizeInBits: `32`),
1164	Op: FuncInfo.getScratchRSrcReg());
1165
1166	auto CalleeRSrcReg = AMDGPU::isChainCC(CalleeCC)
1167	? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
1168	: AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
1169
1170	MIRBuilder.buildCopy(Res: CalleeRSrcReg, Op: ScratchRSrcReg);
1171	CallInst.addReg(RegNo: CalleeRSrcReg, flags: RegState::Implicit);
1172	}
1173
1174	for (std::pair<MCRegister, Register> ArgReg : ImplicitArgRegs) {
1175	MIRBuilder.buildCopy(Res: (Register)ArgReg.first, Op: ArgReg.second);
1176	CallInst.addReg(RegNo: ArgReg.first, flags: RegState::Implicit);
1177	}
1178	}
1179
1180	bool AMDGPUCallLowering::lowerTailCall(
1181	MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info,
1182	SmallVectorImpl<ArgInfo> &OutArgs) const {
1183	MachineFunction &MF = MIRBuilder.getMF();
1184	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1185	SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1186	const Function &F = MF.getFunction();
1187	MachineRegisterInfo &MRI = MF.getRegInfo();
1188	const SITargetLowering &TLI = *getTLI<SITargetLowering>();
1189
1190	// True when we're tail calling, but without -tailcallopt.
1191	bool IsSibCall = !MF.getTarget().Options.GuaranteedTailCallOpt;
1192
1193	// Find out which ABI gets to decide where things go.
1194	CallingConv::ID CalleeCC = Info.CallConv;
1195	CCAssignFn *AssignFnFixed;
1196	CCAssignFn *AssignFnVarArg;
1197	std::tie(args&: AssignFnFixed, args&: AssignFnVarArg) = getAssignFnsForCC(CC: CalleeCC, TLI);
1198
1199	MachineInstrBuilder CallSeqStart;
1200	if (!IsSibCall)
1201	CallSeqStart = MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKUP);
1202
1203	unsigned Opc =
1204	getCallOpcode(CallerF: MF, IsIndirect: Info.Callee.isReg(), IsTailCall: true, isWave32: ST.isWave32(), CC: CalleeCC);
1205	auto MIB = MIRBuilder.buildInstrNoInsert(Opcode: Opc);
1206	if (!addCallTargetOperands(CallInst&: MIB, MIRBuilder, Info))
1207	return false;
1208
1209	// Byte offset for the tail call. When we are sibcalling, this will always
1210	// be 0.
1211	MIB.addImm(Val: `0`);
1212
1213	// If this is a chain call, we need to pass in the EXEC mask.
1214	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1215	if (AMDGPU::isChainCC(CC: Info.CallConv)) {
1216	ArgInfo ExecArg = Info.OrigArgs [`1`];
1217	assert(ExecArg.Regs.size() == `1` && "Too many regs for EXEC");
1218
1219	if (!ExecArg.Ty->isIntegerTy(ST.getWavefrontSize()))
1220	return false;
1221
1222	if (auto CI = dyn_cast<ConstantInt>(Val: ExecArg.OrigValue)) {
1223	MIB.addImm(Val: CI->getSExtValue());
1224	} else {
1225	MIB.addReg(RegNo: ExecArg.Regs [`0`]);
1226	unsigned Idx = MIB ->getNumOperands() - `1`;
1227	MIB ->getOperand(i: Idx).setReg(constrainOperandRegClass(
1228	MF, TRI, MRI, ST.getInstrInfo(), ST.getRegBankInfo(), MIB,
1229	MIB ->getDesc(), MIB ->getOperand(i: Idx), Idx));
1230	}
1231	}
1232
1233	// Tell the call which registers are clobbered.
1234	const uint32_t *Mask = TRI->getCallPreservedMask(MF, CalleeCC);
1235	MIB.addRegMask(Mask);
1236
1237	// FPDiff is the byte offset of the call's argument area from the callee's.
1238	// Stores to callee stack arguments will be placed in FixedStackSlots offset
1239	// by this amount for a tail call. In a sibling call it must be 0 because the
1240	// caller will deallocate the entire stack and the callee still expects its
1241	// arguments to begin at SP+0.
1242	int FPDiff = `0`;
1243
1244	// This will be 0 for sibcalls, potentially nonzero for tail calls produced
1245	// by -tailcallopt. For sibcalls, the memory operands for the call are
1246	// already available in the caller's incoming argument space.
1247	unsigned NumBytes = `0`;
1248	if (!IsSibCall) {
1249	// We aren't sibcalling, so we need to compute FPDiff. We need to do this
1250	// before handling assignments, because FPDiff must be known for memory
1251	// arguments.
1252	unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
1253	SmallVector<CCValAssign, `16`> OutLocs;
1254	CCState OutInfo(CalleeCC, false, MF, OutLocs, F.getContext());
1255
1256	// FIXME: Not accounting for callee implicit inputs
1257	OutgoingValueAssigner CalleeAssigner(AssignFnFixed, AssignFnVarArg);
1258	if (!determineAssignments(Assigner&: CalleeAssigner, Args&: OutArgs, CCInfo&: OutInfo))
1259	return false;
1260
1261	// The callee will pop the argument stack as a tail call. Thus, we must
1262	// keep it 16-byte aligned.
1263	NumBytes = alignTo(Size: OutInfo.getStackSize(), A: ST.getStackAlignment());
1264
1265	// FPDiff will be negative if this tail call requires more space than we
1266	// would automatically have in our incoming argument space. Positive if we
1267	// actually shrink the stack.
1268	FPDiff = NumReusableBytes - NumBytes;
1269
1270	// The stack pointer must be 16-byte aligned at all times it's used for a
1271	// memory operation, which in practice means at all* times and in*
1272	// particular across call boundaries. Therefore our own arguments started at
1273	// a 16-byte aligned SP and the delta applied for the tail call should
1274	// satisfy the same constraint.
1275	assert(isAligned(ST.getStackAlignment(), FPDiff) &&
1276	"unaligned stack on tail call");
1277	}
1278
1279	SmallVector<CCValAssign, `16`> ArgLocs;
1280	CCState CCInfo(Info.CallConv, Info.IsVarArg, MF, ArgLocs, F.getContext());
1281
1282	// We could pass MIB and directly add the implicit uses to the call
1283	// now. However, as an aesthetic choice, place implicit argument operands
1284	// after the ordinary user argument registers.
1285	SmallVector<std::pair<MCRegister, Register>, `12`> ImplicitArgRegs;
1286
1287	if (Info.CallConv != CallingConv::AMDGPU_Gfx &&
1288	!AMDGPU::isChainCC(CC: Info.CallConv)) {
1289	// With a fixed ABI, allocate fixed registers before user arguments.
1290	if (!passSpecialInputs(MIRBuilder, CCInfo, ArgRegs&: ImplicitArgRegs, Info))
1291	return false;
1292	}
1293
1294	OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg);
1295
1296	if (!determineAssignments(Assigner, Args&: OutArgs, CCInfo))
1297	return false;
1298
1299	// Do the actual argument marshalling.
1300	AMDGPUOutgoingArgHandler Handler(MIRBuilder, MRI, MIB, true, FPDiff);
1301	if (!handleAssignments(Handler, Args&: OutArgs, CCState&: CCInfo, ArgLocs, MIRBuilder))
1302	return false;
1303
1304	if (Info.ConvergenceCtrlToken) {
1305	MIB.addUse(RegNo: Info.ConvergenceCtrlToken, Flags: RegState::Implicit);
1306	}
1307	handleImplicitCallArguments(MIRBuilder, CallInst&: MIB, ST, FuncInfo: *FuncInfo, CalleeCC,
1308	ImplicitArgRegs);
1309
1310	// If we have -tailcallopt, we need to adjust the stack. We'll do the call
1311	// sequence start and end here.
1312	if (!IsSibCall) {
1313	MIB ->getOperand(i: `1`).setImm(FPDiff);
1314	CallSeqStart.addImm(Val: NumBytes).addImm(Val: `0`);
1315	// End the call sequence before* emitting the call. Normally, we would*
1316	// tidy the frame up after the call. However, here, we've laid out the
1317	// parameters so that when SP is reset, they will be in the correct
1318	// location.
1319	MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKDOWN).addImm(NumBytes).addImm(`0`);
1320	}
1321
1322	// Now we can add the actual call instruction to the correct basic block.
1323	MIRBuilder.insertInstr(MIB);
1324
1325	// If Callee is a reg, since it is used by a target specific
1326	// instruction, it must have a register class matching the
1327	// constraint of that instruction.
1328
1329	// FIXME: We should define regbankselectable call instructions to handle
1330	// divergent call targets.
1331	if (MIB ->getOperand(i: `0`).isReg()) {
1332	MIB ->getOperand(i: `0`).setReg(constrainOperandRegClass(
1333	MF, TRI, MRI, ST.getInstrInfo(), ST.getRegBankInfo(), MIB,
1334	MIB ->getDesc(), MIB ->getOperand(i: `0`), `0`));
1335	}
1336
1337	MF.getFrameInfo().setHasTailCall();
1338	Info.LoweredTailCall = true;
1339	return true;
1340	}
1341
1342	/// Lower a call to the @llvm.amdgcn.cs.chain intrinsic.
1343	bool AMDGPUCallLowering::lowerChainCall(MachineIRBuilder &MIRBuilder,
1344	CallLoweringInfo &Info) const {
1345	ArgInfo Callee = Info.OrigArgs [`0`];
1346	ArgInfo SGPRArgs = Info.OrigArgs [`2`];
1347	ArgInfo VGPRArgs = Info.OrigArgs [`3`];
1348	ArgInfo Flags = Info.OrigArgs [`4`];
1349
1350	assert(cast<ConstantInt>(Flags.OrigValue)->isZero() &&
1351	"Non-zero flags aren't supported yet.");
1352	assert(Info.OrigArgs.size() == `5` && "Additional args aren't supported yet.");
1353
1354	MachineFunction &MF = MIRBuilder.getMF();
1355	const Function &F = MF.getFunction();
1356	const DataLayout &DL = F.getParent()->getDataLayout();
1357
1358	// The function to jump to is actually the first argument, so we'll change the
1359	// Callee and other info to match that before using our existing helper.
1360	const Value *CalleeV = Callee.OrigValue->stripPointerCasts();
1361	if (const Function *F = dyn_cast<Function>(Val: CalleeV)) {
1362	Info.Callee = MachineOperand::CreateGA(GV: F, Offset: `0`);
1363	Info.CallConv = F->getCallingConv();
1364	} else {
1365	assert(Callee.Regs.size() == `1` && "Too many regs for the callee");
1366	Info.Callee = MachineOperand::CreateReg(Reg: Callee.Regs [`0`], isDef: false);
1367	Info.CallConv = CallingConv::AMDGPU_CS_Chain; // amdgpu_cs_chain_preserve
1368	// behaves the same here.
1369	}
1370
1371	// The function that we're calling cannot be vararg (only the intrinsic is).
1372	Info.IsVarArg = false;
1373
1374	assert(std::all_of(SGPRArgs.Flags.begin(), SGPRArgs.Flags.end(),
1375	[](ISD::ArgFlagsTy F) { return F.isInReg(); }) &&
1376	"SGPR arguments should be marked inreg");
1377	assert(std::none_of(VGPRArgs.Flags.begin(), VGPRArgs.Flags.end(),
1378	[](ISD::ArgFlagsTy F) { return F.isInReg(); }) &&
1379	"VGPR arguments should not be marked inreg");
1380
1381	SmallVector<ArgInfo, `8`> OutArgs;
1382	splitToValueTypes(OrigArgInfo: SGPRArgs, SplitArgs&: OutArgs, DL, CallConv: Info.CallConv);
1383	splitToValueTypes(OrigArgInfo: VGPRArgs, SplitArgs&: OutArgs, DL, CallConv: Info.CallConv);
1384
1385	Info.IsMustTailCall = true;
1386	return lowerTailCall(MIRBuilder, Info, OutArgs);
1387	}
1388
1389	bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
1390	CallLoweringInfo &Info) const {
1391	if (Function *F = Info.CB->getCalledFunction())
1392	if (F->isIntrinsic()) {
1393	assert(F->getIntrinsicID() == Intrinsic::amdgcn_cs_chain &&
1394	"Unexpected intrinsic");
1395	return lowerChainCall(MIRBuilder, Info);
1396	}
1397
1398	if (Info.IsVarArg) {
1399	LLVM_DEBUG(dbgs() << "Variadic functions not implemented\n");
1400	return false;
1401	}
1402
1403	MachineFunction &MF = MIRBuilder.getMF();
1404	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1405	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1406
1407	const Function &F = MF.getFunction();
1408	MachineRegisterInfo &MRI = MF.getRegInfo();
1409	const SITargetLowering &TLI = *getTLI<SITargetLowering>();
1410	const DataLayout &DL = F.getParent()->getDataLayout();
1411
1412	SmallVector<ArgInfo, `8`> OutArgs;
1413	for (auto &OrigArg : Info.OrigArgs)
1414	splitToValueTypes(OrigArgInfo: OrigArg, SplitArgs&: OutArgs, DL, CallConv: Info.CallConv);
1415
1416	SmallVector<ArgInfo, `8`> InArgs;
1417	if (Info.CanLowerReturn && !Info.OrigRet.Ty->isVoidTy())
1418	splitToValueTypes(OrigArgInfo: Info.OrigRet, SplitArgs&: InArgs, DL, CallConv: Info.CallConv);
1419
1420	// If we can lower as a tail call, do that instead.
1421	bool CanTailCallOpt =
1422	isEligibleForTailCallOptimization(B&: MIRBuilder, Info, InArgs, OutArgs);
1423
1424	// We must emit a tail call if we have musttail.
1425	if (Info.IsMustTailCall && !CanTailCallOpt) {
1426	LLVM_DEBUG(dbgs() << "Failed to lower musttail call as tail call\n");
1427	return false;
1428	}
1429
1430	Info.IsTailCall = CanTailCallOpt;
1431	if (CanTailCallOpt)
1432	return lowerTailCall(MIRBuilder, Info, OutArgs);
1433
1434	// Find out which ABI gets to decide where things go.
1435	CCAssignFn *AssignFnFixed;
1436	CCAssignFn *AssignFnVarArg;
1437	std::tie(args&: AssignFnFixed, args&: AssignFnVarArg) =
1438	getAssignFnsForCC(CC: Info.CallConv, TLI);
1439
1440	MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKUP)
1441	.addImm(`0`)
1442	.addImm(`0`);
1443
1444	// Create a temporarily-floating call instruction so we can add the implicit
1445	// uses of arg registers.
1446	unsigned Opc = getCallOpcode(CallerF: MF, IsIndirect: Info.Callee.isReg(), IsTailCall: false, isWave32: ST.isWave32(),
1447	CC: Info.CallConv);
1448
1449	auto MIB = MIRBuilder.buildInstrNoInsert(Opcode: Opc);
1450	MIB.addDef(RegNo: TRI->getReturnAddressReg(MF));
1451
1452	if (!Info.IsConvergent)
1453	MIB.setMIFlag(MachineInstr::NoConvergent);
1454
1455	if (!addCallTargetOperands(CallInst&: MIB, MIRBuilder, Info))
1456	return false;
1457
1458	// Tell the call which registers are clobbered.
1459	const uint32_t *Mask = TRI->getCallPreservedMask(MF, Info.CallConv);
1460	MIB.addRegMask(Mask);
1461
1462	SmallVector<CCValAssign, `16`> ArgLocs;
1463	CCState CCInfo(Info.CallConv, Info.IsVarArg, MF, ArgLocs, F.getContext());
1464
1465	// We could pass MIB and directly add the implicit uses to the call
1466	// now. However, as an aesthetic choice, place implicit argument operands
1467	// after the ordinary user argument registers.
1468	SmallVector<std::pair<MCRegister, Register>, `12`> ImplicitArgRegs;
1469
1470	if (Info.CallConv != CallingConv::AMDGPU_Gfx) {
1471	// With a fixed ABI, allocate fixed registers before user arguments.
1472	if (!passSpecialInputs(MIRBuilder, CCInfo, ArgRegs&: ImplicitArgRegs, Info))
1473	return false;
1474	}
1475
1476	// Do the actual argument marshalling.
1477	SmallVector<Register, `8`> PhysRegs;
1478
1479	OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg);
1480	if (!determineAssignments(Assigner, Args&: OutArgs, CCInfo))
1481	return false;
1482
1483	AMDGPUOutgoingArgHandler Handler(MIRBuilder, MRI, MIB, false);
1484	if (!handleAssignments(Handler, Args&: OutArgs, CCState&: CCInfo, ArgLocs, MIRBuilder))
1485	return false;
1486
1487	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1488
1489	if (Info.ConvergenceCtrlToken) {
1490	MIB.addUse(RegNo: Info.ConvergenceCtrlToken, Flags: RegState::Implicit);
1491	}
1492	handleImplicitCallArguments(MIRBuilder, CallInst&: MIB, ST, FuncInfo: *MFI, CalleeCC: Info.CallConv,
1493	ImplicitArgRegs);
1494
1495	// Get a count of how many bytes are to be pushed on the stack.
1496	unsigned NumBytes = CCInfo.getStackSize();
1497
1498	// If Callee is a reg, since it is used by a target specific
1499	// instruction, it must have a register class matching the
1500	// constraint of that instruction.
1501
1502	// FIXME: We should define regbankselectable call instructions to handle
1503	// divergent call targets.
1504	if (MIB ->getOperand(i: `1`).isReg()) {
1505	MIB ->getOperand(i: `1`).setReg(constrainOperandRegClass(
1506	MF, TRI, MRI, ST.getInstrInfo(),
1507	ST.getRegBankInfo(), MIB, MIB ->getDesc(), MIB ->getOperand(i: `1`),
1508	`1`));
1509	}
1510
1511	// Now we can add the actual call instruction to the correct position.
1512	MIRBuilder.insertInstr(MIB);
1513
1514	// Finally we can copy the returned value back into its virtual-register. In
1515	// symmetry with the arguments, the physical register must be an
1516	// implicit-define of the call instruction.
1517	if (Info.CanLowerReturn && !Info.OrigRet.Ty->isVoidTy()) {
1518	CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(CC: Info.CallConv,
1519	IsVarArg: Info.IsVarArg);
1520	IncomingValueAssigner Assigner(RetAssignFn);
1521	CallReturnHandler Handler(MIRBuilder, MRI, MIB);
1522	if (!determineAndHandleAssignments(Handler, Assigner, Args&: InArgs, MIRBuilder,
1523	CallConv: Info.CallConv, IsVarArg: Info.IsVarArg))
1524	return false;
1525	}
1526
1527	uint64_t CalleePopBytes = NumBytes;
1528
1529	MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKDOWN)
1530	.addImm(`0`)
1531	.addImm(CalleePopBytes);
1532
1533	if (!Info.CanLowerReturn) {
1534	insertSRetLoads(MIRBuilder, RetTy: Info.OrigRet.Ty, VRegs: Info.OrigRet.Regs,
1535	DemoteReg: Info.DemoteRegister, FI: Info.DemoteStackIndex);
1536	}
1537
1538	return true;
1539	}
1540

source code of llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp