X86FrameLowering.cpp source code [llvm/lib/Target/X86/X86FrameLowering.cpp]

1	//===-- X86FrameLowering.cpp - X86 Frame Information ----------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file contains the X86 implementation of TargetFrameLowering class.
10	//
11	//===----------------------------------------------------------------------===//
12
13	#include "X86FrameLowering.h"
14	#include "MCTargetDesc/X86MCTargetDesc.h"
15	#include "X86InstrBuilder.h"
16	#include "X86InstrInfo.h"
17	#include "X86MachineFunctionInfo.h"
18	#include "X86Subtarget.h"
19	#include "X86TargetMachine.h"
20	#include "llvm/ADT/Statistic.h"
21	#include "llvm/CodeGen/LivePhysRegs.h"
22	#include "llvm/CodeGen/MachineFrameInfo.h"
23	#include "llvm/CodeGen/MachineFunction.h"
24	#include "llvm/CodeGen/MachineInstrBuilder.h"
25	#include "llvm/CodeGen/MachineModuleInfo.h"
26	#include "llvm/CodeGen/MachineRegisterInfo.h"
27	#include "llvm/CodeGen/WinEHFuncInfo.h"
28	#include "llvm/IR/DataLayout.h"
29	#include "llvm/IR/EHPersonalities.h"
30	#include "llvm/IR/Function.h"
31	#include "llvm/MC/MCAsmInfo.h"
32	#include "llvm/MC/MCObjectFileInfo.h"
33	#include "llvm/MC/MCSymbol.h"
34	#include "llvm/Support/Debug.h"
35	#include "llvm/Support/LEB128.h"
36	#include "llvm/Target/TargetOptions.h"
37	#include <cstdlib>
38
39	#define DEBUG_TYPE "x86-fl"
40
41	STATISTIC(NumFrameLoopProbe, "Number of loop stack probes used in prologue");
42	STATISTIC(NumFrameExtraProbe,
43	"Number of extra stack probes generated in prologue");
44	STATISTIC(NumFunctionUsingPush2Pop2, "Number of funtions using push2/pop2");
45
46	using namespace llvm;
47
48	X86FrameLowering::X86FrameLowering(const X86Subtarget &STI,
49	MaybeAlign StackAlignOverride)
50	: TargetFrameLowering(StackGrowsDown, StackAlignOverride.valueOrOne(),
51	STI.is64Bit() ? -`8` : -`4`),
52	STI(STI), TII(*STI.getInstrInfo()), TRI(STI.getRegisterInfo()) {
53	// Cache a bunch of frame-related predicates for this subtarget.
54	SlotSize = TRI->getSlotSize();
55	Is64Bit = STI.is64Bit();
56	IsLP64 = STI.isTarget64BitLP64();
57	// standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit.
58	Uses64BitFramePtr = STI.isTarget64BitLP64() \|\| STI.isTargetNaCl64();
59	StackPtr = TRI->getStackRegister();
60	}
61
62	bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
63	return !MF.getFrameInfo().hasVarSizedObjects() &&
64	!MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences() &&
65	!MF.getInfo<X86MachineFunctionInfo>()->hasPreallocatedCall();
66	}
67
68	/// canSimplifyCallFramePseudos - If there is a reserved call frame, the
69	/// call frame pseudos can be simplified. Having a FP, as in the default
70	/// implementation, is not sufficient here since we can't always use it.
71	/// Use a more nuanced condition.
72	bool X86FrameLowering::canSimplifyCallFramePseudos(
73	const MachineFunction &MF) const {
74	return hasReservedCallFrame(MF) \|\|
75	MF.getInfo<X86MachineFunctionInfo>()->hasPreallocatedCall() \|\|
76	(hasFP(MF) && !TRI->hasStackRealignment(MF)) \|\|
77	TRI->hasBasePointer(MF);
78	}
79
80	// needsFrameIndexResolution - Do we need to perform FI resolution for
81	// this function. Normally, this is required only when the function
82	// has any stack objects. However, FI resolution actually has another job,
83	// not apparent from the title - it resolves callframesetup/destroy
84	// that were not simplified earlier.
85	// So, this is required for x86 functions that have push sequences even
86	// when there are no stack objects.
87	bool X86FrameLowering::needsFrameIndexResolution(
88	const MachineFunction &MF) const {
89	return MF.getFrameInfo().hasStackObjects() \|\|
90	MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences();
91	}
92
93	/// hasFP - Return true if the specified function should have a dedicated frame
94	/// pointer register. This is true if the function has variable sized allocas
95	/// or if frame pointer elimination is disabled.
96	bool X86FrameLowering::hasFP(const MachineFunction &MF) const {
97	const MachineFrameInfo &MFI = MF.getFrameInfo();
98	return (MF.getTarget().Options.DisableFramePointerElim(MF) \|\|
99	TRI->hasStackRealignment(MF) \|\| MFI.hasVarSizedObjects() \|\|
100	MFI.isFrameAddressTaken() \|\| MFI.hasOpaqueSPAdjustment() \|\|
101	MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() \|\|
102	MF.getInfo<X86MachineFunctionInfo>()->hasPreallocatedCall() \|\|
103	MF.callsUnwindInit() \|\| MF.hasEHFunclets() \|\| MF.callsEHReturn() \|\|
104	MFI.hasStackMap() \|\| MFI.hasPatchPoint() \|\|
105	(isWin64Prologue(MF) && MFI.hasCopyImplyingStackAdjustment()));
106	}
107
108	static unsigned getSUBriOpcode(bool IsLP64) {
109	return IsLP64 ? X86::SUB64ri32 : X86::SUB32ri;
110	}
111
112	static unsigned getADDriOpcode(bool IsLP64) {
113	return IsLP64 ? X86::ADD64ri32 : X86::ADD32ri;
114	}
115
116	static unsigned getSUBrrOpcode(bool IsLP64) {
117	return IsLP64 ? X86::SUB64rr : X86::SUB32rr;
118	}
119
120	static unsigned getADDrrOpcode(bool IsLP64) {
121	return IsLP64 ? X86::ADD64rr : X86::ADD32rr;
122	}
123
124	static unsigned getANDriOpcode(bool IsLP64, int64_t Imm) {
125	return IsLP64 ? X86::AND64ri32 : X86::AND32ri;
126	}
127
128	static unsigned getLEArOpcode(bool IsLP64) {
129	return IsLP64 ? X86::LEA64r : X86::LEA32r;
130	}
131
132	static unsigned getMOVriOpcode(bool Use64BitReg, int64_t Imm) {
133	if (Use64BitReg) {
134	if (isUInt<`32`>(x: Imm))
135	return X86::MOV32ri64;
136	if (isInt<`32`>(x: Imm))
137	return X86::MOV64ri32;
138	return X86::MOV64ri;
139	}
140	return X86::MOV32ri;
141	}
142
143	// Push-Pop Acceleration (PPX) hint is used to indicate that the POP reads the
144	// value written by the PUSH from the stack. The processor tracks these marked
145	// instructions internally and fast-forwards register data between matching PUSH
146	// and POP instructions, without going through memory or through the training
147	// loop of the Fast Store Forwarding Predictor (FSFP). Instead, a more efficient
148	// memory-renaming optimization can be used.
149	//
150	// The PPX hint is purely a performance hint. Instructions with this hint have
151	// the same functional semantics as those without. PPX hints set by the
152	// compiler that violate the balancing rule may turn off the PPX optimization,
153	// but they will not affect program semantics.
154	//
155	// Hence, PPX is used for balanced spill/reloads (Exceptions and setjmp/longjmp
156	// are not considered).
157	//
158	// PUSH2 and POP2 are instructions for (respectively) pushing/popping 2
159	// GPRs at a time to/from the stack.
160	static unsigned getPUSHOpcode(const X86Subtarget &ST) {
161	return ST.is64Bit() ? (ST.hasPPX() ? X86::PUSHP64r : X86::PUSH64r)
162	: X86::PUSH32r;
163	}
164	static unsigned getPOPOpcode(const X86Subtarget &ST) {
165	return ST.is64Bit() ? (ST.hasPPX() ? X86::POPP64r : X86::POP64r)
166	: X86::POP32r;
167	}
168	static unsigned getPUSH2Opcode(const X86Subtarget &ST) {
169	return ST.hasPPX() ? X86::PUSH2P : X86::PUSH2;
170	}
171	static unsigned getPOP2Opcode(const X86Subtarget &ST) {
172	return ST.hasPPX() ? X86::POP2P : X86::POP2;
173	}
174
175	static bool isEAXLiveIn(MachineBasicBlock &MBB) {
176	for (MachineBasicBlock::RegisterMaskPair RegMask : MBB.liveins()) {
177	unsigned Reg = RegMask.PhysReg;
178
179	if (Reg == X86::RAX \|\| Reg == X86::EAX \|\| Reg == X86::AX \|\|
180	Reg == X86::AH \|\| Reg == X86::AL)
181	return true;
182	}
183
184	return false;
185	}
186
187	/// Check if the flags need to be preserved before the terminators.
188	/// This would be the case, if the eflags is live-in of the region
189	/// composed by the terminators or live-out of that region, without
190	/// being defined by a terminator.
191	static bool
192	flagsNeedToBePreservedBeforeTheTerminators(const MachineBasicBlock &MBB) {
193	for (const MachineInstr &MI : MBB.terminators()) {
194	bool BreakNext = false;
195	for (const MachineOperand &MO : MI.operands()) {
196	if (!MO.isReg())
197	continue;
198	Register Reg = MO.getReg();
199	if (Reg != X86::EFLAGS)
200	continue;
201
202	// This terminator needs an eflags that is not defined
203	// by a previous another terminator:
204	// EFLAGS is live-in of the region composed by the terminators.
205	if (!MO.isDef())
206	return true;
207	// This terminator defines the eflags, i.e., we don't need to preserve it.
208	// However, we still need to check this specific terminator does not
209	// read a live-in value.
210	BreakNext = true;
211	}
212	// We found a definition of the eflags, no need to preserve them.
213	if (BreakNext)
214	return false;
215	}
216
217	// None of the terminators use or define the eflags.
218	// Check if they are live-out, that would imply we need to preserve them.
219	for (const MachineBasicBlock *Succ : MBB.successors())
220	if (Succ->isLiveIn(X86::Reg: EFLAGS))
221	return true;
222
223	return false;
224	}
225
226	/// emitSPUpdate - Emit a series of instructions to increment / decrement the
227	/// stack pointer by a constant value.
228	void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
229	MachineBasicBlock::iterator &MBBI,
230	const DebugLoc &DL, int64_t NumBytes,
231	bool InEpilogue) const {
232	bool isSub = NumBytes < `0`;
233	uint64_t Offset = isSub ? -NumBytes : NumBytes;
234	MachineInstr::MIFlag Flag =
235	isSub ? MachineInstr::FrameSetup : MachineInstr::FrameDestroy;
236
237	uint64_t Chunk = (`1LL` << `31`) - `1`;
238
239	MachineFunction &MF = *MBB.getParent();
240	const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
241	const X86TargetLowering &TLI = *STI.getTargetLowering();
242	const bool EmitInlineStackProbe = TLI.hasInlineStackProbe(MF);
243
244	// It's ok to not take into account large chunks when probing, as the
245	// allocation is split in smaller chunks anyway.
246	if (EmitInlineStackProbe && !InEpilogue) {
247
248	// This pseudo-instruction is going to be expanded, potentially using a
249	// loop, by inlineStackProbe().
250	BuildMI(MBB, MBBI, DL, TII.get(X86::STACKALLOC_W_PROBING)).addImm(Offset);
251	return;
252	} else if (Offset > Chunk) {
253	// Rather than emit a long series of instructions for large offsets,
254	// load the offset into a register and do one sub/add
255	unsigned Reg = `0`;
256	unsigned Rax = (unsigned)(Is64Bit ? X86::RAX : X86::EAX);
257
258	if (isSub && !isEAXLiveIn(MBB))
259	Reg = Rax;
260	else
261	Reg = TRI->findDeadCallerSavedReg(MBB, MBBI);
262
263	unsigned AddSubRROpc =
264	isSub ? getSUBrrOpcode(IsLP64: Is64Bit) : getADDrrOpcode(IsLP64: Is64Bit);
265	if (Reg) {
266	BuildMI(MBB, MBBI, DL, TII.get(getMOVriOpcode(Use64BitReg: Is64Bit, Imm: Offset)), Reg)
267	.addImm(Offset)
268	.setMIFlag(Flag);
269	MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AddSubRROpc), StackPtr)
270	.addReg(StackPtr)
271	.addReg(Reg);
272	MI->getOperand(i: `3`).setIsDead(); // The EFLAGS implicit def is dead.
273	return;
274	} else if (Offset > `8` * Chunk) {
275	// If we would need more than 8 add or sub instructions (a >16GB stack
276	// frame), it's worth spilling RAX to materialize this immediate.
277	// pushq %rax
278	// movabsq +-$Offset+-SlotSize, %rax
279	// addq %rsp, %rax
280	// xchg %rax, (%rsp)
281	// movq (%rsp), %rsp
282	assert(Is64Bit && "can't have 32-bit 16GB stack frame");
283	BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64r))
284	.addReg(Rax, RegState::Kill)
285	.setMIFlag(Flag);
286	// Subtract is not commutative, so negate the offset and always use add.
287	// Subtract 8 less and add 8 more to account for the PUSH we just did.
288	if (isSub)
289	Offset = -(Offset - SlotSize);
290	else
291	Offset = Offset + SlotSize;
292	BuildMI(MBB, MBBI, DL, TII.get(getMOVriOpcode(Use64BitReg: Is64Bit, Imm: Offset)), Rax)
293	.addImm(Offset)
294	.setMIFlag(Flag);
295	MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(X86::ADD64rr), Rax)
296	.addReg(Rax)
297	.addReg(StackPtr);
298	MI->getOperand(i: `3`).setIsDead(); // The EFLAGS implicit def is dead.
299	// Exchange the new SP in RAX with the top of the stack.
300	addRegOffset(
301	BuildMI(MBB, MBBI, DL, TII.get(X86::XCHG64rm), Rax).addReg(Rax),
302	StackPtr, false, `0`);
303	// Load new SP from the top of the stack into RSP.
304	addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rm), StackPtr),
305	StackPtr, false, `0`);
306	return;
307	}
308	}
309
310	while (Offset) {
311	uint64_t ThisVal = std::min(a: Offset, b: Chunk);
312	if (ThisVal == SlotSize) {
313	// Use push / pop for slot sized adjustments as a size optimization. We
314	// need to find a dead register when using pop.
315	unsigned Reg = isSub ? (unsigned)(Is64Bit ? X86::RAX : X86::EAX)
316	: TRI->findDeadCallerSavedReg(MBB, MBBI);
317	if (Reg) {
318	unsigned Opc = isSub ? (Is64Bit ? X86::PUSH64r : X86::PUSH32r)
319	: (Is64Bit ? X86::POP64r : X86::POP32r);
320	BuildMI(MBB, MBBI, DL, TII.get(Opc))
321	.addReg(Reg, getDefRegState(B: !isSub) \| getUndefRegState(B: isSub))
322	.setMIFlag(Flag);
323	Offset -= ThisVal;
324	continue;
325	}
326	}
327
328	BuildStackAdjustment(MBB, MBBI, DL, Offset: isSub ? -ThisVal : ThisVal, InEpilogue)
329	.setMIFlag(Flag);
330
331	Offset -= ThisVal;
332	}
333	}
334
335	MachineInstrBuilder X86FrameLowering::BuildStackAdjustment(
336	MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
337	const DebugLoc &DL, int64_t Offset, bool InEpilogue) const {
338	assert(Offset != `0` && "zero offset stack adjustment requested");
339
340	// On Atom, using LEA to adjust SP is preferred, but using it in the epilogue
341	// is tricky.
342	bool UseLEA;
343	if (!InEpilogue) {
344	// Check if inserting the prologue at the beginning
345	// of MBB would require to use LEA operations.
346	// We need to use LEA operations if EFLAGS is live in, because
347	// it means an instruction will read it before it gets defined.
348	UseLEA = STI.useLeaForSP() \|\| MBB.isLiveIn(X86::EFLAGS);
349	} else {
350	// If we can use LEA for SP but we shouldn't, check that none
351	// of the terminators uses the eflags. Otherwise we will insert
352	// a ADD that will redefine the eflags and break the condition.
353	// Alternatively, we could move the ADD, but this may not be possible
354	// and is an optimization anyway.
355	UseLEA = canUseLEAForSPInEpilogue(MF: *MBB.getParent());
356	if (UseLEA && !STI.useLeaForSP())
357	UseLEA = flagsNeedToBePreservedBeforeTheTerminators(MBB);
358	// If that assert breaks, that means we do not do the right thing
359	// in canUseAsEpilogue.
360	assert((UseLEA \|\| !flagsNeedToBePreservedBeforeTheTerminators(MBB)) &&
361	"We shouldn't have allowed this insertion point");
362	}
363
364	MachineInstrBuilder MI;
365	if (UseLEA) {
366	MI = addRegOffset(BuildMI(MBB, MBBI, DL,
367	TII.get(getLEArOpcode(IsLP64: Uses64BitFramePtr)),
368	StackPtr),
369	StackPtr, false, Offset);
370	} else {
371	bool IsSub = Offset < `0`;
372	uint64_t AbsOffset = IsSub ? -Offset : Offset;
373	const unsigned Opc = IsSub ? getSUBriOpcode(IsLP64: Uses64BitFramePtr)
374	: getADDriOpcode(IsLP64: Uses64BitFramePtr);
375	MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
376	.addReg(StackPtr)
377	.addImm(AbsOffset);
378	MI ->getOperand(i: `3`).setIsDead(); // The EFLAGS implicit def is dead.
379	}
380	return MI;
381	}
382
383	int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,
384	MachineBasicBlock::iterator &MBBI,
385	bool doMergeWithPrevious) const {
386	if ((doMergeWithPrevious && MBBI == MBB.begin()) \|\|
387	(!doMergeWithPrevious && MBBI == MBB.end()))
388	return `0`;
389
390	MachineBasicBlock::iterator PI = doMergeWithPrevious ? std::prev(x: MBBI) : MBBI;
391
392	PI = skipDebugInstructionsBackward(It: PI, Begin: MBB.begin());
393	// It is assumed that ADD/SUB/LEA instruction is succeded by one CFI
394	// instruction, and that there are no DBG_VALUE or other instructions between
395	// ADD/SUB/LEA and its corresponding CFI instruction.
396	/ TODO: Add support for the case where there are multiple CFI instructions*
397	below the ADD/SUB/LEA, e.g.:
398	...
399	add
400	cfi_def_cfa_offset
401	cfi_offset
402	...
403	*/
404	if (doMergeWithPrevious && PI != MBB.begin() && PI ->isCFIInstruction())
405	PI = std::prev(x: PI);
406
407	unsigned Opc = PI ->getOpcode();
408	int Offset = `0`;
409
410	if ((Opc == X86::ADD64ri32 \|\| Opc == X86::ADD32ri) &&
411	PI->getOperand(`0`).getReg() == StackPtr) {
412	assert(PI ->getOperand(`1`).getReg() == StackPtr);
413	Offset = PI ->getOperand(i: `2`).getImm();
414	} else if ((Opc == X86::LEA32r \|\| Opc == X86::LEA64_32r) &&
415	PI->getOperand(`0`).getReg() == StackPtr &&
416	PI->getOperand(`1`).getReg() == StackPtr &&
417	PI->getOperand(`2`).getImm() == `1` &&
418	PI->getOperand(`3`).getReg() == X86::NoRegister &&
419	PI->getOperand(`5`).getReg() == X86::NoRegister) {
420	// For LEAs we have: def = lea SP, FI, noreg, Offset, noreg.
421	Offset = PI ->getOperand(i: `4`).getImm();
422	} else if ((Opc == X86::SUB64ri32 \|\| Opc == X86::SUB32ri) &&
423	PI->getOperand(`0`).getReg() == StackPtr) {
424	assert(PI ->getOperand(`1`).getReg() == StackPtr);
425	Offset = -PI ->getOperand(i: `2`).getImm();
426	} else
427	return `0`;
428
429	PI = MBB.erase(I: PI);
430	if (PI != MBB.end() && PI ->isCFIInstruction()) {
431	auto CIs = MBB.getParent()->getFrameInstructions();
432	MCCFIInstruction CI = CIs [PI ->getOperand(i: `0`).getCFIIndex()];
433	if (CI.getOperation() == MCCFIInstruction::OpDefCfaOffset \|\|
434	CI.getOperation() == MCCFIInstruction::OpAdjustCfaOffset)
435	PI = MBB.erase(I: PI);
436	}
437	if (!doMergeWithPrevious)
438	MBBI = skipDebugInstructionsForward(It: PI, End: MBB.end());
439
440	return Offset;
441	}
442
443	void X86FrameLowering::BuildCFI(MachineBasicBlock &MBB,
444	MachineBasicBlock::iterator MBBI,
445	const DebugLoc &DL,
446	const MCCFIInstruction &CFIInst,
447	MachineInstr::MIFlag Flag) const {
448	MachineFunction &MF = *MBB.getParent();
449	unsigned CFIIndex = MF.addFrameInst(Inst: CFIInst);
450
451	if (CFIInst.getOperation() == MCCFIInstruction::OpAdjustCfaOffset)
452	MF.getInfo<X86MachineFunctionInfo>()->setHasCFIAdjustCfa(true);
453
454	BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
455	.addCFIIndex(CFIIndex)
456	.setMIFlag(Flag);
457	}
458
459	/// Emits Dwarf Info specifying offsets of callee saved registers and
460	/// frame pointer. This is called only when basic block sections are enabled.
461	void X86FrameLowering::emitCalleeSavedFrameMovesFullCFA(
462	MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
463	MachineFunction &MF = *MBB.getParent();
464	if (!hasFP(MF)) {
465	emitCalleeSavedFrameMoves(MBB, MBBI, DL: DebugLoc {}, IsPrologue: true);
466	return;
467	}
468	const MachineModuleInfo &MMI = MF.getMMI();
469	const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
470	const Register FramePtr = TRI->getFrameRegister(MF);
471	const Register MachineFramePtr =
472	STI.isTarget64BitILP32() ? Register (getX86SubSuperRegister(Reg: FramePtr, Size: `64`))
473	: FramePtr;
474	unsigned DwarfReg = MRI->getDwarfRegNum(RegNum: MachineFramePtr, isEH: true);
475	// Offset = space for return address + size of the frame pointer itself.
476	unsigned Offset = (Is64Bit ? `8` : `4`) + (Uses64BitFramePtr ? `8` : `4`);
477	BuildCFI(MBB, MBBI, DL: DebugLoc {},
478	CFIInst: MCCFIInstruction::createOffset(L: nullptr, Register: DwarfReg, Offset: -Offset));
479	emitCalleeSavedFrameMoves(MBB, MBBI, DL: DebugLoc {}, IsPrologue: true);
480	}
481
482	void X86FrameLowering::emitCalleeSavedFrameMoves(
483	MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
484	const DebugLoc &DL, bool IsPrologue) const {
485	MachineFunction &MF = *MBB.getParent();
486	MachineFrameInfo &MFI = MF.getFrameInfo();
487	MachineModuleInfo &MMI = MF.getMMI();
488	const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
489	X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
490
491	// Add callee saved registers to move list.
492	const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
493
494	// Calculate offsets.
495	for (const CalleeSavedInfo &I : CSI) {
496	int64_t Offset = MFI.getObjectOffset(ObjectIdx: I.getFrameIdx());
497	Register Reg = I.getReg();
498	unsigned DwarfReg = MRI->getDwarfRegNum(RegNum: Reg, isEH: true);
499
500	if (IsPrologue) {
501	if (X86FI->getStackPtrSaveMI()) {
502	// +2SlotSize because there is return address and ebp at the bottom*
503	// of the stack.
504	// \| retaddr \|
505	// \| ebp \|
506	// \| \|<--ebp
507	Offset += `2` * SlotSize;
508	SmallString<`64`> CfaExpr;
509	CfaExpr.push_back(Elt: dwarf::DW_CFA_expression);
510	uint8_t buffer[`16`];
511	CfaExpr.append(in_start: buffer, in_end: buffer + encodeULEB128(Value: DwarfReg, p: buffer));
512	CfaExpr.push_back(Elt: `2`);
513	Register FramePtr = TRI->getFrameRegister(MF);
514	const Register MachineFramePtr =
515	STI.isTarget64BitILP32()
516	? Register (getX86SubSuperRegister(Reg: FramePtr, Size: `64`))
517	: FramePtr;
518	unsigned DwarfFramePtr = MRI->getDwarfRegNum(RegNum: MachineFramePtr, isEH: true);
519	CfaExpr.push_back(Elt: (uint8_t)(dwarf::DW_OP_breg0 + DwarfFramePtr));
520	CfaExpr.append(in_start: buffer, in_end: buffer + encodeSLEB128(Value: Offset, p: buffer));
521	BuildCFI(MBB, MBBI, DL,
522	CFIInst: MCCFIInstruction::createEscape(L: nullptr, Vals: CfaExpr.str()),
523	Flag: MachineInstr::FrameSetup);
524	} else {
525	BuildCFI(MBB, MBBI, DL,
526	CFIInst: MCCFIInstruction::createOffset(L: nullptr, Register: DwarfReg, Offset));
527	}
528	} else {
529	BuildCFI(MBB, MBBI, DL,
530	CFIInst: MCCFIInstruction::createRestore(L: nullptr, Register: DwarfReg));
531	}
532	}
533	if (auto *MI = X86FI->getStackPtrSaveMI()) {
534	int FI = MI->getOperand(i: `1`).getIndex();
535	int64_t Offset = MFI.getObjectOffset(ObjectIdx: FI) + `2` * SlotSize;
536	SmallString<`64`> CfaExpr;
537	Register FramePtr = TRI->getFrameRegister(MF);
538	const Register MachineFramePtr =
539	STI.isTarget64BitILP32()
540	? Register (getX86SubSuperRegister(Reg: FramePtr, Size: `64`))
541	: FramePtr;
542	unsigned DwarfFramePtr = MRI->getDwarfRegNum(RegNum: MachineFramePtr, isEH: true);
543	CfaExpr.push_back(Elt: (uint8_t)(dwarf::DW_OP_breg0 + DwarfFramePtr));
544	uint8_t buffer[`16`];
545	CfaExpr.append(in_start: buffer, in_end: buffer + encodeSLEB128(Value: Offset, p: buffer));
546	CfaExpr.push_back(Elt: dwarf::DW_OP_deref);
547
548	SmallString<`64`> DefCfaExpr;
549	DefCfaExpr.push_back(Elt: dwarf::DW_CFA_def_cfa_expression);
550	DefCfaExpr.append(in_start: buffer, in_end: buffer + encodeSLEB128(Value: CfaExpr.size(), p: buffer));
551	DefCfaExpr.append(RHS: CfaExpr.str());
552	// DW_CFA_def_cfa_expression: DW_OP_breg5 offset, DW_OP_deref
553	BuildCFI(MBB, MBBI, DL,
554	CFIInst: MCCFIInstruction::createEscape(L: nullptr, Vals: DefCfaExpr.str()),
555	Flag: MachineInstr::FrameSetup);
556	}
557	}
558
559	void X86FrameLowering::emitZeroCallUsedRegs(BitVector RegsToZero,
560	MachineBasicBlock &MBB) const {
561	const MachineFunction &MF = *MBB.getParent();
562
563	// Insertion point.
564	MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
565
566	// Fake a debug loc.
567	DebugLoc DL;
568	if (MBBI != MBB.end())
569	DL = MBBI ->getDebugLoc();
570
571	// Zero out FP stack if referenced. Do this outside of the loop below so that
572	// it's done only once.
573	const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
574	for (MCRegister Reg : RegsToZero.set_bits()) {
575	if (!X86::RFP80RegClass.contains(Reg))
576	continue;
577
578	unsigned NumFPRegs = ST.is64Bit() ? `8` : `7`;
579	for (unsigned i = `0`; i != NumFPRegs; ++i)
580	BuildMI(MBB, MBBI, DL, TII.get(X86::LD_F0));
581
582	for (unsigned i = `0`; i != NumFPRegs; ++i)
583	BuildMI(MBB, MBBI, DL, TII.get(X86::ST_FPrr)).addReg(X86::ST0);
584	break;
585	}
586
587	// For GPRs, we only care to clear out the 32-bit register.
588	BitVector GPRsToZero(TRI->getNumRegs());
589	for (MCRegister Reg : RegsToZero.set_bits())
590	if (TRI->isGeneralPurposeRegister(MF, Reg)) {
591	GPRsToZero.set(getX86SubSuperRegister(Reg, Size: `32`));
592	RegsToZero.reset(Idx: Reg);
593	}
594
595	// Zero out the GPRs first.
596	for (MCRegister Reg : GPRsToZero.set_bits())
597	TII.buildClearRegister(Reg, MBB, MBBI, DL);
598
599	// Zero out the remaining registers.
600	for (MCRegister Reg : RegsToZero.set_bits())
601	TII.buildClearRegister(Reg, MBB, Iter: MBBI, DL);
602	}
603
604	void X86FrameLowering::emitStackProbe(
605	MachineFunction &MF, MachineBasicBlock &MBB,
606	MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog,
607	std::optional<MachineFunction::DebugInstrOperandPair> InstrNum) const {
608	const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
609	if (STI.isTargetWindowsCoreCLR()) {
610	if (InProlog) {
611	BuildMI(MBB, MBBI, DL, TII.get(X86::STACKALLOC_W_PROBING))
612	.addImm(`0` / no explicit stack size /);
613	} else {
614	emitStackProbeInline(MF, MBB, MBBI, DL, InProlog: false);
615	}
616	} else {
617	emitStackProbeCall(MF, MBB, MBBI, DL, InProlog, InstrNum);
618	}
619	}
620
621	bool X86FrameLowering::stackProbeFunctionModifiesSP() const {
622	return STI.isOSWindows() && !STI.isTargetWin64();
623	}
624
625	void X86FrameLowering::inlineStackProbe(MachineFunction &MF,
626	MachineBasicBlock &PrologMBB) const {
627	auto Where = llvm::find_if(Range&: PrologMBB, P: [](MachineInstr &MI) {
628	return MI.getOpcode() == X86::STACKALLOC_W_PROBING;
629	});
630	if (Where != PrologMBB.end()) {
631	DebugLoc DL = PrologMBB.findDebugLoc(MBBI: Where);
632	emitStackProbeInline(MF, MBB&: PrologMBB, MBBI: Where, DL, InProlog: true);
633	Where ->eraseFromParent();
634	}
635	}
636
637	void X86FrameLowering::emitStackProbeInline(MachineFunction &MF,
638	MachineBasicBlock &MBB,
639	MachineBasicBlock::iterator MBBI,
640	const DebugLoc &DL,
641	bool InProlog) const {
642	const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
643	if (STI.isTargetWindowsCoreCLR() && STI.is64Bit())
644	emitStackProbeInlineWindowsCoreCLR64(MF, MBB, MBBI, DL, InProlog);
645	else
646	emitStackProbeInlineGeneric(MF, MBB, MBBI, DL, InProlog);
647	}
648
649	void X86FrameLowering::emitStackProbeInlineGeneric(
650	MachineFunction &MF, MachineBasicBlock &MBB,
651	MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const {
652	MachineInstr &AllocWithProbe = *MBBI;
653	uint64_t Offset = AllocWithProbe.getOperand(i: `0`).getImm();
654
655	const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
656	const X86TargetLowering &TLI = *STI.getTargetLowering();
657	assert(!(STI.is64Bit() && STI.isTargetWindowsCoreCLR()) &&
658	"different expansion expected for CoreCLR 64 bit");
659
660	const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);
661	uint64_t ProbeChunk = StackProbeSize * `8`;
662
663	uint64_t MaxAlign =
664	TRI->hasStackRealignment(MF) ? calculateMaxStackAlign(MF) : `0`;
665
666	// Synthesize a loop or unroll it, depending on the number of iterations.
667	// BuildStackAlignAND ensures that only MaxAlign % StackProbeSize bits left
668	// between the unaligned rsp and current rsp.
669	if (Offset > ProbeChunk) {
670	emitStackProbeInlineGenericLoop(MF, MBB, MBBI, DL, Offset,
671	Align: MaxAlign % StackProbeSize);
672	} else {
673	emitStackProbeInlineGenericBlock(MF, MBB, MBBI, DL, Offset,
674	Align: MaxAlign % StackProbeSize);
675	}
676	}
677
678	void X86FrameLowering::emitStackProbeInlineGenericBlock(
679	MachineFunction &MF, MachineBasicBlock &MBB,
680	MachineBasicBlock::iterator MBBI, const DebugLoc &DL, uint64_t Offset,
681	uint64_t AlignOffset) const {
682
683	const bool NeedsDwarfCFI = needsDwarfCFI(MF);
684	const bool HasFP = hasFP(MF);
685	const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
686	const X86TargetLowering &TLI = *STI.getTargetLowering();
687	const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi;
688	const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);
689
690	uint64_t CurrentOffset = `0`;
691
692	assert(AlignOffset < StackProbeSize);
693
694	// If the offset is so small it fits within a page, there's nothing to do.
695	if (StackProbeSize < Offset + AlignOffset) {
696
697	uint64_t StackAdjustment = StackProbeSize - AlignOffset;
698	BuildStackAdjustment(MBB, MBBI, DL, Offset: -StackAdjustment, /InEpilogue=/false)
699	.setMIFlag(MachineInstr::FrameSetup);
700	if (!HasFP && NeedsDwarfCFI) {
701	BuildCFI(
702	MBB, MBBI, DL,
703	CFIInst: MCCFIInstruction::createAdjustCfaOffset(L: nullptr, Adjustment: StackAdjustment));
704	}
705
706	addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MovMIOpc))
707	.setMIFlag(MachineInstr::FrameSetup),
708	StackPtr, false, `0`)
709	.addImm(`0`)
710	.setMIFlag(MachineInstr::FrameSetup);
711	NumFrameExtraProbe ++;
712	CurrentOffset = StackProbeSize - AlignOffset;
713	}
714
715	// For the next N - 1 pages, just probe. I tried to take advantage of
716	// natural probes but it implies much more logic and there was very few
717	// interesting natural probes to interleave.
718	while (CurrentOffset + StackProbeSize < Offset) {
719	BuildStackAdjustment(MBB, MBBI, DL, Offset: -StackProbeSize, /InEpilogue=/false)
720	.setMIFlag(MachineInstr::FrameSetup);
721
722	if (!HasFP && NeedsDwarfCFI) {
723	BuildCFI(
724	MBB, MBBI, DL,
725	CFIInst: MCCFIInstruction::createAdjustCfaOffset(L: nullptr, Adjustment: StackProbeSize));
726	}
727	addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MovMIOpc))
728	.setMIFlag(MachineInstr::FrameSetup),
729	StackPtr, false, `0`)
730	.addImm(`0`)
731	.setMIFlag(MachineInstr::FrameSetup);
732	NumFrameExtraProbe ++;
733	CurrentOffset += StackProbeSize;
734	}
735
736	// No need to probe the tail, it is smaller than a Page.
737	uint64_t ChunkSize = Offset - CurrentOffset;
738	if (ChunkSize == SlotSize) {
739	// Use push for slot sized adjustments as a size optimization,
740	// like emitSPUpdate does when not probing.
741	unsigned Reg = Is64Bit ? X86::RAX : X86::EAX;
742	unsigned Opc = Is64Bit ? X86::PUSH64r : X86::PUSH32r;
743	BuildMI(MBB, MBBI, DL, TII.get(Opc))
744	.addReg(Reg, RegState::Undef)
745	.setMIFlag(MachineInstr::FrameSetup);
746	} else {
747	BuildStackAdjustment(MBB, MBBI, DL, Offset: -ChunkSize, /InEpilogue=/false)
748	.setMIFlag(MachineInstr::FrameSetup);
749	}
750	// No need to adjust Dwarf CFA offset here, the last position of the stack has
751	// been defined
752	}
753
754	void X86FrameLowering::emitStackProbeInlineGenericLoop(
755	MachineFunction &MF, MachineBasicBlock &MBB,
756	MachineBasicBlock::iterator MBBI, const DebugLoc &DL, uint64_t Offset,
757	uint64_t AlignOffset) const {
758	assert(Offset && "null offset");
759
760	assert(MBB.computeRegisterLiveness(TRI, X86::EFLAGS, MBBI) !=
761	MachineBasicBlock::LQR_Live &&
762	"Inline stack probe loop will clobber live EFLAGS.");
763
764	const bool NeedsDwarfCFI = needsDwarfCFI(MF);
765	const bool HasFP = hasFP(MF);
766	const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
767	const X86TargetLowering &TLI = *STI.getTargetLowering();
768	const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi;
769	const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);
770
771	if (AlignOffset) {
772	if (AlignOffset < StackProbeSize) {
773	// Perform a first smaller allocation followed by a probe.
774	BuildStackAdjustment(MBB, MBBI, DL, Offset: -AlignOffset, /InEpilogue=/false)
775	.setMIFlag(MachineInstr::FrameSetup);
776
777	addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MovMIOpc))
778	.setMIFlag(MachineInstr::FrameSetup),
779	StackPtr, false, `0`)
780	.addImm(`0`)
781	.setMIFlag(MachineInstr::FrameSetup);
782	NumFrameExtraProbe ++;
783	Offset -= AlignOffset;
784	}
785	}
786
787	// Synthesize a loop
788	NumFrameLoopProbe ++;
789	const BasicBlock *LLVM_BB = MBB.getBasicBlock();
790
791	MachineBasicBlock *testMBB = MF.CreateMachineBasicBlock(BB: LLVM_BB);
792	MachineBasicBlock *tailMBB = MF.CreateMachineBasicBlock(BB: LLVM_BB);
793
794	MachineFunction::iterator MBBIter = ++MBB.getIterator();
795	MF.insert(MBBI: MBBIter, MBB: testMBB);
796	MF.insert(MBBI: MBBIter, MBB: tailMBB);
797
798	Register FinalStackProbed = Uses64BitFramePtr ? X86::R11
799	: Is64Bit ? X86::R11D
800	: X86::EAX;
801
802	BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::COPY), FinalStackProbed)
803	.addReg(StackPtr)
804	.setMIFlag(MachineInstr::FrameSetup);
805
806	// save loop bound
807	{
808	const unsigned BoundOffset = alignDown(Value: Offset, Align: StackProbeSize);
809	const unsigned SUBOpc = getSUBriOpcode(IsLP64: Uses64BitFramePtr);
810	BuildMI(MBB, MBBI, DL, TII.get(SUBOpc), FinalStackProbed)
811	.addReg(FinalStackProbed)
812	.addImm(BoundOffset)
813	.setMIFlag(MachineInstr::FrameSetup);
814
815	// while in the loop, use loop-invariant reg for CFI,
816	// instead of the stack pointer, which changes during the loop
817	if (!HasFP && NeedsDwarfCFI) {
818	// x32 uses the same DWARF register numbers as x86-64,
819	// so there isn't a register number for r11d, we must use r11 instead
820	const Register DwarfFinalStackProbed =
821	STI.isTarget64BitILP32()
822	? Register (getX86SubSuperRegister(Reg: FinalStackProbed, Size: `64`))
823	: FinalStackProbed;
824
825	BuildCFI(MBB, MBBI, DL,
826	CFIInst: MCCFIInstruction::createDefCfaRegister(
827	L: nullptr, Register: TRI->getDwarfRegNum(DwarfFinalStackProbed, true)));
828	BuildCFI(MBB, MBBI, DL,
829	CFIInst: MCCFIInstruction::createAdjustCfaOffset(L: nullptr, Adjustment: BoundOffset));
830	}
831	}
832
833	// allocate a page
834	BuildStackAdjustment(MBB&: *testMBB, MBBI: testMBB->end(), DL, Offset: -StackProbeSize,
835	/InEpilogue=/false)
836	.setMIFlag(MachineInstr::FrameSetup);
837
838	// touch the page
839	addRegOffset(BuildMI(testMBB, DL, TII.get(MovMIOpc))
840	.setMIFlag(MachineInstr::FrameSetup),
841	StackPtr, false, `0`)
842	.addImm(`0`)
843	.setMIFlag(MachineInstr::FrameSetup);
844
845	// cmp with stack pointer bound
846	BuildMI(testMBB, DL, TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
847	.addReg(StackPtr)
848	.addReg(FinalStackProbed)
849	.setMIFlag(MachineInstr::FrameSetup);
850
851	// jump
852	BuildMI(testMBB, DL, TII.get(X86::JCC_1))
853	.addMBB(testMBB)
854	.addImm(X86::COND_NE)
855	.setMIFlag(MachineInstr::FrameSetup);
856	testMBB->addSuccessor(Succ: testMBB);
857	testMBB->addSuccessor(Succ: tailMBB);
858
859	// BB management
860	tailMBB->splice(Where: tailMBB->end(), Other: &MBB, From: MBBI, To: MBB.end());
861	tailMBB->transferSuccessorsAndUpdatePHIs(FromMBB: &MBB);
862	MBB.addSuccessor(Succ: testMBB);
863
864	// handle tail
865	const uint64_t TailOffset = Offset % StackProbeSize;
866	MachineBasicBlock::iterator TailMBBIter = tailMBB->begin();
867	if (TailOffset) {
868	BuildStackAdjustment(MBB&: *tailMBB, MBBI: TailMBBIter, DL, Offset: -TailOffset,
869	/InEpilogue=/false)
870	.setMIFlag(MachineInstr::FrameSetup);
871	}
872
873	// after the loop, switch back to stack pointer for CFI
874	if (!HasFP && NeedsDwarfCFI) {
875	// x32 uses the same DWARF register numbers as x86-64,
876	// so there isn't a register number for esp, we must use rsp instead
877	const Register DwarfStackPtr =
878	STI.isTarget64BitILP32()
879	? Register (getX86SubSuperRegister(Reg: StackPtr, Size: `64`))
880	: Register (StackPtr);
881
882	BuildCFI(MBB&: *tailMBB, MBBI: TailMBBIter, DL,
883	CFIInst: MCCFIInstruction::createDefCfaRegister(
884	L: nullptr, Register: TRI->getDwarfRegNum(DwarfStackPtr, true)));
885	}
886
887	// Update Live In information
888	fullyRecomputeLiveIns(MBBs: {tailMBB, testMBB});
889	}
890
891	void X86FrameLowering::emitStackProbeInlineWindowsCoreCLR64(
892	MachineFunction &MF, MachineBasicBlock &MBB,
893	MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const {
894	const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
895	assert(STI.is64Bit() && "different expansion needed for 32 bit");
896	assert(STI.isTargetWindowsCoreCLR() && "custom expansion expects CoreCLR");
897	const TargetInstrInfo &TII = *STI.getInstrInfo();
898	const BasicBlock *LLVM_BB = MBB.getBasicBlock();
899
900	assert(MBB.computeRegisterLiveness(TRI, X86::EFLAGS, MBBI) !=
901	MachineBasicBlock::LQR_Live &&
902	"Inline stack probe loop will clobber live EFLAGS.");
903
904	// RAX contains the number of bytes of desired stack adjustment.
905	// The handling here assumes this value has already been updated so as to
906	// maintain stack alignment.
907	//
908	// We need to exit with RSP modified by this amount and execute suitable
909	// page touches to notify the OS that we're growing the stack responsibly.
910	// All stack probing must be done without modifying RSP.
911	//
912	// MBB:
913	// SizeReg = RAX;
914	// ZeroReg = 0
915	// CopyReg = RSP
916	// Flags, TestReg = CopyReg - SizeReg
917	// FinalReg = !Flags.Ovf ? TestReg : ZeroReg
918	// LimitReg = gs magic thread env access
919	// if FinalReg >= LimitReg goto ContinueMBB
920	// RoundBB:
921	// RoundReg = page address of FinalReg
922	// LoopMBB:
923	// LoopReg = PHI(LimitReg,ProbeReg)
924	// ProbeReg = LoopReg - PageSize
925	// [ProbeReg] = 0
926	// if (ProbeReg > RoundReg) goto LoopMBB
927	// ContinueMBB:
928	// RSP = RSP - RAX
929	// [rest of original MBB]
930
931	// Set up the new basic blocks
932	MachineBasicBlock *RoundMBB = MF.CreateMachineBasicBlock(BB: LLVM_BB);
933	MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(BB: LLVM_BB);
934	MachineBasicBlock *ContinueMBB = MF.CreateMachineBasicBlock(BB: LLVM_BB);
935
936	MachineFunction::iterator MBBIter = std::next(x: MBB.getIterator());
937	MF.insert(MBBI: MBBIter, MBB: RoundMBB);
938	MF.insert(MBBI: MBBIter, MBB: LoopMBB);
939	MF.insert(MBBI: MBBIter, MBB: ContinueMBB);
940
941	// Split MBB and move the tail portion down to ContinueMBB.
942	MachineBasicBlock::iterator BeforeMBBI = std::prev(x: MBBI);
943	ContinueMBB->splice(Where: ContinueMBB->begin(), Other: &MBB, From: MBBI, To: MBB.end());
944	ContinueMBB->transferSuccessorsAndUpdatePHIs(FromMBB: &MBB);
945
946	// Some useful constants
947	const int64_t ThreadEnvironmentStackLimit = `0x10`;
948	const int64_t PageSize = `0x1000`;
949	const int64_t PageMask = ~(PageSize - `1`);
950
951	// Registers we need. For the normal case we use virtual
952	// registers. For the prolog expansion we use RAX, RCX and RDX.
953	MachineRegisterInfo &MRI = MF.getRegInfo();
954	const TargetRegisterClass *RegClass = &X86::GR64RegClass;
955	const Register
956	SizeReg = InProlog ? X86::RAX : MRI.createVirtualRegister(RegClass),
957	ZeroReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass),
958	CopyReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass),
959	TestReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass),
960	FinalReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass),
961	RoundedReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass),
962	LimitReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass),
963	JoinReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass),
964	ProbeReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass);
965
966	// SP-relative offsets where we can save RCX and RDX.
967	int64_t RCXShadowSlot = `0`;
968	int64_t RDXShadowSlot = `0`;
969
970	// If inlining in the prolog, save RCX and RDX.
971	if (InProlog) {
972	// Compute the offsets. We need to account for things already
973	// pushed onto the stack at this point: return address, frame
974	// pointer (if used), and callee saves.
975	X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
976	const int64_t CalleeSaveSize = X86FI->getCalleeSavedFrameSize();
977	const bool HasFP = hasFP(MF);
978
979	// Check if we need to spill RCX and/or RDX.
980	// Here we assume that no earlier prologue instruction changes RCX and/or
981	// RDX, so checking the block live-ins is enough.
982	const bool IsRCXLiveIn = MBB.isLiveIn(X86::RCX);
983	const bool IsRDXLiveIn = MBB.isLiveIn(X86::RDX);
984	int64_t InitSlot = `8` + CalleeSaveSize + (HasFP ? `8` : `0`);
985	// Assign the initial slot to both registers, then change RDX's slot if both
986	// need to be spilled.
987	if (IsRCXLiveIn)
988	RCXShadowSlot = InitSlot;
989	if (IsRDXLiveIn)
990	RDXShadowSlot = InitSlot;
991	if (IsRDXLiveIn && IsRCXLiveIn)
992	RDXShadowSlot += `8`;
993	// Emit the saves if needed.
994	if (IsRCXLiveIn)
995	addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false,
996	RCXShadowSlot)
997	.addReg(X86::RCX);
998	if (IsRDXLiveIn)
999	addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false,
1000	RDXShadowSlot)
1001	.addReg(X86::RDX);
1002	} else {
1003	// Not in the prolog. Copy RAX to a virtual reg.
1004	BuildMI(&MBB, DL, TII.get(X86::MOV64rr), SizeReg).addReg(X86::RAX);
1005	}
1006
1007	// Add code to MBB to check for overflow and set the new target stack pointer
1008	// to zero if so.
1009	BuildMI(&MBB, DL, TII.get(X86::XOR64rr), ZeroReg)
1010	.addReg(ZeroReg, RegState::Undef)
1011	.addReg(ZeroReg, RegState::Undef);
1012	BuildMI(&MBB, DL, TII.get(X86::MOV64rr), CopyReg).addReg(X86::RSP);
1013	BuildMI(&MBB, DL, TII.get(X86::SUB64rr), TestReg)
1014	.addReg(CopyReg)
1015	.addReg(SizeReg);
1016	BuildMI(&MBB, DL, TII.get(X86::CMOV64rr), FinalReg)
1017	.addReg(TestReg)
1018	.addReg(ZeroReg)
1019	.addImm(X86::COND_B);
1020
1021	// FinalReg now holds final stack pointer value, or zero if
1022	// allocation would overflow. Compare against the current stack
1023	// limit from the thread environment block. Note this limit is the
1024	// lowest touched page on the stack, not the point at which the OS
1025	// will cause an overflow exception, so this is just an optimization
1026	// to avoid unnecessarily touching pages that are below the current
1027	// SP but already committed to the stack by the OS.
1028	BuildMI(&MBB, DL, TII.get(X86::MOV64rm), LimitReg)
1029	.addReg(`0`)
1030	.addImm(`1`)
1031	.addReg(`0`)
1032	.addImm(ThreadEnvironmentStackLimit)
1033	.addReg(X86::GS);
1034	BuildMI(&MBB, DL, TII.get(X86::CMP64rr)).addReg(FinalReg).addReg(LimitReg);
1035	// Jump if the desired stack pointer is at or above the stack limit.
1036	BuildMI(&MBB, DL, TII.get(X86::JCC_1))
1037	.addMBB(ContinueMBB)
1038	.addImm(X86::COND_AE);
1039
1040	// Add code to roundMBB to round the final stack pointer to a page boundary.
1041	RoundMBB->addLiveIn(PhysReg: FinalReg);
1042	BuildMI(RoundMBB, DL, TII.get(X86::AND64ri32), RoundedReg)
1043	.addReg(FinalReg)
1044	.addImm(PageMask);
1045	BuildMI(RoundMBB, DL, TII.get(X86::JMP_1)).addMBB(LoopMBB);
1046
1047	// LimitReg now holds the current stack limit, RoundedReg page-rounded
1048	// final RSP value. Add code to loopMBB to decrement LimitReg page-by-page
1049	// and probe until we reach RoundedReg.
1050	if (!InProlog) {
1051	BuildMI(LoopMBB, DL, TII.get(X86::PHI), JoinReg)
1052	.addReg(LimitReg)
1053	.addMBB(RoundMBB)
1054	.addReg(ProbeReg)
1055	.addMBB(LoopMBB);
1056	}
1057
1058	LoopMBB->addLiveIn(PhysReg: JoinReg);
1059	addRegOffset(BuildMI(LoopMBB, DL, TII.get(X86::LEA64r), ProbeReg), JoinReg,
1060	false, -PageSize);
1061
1062	// Probe by storing a byte onto the stack.
1063	BuildMI(LoopMBB, DL, TII.get(X86::MOV8mi))
1064	.addReg(ProbeReg)
1065	.addImm(`1`)
1066	.addReg(`0`)
1067	.addImm(`0`)
1068	.addReg(`0`)
1069	.addImm(`0`);
1070
1071	LoopMBB->addLiveIn(PhysReg: RoundedReg);
1072	BuildMI(LoopMBB, DL, TII.get(X86::CMP64rr))
1073	.addReg(RoundedReg)
1074	.addReg(ProbeReg);
1075	BuildMI(LoopMBB, DL, TII.get(X86::JCC_1))
1076	.addMBB(LoopMBB)
1077	.addImm(X86::COND_NE);
1078
1079	MachineBasicBlock::iterator ContinueMBBI = ContinueMBB->getFirstNonPHI();
1080
1081	// If in prolog, restore RDX and RCX.
1082	if (InProlog) {
1083	if (RCXShadowSlot) // It means we spilled RCX in the prologue.
1084	addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL,
1085	TII.get(X86::MOV64rm), X86::RCX),
1086	X86::RSP, false, RCXShadowSlot);
1087	if (RDXShadowSlot) // It means we spilled RDX in the prologue.
1088	addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL,
1089	TII.get(X86::MOV64rm), X86::RDX),
1090	X86::RSP, false, RDXShadowSlot);
1091	}
1092
1093	// Now that the probing is done, add code to continueMBB to update
1094	// the stack pointer for real.
1095	ContinueMBB->addLiveIn(PhysReg: SizeReg);
1096	BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::SUB64rr), X86::RSP)
1097	.addReg(X86::RSP)
1098	.addReg(SizeReg);
1099
1100	// Add the control flow edges we need.
1101	MBB.addSuccessor(Succ: ContinueMBB);
1102	MBB.addSuccessor(Succ: RoundMBB);
1103	RoundMBB->addSuccessor(Succ: LoopMBB);
1104	LoopMBB->addSuccessor(Succ: ContinueMBB);
1105	LoopMBB->addSuccessor(Succ: LoopMBB);
1106
1107	// Mark all the instructions added to the prolog as frame setup.
1108	if (InProlog) {
1109	for (++BeforeMBBI; BeforeMBBI != MBB.end(); ++BeforeMBBI) {
1110	BeforeMBBI ->setFlag(MachineInstr::FrameSetup);
1111	}
1112	for (MachineInstr &MI : *RoundMBB) {
1113	MI.setFlag(MachineInstr::FrameSetup);
1114	}
1115	for (MachineInstr &MI : *LoopMBB) {
1116	MI.setFlag(MachineInstr::FrameSetup);
1117	}
1118	for (MachineInstr &MI :
1119	llvm::make_range(x: ContinueMBB->begin(), y: ContinueMBBI)) {
1120	MI.setFlag(MachineInstr::FrameSetup);
1121	}
1122	}
1123	}
1124
1125	void X86FrameLowering::emitStackProbeCall(
1126	MachineFunction &MF, MachineBasicBlock &MBB,
1127	MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog,
1128	std::optional<MachineFunction::DebugInstrOperandPair> InstrNum) const {
1129	bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large;
1130
1131	// FIXME: Add indirect thunk support and remove this.
1132	if (Is64Bit && IsLargeCodeModel && STI.useIndirectThunkCalls())
1133	report_fatal_error(reason: "Emitting stack probe calls on 64-bit with the large "
1134	"code model and indirect thunks not yet implemented.");
1135
1136	assert(MBB.computeRegisterLiveness(TRI, X86::EFLAGS, MBBI) !=
1137	MachineBasicBlock::LQR_Live &&
1138	"Stack probe calls will clobber live EFLAGS.");
1139
1140	unsigned CallOp;
1141	if (Is64Bit)
1142	CallOp = IsLargeCodeModel ? X86::CALL64r : X86::CALL64pcrel32;
1143	else
1144	CallOp = X86::CALLpcrel32;
1145
1146	StringRef Symbol = STI.getTargetLowering()->getStackProbeSymbolName(MF);
1147
1148	MachineInstrBuilder CI;
1149	MachineBasicBlock::iterator ExpansionMBBI = std::prev(x: MBBI);
1150
1151	// All current stack probes take AX and SP as input, clobber flags, and
1152	// preserve all registers. x86_64 probes leave RSP unmodified.
1153	if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) {
1154	// For the large code model, we have to call through a register. Use R11,
1155	// as it is scratch in all supported calling conventions.
1156	BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::R11)
1157	.addExternalSymbol(MF.createExternalSymbolName(Symbol));
1158	CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addReg(X86::R11);
1159	} else {
1160	CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp))
1161	.addExternalSymbol(MF.createExternalSymbolName(Name: Symbol));
1162	}
1163
1164	unsigned AX = Uses64BitFramePtr ? X86::RAX : X86::EAX;
1165	unsigned SP = Uses64BitFramePtr ? X86::RSP : X86::ESP;
1166	CI.addReg(AX, RegState::Implicit)
1167	.addReg(SP, RegState::Implicit)
1168	.addReg(AX, RegState::Define \| RegState::Implicit)
1169	.addReg(SP, RegState::Define \| RegState::Implicit)
1170	.addReg(X86::EFLAGS, RegState::Define \| RegState::Implicit);
1171
1172	MachineInstr *ModInst = CI;
1173	if (STI.isTargetWin64() \|\| !STI.isOSWindows()) {
1174	// MSVC x32's _chkstk and cygwin/mingw's _alloca adjust %esp themselves.
1175	// MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp
1176	// themselves. They also does not clobber %rax so we can reuse it when
1177	// adjusting %rsp.
1178	// All other platforms do not specify a particular ABI for the stack probe
1179	// function, so we arbitrarily define it to not adjust %esp/%rsp itself.
1180	ModInst =
1181	BuildMI(MBB, MBBI, DL, TII.get(getSUBrrOpcode(IsLP64: Uses64BitFramePtr)), SP)
1182	.addReg(SP)
1183	.addReg(AX);
1184	}
1185
1186	// DebugInfo variable locations -- if there's an instruction number for the
1187	// allocation (i.e., DYN_ALLOC_), substitute it for the instruction that*
1188	// modifies SP.
1189	if (InstrNum) {
1190	if (STI.isTargetWin64() \|\| !STI.isOSWindows()) {
1191	// Label destination operand of the subtract.
1192	MF.makeDebugValueSubstitution(*InstrNum,
1193	{ModInst->getDebugInstrNum(), `0`});
1194	} else {
1195	// Label the call. The operand number is the penultimate operand, zero
1196	// based.
1197	unsigned SPDefOperand = ModInst->getNumOperands() - `2`;
1198	MF.makeDebugValueSubstitution(
1199	*InstrNum, {ModInst->getDebugInstrNum(), SPDefOperand});
1200	}
1201	}
1202
1203	if (InProlog) {
1204	// Apply the frame setup flag to all inserted instrs.
1205	for (++ExpansionMBBI; ExpansionMBBI != MBBI; ++ExpansionMBBI)
1206	ExpansionMBBI ->setFlag(MachineInstr::FrameSetup);
1207	}
1208	}
1209
1210	static unsigned calculateSetFPREG(uint64_t SPAdjust) {
1211	// Win64 ABI has a less restrictive limitation of 240; 128 works equally well
1212	// and might require smaller successive adjustments.
1213	const uint64_t Win64MaxSEHOffset = `128`;
1214	uint64_t SEHFrameOffset = std::min(a: SPAdjust, b: Win64MaxSEHOffset);
1215	// Win64 ABI requires 16-byte alignment for the UWOP_SET_FPREG opcode.
1216	return SEHFrameOffset & -`16`;
1217	}
1218
1219	// If we're forcing a stack realignment we can't rely on just the frame
1220	// info, we need to know the ABI stack alignment as well in case we
1221	// have a call out. Otherwise just make sure we have some alignment - we'll
1222	// go with the minimum SlotSize.
1223	uint64_t
1224	X86FrameLowering::calculateMaxStackAlign(const MachineFunction &MF) const {
1225	const MachineFrameInfo &MFI = MF.getFrameInfo();
1226	Align MaxAlign = MFI.getMaxAlign(); // Desired stack alignment.
1227	Align StackAlign = getStackAlign();
1228	bool HasRealign = MF.getFunction().hasFnAttribute(Kind: "stackrealign");
1229	if (HasRealign) {
1230	if (MFI.hasCalls())
1231	MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign;
1232	else if (MaxAlign < SlotSize)
1233	MaxAlign = Align (SlotSize);
1234	}
1235
1236	if (!Is64Bit && MF.getFunction().getCallingConv() == CallingConv::X86_INTR) {
1237	if (HasRealign)
1238	MaxAlign = (MaxAlign > `16`) ? MaxAlign : Align (`16`);
1239	else
1240	MaxAlign = Align (`16`);
1241	}
1242	return MaxAlign.value();
1243	}
1244
1245	void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB,
1246	MachineBasicBlock::iterator MBBI,
1247	const DebugLoc &DL, unsigned Reg,
1248	uint64_t MaxAlign) const {
1249	uint64_t Val = -MaxAlign;
1250	unsigned AndOp = getANDriOpcode(IsLP64: Uses64BitFramePtr, Imm: Val);
1251
1252	MachineFunction &MF = *MBB.getParent();
1253	const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
1254	const X86TargetLowering &TLI = *STI.getTargetLowering();
1255	const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);
1256	const bool EmitInlineStackProbe = TLI.hasInlineStackProbe(MF);
1257
1258	// We want to make sure that (in worst case) less than StackProbeSize bytes
1259	// are not probed after the AND. This assumption is used in
1260	// emitStackProbeInlineGeneric.
1261	if (Reg == StackPtr && EmitInlineStackProbe && MaxAlign >= StackProbeSize) {
1262	{
1263	NumFrameLoopProbe ++;
1264	MachineBasicBlock *entryMBB =
1265	MF.CreateMachineBasicBlock(BB: MBB.getBasicBlock());
1266	MachineBasicBlock *headMBB =
1267	MF.CreateMachineBasicBlock(BB: MBB.getBasicBlock());
1268	MachineBasicBlock *bodyMBB =
1269	MF.CreateMachineBasicBlock(BB: MBB.getBasicBlock());
1270	MachineBasicBlock *footMBB =
1271	MF.CreateMachineBasicBlock(BB: MBB.getBasicBlock());
1272
1273	MachineFunction::iterator MBBIter = MBB.getIterator();
1274	MF.insert(MBBI: MBBIter, MBB: entryMBB);
1275	MF.insert(MBBI: MBBIter, MBB: headMBB);
1276	MF.insert(MBBI: MBBIter, MBB: bodyMBB);
1277	MF.insert(MBBI: MBBIter, MBB: footMBB);
1278	const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi;
1279	Register FinalStackProbed = Uses64BitFramePtr ? X86::R11
1280	: Is64Bit ? X86::R11D
1281	: X86::EAX;
1282
1283	// Setup entry block
1284	{
1285
1286	entryMBB->splice(Where: entryMBB->end(), Other: &MBB, From: MBB.begin(), To: MBBI);
1287	BuildMI(entryMBB, DL, TII.get(TargetOpcode::COPY), FinalStackProbed)
1288	.addReg(StackPtr)
1289	.setMIFlag(MachineInstr::FrameSetup);
1290	MachineInstr *MI =
1291	BuildMI(entryMBB, DL, TII.get(AndOp), FinalStackProbed)
1292	.addReg(FinalStackProbed)
1293	.addImm(Val)
1294	.setMIFlag(MachineInstr::FrameSetup);
1295
1296	// The EFLAGS implicit def is dead.
1297	MI->getOperand(i: `3`).setIsDead();
1298
1299	BuildMI(entryMBB, DL,
1300	TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
1301	.addReg(FinalStackProbed)
1302	.addReg(StackPtr)
1303	.setMIFlag(MachineInstr::FrameSetup);
1304	BuildMI(entryMBB, DL, TII.get(X86::JCC_1))
1305	.addMBB(&MBB)
1306	.addImm(X86::COND_E)
1307	.setMIFlag(MachineInstr::FrameSetup);
1308	entryMBB->addSuccessor(Succ: headMBB);
1309	entryMBB->addSuccessor(Succ: &MBB);
1310	}
1311
1312	// Loop entry block
1313
1314	{
1315	const unsigned SUBOpc = getSUBriOpcode(IsLP64: Uses64BitFramePtr);
1316	BuildMI(headMBB, DL, TII.get(SUBOpc), StackPtr)
1317	.addReg(StackPtr)
1318	.addImm(StackProbeSize)
1319	.setMIFlag(MachineInstr::FrameSetup);
1320
1321	BuildMI(headMBB, DL,
1322	TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
1323	.addReg(StackPtr)
1324	.addReg(FinalStackProbed)
1325	.setMIFlag(MachineInstr::FrameSetup);
1326
1327	// jump to the footer if StackPtr < FinalStackProbed
1328	BuildMI(headMBB, DL, TII.get(X86::JCC_1))
1329	.addMBB(footMBB)
1330	.addImm(X86::COND_B)
1331	.setMIFlag(MachineInstr::FrameSetup);
1332
1333	headMBB->addSuccessor(Succ: bodyMBB);
1334	headMBB->addSuccessor(Succ: footMBB);
1335	}
1336
1337	// setup loop body
1338	{
1339	addRegOffset(BuildMI(bodyMBB, DL, TII.get(MovMIOpc))
1340	.setMIFlag(MachineInstr::FrameSetup),
1341	StackPtr, false, `0`)
1342	.addImm(`0`)
1343	.setMIFlag(MachineInstr::FrameSetup);
1344
1345	const unsigned SUBOpc = getSUBriOpcode(IsLP64: Uses64BitFramePtr);
1346	BuildMI(bodyMBB, DL, TII.get(SUBOpc), StackPtr)
1347	.addReg(StackPtr)
1348	.addImm(StackProbeSize)
1349	.setMIFlag(MachineInstr::FrameSetup);
1350
1351	// cmp with stack pointer bound
1352	BuildMI(bodyMBB, DL,
1353	TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
1354	.addReg(FinalStackProbed)
1355	.addReg(StackPtr)
1356	.setMIFlag(MachineInstr::FrameSetup);
1357
1358	// jump back while FinalStackProbed < StackPtr
1359	BuildMI(bodyMBB, DL, TII.get(X86::JCC_1))
1360	.addMBB(bodyMBB)
1361	.addImm(X86::COND_B)
1362	.setMIFlag(MachineInstr::FrameSetup);
1363	bodyMBB->addSuccessor(Succ: bodyMBB);
1364	bodyMBB->addSuccessor(Succ: footMBB);
1365	}
1366
1367	// setup loop footer
1368	{
1369	BuildMI(footMBB, DL, TII.get(TargetOpcode::COPY), StackPtr)
1370	.addReg(FinalStackProbed)
1371	.setMIFlag(MachineInstr::FrameSetup);
1372	addRegOffset(BuildMI(footMBB, DL, TII.get(MovMIOpc))
1373	.setMIFlag(MachineInstr::FrameSetup),
1374	StackPtr, false, `0`)
1375	.addImm(`0`)
1376	.setMIFlag(MachineInstr::FrameSetup);
1377	footMBB->addSuccessor(Succ: &MBB);
1378	}
1379
1380	fullyRecomputeLiveIns(MBBs: {footMBB, bodyMBB, headMBB, &MBB});
1381	}
1382	} else {
1383	MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AndOp), Reg)
1384	.addReg(Reg)
1385	.addImm(Val)
1386	.setMIFlag(MachineInstr::FrameSetup);
1387
1388	// The EFLAGS implicit def is dead.
1389	MI->getOperand(i: `3`).setIsDead();
1390	}
1391	}
1392
1393	bool X86FrameLowering::has128ByteRedZone(const MachineFunction &MF) const {
1394	// x86-64 (non Win64) has a 128 byte red zone which is guaranteed not to be
1395	// clobbered by any interrupt handler.
1396	assert(&STI == &MF.getSubtarget<X86Subtarget>() &&
1397	"MF used frame lowering for wrong subtarget");
1398	const Function &Fn = MF.getFunction();
1399	const bool IsWin64CC = STI.isCallingConvWin64(CC: Fn.getCallingConv());
1400	return Is64Bit && !IsWin64CC && !Fn.hasFnAttribute(Attribute::NoRedZone);
1401	}
1402
1403	/// Return true if we need to use the restricted Windows x64 prologue and
1404	/// epilogue code patterns that can be described with WinCFI (.seh_*
1405	/// directives).
1406	bool X86FrameLowering::isWin64Prologue(const MachineFunction &MF) const {
1407	return MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
1408	}
1409
1410	bool X86FrameLowering::needsDwarfCFI(const MachineFunction &MF) const {
1411	return !isWin64Prologue(MF) && MF.needsFrameMoves();
1412	}
1413
1414	/// Return true if an opcode is part of the REP group of instructions
1415	static bool isOpcodeRep(unsigned Opcode) {
1416	switch (Opcode) {
1417	case X86::REPNE_PREFIX:
1418	case X86::REP_MOVSB_32:
1419	case X86::REP_MOVSB_64:
1420	case X86::REP_MOVSD_32:
1421	case X86::REP_MOVSD_64:
1422	case X86::REP_MOVSQ_32:
1423	case X86::REP_MOVSQ_64:
1424	case X86::REP_MOVSW_32:
1425	case X86::REP_MOVSW_64:
1426	case X86::REP_PREFIX:
1427	case X86::REP_STOSB_32:
1428	case X86::REP_STOSB_64:
1429	case X86::REP_STOSD_32:
1430	case X86::REP_STOSD_64:
1431	case X86::REP_STOSQ_32:
1432	case X86::REP_STOSQ_64:
1433	case X86::REP_STOSW_32:
1434	case X86::REP_STOSW_64:
1435	return true;
1436	default:
1437	break;
1438	}
1439	return false;
1440	}
1441
1442	/// emitPrologue - Push callee-saved registers onto the stack, which
1443	/// automatically adjust the stack pointer. Adjust the stack pointer to allocate
1444	/// space for local variables. Also emit labels used by the exception handler to
1445	/// generate the exception handling frames.
1446
1447	/*
1448	Here's a gist of what gets emitted:
1449
1450	; Establish frame pointer, if needed
1451	[if needs FP]
1452	push %rbp
1453	.cfi_def_cfa_offset 16
1454	.cfi_offset %rbp, -16
1455	.seh_pushreg %rpb
1456	mov %rsp, %rbp
1457	.cfi_def_cfa_register %rbp
1458
1459	; Spill general-purpose registers
1460	[for all callee-saved GPRs]
1461	pushq %<reg>
1462	[if not needs FP]
1463	.cfi_def_cfa_offset (offset from RETADDR)
1464	.seh_pushreg %<reg>
1465
1466	; If the required stack alignment > default stack alignment
1467	; rsp needs to be re-aligned. This creates a "re-alignment gap"
1468	; of unknown size in the stack frame.
1469	[if stack needs re-alignment]
1470	and $MASK, %rsp
1471
1472	; Allocate space for locals
1473	[if target is Windows and allocated space > 4096 bytes]
1474	; Windows needs special care for allocations larger
1475	; than one page.
1476	mov $NNN, %rax
1477	call ___chkstk_ms/___chkstk
1478	sub %rax, %rsp
1479	[else]
1480	sub $NNN, %rsp
1481
1482	[if needs FP]
1483	.seh_stackalloc (size of XMM spill slots)
1484	.seh_setframe %rbp, SEHFrameOffset ; = size of all spill slots
1485	[else]
1486	.seh_stackalloc NNN
1487
1488	; Spill XMMs
1489	; Note, that while only Windows 64 ABI specifies XMMs as callee-preserved,
1490	; they may get spilled on any platform, if the current function
1491	; calls @llvm.eh.unwind.init
1492	[if needs FP]
1493	[for all callee-saved XMM registers]
1494	movaps %<xmm reg>, -MMM(%rbp)
1495	[for all callee-saved XMM registers]
1496	.seh_savexmm %<xmm reg>, (-MMM + SEHFrameOffset)
1497	; i.e. the offset relative to (%rbp - SEHFrameOffset)
1498	[else]
1499	[for all callee-saved XMM registers]
1500	movaps %<xmm reg>, KKK(%rsp)
1501	[for all callee-saved XMM registers]
1502	.seh_savexmm %<xmm reg>, KKK
1503
1504	.seh_endprologue
1505
1506	[if needs base pointer]
1507	mov %rsp, %rbx
1508	[if needs to restore base pointer]
1509	mov %rsp, -MMM(%rbp)
1510
1511	; Emit CFI info
1512	[if needs FP]
1513	[for all callee-saved registers]
1514	.cfi_offset %<reg>, (offset from %rbp)
1515	[else]
1516	.cfi_def_cfa_offset (offset from RETADDR)
1517	[for all callee-saved registers]
1518	.cfi_offset %<reg>, (offset from %rsp)
1519
1520	Notes:
1521	- .seh directives are emitted only for Windows 64 ABI
1522	- .cv_fpo directives are emitted on win32 when emitting CodeView
1523	- .cfi directives are emitted for all other ABIs
1524	- for 32-bit code, substitute %e?? registers for %r??
1525	*/
1526
1527	void X86FrameLowering::emitPrologue(MachineFunction &MF,
1528	MachineBasicBlock &MBB) const {
1529	assert(&STI == &MF.getSubtarget<X86Subtarget>() &&
1530	"MF used frame lowering for wrong subtarget");
1531	MachineBasicBlock::iterator MBBI = MBB.begin();
1532	MachineFrameInfo &MFI = MF.getFrameInfo();
1533	const Function &Fn = MF.getFunction();
1534	MachineModuleInfo &MMI = MF.getMMI();
1535	X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
1536	uint64_t MaxAlign = calculateMaxStackAlign(MF); // Desired stack alignment.
1537	uint64_t StackSize = MFI.getStackSize(); // Number of bytes to allocate.
1538	bool IsFunclet = MBB.isEHFuncletEntry();
1539	EHPersonality Personality = EHPersonality::Unknown;
1540	if (Fn.hasPersonalityFn())
1541	Personality = classifyEHPersonality(Pers: Fn.getPersonalityFn());
1542	bool FnHasClrFunclet =
1543	MF.hasEHFunclets() && Personality == EHPersonality::CoreCLR;
1544	bool IsClrFunclet = IsFunclet && FnHasClrFunclet;
1545	bool HasFP = hasFP(MF);
1546	bool IsWin64Prologue = isWin64Prologue(MF);
1547	bool NeedsWin64CFI = IsWin64Prologue && Fn.needsUnwindTableEntry();
1548	// FIXME: Emit FPO data for EH funclets.
1549	bool NeedsWinFPO =
1550	!IsFunclet && STI.isTargetWin32() && MMI.getModule()->getCodeViewFlag();
1551	bool NeedsWinCFI = NeedsWin64CFI \|\| NeedsWinFPO;
1552	bool NeedsDwarfCFI = needsDwarfCFI(MF);
1553	Register FramePtr = TRI->getFrameRegister(MF);
1554	const Register MachineFramePtr =
1555	STI.isTarget64BitILP32() ? Register (getX86SubSuperRegister(Reg: FramePtr, Size: `64`))
1556	: FramePtr;
1557	Register BasePtr = TRI->getBaseRegister();
1558	bool HasWinCFI = false;
1559
1560	// Debug location must be unknown since the first debug location is used
1561	// to determine the end of the prologue.
1562	DebugLoc DL;
1563	Register ArgBaseReg;
1564
1565	// Emit extra prolog for argument stack slot reference.
1566	if (auto *MI = X86FI->getStackPtrSaveMI()) {
1567	// MI is lea instruction that created in X86ArgumentStackSlotPass.
1568	// Creat extra prolog for stack realignment.
1569	ArgBaseReg = MI->getOperand(i: `0`).getReg();
1570	// leal 4(%esp), %basereg
1571	// .cfi_def_cfa %basereg, 0
1572	// andl $-128, %esp
1573	// pushl -4(%basereg)
1574	BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::LEA64r : X86::LEA32r),
1575	ArgBaseReg)
1576	.addUse(StackPtr)
1577	.addImm(`1`)
1578	.addUse(X86::NoRegister)
1579	.addImm(SlotSize)
1580	.addUse(X86::NoRegister)
1581	.setMIFlag(MachineInstr::FrameSetup);
1582	if (NeedsDwarfCFI) {
1583	// .cfi_def_cfa %basereg, 0
1584	unsigned DwarfStackPtr = TRI->getDwarfRegNum(ArgBaseReg, true);
1585	BuildCFI(MBB, MBBI, DL,
1586	CFIInst: MCCFIInstruction::cfiDefCfa(L: nullptr, Register: DwarfStackPtr, Offset: `0`),
1587	Flag: MachineInstr::FrameSetup);
1588	}
1589	BuildStackAlignAND(MBB, MBBI, DL, Reg: StackPtr, MaxAlign);
1590	int64_t Offset = -(int64_t)SlotSize;
1591	BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64rmm : X86::PUSH32rmm))
1592	.addReg(ArgBaseReg)
1593	.addImm(`1`)
1594	.addReg(X86::NoRegister)
1595	.addImm(Offset)
1596	.addReg(X86::NoRegister)
1597	.setMIFlag(MachineInstr::FrameSetup);
1598	}
1599
1600	// Space reserved for stack-based arguments when making a (ABI-guaranteed)
1601	// tail call.
1602	unsigned TailCallArgReserveSize = -X86FI->getTCReturnAddrDelta();
1603	if (TailCallArgReserveSize && IsWin64Prologue)
1604	report_fatal_error(reason: "Can't handle guaranteed tail call under win64 yet");
1605
1606	const bool EmitStackProbeCall =
1607	STI.getTargetLowering()->hasStackProbeSymbol(MF);
1608	unsigned StackProbeSize = STI.getTargetLowering()->getStackProbeSize(MF);
1609
1610	if (HasFP && X86FI->hasSwiftAsyncContext()) {
1611	switch (MF.getTarget().Options.SwiftAsyncFramePointer) {
1612	case SwiftAsyncFramePointerMode::DeploymentBased:
1613	if (STI.swiftAsyncContextIsDynamicallySet()) {
1614	// The special symbol below is absolute and has a value* suitable to be*
1615	// combined with the frame pointer directly.
1616	BuildMI(MBB, MBBI, DL, TII.get(X86::OR64rm), MachineFramePtr)
1617	.addUse(MachineFramePtr)
1618	.addUse(X86::RIP)
1619	.addImm(`1`)
1620	.addUse(X86::NoRegister)
1621	.addExternalSymbol("swift_async_extendedFramePointerFlags",
1622	X86II::MO_GOTPCREL)
1623	.addUse(X86::NoRegister);
1624	break;
1625	}
1626	[[fallthrough]];
1627
1628	case SwiftAsyncFramePointerMode::Always:
1629	assert(
1630	!IsWin64Prologue &&
1631	"win64 prologue does not set the bit 60 in the saved frame pointer");
1632	BuildMI(MBB, MBBI, DL, TII.get(X86::BTS64ri8), MachineFramePtr)
1633	.addUse(MachineFramePtr)
1634	.addImm(`60`)
1635	.setMIFlag(MachineInstr::FrameSetup);
1636	break;
1637
1638	case SwiftAsyncFramePointerMode::Never:
1639	break;
1640	}
1641	}
1642
1643	// Re-align the stack on 64-bit if the x86-interrupt calling convention is
1644	// used and an error code was pushed, since the x86-64 ABI requires a 16-byte
1645	// stack alignment.
1646	if (Fn.getCallingConv() == CallingConv::X86_INTR && Is64Bit &&
1647	Fn.arg_size() == `2`) {
1648	StackSize += `8`;
1649	MFI.setStackSize(StackSize);
1650
1651	// Update the stack pointer by pushing a register. This is the instruction
1652	// emitted that would be end up being emitted by a call to `emitSPUpdate`.
1653	// Hard-coding the update to a push avoids emitting a second
1654	// `STACKALLOC_W_PROBING` instruction in the save block: We know that stack
1655	// probing isn't needed anyways for an 8-byte update.
1656	// Pushing a register leaves us in a similar situation to a regular
1657	// function call where we know that the address at (rsp-8) is writeable.
1658	// That way we avoid any off-by-ones with stack probing for additional
1659	// stack pointer updates later on.
1660	BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64r))
1661	.addReg(X86::RAX, RegState::Undef)
1662	.setMIFlag(MachineInstr::FrameSetup);
1663	}
1664
1665	// If this is x86-64 and the Red Zone is not disabled, if we are a leaf
1666	// function, and use up to 128 bytes of stack space, don't have a frame
1667	// pointer, calls, or dynamic alloca then we do not need to adjust the
1668	// stack pointer (we fit in the Red Zone). We also check that we don't
1669	// push and pop from the stack.
1670	if (has128ByteRedZone(MF) && !TRI->hasStackRealignment(MF) &&
1671	!MFI.hasVarSizedObjects() && // No dynamic alloca.
1672	!MFI.adjustsStack() && // No calls.
1673	!EmitStackProbeCall && // No stack probes.
1674	!MFI.hasCopyImplyingStackAdjustment() && // Don't push and pop.
1675	!MF.shouldSplitStack()) { // Regular stack
1676	uint64_t MinSize =
1677	X86FI->getCalleeSavedFrameSize() - X86FI->getTCReturnAddrDelta();
1678	if (HasFP)
1679	MinSize += SlotSize;
1680	X86FI->setUsesRedZone(MinSize > `0` \|\| StackSize > `0`);
1681	StackSize = std::max(a: MinSize, b: StackSize > `128` ? StackSize - `128` : `0`);
1682	MFI.setStackSize(StackSize);
1683	}
1684
1685	// Insert stack pointer adjustment for later moving of return addr. Only
1686	// applies to tail call optimized functions where the callee argument stack
1687	// size is bigger than the callers.
1688	if (TailCallArgReserveSize != `0`) {
1689	BuildStackAdjustment(MBB, MBBI, DL, Offset: -(int)TailCallArgReserveSize,
1690	/InEpilogue=/false)
1691	.setMIFlag(MachineInstr::FrameSetup);
1692	}
1693
1694	// Mapping for machine moves:
1695	//
1696	// DST: VirtualFP AND
1697	// SRC: VirtualFP => DW_CFA_def_cfa_offset
1698	// ELSE => DW_CFA_def_cfa
1699	//
1700	// SRC: VirtualFP AND
1701	// DST: Register => DW_CFA_def_cfa_register
1702	//
1703	// ELSE
1704	// OFFSET < 0 => DW_CFA_offset_extended_sf
1705	// REG < 64 => DW_CFA_offset + Reg
1706	// ELSE => DW_CFA_offset_extended
1707
1708	uint64_t NumBytes = `0`;
1709	int stackGrowth = -SlotSize;
1710
1711	// Find the funclet establisher parameter
1712	Register Establisher = X86::NoRegister;
1713	if (IsClrFunclet)
1714	Establisher = Uses64BitFramePtr ? X86::RCX : X86::ECX;
1715	else if (IsFunclet)
1716	Establisher = Uses64BitFramePtr ? X86::RDX : X86::EDX;
1717
1718	if (IsWin64Prologue && IsFunclet && !IsClrFunclet) {
1719	// Immediately spill establisher into the home slot.
1720	// The runtime cares about this.
1721	// MOV64mr %rdx, 16(%rsp)
1722	unsigned MOVmr = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
1723	addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MOVmr)), StackPtr, true, `16`)
1724	.addReg(Establisher)
1725	.setMIFlag(MachineInstr::FrameSetup);
1726	MBB.addLiveIn(PhysReg: Establisher);
1727	}
1728
1729	if (HasFP) {
1730	assert(MF.getRegInfo().isReserved(MachineFramePtr) && "FP reserved");
1731
1732	// Calculate required stack adjustment.
1733	uint64_t FrameSize = StackSize - SlotSize;
1734	NumBytes =
1735	FrameSize - (X86FI->getCalleeSavedFrameSize() + TailCallArgReserveSize);
1736
1737	// Callee-saved registers are pushed on stack before the stack is realigned.
1738	if (TRI->hasStackRealignment(MF) && !IsWin64Prologue)
1739	NumBytes = alignTo(Value: NumBytes, Align: MaxAlign);
1740
1741	// Save EBP/RBP into the appropriate stack slot.
1742	BuildMI(MBB, MBBI, DL,
1743	TII.get(getPUSHOpcode(ST: MF.getSubtarget<X86Subtarget>())))
1744	.addReg(MachineFramePtr, RegState::Kill)
1745	.setMIFlag(MachineInstr::FrameSetup);
1746
1747	if (NeedsDwarfCFI && !ArgBaseReg.isValid()) {
1748	// Mark the place where EBP/RBP was saved.
1749	// Define the current CFA rule to use the provided offset.
1750	assert(StackSize);
1751	BuildCFI(MBB, MBBI, DL,
1752	CFIInst: MCCFIInstruction::cfiDefCfaOffset(
1753	L: nullptr, Offset: -`2` * stackGrowth + (int)TailCallArgReserveSize),
1754	Flag: MachineInstr::FrameSetup);
1755
1756	// Change the rule for the FramePtr to be an "offset" rule.
1757	unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true);
1758	BuildCFI(MBB, MBBI, DL,
1759	CFIInst: MCCFIInstruction::createOffset(L: nullptr, Register: DwarfFramePtr,
1760	Offset: `2` * stackGrowth -
1761	(int)TailCallArgReserveSize),
1762	Flag: MachineInstr::FrameSetup);
1763	}
1764
1765	if (NeedsWinCFI) {
1766	HasWinCFI = true;
1767	BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg))
1768	.addImm(FramePtr)
1769	.setMIFlag(MachineInstr::FrameSetup);
1770	}
1771
1772	if (!IsFunclet) {
1773	if (X86FI->hasSwiftAsyncContext()) {
1774	assert(!IsWin64Prologue &&
1775	"win64 prologue does not store async context right below rbp");
1776	const auto &Attrs = MF.getFunction().getAttributes();
1777
1778	// Before we update the live frame pointer we have to ensure there's a
1779	// valid (or null) asynchronous context in its slot just before FP in
1780	// the frame record, so store it now.
1781	if (Attrs.hasAttrSomewhere(Attribute::SwiftAsync)) {
1782	// We have an initial context in r14, store it just before the frame
1783	// pointer.
1784	MBB.addLiveIn(X86::R14);
1785	BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64r))
1786	.addReg(X86::R14)
1787	.setMIFlag(MachineInstr::FrameSetup);
1788	} else {
1789	// No initial context, store null so that there's no pointer that
1790	// could be misused.
1791	BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64i32))
1792	.addImm(`0`)
1793	.setMIFlag(MachineInstr::FrameSetup);
1794	}
1795
1796	if (NeedsWinCFI) {
1797	HasWinCFI = true;
1798	BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg))
1799	.addImm(X86::R14)
1800	.setMIFlag(MachineInstr::FrameSetup);
1801	}
1802
1803	BuildMI(MBB, MBBI, DL, TII.get(X86::LEA64r), FramePtr)
1804	.addUse(X86::RSP)
1805	.addImm(`1`)
1806	.addUse(X86::NoRegister)
1807	.addImm(`8`)
1808	.addUse(X86::NoRegister)
1809	.setMIFlag(MachineInstr::FrameSetup);
1810	BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64ri32), X86::RSP)
1811	.addUse(X86::RSP)
1812	.addImm(`8`)
1813	.setMIFlag(MachineInstr::FrameSetup);
1814	}
1815
1816	if (!IsWin64Prologue && !IsFunclet) {
1817	// Update EBP with the new base value.
1818	if (!X86FI->hasSwiftAsyncContext())
1819	BuildMI(MBB, MBBI, DL,
1820	TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr),
1821	FramePtr)
1822	.addReg(StackPtr)
1823	.setMIFlag(MachineInstr::FrameSetup);
1824
1825	if (NeedsDwarfCFI) {
1826	if (ArgBaseReg.isValid()) {
1827	SmallString<`64`> CfaExpr;
1828	CfaExpr.push_back(Elt: dwarf::DW_CFA_expression);
1829	uint8_t buffer[`16`];
1830	unsigned DwarfReg = TRI->getDwarfRegNum(MachineFramePtr, true);
1831	CfaExpr.append(in_start: buffer, in_end: buffer + encodeULEB128(Value: DwarfReg, p: buffer));
1832	CfaExpr.push_back(Elt: `2`);
1833	CfaExpr.push_back(Elt: (uint8_t)(dwarf::DW_OP_breg0 + DwarfReg));
1834	CfaExpr.push_back(Elt: `0`);
1835	// DW_CFA_expression: reg5 DW_OP_breg5 +0
1836	BuildCFI(MBB, MBBI, DL,
1837	CFIInst: MCCFIInstruction::createEscape(L: nullptr, Vals: CfaExpr.str()),
1838	Flag: MachineInstr::FrameSetup);
1839	} else {
1840	// Mark effective beginning of when frame pointer becomes valid.
1841	// Define the current CFA to use the EBP/RBP register.
1842	unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true);
1843	BuildCFI(
1844	MBB, MBBI, DL,
1845	CFIInst: MCCFIInstruction::createDefCfaRegister(L: nullptr, Register: DwarfFramePtr),
1846	Flag: MachineInstr::FrameSetup);
1847	}
1848	}
1849
1850	if (NeedsWinFPO) {
1851	// .cv_fpo_setframe $FramePtr
1852	HasWinCFI = true;
1853	BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame))
1854	.addImm(FramePtr)
1855	.addImm(`0`)
1856	.setMIFlag(MachineInstr::FrameSetup);
1857	}
1858	}
1859	}
1860	} else {
1861	assert(!IsFunclet && "funclets without FPs not yet implemented");
1862	NumBytes =
1863	StackSize - (X86FI->getCalleeSavedFrameSize() + TailCallArgReserveSize);
1864	}
1865
1866	// Update the offset adjustment, which is mainly used by codeview to translate
1867	// from ESP to VFRAME relative local variable offsets.
1868	if (!IsFunclet) {
1869	if (HasFP && TRI->hasStackRealignment(MF))
1870	MFI.setOffsetAdjustment(-NumBytes);
1871	else
1872	MFI.setOffsetAdjustment(-StackSize);
1873	}
1874
1875	// For EH funclets, only allocate enough space for outgoing calls. Save the
1876	// NumBytes value that we would've used for the parent frame.
1877	unsigned ParentFrameNumBytes = NumBytes;
1878	if (IsFunclet)
1879	NumBytes = getWinEHFuncletFrameSize(MF);
1880
1881	// Skip the callee-saved push instructions.
1882	bool PushedRegs = false;
1883	int StackOffset = `2` * stackGrowth;
1884	MachineBasicBlock::const_iterator LastCSPush = MBBI;
1885	auto IsCSPush = [&](const MachineBasicBlock::iterator &MBBI) {
1886	if (MBBI == MBB.end() \|\| !MBBI ->getFlag(Flag: MachineInstr::FrameSetup))
1887	return false;
1888	unsigned Opc = MBBI ->getOpcode();
1889	return Opc == X86::PUSH32r \|\| Opc == X86::PUSH64r \|\| Opc == X86::PUSHP64r \|\|
1890	Opc == X86::PUSH2 \|\| Opc == X86::PUSH2P;
1891	};
1892
1893	while (IsCSPush (MBBI)) {
1894	PushedRegs = true;
1895	Register Reg = MBBI ->getOperand(i: `0`).getReg();
1896	LastCSPush = MBBI;
1897	++MBBI;
1898	unsigned Opc = LastCSPush ->getOpcode();
1899
1900	if (!HasFP && NeedsDwarfCFI) {
1901	// Mark callee-saved push instruction.
1902	// Define the current CFA rule to use the provided offset.
1903	assert(StackSize);
1904	// Compared to push, push2 introduces more stack offset (one more
1905	// register).
1906	if (Opc == X86::PUSH2 \|\| Opc == X86::PUSH2P)
1907	StackOffset += stackGrowth;
1908	BuildCFI(MBB, MBBI, DL,
1909	CFIInst: MCCFIInstruction::cfiDefCfaOffset(L: nullptr, Offset: -StackOffset),
1910	Flag: MachineInstr::FrameSetup);
1911	StackOffset += stackGrowth;
1912	}
1913
1914	if (NeedsWinCFI) {
1915	HasWinCFI = true;
1916	BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg))
1917	.addImm(Reg)
1918	.setMIFlag(MachineInstr::FrameSetup);
1919	if (Opc == X86::PUSH2 \|\| Opc == X86::PUSH2P)
1920	BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg))
1921	.addImm(LastCSPush->getOperand(`1`).getReg())
1922	.setMIFlag(MachineInstr::FrameSetup);
1923	}
1924	}
1925
1926	// Realign stack after we pushed callee-saved registers (so that we'll be
1927	// able to calculate their offsets from the frame pointer).
1928	// Don't do this for Win64, it needs to realign the stack after the prologue.
1929	if (!IsWin64Prologue && !IsFunclet && TRI->hasStackRealignment(MF) &&
1930	!ArgBaseReg.isValid()) {
1931	assert(HasFP && "There should be a frame pointer if stack is realigned.");
1932	BuildStackAlignAND(MBB, MBBI, DL, Reg: StackPtr, MaxAlign);
1933
1934	if (NeedsWinCFI) {
1935	HasWinCFI = true;
1936	BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlign))
1937	.addImm(MaxAlign)
1938	.setMIFlag(MachineInstr::FrameSetup);
1939	}
1940	}
1941
1942	// If there is an SUB32ri of ESP immediately before this instruction, merge
1943	// the two. This can be the case when tail call elimination is enabled and
1944	// the callee has more arguments then the caller.
1945	NumBytes -= mergeSPUpdates(MBB, MBBI, doMergeWithPrevious: true);
1946
1947	// Adjust stack pointer: ESP -= numbytes.
1948
1949	// Windows and cygwin/mingw require a prologue helper routine when allocating
1950	// more than 4K bytes on the stack. Windows uses __chkstk and cygwin/mingw
1951	// uses __alloca. __alloca and the 32-bit version of __chkstk will probe the
1952	// stack and adjust the stack pointer in one go. The 64-bit version of
1953	// __chkstk is only responsible for probing the stack. The 64-bit prologue is
1954	// responsible for adjusting the stack pointer. Touching the stack at 4K
1955	// increments is necessary to ensure that the guard pages used by the OS
1956	// virtual memory manager are allocated in correct sequence.
1957	uint64_t AlignedNumBytes = NumBytes;
1958	if (IsWin64Prologue && !IsFunclet && TRI->hasStackRealignment(MF))
1959	AlignedNumBytes = alignTo(Value: AlignedNumBytes, Align: MaxAlign);
1960	if (AlignedNumBytes >= StackProbeSize && EmitStackProbeCall) {
1961	assert(!X86FI->getUsesRedZone() &&
1962	"The Red Zone is not accounted for in stack probes");
1963
1964	// Check whether EAX is livein for this block.
1965	bool isEAXAlive = isEAXLiveIn(MBB);
1966
1967	if (isEAXAlive) {
1968	if (Is64Bit) {
1969	// Save RAX
1970	BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64r))
1971	.addReg(X86::RAX, RegState::Kill)
1972	.setMIFlag(MachineInstr::FrameSetup);
1973	} else {
1974	// Save EAX
1975	BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r))
1976	.addReg(X86::EAX, RegState::Kill)
1977	.setMIFlag(MachineInstr::FrameSetup);
1978	}
1979	}
1980
1981	if (Is64Bit) {
1982	// Handle the 64-bit Windows ABI case where we need to call __chkstk.
1983	// Function prologue is responsible for adjusting the stack pointer.
1984	int64_t Alloc = isEAXAlive ? NumBytes - `8` : NumBytes;
1985	BuildMI(MBB, MBBI, DL, TII.get(getMOVriOpcode(Is64Bit, Alloc)), X86::RAX)
1986	.addImm(Alloc)
1987	.setMIFlag(MachineInstr::FrameSetup);
1988	} else {
1989	// Allocate NumBytes-4 bytes on stack in case of isEAXAlive.
1990	// We'll also use 4 already allocated bytes for EAX.
1991	BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
1992	.addImm(isEAXAlive ? NumBytes - `4` : NumBytes)
1993	.setMIFlag(MachineInstr::FrameSetup);
1994	}
1995
1996	// Call __chkstk, __chkstk_ms, or __alloca.
1997	emitStackProbe(MF, MBB, MBBI, DL, InProlog: true);
1998
1999	if (isEAXAlive) {
2000	// Restore RAX/EAX
2001	MachineInstr *MI;
2002	if (Is64Bit)
2003	MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV64rm), X86::RAX),
2004	StackPtr, false, NumBytes - `8`);
2005	else
2006	MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), X86::EAX),
2007	StackPtr, false, NumBytes - `4`);
2008	MI->setFlag(MachineInstr::FrameSetup);
2009	MBB.insert(I: MBBI, MI);
2010	}
2011	} else if (NumBytes) {
2012	emitSPUpdate(MBB, MBBI, DL, NumBytes: -(int64_t)NumBytes, /InEpilogue=/false);
2013	}
2014
2015	if (NeedsWinCFI && NumBytes) {
2016	HasWinCFI = true;
2017	BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc))
2018	.addImm(NumBytes)
2019	.setMIFlag(MachineInstr::FrameSetup);
2020	}
2021
2022	int SEHFrameOffset = `0`;
2023	unsigned SPOrEstablisher;
2024	if (IsFunclet) {
2025	if (IsClrFunclet) {
2026	// The establisher parameter passed to a CLR funclet is actually a pointer
2027	// to the (mostly empty) frame of its nearest enclosing funclet; we have
2028	// to find the root function establisher frame by loading the PSPSym from
2029	// the intermediate frame.
2030	unsigned PSPSlotOffset = getPSPSlotOffsetFromSP(MF);
2031	MachinePointerInfo NoInfo;
2032	MBB.addLiveIn(PhysReg: Establisher);
2033	addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rm), Establisher),
2034	Establisher, false, PSPSlotOffset)
2035	.addMemOperand(MF.getMachineMemOperand(
2036	NoInfo, MachineMemOperand::MOLoad, SlotSize, Align(SlotSize)));
2037	;
2038	// Save the root establisher back into the current funclet's (mostly
2039	// empty) frame, in case a sub-funclet or the GC needs it.
2040	addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mr)), StackPtr,
2041	false, PSPSlotOffset)
2042	.addReg(Establisher)
2043	.addMemOperand(MF.getMachineMemOperand(
2044	NoInfo,
2045	MachineMemOperand::MOStore \| MachineMemOperand::MOVolatile,
2046	SlotSize, Align(SlotSize)));
2047	}
2048	SPOrEstablisher = Establisher;
2049	} else {
2050	SPOrEstablisher = StackPtr;
2051	}
2052
2053	if (IsWin64Prologue && HasFP) {
2054	// Set RBP to a small fixed offset from RSP. In the funclet case, we base
2055	// this calculation on the incoming establisher, which holds the value of
2056	// RSP from the parent frame at the end of the prologue.
2057	SEHFrameOffset = calculateSetFPREG(SPAdjust: ParentFrameNumBytes);
2058	if (SEHFrameOffset)
2059	addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::LEA64r), FramePtr),
2060	SPOrEstablisher, false, SEHFrameOffset);
2061	else
2062	BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rr), FramePtr)
2063	.addReg(SPOrEstablisher);
2064
2065	// If this is not a funclet, emit the CFI describing our frame pointer.
2066	if (NeedsWinCFI && !IsFunclet) {
2067	assert(!NeedsWinFPO && "this setframe incompatible with FPO data");
2068	HasWinCFI = true;
2069	BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame))
2070	.addImm(FramePtr)
2071	.addImm(SEHFrameOffset)
2072	.setMIFlag(MachineInstr::FrameSetup);
2073	if (isAsynchronousEHPersonality(Pers: Personality))
2074	MF.getWinEHFuncInfo()->SEHSetFrameOffset = SEHFrameOffset;
2075	}
2076	} else if (IsFunclet && STI.is32Bit()) {
2077	// Reset EBP / ESI to something good for funclets.
2078	MBBI = restoreWin32EHStackPointers(MBB, MBBI, DL);
2079	// If we're a catch funclet, we can be returned to via catchret. Save ESP
2080	// into the registration node so that the runtime will restore it for us.
2081	if (!MBB.isCleanupFuncletEntry()) {
2082	assert(Personality == EHPersonality::MSVC_CXX);
2083	Register FrameReg;
2084	int FI = MF.getWinEHFuncInfo()->EHRegNodeFrameIndex;
2085	int64_t EHRegOffset = getFrameIndexReference(MF, FI, FrameReg).getFixed();
2086	// ESP is the first field, so no extra displacement is needed.
2087	addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32mr)), FrameReg,
2088	false, EHRegOffset)
2089	.addReg(X86::ESP);
2090	}
2091	}
2092
2093	while (MBBI != MBB.end() && MBBI ->getFlag(Flag: MachineInstr::FrameSetup)) {
2094	const MachineInstr &FrameInstr = *MBBI;
2095	++MBBI;
2096
2097	if (NeedsWinCFI) {
2098	int FI;
2099	if (Register Reg = TII.isStoreToStackSlot(MI: FrameInstr, FrameIndex&: FI)) {
2100	if (X86::FR64RegClass.contains(Reg)) {
2101	int Offset;
2102	Register IgnoredFrameReg;
2103	if (IsWin64Prologue && IsFunclet)
2104	Offset = getWin64EHFrameIndexRef(MF, FI, SPReg&: IgnoredFrameReg);
2105	else
2106	Offset =
2107	getFrameIndexReference(MF, FI, FrameReg&: IgnoredFrameReg).getFixed() +
2108	SEHFrameOffset;
2109
2110	HasWinCFI = true;
2111	assert(!NeedsWinFPO && "SEH_SaveXMM incompatible with FPO data");
2112	BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM))
2113	.addImm(Reg)
2114	.addImm(Offset)
2115	.setMIFlag(MachineInstr::FrameSetup);
2116	}
2117	}
2118	}
2119	}
2120
2121	if (NeedsWinCFI && HasWinCFI)
2122	BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_EndPrologue))
2123	.setMIFlag(MachineInstr::FrameSetup);
2124
2125	if (FnHasClrFunclet && !IsFunclet) {
2126	// Save the so-called Initial-SP (i.e. the value of the stack pointer
2127	// immediately after the prolog) into the PSPSlot so that funclets
2128	// and the GC can recover it.
2129	unsigned PSPSlotOffset = getPSPSlotOffsetFromSP(MF);
2130	auto PSPInfo = MachinePointerInfo::getFixedStack(
2131	MF, FI: MF.getWinEHFuncInfo()->PSPSymFrameIdx);
2132	addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mr)), StackPtr, false,
2133	PSPSlotOffset)
2134	.addReg(StackPtr)
2135	.addMemOperand(MF.getMachineMemOperand(
2136	PSPInfo, MachineMemOperand::MOStore \| MachineMemOperand::MOVolatile,
2137	SlotSize, Align(SlotSize)));
2138	}
2139
2140	// Realign stack after we spilled callee-saved registers (so that we'll be
2141	// able to calculate their offsets from the frame pointer).
2142	// Win64 requires aligning the stack after the prologue.
2143	if (IsWin64Prologue && TRI->hasStackRealignment(MF)) {
2144	assert(HasFP && "There should be a frame pointer if stack is realigned.");
2145	BuildStackAlignAND(MBB, MBBI, DL, Reg: SPOrEstablisher, MaxAlign);
2146	}
2147
2148	// We already dealt with stack realignment and funclets above.
2149	if (IsFunclet && STI.is32Bit())
2150	return;
2151
2152	// If we need a base pointer, set it up here. It's whatever the value
2153	// of the stack pointer is at this point. Any variable size objects
2154	// will be allocated after this, so we can still use the base pointer
2155	// to reference locals.
2156	if (TRI->hasBasePointer(MF)) {
2157	// Update the base pointer with the current stack pointer.
2158	unsigned Opc = Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr;
2159	BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr)
2160	.addReg(SPOrEstablisher)
2161	.setMIFlag(MachineInstr::FrameSetup);
2162	if (X86FI->getRestoreBasePointer()) {
2163	// Stash value of base pointer. Saving RSP instead of EBP shortens
2164	// dependence chain. Used by SjLj EH.
2165	unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
2166	addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), FramePtr, true,
2167	X86FI->getRestoreBasePointerOffset())
2168	.addReg(SPOrEstablisher)
2169	.setMIFlag(MachineInstr::FrameSetup);
2170	}
2171
2172	if (X86FI->getHasSEHFramePtrSave() && !IsFunclet) {
2173	// Stash the value of the frame pointer relative to the base pointer for
2174	// Win32 EH. This supports Win32 EH, which does the inverse of the above:
2175	// it recovers the frame pointer from the base pointer rather than the
2176	// other way around.
2177	unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
2178	Register UsedReg;
2179	int Offset =
2180	getFrameIndexReference(MF, FI: X86FI->getSEHFramePtrSaveIndex(), FrameReg&: UsedReg)
2181	.getFixed();
2182	assert(UsedReg == BasePtr);
2183	addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), UsedReg, true, Offset)
2184	.addReg(FramePtr)
2185	.setMIFlag(MachineInstr::FrameSetup);
2186	}
2187	}
2188	if (ArgBaseReg.isValid()) {
2189	// Save argument base pointer.
2190	auto *MI = X86FI->getStackPtrSaveMI();
2191	int FI = MI->getOperand(i: `1`).getIndex();
2192	unsigned MOVmr = Is64Bit ? X86::MOV64mr : X86::MOV32mr;
2193	// movl %basereg, offset(%ebp)
2194	addFrameReference(BuildMI(MBB, MBBI, DL, TII.get(MOVmr)), FI)
2195	.addReg(ArgBaseReg)
2196	.setMIFlag(MachineInstr::FrameSetup);
2197	}
2198
2199	if (((!HasFP && NumBytes) \|\| PushedRegs) && NeedsDwarfCFI) {
2200	// Mark end of stack pointer adjustment.
2201	if (!HasFP && NumBytes) {
2202	// Define the current CFA rule to use the provided offset.
2203	assert(StackSize);
2204	BuildCFI(
2205	MBB, MBBI, DL,
2206	CFIInst: MCCFIInstruction::cfiDefCfaOffset(L: nullptr, Offset: StackSize - stackGrowth),
2207	Flag: MachineInstr::FrameSetup);
2208	}
2209
2210	// Emit DWARF info specifying the offsets of the callee-saved registers.
2211	emitCalleeSavedFrameMoves(MBB, MBBI, DL, IsPrologue: true);
2212	}
2213
2214	// X86 Interrupt handling function cannot assume anything about the direction
2215	// flag (DF in EFLAGS register). Clear this flag by creating "cld" instruction
2216	// in each prologue of interrupt handler function.
2217	//
2218	// Create "cld" instruction only in these cases:
2219	// 1. The interrupt handling function uses any of the "rep" instructions.
2220	// 2. Interrupt handling function calls another function.
2221	// 3. If there are any inline asm blocks, as we do not know what they do
2222	//
2223	// TODO: We should also emit cld if we detect the use of std, but as of now,
2224	// the compiler does not even emit that instruction or even define it, so in
2225	// practice, this would only happen with inline asm, which we cover anyway.
2226	if (Fn.getCallingConv() == CallingConv::X86_INTR) {
2227	bool NeedsCLD = false;
2228
2229	for (const MachineBasicBlock &B : MF) {
2230	for (const MachineInstr &MI : B) {
2231	if (MI.isCall()) {
2232	NeedsCLD = true;
2233	break;
2234	}
2235
2236	if (isOpcodeRep(Opcode: MI.getOpcode())) {
2237	NeedsCLD = true;
2238	break;
2239	}
2240
2241	if (MI.isInlineAsm()) {
2242	// TODO: Parse asm for rep instructions or call sites?
2243	// For now, let's play it safe and emit a cld instruction
2244	// just in case.
2245	NeedsCLD = true;
2246	break;
2247	}
2248	}
2249	}
2250
2251	if (NeedsCLD) {
2252	BuildMI(MBB, MBBI, DL, TII.get(X86::CLD))
2253	.setMIFlag(MachineInstr::FrameSetup);
2254	}
2255	}
2256
2257	// At this point we know if the function has WinCFI or not.
2258	MF.setHasWinCFI(HasWinCFI);
2259	}
2260
2261	bool X86FrameLowering::canUseLEAForSPInEpilogue(
2262	const MachineFunction &MF) const {
2263	// We can't use LEA instructions for adjusting the stack pointer if we don't
2264	// have a frame pointer in the Win64 ABI. Only ADD instructions may be used
2265	// to deallocate the stack.
2266	// This means that we can use LEA for SP in two situations:
2267	// 1. We aren't* using the Win64 ABI which means we are free to use LEA.*
2268	// 2. We have* a frame pointer which means we are permitted to use LEA.*
2269	return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI() \|\| hasFP(MF);
2270	}
2271
2272	static bool isFuncletReturnInstr(MachineInstr &MI) {
2273	switch (MI.getOpcode()) {
2274	case X86::CATCHRET:
2275	case X86::CLEANUPRET:
2276	return true;
2277	default:
2278	return false;
2279	}
2280	llvm_unreachable("impossible");
2281	}
2282
2283	// CLR funclets use a special "Previous Stack Pointer Symbol" slot on the
2284	// stack. It holds a pointer to the bottom of the root function frame. The
2285	// establisher frame pointer passed to a nested funclet may point to the
2286	// (mostly empty) frame of its parent funclet, but it will need to find
2287	// the frame of the root function to access locals. To facilitate this,
2288	// every funclet copies the pointer to the bottom of the root function
2289	// frame into a PSPSym slot in its own (mostly empty) stack frame. Using the
2290	// same offset for the PSPSym in the root function frame that's used in the
2291	// funclets' frames allows each funclet to dynamically accept any ancestor
2292	// frame as its establisher argument (the runtime doesn't guarantee the
2293	// immediate parent for some reason lost to history), and also allows the GC,
2294	// which uses the PSPSym for some bookkeeping, to find it in any funclet's
2295	// frame with only a single offset reported for the entire method.
2296	unsigned
2297	X86FrameLowering::getPSPSlotOffsetFromSP(const MachineFunction &MF) const {
2298	const WinEHFuncInfo &Info = *MF.getWinEHFuncInfo();
2299	Register SPReg;
2300	int Offset = getFrameIndexReferencePreferSP(MF, FI: Info.PSPSymFrameIdx, FrameReg&: SPReg,
2301	/IgnoreSPUpdates/ true)
2302	.getFixed();
2303	assert(Offset >= `0` && SPReg == TRI->getStackRegister());
2304	return static_cast<unsigned>(Offset);
2305	}
2306
2307	unsigned
2308	X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const {
2309	const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
2310	// This is the size of the pushed CSRs.
2311	unsigned CSSize = X86FI->getCalleeSavedFrameSize();
2312	// This is the size of callee saved XMMs.
2313	const auto &WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo();
2314	unsigned XMMSize =
2315	WinEHXMMSlotInfo.size() * TRI->getSpillSize(X86::VR128RegClass);
2316	// This is the amount of stack a funclet needs to allocate.
2317	unsigned UsedSize;
2318	EHPersonality Personality =
2319	classifyEHPersonality(Pers: MF.getFunction().getPersonalityFn());
2320	if (Personality == EHPersonality::CoreCLR) {
2321	// CLR funclets need to hold enough space to include the PSPSym, at the
2322	// same offset from the stack pointer (immediately after the prolog) as it
2323	// resides at in the main function.
2324	UsedSize = getPSPSlotOffsetFromSP(MF) + SlotSize;
2325	} else {
2326	// Other funclets just need enough stack for outgoing call arguments.
2327	UsedSize = MF.getFrameInfo().getMaxCallFrameSize();
2328	}
2329	// RBP is not included in the callee saved register block. After pushing RBP,
2330	// everything is 16 byte aligned. Everything we allocate before an outgoing
2331	// call must also be 16 byte aligned.
2332	unsigned FrameSizeMinusRBP = alignTo(Size: CSSize + UsedSize, A: getStackAlign());
2333	// Subtract out the size of the callee saved registers. This is how much stack
2334	// each funclet will allocate.
2335	return FrameSizeMinusRBP + XMMSize - CSSize;
2336	}
2337
2338	static bool isTailCallOpcode(unsigned Opc) {
2339	return Opc == X86::TCRETURNri \|\| Opc == X86::TCRETURNdi \|\|
2340	Opc == X86::TCRETURNmi \|\| Opc == X86::TCRETURNri64 \|\|
2341	Opc == X86::TCRETURNdi64 \|\| Opc == X86::TCRETURNmi64;
2342	}
2343
2344	void X86FrameLowering::emitEpilogue(MachineFunction &MF,
2345	MachineBasicBlock &MBB) const {
2346	const MachineFrameInfo &MFI = MF.getFrameInfo();
2347	X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
2348	MachineBasicBlock::iterator Terminator = MBB.getFirstTerminator();
2349	MachineBasicBlock::iterator MBBI = Terminator;
2350	DebugLoc DL;
2351	if (MBBI != MBB.end())
2352	DL = MBBI ->getDebugLoc();
2353	// standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit.
2354	const bool Is64BitILP32 = STI.isTarget64BitILP32();
2355	Register FramePtr = TRI->getFrameRegister(MF);
2356	Register MachineFramePtr =
2357	Is64BitILP32 ? Register (getX86SubSuperRegister(Reg: FramePtr, Size: `64`)) : FramePtr;
2358
2359	bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
2360	bool NeedsWin64CFI =
2361	IsWin64Prologue && MF.getFunction().needsUnwindTableEntry();
2362	bool IsFunclet = MBBI == MBB.end() ? false : isFuncletReturnInstr(MI&: *MBBI);
2363
2364	// Get the number of bytes to allocate from the FrameInfo.
2365	uint64_t StackSize = MFI.getStackSize();
2366	uint64_t MaxAlign = calculateMaxStackAlign(MF);
2367	unsigned CSSize = X86FI->getCalleeSavedFrameSize();
2368	unsigned TailCallArgReserveSize = -X86FI->getTCReturnAddrDelta();
2369	bool HasFP = hasFP(MF);
2370	uint64_t NumBytes = `0`;
2371
2372	bool NeedsDwarfCFI = (!MF.getTarget().getTargetTriple().isOSDarwin() &&
2373	!MF.getTarget().getTargetTriple().isOSWindows()) &&
2374	MF.needsFrameMoves();
2375
2376	Register ArgBaseReg;
2377	if (auto *MI = X86FI->getStackPtrSaveMI()) {
2378	unsigned Opc = X86::LEA32r;
2379	Register StackReg = X86::ESP;
2380	ArgBaseReg = MI->getOperand(i: `0`).getReg();
2381	if (STI.is64Bit()) {
2382	Opc = X86::LEA64r;
2383	StackReg = X86::RSP;
2384	}
2385	// leal -4(%basereg), %esp
2386	// .cfi_def_cfa %esp, 4
2387	BuildMI(MBB, MBBI, DL, TII.get(Opc), StackReg)
2388	.addUse(ArgBaseReg)
2389	.addImm(`1`)
2390	.addUse(X86::NoRegister)
2391	.addImm(-(int64_t)SlotSize)
2392	.addUse(X86::NoRegister)
2393	.setMIFlag(MachineInstr::FrameDestroy);
2394	if (NeedsDwarfCFI) {
2395	unsigned DwarfStackPtr = TRI->getDwarfRegNum(StackReg, true);
2396	BuildCFI(MBB, MBBI, DL,
2397	CFIInst: MCCFIInstruction::cfiDefCfa(L: nullptr, Register: DwarfStackPtr, Offset: SlotSize),
2398	Flag: MachineInstr::FrameDestroy);
2399	--MBBI;
2400	}
2401	--MBBI;
2402	}
2403
2404	if (IsFunclet) {
2405	assert(HasFP && "EH funclets without FP not yet implemented");
2406	NumBytes = getWinEHFuncletFrameSize(MF);
2407	} else if (HasFP) {
2408	// Calculate required stack adjustment.
2409	uint64_t FrameSize = StackSize - SlotSize;
2410	NumBytes = FrameSize - CSSize - TailCallArgReserveSize;
2411
2412	// Callee-saved registers were pushed on stack before the stack was
2413	// realigned.
2414	if (TRI->hasStackRealignment(MF) && !IsWin64Prologue)
2415	NumBytes = alignTo(Value: FrameSize, Align: MaxAlign);
2416	} else {
2417	NumBytes = StackSize - CSSize - TailCallArgReserveSize;
2418	}
2419	uint64_t SEHStackAllocAmt = NumBytes;
2420
2421	// AfterPop is the position to insert .cfi_restore.
2422	MachineBasicBlock::iterator AfterPop = MBBI;
2423	if (HasFP) {
2424	if (X86FI->hasSwiftAsyncContext()) {
2425	// Discard the context.
2426	int Offset = `16` + mergeSPUpdates(MBB, MBBI, doMergeWithPrevious: true);
2427	emitSPUpdate(MBB, MBBI, DL, NumBytes: Offset, /InEpilogue/ true);
2428	}
2429	// Pop EBP.
2430	BuildMI(MBB, MBBI, DL,
2431	TII.get(getPOPOpcode(ST: MF.getSubtarget<X86Subtarget>())),
2432	MachineFramePtr)
2433	.setMIFlag(MachineInstr::FrameDestroy);
2434
2435	// We need to reset FP to its untagged state on return. Bit 60 is currently
2436	// used to show the presence of an extended frame.
2437	if (X86FI->hasSwiftAsyncContext()) {
2438	BuildMI(MBB, MBBI, DL, TII.get(X86::BTR64ri8), MachineFramePtr)
2439	.addUse(MachineFramePtr)
2440	.addImm(`60`)
2441	.setMIFlag(MachineInstr::FrameDestroy);
2442	}
2443
2444	if (NeedsDwarfCFI) {
2445	if (!ArgBaseReg.isValid()) {
2446	unsigned DwarfStackPtr =
2447	TRI->getDwarfRegNum(Is64Bit ? X86::RSP : X86::ESP, true);
2448	BuildCFI(MBB, MBBI, DL,
2449	CFIInst: MCCFIInstruction::cfiDefCfa(L: nullptr, Register: DwarfStackPtr, Offset: SlotSize),
2450	Flag: MachineInstr::FrameDestroy);
2451	}
2452	if (!MBB.succ_empty() && !MBB.isReturnBlock()) {
2453	unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true);
2454	BuildCFI(MBB, MBBI: AfterPop, DL,
2455	CFIInst: MCCFIInstruction::createRestore(L: nullptr, Register: DwarfFramePtr),
2456	Flag: MachineInstr::FrameDestroy);
2457	--MBBI;
2458	--AfterPop;
2459	}
2460	--MBBI;
2461	}
2462	}
2463
2464	MachineBasicBlock::iterator FirstCSPop = MBBI;
2465	// Skip the callee-saved pop instructions.
2466	while (MBBI != MBB.begin()) {
2467	MachineBasicBlock::iterator PI = std::prev(x: MBBI);
2468	unsigned Opc = PI ->getOpcode();
2469
2470	if (Opc != X86::DBG_VALUE && !PI->isTerminator()) {
2471	if (!PI->getFlag(MachineInstr::FrameDestroy) \|\|
2472	(Opc != X86::POP32r && Opc != X86::POP64r && Opc != X86::BTR64ri8 &&
2473	Opc != X86::ADD64ri32 && Opc != X86::POPP64r && Opc != X86::POP2 &&
2474	Opc != X86::POP2P && Opc != X86::LEA64r))
2475	break;
2476	FirstCSPop = PI;
2477	}
2478
2479	--MBBI;
2480	}
2481	if (ArgBaseReg.isValid()) {
2482	// Restore argument base pointer.
2483	auto *MI = X86FI->getStackPtrSaveMI();
2484	int FI = MI->getOperand(i: `1`).getIndex();
2485	unsigned MOVrm = Is64Bit ? X86::MOV64rm : X86::MOV32rm;
2486	// movl offset(%ebp), %basereg
2487	addFrameReference(BuildMI(MBB, MBBI, DL, TII.get(MOVrm), ArgBaseReg), FI)
2488	.setMIFlag(MachineInstr::FrameDestroy);
2489	}
2490	MBBI = FirstCSPop;
2491
2492	if (IsFunclet && Terminator->getOpcode() == X86::CATCHRET)
2493	emitCatchRetReturnValue(MBB, MBBI: FirstCSPop, CatchRet: &*Terminator);
2494
2495	if (MBBI != MBB.end())
2496	DL = MBBI ->getDebugLoc();
2497	// If there is an ADD32ri or SUB32ri of ESP immediately before this
2498	// instruction, merge the two instructions.
2499	if (NumBytes \|\| MFI.hasVarSizedObjects())
2500	NumBytes += mergeSPUpdates(MBB, MBBI, doMergeWithPrevious: true);
2501
2502	// If dynamic alloca is used, then reset esp to point to the last callee-saved
2503	// slot before popping them off! Same applies for the case, when stack was
2504	// realigned. Don't do this if this was a funclet epilogue, since the funclets
2505	// will not do realignment or dynamic stack allocation.
2506	if (((TRI->hasStackRealignment(MF)) \|\| MFI.hasVarSizedObjects()) &&
2507	!IsFunclet) {
2508	if (TRI->hasStackRealignment(MF))
2509	MBBI = FirstCSPop;
2510	unsigned SEHFrameOffset = calculateSetFPREG(SPAdjust: SEHStackAllocAmt);
2511	uint64_t LEAAmount =
2512	IsWin64Prologue ? SEHStackAllocAmt - SEHFrameOffset : -CSSize;
2513
2514	if (X86FI->hasSwiftAsyncContext())
2515	LEAAmount -= `16`;
2516
2517	// There are only two legal forms of epilogue:
2518	// - add SEHAllocationSize, %rsp
2519	// - lea SEHAllocationSize(%FramePtr), %rsp
2520	//
2521	// 'mov %FramePtr, %rsp' will not be recognized as an epilogue sequence.
2522	// However, we may use this sequence if we have a frame pointer because the
2523	// effects of the prologue can safely be undone.
2524	if (LEAAmount != `0`) {
2525	unsigned Opc = getLEArOpcode(IsLP64: Uses64BitFramePtr);
2526	addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr), FramePtr,
2527	false, LEAAmount);
2528	--MBBI;
2529	} else {
2530	unsigned Opc = (Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr);
2531	BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr).addReg(FramePtr);
2532	--MBBI;
2533	}
2534	} else if (NumBytes) {
2535	// Adjust stack pointer back: ESP += numbytes.
2536	emitSPUpdate(MBB, MBBI, DL, NumBytes, /InEpilogue=/true);
2537	if (!HasFP && NeedsDwarfCFI) {
2538	// Define the current CFA rule to use the provided offset.
2539	BuildCFI(MBB, MBBI, DL,
2540	CFIInst: MCCFIInstruction::cfiDefCfaOffset(
2541	L: nullptr, Offset: CSSize + TailCallArgReserveSize + SlotSize),
2542	Flag: MachineInstr::FrameDestroy);
2543	}
2544	--MBBI;
2545	}
2546
2547	// Windows unwinder will not invoke function's exception handler if IP is
2548	// either in prologue or in epilogue. This behavior causes a problem when a
2549	// call immediately precedes an epilogue, because the return address points
2550	// into the epilogue. To cope with that, we insert an epilogue marker here,
2551	// then replace it with a 'nop' if it ends up immediately after a CALL in the
2552	// final emitted code.
2553	if (NeedsWin64CFI && MF.hasWinCFI())
2554	BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_Epilogue));
2555
2556	if (!HasFP && NeedsDwarfCFI) {
2557	MBBI = FirstCSPop;
2558	int64_t Offset = -CSSize - SlotSize;
2559	// Mark callee-saved pop instruction.
2560	// Define the current CFA rule to use the provided offset.
2561	while (MBBI != MBB.end()) {
2562	MachineBasicBlock::iterator PI = MBBI;
2563	unsigned Opc = PI ->getOpcode();
2564	++MBBI;
2565	if (Opc == X86::POP32r \|\| Opc == X86::POP64r \|\| Opc == X86::POPP64r \|\|
2566	Opc == X86::POP2 \|\| Opc == X86::POP2P) {
2567	Offset += SlotSize;
2568	// Compared to pop, pop2 introduces more stack offset (one more
2569	// register).
2570	if (Opc == X86::POP2 \|\| Opc == X86::POP2P)
2571	Offset += SlotSize;
2572	BuildCFI(MBB, MBBI, DL,
2573	CFIInst: MCCFIInstruction::cfiDefCfaOffset(L: nullptr, Offset: -Offset),
2574	Flag: MachineInstr::FrameDestroy);
2575	}
2576	}
2577	}
2578
2579	// Emit DWARF info specifying the restores of the callee-saved registers.
2580	// For epilogue with return inside or being other block without successor,
2581	// no need to generate .cfi_restore for callee-saved registers.
2582	if (NeedsDwarfCFI && !MBB.succ_empty())
2583	emitCalleeSavedFrameMoves(MBB, MBBI: AfterPop, DL, IsPrologue: false);
2584
2585	if (Terminator == MBB.end() \|\| !isTailCallOpcode(Opc: Terminator ->getOpcode())) {
2586	// Add the return addr area delta back since we are not tail calling.
2587	int Offset = -`1` * X86FI->getTCReturnAddrDelta();
2588	assert(Offset >= `0` && "TCDelta should never be positive");
2589	if (Offset) {
2590	// Check for possible merge with preceding ADD instruction.
2591	Offset += mergeSPUpdates(MBB, MBBI&: Terminator, doMergeWithPrevious: true);
2592	emitSPUpdate(MBB, MBBI&: Terminator, DL, NumBytes: Offset, /InEpilogue=/true);
2593	}
2594	}
2595
2596	// Emit tilerelease for AMX kernel.
2597	if (X86FI->hasVirtualTileReg())
2598	BuildMI(MBB, Terminator, DL, TII.get(X86::TILERELEASE));
2599	}
2600
2601	StackOffset X86FrameLowering::getFrameIndexReference(const MachineFunction &MF,
2602	int FI,
2603	Register &FrameReg) const {
2604	const MachineFrameInfo &MFI = MF.getFrameInfo();
2605
2606	bool IsFixed = MFI.isFixedObjectIndex(ObjectIdx: FI);
2607	// We can't calculate offset from frame pointer if the stack is realigned,
2608	// so enforce usage of stack/base pointer. The base pointer is used when we
2609	// have dynamic allocas in addition to dynamic realignment.
2610	if (TRI->hasBasePointer(MF))
2611	FrameReg = IsFixed ? TRI->getFramePtr() : TRI->getBaseRegister();
2612	else if (TRI->hasStackRealignment(MF))
2613	FrameReg = IsFixed ? TRI->getFramePtr() : TRI->getStackRegister();
2614	else
2615	FrameReg = TRI->getFrameRegister(MF);
2616
2617	// Offset will hold the offset from the stack pointer at function entry to the
2618	// object.
2619	// We need to factor in additional offsets applied during the prologue to the
2620	// frame, base, and stack pointer depending on which is used.
2621	int Offset = MFI.getObjectOffset(ObjectIdx: FI) - getOffsetOfLocalArea();
2622	const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
2623	unsigned CSSize = X86FI->getCalleeSavedFrameSize();
2624	uint64_t StackSize = MFI.getStackSize();
2625	bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
2626	int64_t FPDelta = `0`;
2627
2628	// In an x86 interrupt, remove the offset we added to account for the return
2629	// address from any stack object allocated in the caller's frame. Interrupts
2630	// do not have a standard return address. Fixed objects in the current frame,
2631	// such as SSE register spills, should not get this treatment.
2632	if (MF.getFunction().getCallingConv() == CallingConv::X86_INTR &&
2633	Offset >= `0`) {
2634	Offset += getOffsetOfLocalArea();
2635	}
2636
2637	if (IsWin64Prologue) {
2638	assert(!MFI.hasCalls() \|\| (StackSize % `16`) == `8`);
2639
2640	// Calculate required stack adjustment.
2641	uint64_t FrameSize = StackSize - SlotSize;
2642	// If required, include space for extra hidden slot for stashing base
2643	// pointer.
2644	if (X86FI->getRestoreBasePointer())
2645	FrameSize += SlotSize;
2646	uint64_t NumBytes = FrameSize - CSSize;
2647
2648	uint64_t SEHFrameOffset = calculateSetFPREG(SPAdjust: NumBytes);
2649	if (FI && FI == X86FI->getFAIndex())
2650	return StackOffset::getFixed(Fixed: -SEHFrameOffset);
2651
2652	// FPDelta is the offset from the "traditional" FP location of the old base
2653	// pointer followed by return address and the location required by the
2654	// restricted Win64 prologue.
2655	// Add FPDelta to all offsets below that go through the frame pointer.
2656	FPDelta = FrameSize - SEHFrameOffset;
2657	assert((!MFI.hasCalls() \|\| (FPDelta % `16`) == `0`) &&
2658	"FPDelta isn't aligned per the Win64 ABI!");
2659	}
2660
2661	if (FrameReg == TRI->getFramePtr()) {
2662	// Skip saved EBP/RBP
2663	Offset += SlotSize;
2664
2665	// Account for restricted Windows prologue.
2666	Offset += FPDelta;
2667
2668	// Skip the RETADDR move area
2669	int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
2670	if (TailCallReturnAddrDelta < `0`)
2671	Offset -= TailCallReturnAddrDelta;
2672
2673	return StackOffset::getFixed(Fixed: Offset);
2674	}
2675
2676	// FrameReg is either the stack pointer or a base pointer. But the base is
2677	// located at the end of the statically known StackSize so the distinction
2678	// doesn't really matter.
2679	if (TRI->hasStackRealignment(MF) \|\| TRI->hasBasePointer(MF))
2680	assert(isAligned(MFI.getObjectAlign(FI), -(Offset + StackSize)));
2681	return StackOffset::getFixed(Fixed: Offset + StackSize);
2682	}
2683
2684	int X86FrameLowering::getWin64EHFrameIndexRef(const MachineFunction &MF, int FI,
2685	Register &FrameReg) const {
2686	const MachineFrameInfo &MFI = MF.getFrameInfo();
2687	const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
2688	const auto &WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo();
2689	const auto it = WinEHXMMSlotInfo.find(Val: FI);
2690
2691	if (it == WinEHXMMSlotInfo.end())
2692	return getFrameIndexReference(MF, FI, FrameReg).getFixed();
2693
2694	FrameReg = TRI->getStackRegister();
2695	return alignDown(Value: MFI.getMaxCallFrameSize(), Align: getStackAlign().value()) +
2696	it ->second;
2697	}
2698
2699	StackOffset
2700	X86FrameLowering::getFrameIndexReferenceSP(const MachineFunction &MF, int FI,
2701	Register &FrameReg,
2702	int Adjustment) const {
2703	const MachineFrameInfo &MFI = MF.getFrameInfo();
2704	FrameReg = TRI->getStackRegister();
2705	return StackOffset::getFixed(Fixed: MFI.getObjectOffset(ObjectIdx: FI) -
2706	getOffsetOfLocalArea() + Adjustment);
2707	}
2708
2709	StackOffset
2710	X86FrameLowering::getFrameIndexReferencePreferSP(const MachineFunction &MF,
2711	int FI, Register &FrameReg,
2712	bool IgnoreSPUpdates) const {
2713
2714	const MachineFrameInfo &MFI = MF.getFrameInfo();
2715	// Does not include any dynamic realign.
2716	const uint64_t StackSize = MFI.getStackSize();
2717	// LLVM arranges the stack as follows:
2718	// ...
2719	// ARG2
2720	// ARG1
2721	// RETADDR
2722	// PUSH RBP <-- RBP points here
2723	// PUSH CSRs
2724	// ~~~~~~~ <-- possible stack realignment (non-win64)
2725	// ...
2726	// STACK OBJECTS
2727	// ... <-- RSP after prologue points here
2728	// ~~~~~~~ <-- possible stack realignment (win64)
2729	//
2730	// if (hasVarSizedObjects()):
2731	// ... <-- "base pointer" (ESI/RBX) points here
2732	// DYNAMIC ALLOCAS
2733	// ... <-- RSP points here
2734	//
2735	// Case 1: In the simple case of no stack realignment and no dynamic
2736	// allocas, both "fixed" stack objects (arguments and CSRs) are addressable
2737	// with fixed offsets from RSP.
2738	//
2739	// Case 2: In the case of stack realignment with no dynamic allocas, fixed
2740	// stack objects are addressed with RBP and regular stack objects with RSP.
2741	//
2742	// Case 3: In the case of dynamic allocas and stack realignment, RSP is used
2743	// to address stack arguments for outgoing calls and nothing else. The "base
2744	// pointer" points to local variables, and RBP points to fixed objects.
2745	//
2746	// In cases 2 and 3, we can only answer for non-fixed stack objects, and the
2747	// answer we give is relative to the SP after the prologue, and not the
2748	// SP in the middle of the function.
2749
2750	if (MFI.isFixedObjectIndex(ObjectIdx: FI) && TRI->hasStackRealignment(MF) &&
2751	!STI.isTargetWin64())
2752	return getFrameIndexReference(MF, FI, FrameReg);
2753
2754	// If !hasReservedCallFrame the function might have SP adjustement in the
2755	// body. So, even though the offset is statically known, it depends on where
2756	// we are in the function.
2757	if (!IgnoreSPUpdates && !hasReservedCallFrame(MF))
2758	return getFrameIndexReference(MF, FI, FrameReg);
2759
2760	// We don't handle tail calls, and shouldn't be seeing them either.
2761	assert(MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta() >= `0` &&
2762	"we don't handle this case!");
2763
2764	// This is how the math works out:
2765	//
2766	// %rsp grows (i.e. gets lower) left to right. Each box below is
2767	// one word (eight bytes). Obj0 is the stack slot we're trying to
2768	// get to.
2769	//
2770	// ----------------------------------
2771	// \| BP \| Obj0 \| Obj1 \| ... \| ObjN \|
2772	// ----------------------------------
2773	// ^ ^ ^ ^
2774	// A B C E
2775	//
2776	// A is the incoming stack pointer.
2777	// (B - A) is the local area offset (-8 for x86-64) [1]
2778	// (C - A) is the Offset returned by MFI.getObjectOffset for Obj0 [2]
2779	//
2780	// \|(E - B)\| is the StackSize (absolute value, positive). For a
2781	// stack that grown down, this works out to be (B - E). [3]
2782	//
2783	// E is also the value of %rsp after stack has been set up, and we
2784	// want (C - E) -- the value we can add to %rsp to get to Obj0. Now
2785	// (C - E) == (C - A) - (B - A) + (B - E)
2786	// { Using [1], [2] and [3] above }
2787	// == getObjectOffset - LocalAreaOffset + StackSize
2788
2789	return getFrameIndexReferenceSP(MF, FI, FrameReg, Adjustment: StackSize);
2790	}
2791
2792	bool X86FrameLowering::assignCalleeSavedSpillSlots(
2793	MachineFunction &MF, const TargetRegisterInfo *TRI,
2794	std::vector<CalleeSavedInfo> &CSI) const {
2795	MachineFrameInfo &MFI = MF.getFrameInfo();
2796	X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
2797
2798	unsigned CalleeSavedFrameSize = `0`;
2799	unsigned XMMCalleeSavedFrameSize = `0`;
2800	auto &WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo();
2801	int SpillSlotOffset = getOffsetOfLocalArea() + X86FI->getTCReturnAddrDelta();
2802
2803	int64_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
2804
2805	if (TailCallReturnAddrDelta < `0`) {
2806	// create RETURNADDR area
2807	// arg
2808	// arg
2809	// RETADDR
2810	// { ...
2811	// RETADDR area
2812	// ...
2813	// }
2814	// [EBP]
2815	MFI.CreateFixedObject(Size: -TailCallReturnAddrDelta,
2816	SPOffset: TailCallReturnAddrDelta - SlotSize, IsImmutable: true);
2817	}
2818
2819	// Spill the BasePtr if it's used.
2820	if (this->TRI->hasBasePointer(MF)) {
2821	// Allocate a spill slot for EBP if we have a base pointer and EH funclets.
2822	if (MF.hasEHFunclets()) {
2823	int FI = MFI.CreateSpillStackObject(Size: SlotSize, Alignment: Align (SlotSize));
2824	X86FI->setHasSEHFramePtrSave(true);
2825	X86FI->setSEHFramePtrSaveIndex(FI);
2826	}
2827	}
2828
2829	if (hasFP(MF)) {
2830	// emitPrologue always spills frame register the first thing.
2831	SpillSlotOffset -= SlotSize;
2832	MFI.CreateFixedSpillStackObject(Size: SlotSize, SPOffset: SpillSlotOffset);
2833
2834	// The async context lives directly before the frame pointer, and we
2835	// allocate a second slot to preserve stack alignment.
2836	if (X86FI->hasSwiftAsyncContext()) {
2837	SpillSlotOffset -= SlotSize;
2838	MFI.CreateFixedSpillStackObject(Size: SlotSize, SPOffset: SpillSlotOffset);
2839	SpillSlotOffset -= SlotSize;
2840	}
2841
2842	// Since emitPrologue and emitEpilogue will handle spilling and restoring of
2843	// the frame register, we can delete it from CSI list and not have to worry
2844	// about avoiding it later.
2845	Register FPReg = TRI->getFrameRegister(MF);
2846	for (unsigned i = `0`; i < CSI.size(); ++i) {
2847	if (TRI->regsOverlap(RegA: CSI [i].getReg(), RegB: FPReg)) {
2848	CSI.erase(position: CSI.begin() + i);
2849	break;
2850	}
2851	}
2852	}
2853
2854	// Strategy:
2855	// 1. Use push2 when
2856	// a) number of CSR > 1 if no need padding
2857	// b) number of CSR > 2 if need padding
2858	// 2. When the number of CSR push is odd
2859	// a. Start to use push2 from the 1st push if stack is 16B aligned.
2860	// b. Start to use push2 from the 2nd push if stack is not 16B aligned.
2861	// 3. When the number of CSR push is even, start to use push2 from the 1st
2862	// push and make the stack 16B aligned before the push
2863	unsigned NumRegsForPush2 = `0`;
2864	if (STI.hasPush2Pop2()) {
2865	unsigned NumCSGPR = llvm::count_if(Range&: CSI, P: [](const CalleeSavedInfo &I) {
2866	return X86::GR64RegClass.contains(I.getReg());
2867	});
2868	bool NeedPadding = (SpillSlotOffset % `16` != `0`) && (NumCSGPR % `2` == `0`);
2869	bool UsePush2Pop2 = NeedPadding ? NumCSGPR > `2` : NumCSGPR > `1`;
2870	X86FI->setPadForPush2Pop2(NeedPadding && UsePush2Pop2);
2871	NumRegsForPush2 = UsePush2Pop2 ? alignDown(Value: NumCSGPR, Align: `2`) : `0`;
2872	if (X86FI->padForPush2Pop2()) {
2873	SpillSlotOffset -= SlotSize;
2874	MFI.CreateFixedSpillStackObject(Size: SlotSize, SPOffset: SpillSlotOffset);
2875	}
2876	}
2877
2878	// Assign slots for GPRs. It increases frame size.
2879	for (CalleeSavedInfo &I : llvm::reverse(C&: CSI)) {
2880	Register Reg = I.getReg();
2881
2882	if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
2883	continue;
2884
2885	// A CSR is a candidate for push2/pop2 when it's slot offset is 16B aligned
2886	// or only an odd number of registers in the candidates.
2887	if (X86FI->getNumCandidatesForPush2Pop2() < NumRegsForPush2 &&
2888	(SpillSlotOffset % `16` == `0` \|\|
2889	X86FI->getNumCandidatesForPush2Pop2() % `2`))
2890	X86FI->addCandidateForPush2Pop2(Reg);
2891
2892	SpillSlotOffset -= SlotSize;
2893	CalleeSavedFrameSize += SlotSize;
2894
2895	int SlotIndex = MFI.CreateFixedSpillStackObject(Size: SlotSize, SPOffset: SpillSlotOffset);
2896	I.setFrameIdx(SlotIndex);
2897	}
2898
2899	// Adjust the offset of spill slot as we know the accurate callee saved frame
2900	// size.
2901	if (X86FI->getRestoreBasePointer()) {
2902	SpillSlotOffset -= SlotSize;
2903	CalleeSavedFrameSize += SlotSize;
2904
2905	MFI.CreateFixedSpillStackObject(Size: SlotSize, SPOffset: SpillSlotOffset);
2906	// TODO: saving the slot index is better?
2907	X86FI->setRestoreBasePointer(CalleeSavedFrameSize);
2908	}
2909	assert(X86FI->getNumCandidatesForPush2Pop2() % `2` == `0` &&
2910	"Expect even candidates for push2/pop2");
2911	if (X86FI->getNumCandidatesForPush2Pop2())
2912	++NumFunctionUsingPush2Pop2;
2913	X86FI->setCalleeSavedFrameSize(CalleeSavedFrameSize);
2914	MFI.setCVBytesOfCalleeSavedRegisters(CalleeSavedFrameSize);
2915
2916	// Assign slots for XMMs.
2917	for (CalleeSavedInfo &I : llvm::reverse(C&: CSI)) {
2918	Register Reg = I.getReg();
2919	if (X86::GR64RegClass.contains(Reg) \|\| X86::GR32RegClass.contains(Reg))
2920	continue;
2921
2922	// If this is k-register make sure we lookup via the largest legal type.
2923	MVT VT = MVT::Other;
2924	if (X86::VK16RegClass.contains(Reg))
2925	VT = STI.hasBWI() ? MVT::v64i1 : MVT::v16i1;
2926
2927	const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
2928	unsigned Size = TRI->getSpillSize(RC: *RC);
2929	Align Alignment = TRI->getSpillAlign(RC: *RC);
2930	// ensure alignment
2931	assert(SpillSlotOffset < `0` && "SpillSlotOffset should always < 0 on X86");
2932	SpillSlotOffset = -alignTo(Size: -SpillSlotOffset, A: Alignment);
2933
2934	// spill into slot
2935	SpillSlotOffset -= Size;
2936	int SlotIndex = MFI.CreateFixedSpillStackObject(Size, SPOffset: SpillSlotOffset);
2937	I.setFrameIdx(SlotIndex);
2938	MFI.ensureMaxAlignment(Alignment);
2939
2940	// Save the start offset and size of XMM in stack frame for funclets.
2941	if (X86::VR128RegClass.contains(Reg)) {
2942	WinEHXMMSlotInfo [SlotIndex] = XMMCalleeSavedFrameSize;
2943	XMMCalleeSavedFrameSize += Size;
2944	}
2945	}
2946
2947	return true;
2948	}
2949
2950	bool X86FrameLowering::spillCalleeSavedRegisters(
2951	MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
2952	ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo TRI) const* {
2953	DebugLoc DL = MBB.findDebugLoc(MBBI: MI);
2954
2955	// Don't save CSRs in 32-bit EH funclets. The caller saves EBX, EBP, ESI, EDI
2956	// for us, and there are no XMM CSRs on Win32.
2957	if (MBB.isEHFuncletEntry() && STI.is32Bit() && STI.isOSWindows())
2958	return true;
2959
2960	// Push GPRs. It increases frame size.
2961	const MachineFunction &MF = *MBB.getParent();
2962	const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
2963	if (X86FI->padForPush2Pop2())
2964	emitSPUpdate(MBB, MBBI&: MI, DL, NumBytes: -(int64_t)SlotSize, /InEpilogue=/false);
2965
2966	// Update LiveIn of the basic block and decide whether we can add a kill flag
2967	// to the use.
2968	auto UpdateLiveInCheckCanKill = [&](Register Reg) {
2969	const MachineRegisterInfo &MRI = MF.getRegInfo();
2970	// Do not set a kill flag on values that are also marked as live-in. This
2971	// happens with the @llvm-returnaddress intrinsic and with arguments
2972	// passed in callee saved registers.
2973	// Omitting the kill flags is conservatively correct even if the live-in
2974	// is not used after all.
2975	if (MRI.isLiveIn(Reg))
2976	return false;
2977	MBB.addLiveIn(PhysReg: Reg);
2978	// Check if any subregister is live-in
2979	for (MCRegAliasIterator AReg(Reg, TRI, false); AReg.isValid(); ++AReg)
2980	if (MRI.isLiveIn(Reg: *AReg))
2981	return false;
2982	return true;
2983	};
2984	auto UpdateLiveInGetKillRegState = [&](Register Reg) {
2985	return getKillRegState(B: UpdateLiveInCheckCanKill (Reg));
2986	};
2987
2988	for (auto RI = CSI.rbegin(), RE = CSI.rend(); RI != RE; ++RI) {
2989	Register Reg = RI ->getReg();
2990	if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
2991	continue;
2992
2993	if (X86FI->isCandidateForPush2Pop2(Reg)) {
2994	Register Reg2 = (++RI)->getReg();
2995	BuildMI(MBB, MI, DL, TII.get(getPUSH2Opcode(ST: STI)))
2996	.addReg(Reg, UpdateLiveInGetKillRegState (Reg))
2997	.addReg(Reg2, UpdateLiveInGetKillRegState (Reg2))
2998	.setMIFlag(MachineInstr::FrameSetup);
2999	} else {
3000	BuildMI(MBB, MI, DL, TII.get(getPUSHOpcode(ST: STI)))
3001	.addReg(Reg, UpdateLiveInGetKillRegState (Reg))
3002	.setMIFlag(MachineInstr::FrameSetup);
3003	}
3004	}
3005
3006	if (X86FI->getRestoreBasePointer()) {
3007	unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r;
3008	Register BaseReg = this->TRI->getBaseRegister();
3009	BuildMI(MBB, MI, DL, TII.get(Opc))
3010	.addReg(BaseReg, getKillRegState(B: true))
3011	.setMIFlag(MachineInstr::FrameSetup);
3012	}
3013
3014	// Make XMM regs spilled. X86 does not have ability of push/pop XMM.
3015	// It can be done by spilling XMMs to stack frame.
3016	for (const CalleeSavedInfo &I : llvm::reverse(C&: CSI)) {
3017	Register Reg = I.getReg();
3018	if (X86::GR64RegClass.contains(Reg) \|\| X86::GR32RegClass.contains(Reg))
3019	continue;
3020
3021	// If this is k-register make sure we lookup via the largest legal type.
3022	MVT VT = MVT::Other;
3023	if (X86::VK16RegClass.contains(Reg))
3024	VT = STI.hasBWI() ? MVT::v64i1 : MVT::v16i1;
3025
3026	// Add the callee-saved register as live-in. It's killed at the spill.
3027	MBB.addLiveIn(PhysReg: Reg);
3028	const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
3029
3030	TII.storeRegToStackSlot(MBB, MI, SrcReg: Reg, isKill: true, FrameIndex: I.getFrameIdx(), RC, TRI,
3031	VReg: Register ());
3032	--MI;
3033	MI ->setFlag(MachineInstr::FrameSetup);
3034	++MI;
3035	}
3036
3037	return true;
3038	}
3039
3040	void X86FrameLowering::emitCatchRetReturnValue(MachineBasicBlock &MBB,
3041	MachineBasicBlock::iterator MBBI,
3042	MachineInstr CatchRet) const* {
3043	// SEH shouldn't use catchret.
3044	assert(!isAsynchronousEHPersonality(classifyEHPersonality(
3045	MBB.getParent()->getFunction().getPersonalityFn())) &&
3046	"SEH should not use CATCHRET");
3047	const DebugLoc &DL = CatchRet->getDebugLoc();
3048	MachineBasicBlock *CatchRetTarget = CatchRet->getOperand(i: `0`).getMBB();
3049
3050	// Fill EAX/RAX with the address of the target block.
3051	if (STI.is64Bit()) {
3052	// LEA64r CatchRetTarget(%rip), %rax
3053	BuildMI(MBB, MBBI, DL, TII.get(X86::LEA64r), X86::RAX)
3054	.addReg(X86::RIP)
3055	.addImm(`0`)
3056	.addReg(`0`)
3057	.addMBB(CatchRetTarget)
3058	.addReg(`0`);
3059	} else {
3060	// MOV32ri $CatchRetTarget, %eax
3061	BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
3062	.addMBB(CatchRetTarget);
3063	}
3064
3065	// Record that we've taken the address of CatchRetTarget and no longer just
3066	// reference it in a terminator.
3067	CatchRetTarget->setMachineBlockAddressTaken();
3068	}
3069
3070	bool X86FrameLowering::restoreCalleeSavedRegisters(
3071	MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
3072	MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo TRI) const* {
3073	if (CSI.empty())
3074	return false;
3075
3076	if (MI != MBB.end() && isFuncletReturnInstr(MI&: *MI) && STI.isOSWindows()) {
3077	// Don't restore CSRs in 32-bit EH funclets. Matches
3078	// spillCalleeSavedRegisters.
3079	if (STI.is32Bit())
3080	return true;
3081	// Don't restore CSRs before an SEH catchret. SEH except blocks do not form
3082	// funclets. emitEpilogue transforms these to normal jumps.
3083	if (MI->getOpcode() == X86::CATCHRET) {
3084	const Function &F = MBB.getParent()->getFunction();
3085	bool IsSEH = isAsynchronousEHPersonality(
3086	Pers: classifyEHPersonality(Pers: F.getPersonalityFn()));
3087	if (IsSEH)
3088	return true;
3089	}
3090	}
3091
3092	DebugLoc DL = MBB.findDebugLoc(MBBI: MI);
3093
3094	// Reload XMMs from stack frame.
3095	for (const CalleeSavedInfo &I : CSI) {
3096	Register Reg = I.getReg();
3097	if (X86::GR64RegClass.contains(Reg) \|\| X86::GR32RegClass.contains(Reg))
3098	continue;
3099
3100	// If this is k-register make sure we lookup via the largest legal type.
3101	MVT VT = MVT::Other;
3102	if (X86::VK16RegClass.contains(Reg))
3103	VT = STI.hasBWI() ? MVT::v64i1 : MVT::v16i1;
3104
3105	const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
3106	TII.loadRegFromStackSlot(MBB, MI, DestReg: Reg, FrameIndex: I.getFrameIdx(), RC, TRI,
3107	VReg: Register ());
3108	}
3109
3110	// Clear the stack slot for spill base pointer register.
3111	MachineFunction &MF = *MBB.getParent();
3112	const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
3113	if (X86FI->getRestoreBasePointer()) {
3114	unsigned Opc = STI.is64Bit() ? X86::POP64r : X86::POP32r;
3115	Register BaseReg = this->TRI->getBaseRegister();
3116	BuildMI(MBB, MI, DL, TII.get(Opc), BaseReg)
3117	.setMIFlag(MachineInstr::FrameDestroy);
3118	}
3119
3120	// POP GPRs.
3121	for (auto I = CSI.begin(), E = CSI.end(); I != E; ++I) {
3122	Register Reg = I->getReg();
3123	if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
3124	continue;
3125
3126	if (X86FI->isCandidateForPush2Pop2(Reg))
3127	BuildMI(MBB, MI, DL, TII.get(getPOP2Opcode(ST: STI)), Reg)
3128	.addReg((++I)->getReg(), RegState::Define)
3129	.setMIFlag(MachineInstr::FrameDestroy);
3130	else
3131	BuildMI(MBB, MI, DL, TII.get(getPOPOpcode(ST: STI)), Reg)
3132	.setMIFlag(MachineInstr::FrameDestroy);
3133	}
3134	if (X86FI->padForPush2Pop2())
3135	emitSPUpdate(MBB, MBBI&: MI, DL, NumBytes: SlotSize, /InEpilogue=/true);
3136
3137	return true;
3138	}
3139
3140	void X86FrameLowering::determineCalleeSaves(MachineFunction &MF,
3141	BitVector &SavedRegs,
3142	RegScavenger RS) const* {
3143	TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
3144
3145	// Spill the BasePtr if it's used.
3146	if (TRI->hasBasePointer(MF)) {
3147	Register BasePtr = TRI->getBaseRegister();
3148	if (STI.isTarget64BitILP32())
3149	BasePtr = getX86SubSuperRegister(Reg: BasePtr, Size: `64`);
3150	SavedRegs.set(BasePtr);
3151	}
3152	}
3153
3154	static bool HasNestArgument(const MachineFunction *MF) {
3155	const Function &F = MF->getFunction();
3156	for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E;
3157	I++) {
3158	if (I->hasNestAttr() && !I->use_empty())
3159	return true;
3160	}
3161	return false;
3162	}
3163
3164	/// GetScratchRegister - Get a temp register for performing work in the
3165	/// segmented stack and the Erlang/HiPE stack prologue. Depending on platform
3166	/// and the properties of the function either one or two registers will be
3167	/// needed. Set primary to true for the first register, false for the second.
3168	static unsigned GetScratchRegister(bool Is64Bit, bool IsLP64,
3169	const MachineFunction &MF, bool Primary) {
3170	CallingConv::ID CallingConvention = MF.getFunction().getCallingConv();
3171
3172	// Erlang stuff.
3173	if (CallingConvention == CallingConv::HiPE) {
3174	if (Is64Bit)
3175	return Primary ? X86::R14 : X86::R13;
3176	else
3177	return Primary ? X86::EBX : X86::EDI;
3178	}
3179
3180	if (Is64Bit) {
3181	if (IsLP64)
3182	return Primary ? X86::R11 : X86::R12;
3183	else
3184	return Primary ? X86::R11D : X86::R12D;
3185	}
3186
3187	bool IsNested = HasNestArgument(MF: &MF);
3188
3189	if (CallingConvention == CallingConv::X86_FastCall \|\|
3190	CallingConvention == CallingConv::Fast \|\|
3191	CallingConvention == CallingConv::Tail) {
3192	if (IsNested)
3193	report_fatal_error(reason: "Segmented stacks does not support fastcall with "
3194	"nested function.");
3195	return Primary ? X86::EAX : X86::ECX;
3196	}
3197	if (IsNested)
3198	return Primary ? X86::EDX : X86::EAX;
3199	return Primary ? X86::ECX : X86::EAX;
3200	}
3201
3202	// The stack limit in the TCB is set to this many bytes above the actual stack
3203	// limit.
3204	static const uint64_t kSplitStackAvailable = `256`;
3205
3206	void X86FrameLowering::adjustForSegmentedStacks(
3207	MachineFunction &MF, MachineBasicBlock &PrologueMBB) const {
3208	MachineFrameInfo &MFI = MF.getFrameInfo();
3209	uint64_t StackSize;
3210	unsigned TlsReg, TlsOffset;
3211	DebugLoc DL;
3212
3213	// To support shrink-wrapping we would need to insert the new blocks
3214	// at the right place and update the branches to PrologueMBB.
3215	assert(&(*MF.begin()) == &PrologueMBB && "Shrink-wrapping not supported yet");
3216
3217	unsigned ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, Primary: true);
3218	assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&
3219	"Scratch register is live-in");
3220
3221	if (MF.getFunction().isVarArg())
3222	report_fatal_error(reason: "Segmented stacks do not support vararg functions.");
3223	if (!STI.isTargetLinux() && !STI.isTargetDarwin() && !STI.isTargetWin32() &&
3224	!STI.isTargetWin64() && !STI.isTargetFreeBSD() &&
3225	!STI.isTargetDragonFly())
3226	report_fatal_error(reason: "Segmented stacks not supported on this platform.");
3227
3228	// Eventually StackSize will be calculated by a link-time pass; which will
3229	// also decide whether checking code needs to be injected into this particular
3230	// prologue.
3231	StackSize = MFI.getStackSize();
3232
3233	if (!MFI.needsSplitStackProlog())
3234	return;
3235
3236	MachineBasicBlock *allocMBB = MF.CreateMachineBasicBlock();
3237	MachineBasicBlock *checkMBB = MF.CreateMachineBasicBlock();
3238	X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
3239	bool IsNested = false;
3240
3241	// We need to know if the function has a nest argument only in 64 bit mode.
3242	if (Is64Bit)
3243	IsNested = HasNestArgument(MF: &MF);
3244
3245	// The MOV R10, RAX needs to be in a different block, since the RET we emit in
3246	// allocMBB needs to be last (terminating) instruction.
3247
3248	for (const auto &LI : PrologueMBB.liveins()) {
3249	allocMBB->addLiveIn(RegMaskPair: LI);
3250	checkMBB->addLiveIn(RegMaskPair: LI);
3251	}
3252
3253	if (IsNested)
3254	allocMBB->addLiveIn(IsLP64 ? X86::R10 : X86::R10D);
3255
3256	MF.push_front(MBB: allocMBB);
3257	MF.push_front(MBB: checkMBB);
3258
3259	// When the frame size is less than 256 we just compare the stack
3260	// boundary directly to the value of the stack pointer, per gcc.
3261	bool CompareStackPointer = StackSize < kSplitStackAvailable;
3262
3263	// Read the limit off the current stacklet off the stack_guard location.
3264	if (Is64Bit) {
3265	if (STI.isTargetLinux()) {
3266	TlsReg = X86::FS;
3267	TlsOffset = IsLP64 ? `0x70` : `0x40`;
3268	} else if (STI.isTargetDarwin()) {
3269	TlsReg = X86::GS;
3270	TlsOffset = `0x60` + `90` * `8`; // See pthread_machdep.h. Steal TLS slot 90.
3271	} else if (STI.isTargetWin64()) {
3272	TlsReg = X86::GS;
3273	TlsOffset = `0x28`; // pvArbitrary, reserved for application use
3274	} else if (STI.isTargetFreeBSD()) {
3275	TlsReg = X86::FS;
3276	TlsOffset = `0x18`;
3277	} else if (STI.isTargetDragonFly()) {
3278	TlsReg = X86::FS;
3279	TlsOffset = `0x20`; // use tls_tcb.tcb_segstack
3280	} else {
3281	report_fatal_error(reason: "Segmented stacks not supported on this platform.");
3282	}
3283
3284	if (CompareStackPointer)
3285	ScratchReg = IsLP64 ? X86::RSP : X86::ESP;
3286	else
3287	BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::LEA64r : X86::LEA64_32r),
3288	ScratchReg)
3289	.addReg(X86::RSP)
3290	.addImm(`1`)
3291	.addReg(`0`)
3292	.addImm(-StackSize)
3293	.addReg(`0`);
3294
3295	BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::CMP64rm : X86::CMP32rm))
3296	.addReg(ScratchReg)
3297	.addReg(`0`)
3298	.addImm(`1`)
3299	.addReg(`0`)
3300	.addImm(TlsOffset)
3301	.addReg(TlsReg);
3302	} else {
3303	if (STI.isTargetLinux()) {
3304	TlsReg = X86::GS;
3305	TlsOffset = `0x30`;
3306	} else if (STI.isTargetDarwin()) {
3307	TlsReg = X86::GS;
3308	TlsOffset = `0x48` + `90` * `4`;
3309	} else if (STI.isTargetWin32()) {
3310	TlsReg = X86::FS;
3311	TlsOffset = `0x14`; // pvArbitrary, reserved for application use
3312	} else if (STI.isTargetDragonFly()) {
3313	TlsReg = X86::FS;
3314	TlsOffset = `0x10`; // use tls_tcb.tcb_segstack
3315	} else if (STI.isTargetFreeBSD()) {
3316	report_fatal_error(reason: "Segmented stacks not supported on FreeBSD i386.");
3317	} else {
3318	report_fatal_error(reason: "Segmented stacks not supported on this platform.");
3319	}
3320
3321	if (CompareStackPointer)
3322	ScratchReg = X86::ESP;
3323	else
3324	BuildMI(checkMBB, DL, TII.get(X86::LEA32r), ScratchReg)
3325	.addReg(X86::ESP)
3326	.addImm(`1`)
3327	.addReg(`0`)
3328	.addImm(-StackSize)
3329	.addReg(`0`);
3330
3331	if (STI.isTargetLinux() \|\| STI.isTargetWin32() \|\| STI.isTargetWin64() \|\|
3332	STI.isTargetDragonFly()) {
3333	BuildMI(checkMBB, DL, TII.get(X86::CMP32rm))
3334	.addReg(ScratchReg)
3335	.addReg(`0`)
3336	.addImm(`0`)
3337	.addReg(`0`)
3338	.addImm(TlsOffset)
3339	.addReg(TlsReg);
3340	} else if (STI.isTargetDarwin()) {
3341
3342	// TlsOffset doesn't fit into a mod r/m byte so we need an extra register.
3343	unsigned ScratchReg2;
3344	bool SaveScratch2;
3345	if (CompareStackPointer) {
3346	// The primary scratch register is available for holding the TLS offset.
3347	ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, Primary: true);
3348	SaveScratch2 = false;
3349	} else {
3350	// Need to use a second register to hold the TLS offset
3351	ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, Primary: false);
3352
3353	// Unfortunately, with fastcc the second scratch register may hold an
3354	// argument.
3355	SaveScratch2 = MF.getRegInfo().isLiveIn(Reg: ScratchReg2);
3356	}
3357
3358	// If Scratch2 is live-in then it needs to be saved.
3359	assert((!MF.getRegInfo().isLiveIn(ScratchReg2) \|\| SaveScratch2) &&
3360	"Scratch register is live-in and not saved");
3361
3362	if (SaveScratch2)
3363	BuildMI(checkMBB, DL, TII.get(X86::PUSH32r))
3364	.addReg(ScratchReg2, RegState::Kill);
3365
3366	BuildMI(checkMBB, DL, TII.get(X86::MOV32ri), ScratchReg2)
3367	.addImm(TlsOffset);
3368	BuildMI(checkMBB, DL, TII.get(X86::CMP32rm))
3369	.addReg(ScratchReg)
3370	.addReg(ScratchReg2)
3371	.addImm(`1`)
3372	.addReg(`0`)
3373	.addImm(`0`)
3374	.addReg(TlsReg);
3375
3376	if (SaveScratch2)
3377	BuildMI(checkMBB, DL, TII.get(X86::POP32r), ScratchReg2);
3378	}
3379	}
3380
3381	// This jump is taken if SP >= (Stacklet Limit + Stack Space required).
3382	// It jumps to normal execution of the function body.
3383	BuildMI(checkMBB, DL, TII.get(X86::JCC_1))
3384	.addMBB(&PrologueMBB)
3385	.addImm(X86::COND_A);
3386
3387	// On 32 bit we first push the arguments size and then the frame size. On 64
3388	// bit, we pass the stack frame size in r10 and the argument size in r11.
3389	if (Is64Bit) {
3390	// Functions with nested arguments use R10, so it needs to be saved across
3391	// the call to _morestack
3392
3393	const unsigned RegAX = IsLP64 ? X86::RAX : X86::EAX;
3394	const unsigned Reg10 = IsLP64 ? X86::R10 : X86::R10D;
3395	const unsigned Reg11 = IsLP64 ? X86::R11 : X86::R11D;
3396	const unsigned MOVrr = IsLP64 ? X86::MOV64rr : X86::MOV32rr;
3397
3398	if (IsNested)
3399	BuildMI(allocMBB, DL, TII.get(MOVrr), RegAX).addReg(Reg10);
3400
3401	BuildMI(allocMBB, DL, TII.get(getMOVriOpcode(Use64BitReg: IsLP64, Imm: StackSize)), Reg10)
3402	.addImm(StackSize);
3403	BuildMI(allocMBB, DL,
3404	TII.get(getMOVriOpcode(Use64BitReg: IsLP64, Imm: X86FI->getArgumentStackSize())),
3405	Reg11)
3406	.addImm(X86FI->getArgumentStackSize());
3407	} else {
3408	BuildMI(allocMBB, DL, TII.get(X86::PUSH32i))
3409	.addImm(X86FI->getArgumentStackSize());
3410	BuildMI(allocMBB, DL, TII.get(X86::PUSH32i)).addImm(StackSize);
3411	}
3412
3413	// __morestack is in libgcc
3414	if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) {
3415	// Under the large code model, we cannot assume that __morestack lives
3416	// within 2^31 bytes of the call site, so we cannot use pc-relative
3417	// addressing. We cannot perform the call via a temporary register,
3418	// as the rax register may be used to store the static chain, and all
3419	// other suitable registers may be either callee-save or used for
3420	// parameter passing. We cannot use the stack at this point either
3421	// because __morestack manipulates the stack directly.
3422	//
3423	// To avoid these issues, perform an indirect call via a read-only memory
3424	// location containing the address.
3425	//
3426	// This solution is not perfect, as it assumes that the .rodata section
3427	// is laid out within 2^31 bytes of each function body, but this seems
3428	// to be sufficient for JIT.
3429	// FIXME: Add retpoline support and remove the error here..
3430	if (STI.useIndirectThunkCalls())
3431	report_fatal_error(reason: "Emitting morestack calls on 64-bit with the large "
3432	"code model and thunks not yet implemented.");
3433	BuildMI(allocMBB, DL, TII.get(X86::CALL64m))
3434	.addReg(X86::RIP)
3435	.addImm(`0`)
3436	.addReg(`0`)
3437	.addExternalSymbol("__morestack_addr")
3438	.addReg(`0`);
3439	} else {
3440	if (Is64Bit)
3441	BuildMI(allocMBB, DL, TII.get(X86::CALL64pcrel32))
3442	.addExternalSymbol("__morestack");
3443	else
3444	BuildMI(allocMBB, DL, TII.get(X86::CALLpcrel32))
3445	.addExternalSymbol("__morestack");
3446	}
3447
3448	if (IsNested)
3449	BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET_RESTORE_R10));
3450	else
3451	BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET));
3452
3453	allocMBB->addSuccessor(Succ: &PrologueMBB);
3454
3455	checkMBB->addSuccessor(Succ: allocMBB, Prob: BranchProbability::getZero());
3456	checkMBB->addSuccessor(Succ: &PrologueMBB, Prob: BranchProbability::getOne());
3457
3458	#ifdef EXPENSIVE_CHECKS
3459	MF.verify();
3460	#endif
3461	}
3462
3463	/// Lookup an ERTS parameter in the !hipe.literals named metadata node.
3464	/// HiPE provides Erlang Runtime System-internal parameters, such as PCB offsets
3465	/// to fields it needs, through a named metadata node "hipe.literals" containing
3466	/// name-value pairs.
3467	static unsigned getHiPELiteral(NamedMDNode *HiPELiteralsMD,
3468	const StringRef LiteralName) {
3469	for (int i = `0`, e = HiPELiteralsMD->getNumOperands(); i != e; ++i) {
3470	MDNode *Node = HiPELiteralsMD->getOperand(i);
3471	if (Node->getNumOperands() != `2`)
3472	continue;
3473	MDString *NodeName = dyn_cast<MDString>(Val: Node->getOperand(I: `0`));
3474	ValueAsMetadata *NodeVal = dyn_cast<ValueAsMetadata>(Val: Node->getOperand(I: `1`));
3475	if (!NodeName \|\| !NodeVal)
3476	continue;
3477	ConstantInt *ValConst = dyn_cast_or_null<ConstantInt>(Val: NodeVal->getValue());
3478	if (ValConst && NodeName->getString() == LiteralName) {
3479	return ValConst->getZExtValue();
3480	}
3481	}
3482
3483	report_fatal_error(reason: "HiPE literal " + LiteralName +
3484	" required but not provided");
3485	}
3486
3487	// Return true if there are no non-ehpad successors to MBB and there are no
3488	// non-meta instructions between MBBI and MBB.end().
3489	static bool blockEndIsUnreachable(const MachineBasicBlock &MBB,
3490	MachineBasicBlock::const_iterator MBBI) {
3491	return llvm::all_of(
3492	Range: MBB.successors(),
3493	P: [](const MachineBasicBlock Succ) { return* Succ->isEHPad(); }) &&
3494	std::all_of(first: MBBI, last: MBB.end(), pred: [](const MachineInstr &MI) {
3495	return MI.isMetaInstruction();
3496	});
3497	}
3498
3499	/// Erlang programs may need a special prologue to handle the stack size they
3500	/// might need at runtime. That is because Erlang/OTP does not implement a C
3501	/// stack but uses a custom implementation of hybrid stack/heap architecture.
3502	/// (for more information see Eric Stenman's Ph.D. thesis:
3503	/// http://publications.uu.se/uu/fulltext/nbn_se_uu_diva-2688.pdf)
3504	///
3505	/// CheckStack:
3506	/// temp0 = sp - MaxStack
3507	/// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
3508	/// OldStart:
3509	/// ...
3510	/// IncStack:
3511	/// call inc_stack # doubles the stack space
3512	/// temp0 = sp - MaxStack
3513	/// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
3514	void X86FrameLowering::adjustForHiPEPrologue(
3515	MachineFunction &MF, MachineBasicBlock &PrologueMBB) const {
3516	MachineFrameInfo &MFI = MF.getFrameInfo();
3517	DebugLoc DL;
3518
3519	// To support shrink-wrapping we would need to insert the new blocks
3520	// at the right place and update the branches to PrologueMBB.
3521	assert(&(*MF.begin()) == &PrologueMBB && "Shrink-wrapping not supported yet");
3522
3523	// HiPE-specific values
3524	NamedMDNode *HiPELiteralsMD =
3525	MF.getMMI().getModule()->getNamedMetadata(Name: "hipe.literals");
3526	if (!HiPELiteralsMD)
3527	report_fatal_error(
3528	reason: "Can't generate HiPE prologue without runtime parameters");
3529	const unsigned HipeLeafWords = getHiPELiteral(
3530	HiPELiteralsMD, LiteralName: Is64Bit ? "AMD64_LEAF_WORDS" : "X86_LEAF_WORDS");
3531	const unsigned CCRegisteredArgs = Is64Bit ? `6` : `5`;
3532	const unsigned Guaranteed = HipeLeafWords * SlotSize;
3533	unsigned CallerStkArity = MF.getFunction().arg_size() > CCRegisteredArgs
3534	? MF.getFunction().arg_size() - CCRegisteredArgs
3535	: `0`;
3536	unsigned MaxStack = MFI.getStackSize() + CallerStkArity * SlotSize + SlotSize;
3537
3538	assert(STI.isTargetLinux() &&
3539	"HiPE prologue is only supported on Linux operating systems.");
3540
3541	// Compute the largest caller's frame that is needed to fit the callees'
3542	// frames. This 'MaxStack' is computed from:
3543	//
3544	// a) the fixed frame size, which is the space needed for all spilled temps,
3545	// b) outgoing on-stack parameter areas, and
3546	// c) the minimum stack space this function needs to make available for the
3547	// functions it calls (a tunable ABI property).
3548	if (MFI.hasCalls()) {
3549	unsigned MoreStackForCalls = `0`;
3550
3551	for (auto &MBB : MF) {
3552	for (auto &MI : MBB) {
3553	if (!MI.isCall())
3554	continue;
3555
3556	// Get callee operand.
3557	const MachineOperand &MO = MI.getOperand(i: `0`);
3558
3559	// Only take account of global function calls (no closures etc.).
3560	if (!MO.isGlobal())
3561	continue;
3562
3563	const Function *F = dyn_cast<Function>(Val: MO.getGlobal());
3564	if (!F)
3565	continue;
3566
3567	// Do not update 'MaxStack' for primitive and built-in functions
3568	// (encoded with names either starting with "erlang."/"bif_" or not
3569	// having a ".", such as a simple <Module>.<Function>.<Arity>, or an
3570	// "_", such as the BIF "suspend_0") as they are executed on another
3571	// stack.
3572	if (F->getName().contains(Other: "erlang.") \|\| F->getName().contains(Other: "bif_") \|\|
3573	F->getName().find_first_of(Chars: "._") == StringRef::npos)
3574	continue;
3575
3576	unsigned CalleeStkArity = F->arg_size() > CCRegisteredArgs
3577	? F->arg_size() - CCRegisteredArgs
3578	: `0`;
3579	if (HipeLeafWords - `1` > CalleeStkArity)
3580	MoreStackForCalls =
3581	std::max(a: MoreStackForCalls,
3582	b: (HipeLeafWords - `1` - CalleeStkArity) * SlotSize);
3583	}
3584	}
3585	MaxStack += MoreStackForCalls;
3586	}
3587
3588	// If the stack frame needed is larger than the guaranteed then runtime checks
3589	// and calls to "inc_stack_0" BIF should be inserted in the assembly prologue.
3590	if (MaxStack > Guaranteed) {
3591	MachineBasicBlock *stackCheckMBB = MF.CreateMachineBasicBlock();
3592	MachineBasicBlock *incStackMBB = MF.CreateMachineBasicBlock();
3593
3594	for (const auto &LI : PrologueMBB.liveins()) {
3595	stackCheckMBB->addLiveIn(RegMaskPair: LI);
3596	incStackMBB->addLiveIn(RegMaskPair: LI);
3597	}
3598
3599	MF.push_front(MBB: incStackMBB);
3600	MF.push_front(MBB: stackCheckMBB);
3601
3602	unsigned ScratchReg, SPReg, PReg, SPLimitOffset;
3603	unsigned LEAop, CMPop, CALLop;
3604	SPLimitOffset = getHiPELiteral(HiPELiteralsMD, LiteralName: "P_NSP_LIMIT");
3605	if (Is64Bit) {
3606	SPReg = X86::RSP;
3607	PReg = X86::RBP;
3608	LEAop = X86::LEA64r;
3609	CMPop = X86::CMP64rm;
3610	CALLop = X86::CALL64pcrel32;
3611	} else {
3612	SPReg = X86::ESP;
3613	PReg = X86::EBP;
3614	LEAop = X86::LEA32r;
3615	CMPop = X86::CMP32rm;
3616	CALLop = X86::CALLpcrel32;
3617	}
3618
3619	ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, Primary: true);
3620	assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&
3621	"HiPE prologue scratch register is live-in");
3622
3623	// Create new MBB for StackCheck:
3624	addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(LEAop), ScratchReg), SPReg,
3625	false, -MaxStack);
3626	// SPLimitOffset is in a fixed heap location (pointed by BP).
3627	addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(CMPop)).addReg(ScratchReg),
3628	PReg, false, SPLimitOffset);
3629	BuildMI(stackCheckMBB, DL, TII.get(X86::JCC_1))
3630	.addMBB(&PrologueMBB)
3631	.addImm(X86::COND_AE);
3632
3633	// Create new MBB for IncStack:
3634	BuildMI(incStackMBB, DL, TII.get(CALLop)).addExternalSymbol("inc_stack_0");
3635	addRegOffset(BuildMI(incStackMBB, DL, TII.get(LEAop), ScratchReg), SPReg,
3636	false, -MaxStack);
3637	addRegOffset(BuildMI(incStackMBB, DL, TII.get(CMPop)).addReg(ScratchReg),
3638	PReg, false, SPLimitOffset);
3639	BuildMI(incStackMBB, DL, TII.get(X86::JCC_1))
3640	.addMBB(incStackMBB)
3641	.addImm(X86::COND_LE);
3642
3643	stackCheckMBB->addSuccessor(Succ: &PrologueMBB, Prob: {`99`, `100`});
3644	stackCheckMBB->addSuccessor(Succ: incStackMBB, Prob: {`1`, `100`});
3645	incStackMBB->addSuccessor(Succ: &PrologueMBB, Prob: {`99`, `100`});
3646	incStackMBB->addSuccessor(Succ: incStackMBB, Prob: {`1`, `100`});
3647	}
3648	#ifdef EXPENSIVE_CHECKS
3649	MF.verify();
3650	#endif
3651	}
3652
3653	bool X86FrameLowering::adjustStackWithPops(MachineBasicBlock &MBB,
3654	MachineBasicBlock::iterator MBBI,
3655	const DebugLoc &DL,
3656	int Offset) const {
3657	if (Offset <= `0`)
3658	return false;
3659
3660	if (Offset % SlotSize)
3661	return false;
3662
3663	int NumPops = Offset / SlotSize;
3664	// This is only worth it if we have at most 2 pops.
3665	if (NumPops != `1` && NumPops != `2`)
3666	return false;
3667
3668	// Handle only the trivial case where the adjustment directly follows
3669	// a call. This is the most common one, anyway.
3670	if (MBBI == MBB.begin())
3671	return false;
3672	MachineBasicBlock::iterator Prev = std::prev(x: MBBI);
3673	if (!Prev ->isCall() \|\| !Prev ->getOperand(i: `1`).isRegMask())
3674	return false;
3675
3676	unsigned Regs[`2`];
3677	unsigned FoundRegs = `0`;
3678
3679	const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3680	const MachineOperand &RegMask = Prev ->getOperand(i: `1`);
3681
3682	auto &RegClass =
3683	Is64Bit ? X86::GR64_NOREX_NOSPRegClass : X86::GR32_NOREX_NOSPRegClass;
3684	// Try to find up to NumPops free registers.
3685	for (auto Candidate : RegClass) {
3686	// Poor man's liveness:
3687	// Since we're immediately after a call, any register that is clobbered
3688	// by the call and not defined by it can be considered dead.
3689	if (!RegMask.clobbersPhysReg(Candidate))
3690	continue;
3691
3692	// Don't clobber reserved registers
3693	if (MRI.isReserved(Candidate))
3694	continue;
3695
3696	bool IsDef = false;
3697	for (const MachineOperand &MO : Prev->implicit_operands()) {
3698	if (MO.isReg() && MO.isDef() &&
3699	TRI->isSuperOrSubRegisterEq(MO.getReg(), Candidate)) {
3700	IsDef = true;
3701	break;
3702	}
3703	}
3704
3705	if (IsDef)
3706	continue;
3707
3708	Regs[FoundRegs++] = Candidate;
3709	if (FoundRegs == (unsigned)NumPops)
3710	break;
3711	}
3712
3713	if (FoundRegs == `0`)
3714	return false;
3715
3716	// If we found only one free register, but need two, reuse the same one twice.
3717	while (FoundRegs < (unsigned)NumPops)
3718	Regs[FoundRegs++] = Regs[`0`];
3719
3720	for (int i = `0`; i < NumPops; ++i)
3721	BuildMI(MBB, MBBI, DL, TII.get(STI.is64Bit() ? X86::POP64r : X86::POP32r),
3722	Regs[i]);
3723
3724	return true;
3725	}
3726
3727	MachineBasicBlock::iterator X86FrameLowering::eliminateCallFramePseudoInstr(
3728	MachineFunction &MF, MachineBasicBlock &MBB,
3729	MachineBasicBlock::iterator I) const {
3730	bool reserveCallFrame = hasReservedCallFrame(MF);
3731	unsigned Opcode = I ->getOpcode();
3732	bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode();
3733	DebugLoc DL = I ->getDebugLoc(); // copy DebugLoc as I will be erased.
3734	uint64_t Amount = TII.getFrameSize(*I);
3735	uint64_t InternalAmt = (isDestroy \|\| Amount) ? TII.getFrameAdjustment(I: *I) : `0`;
3736	I = MBB.erase(I);
3737	auto InsertPos = skipDebugInstructionsForward(It: I, End: MBB.end());
3738
3739	// Try to avoid emitting dead SP adjustments if the block end is unreachable,
3740	// typically because the function is marked noreturn (abort, throw,
3741	// assert_fail, etc).
3742	if (isDestroy && blockEndIsUnreachable(MBB, MBBI: I))
3743	return I;
3744
3745	if (!reserveCallFrame) {
3746	// If the stack pointer can be changed after prologue, turn the
3747	// adjcallstackup instruction into a 'sub ESP, <amt>' and the
3748	// adjcallstackdown instruction into 'add ESP, <amt>'
3749
3750	// We need to keep the stack aligned properly. To do this, we round the
3751	// amount of space needed for the outgoing arguments up to the next
3752	// alignment boundary.
3753	Amount = alignTo(Size: Amount, A: getStackAlign());
3754
3755	const Function &F = MF.getFunction();
3756	bool WindowsCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
3757	bool DwarfCFI = !WindowsCFI && MF.needsFrameMoves();
3758
3759	// If we have any exception handlers in this function, and we adjust
3760	// the SP before calls, we may need to indicate this to the unwinder
3761	// using GNU_ARGS_SIZE. Note that this may be necessary even when
3762	// Amount == 0, because the preceding function may have set a non-0
3763	// GNU_ARGS_SIZE.
3764	// TODO: We don't need to reset this between subsequent functions,
3765	// if it didn't change.
3766	bool HasDwarfEHHandlers = !WindowsCFI && !MF.getLandingPads().empty();
3767
3768	if (HasDwarfEHHandlers && !isDestroy &&
3769	MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences())
3770	BuildCFI(MBB, MBBI: InsertPos, DL,
3771	CFIInst: MCCFIInstruction::createGnuArgsSize(L: nullptr, Size: Amount));
3772
3773	if (Amount == `0`)
3774	return I;
3775
3776	// Factor out the amount that gets handled inside the sequence
3777	// (Pushes of argument for frame setup, callee pops for frame destroy)
3778	Amount -= InternalAmt;
3779
3780	// TODO: This is needed only if we require precise CFA.
3781	// If this is a callee-pop calling convention, emit a CFA adjust for
3782	// the amount the callee popped.
3783	if (isDestroy && InternalAmt && DwarfCFI && !hasFP(MF))
3784	BuildCFI(MBB, MBBI: InsertPos, DL,
3785	CFIInst: MCCFIInstruction::createAdjustCfaOffset(L: nullptr, Adjustment: -InternalAmt));
3786
3787	// Add Amount to SP to destroy a frame, or subtract to setup.
3788	int64_t StackAdjustment = isDestroy ? Amount : -Amount;
3789
3790	if (StackAdjustment) {
3791	// Merge with any previous or following adjustment instruction. Note: the
3792	// instructions merged with here do not have CFI, so their stack
3793	// adjustments do not feed into CfaAdjustment.
3794	StackAdjustment += mergeSPUpdates(MBB, MBBI&: InsertPos, doMergeWithPrevious: true);
3795	StackAdjustment += mergeSPUpdates(MBB, MBBI&: InsertPos, doMergeWithPrevious: false);
3796
3797	if (StackAdjustment) {
3798	if (!(F.hasMinSize() &&
3799	adjustStackWithPops(MBB, MBBI: InsertPos, DL, Offset: StackAdjustment)))
3800	BuildStackAdjustment(MBB, MBBI: InsertPos, DL, Offset: StackAdjustment,
3801	/InEpilogue=/false);
3802	}
3803	}
3804
3805	if (DwarfCFI && !hasFP(MF)) {
3806	// If we don't have FP, but need to generate unwind information,
3807	// we need to set the correct CFA offset after the stack adjustment.
3808	// How much we adjust the CFA offset depends on whether we're emitting
3809	// CFI only for EH purposes or for debugging. EH only requires the CFA
3810	// offset to be correct at each call site, while for debugging we want
3811	// it to be more precise.
3812
3813	int64_t CfaAdjustment = -StackAdjustment;
3814	// TODO: When not using precise CFA, we also need to adjust for the
3815	// InternalAmt here.
3816	if (CfaAdjustment) {
3817	BuildCFI(
3818	MBB, MBBI: InsertPos, DL,
3819	CFIInst: MCCFIInstruction::createAdjustCfaOffset(L: nullptr, Adjustment: CfaAdjustment));
3820	}
3821	}
3822
3823	return I;
3824	}
3825
3826	if (InternalAmt) {
3827	MachineBasicBlock::iterator CI = I;
3828	MachineBasicBlock::iterator B = MBB.begin();
3829	while (CI != B && !std::prev(x: CI)->isCall())
3830	--CI;
3831	BuildStackAdjustment(MBB, MBBI: CI, DL, Offset: -InternalAmt, /InEpilogue=/false);
3832	}
3833
3834	return I;
3835	}
3836
3837	bool X86FrameLowering::canUseAsPrologue(const MachineBasicBlock &MBB) const {
3838	assert(MBB.getParent() && "Block is not attached to a function!");
3839	const MachineFunction &MF = *MBB.getParent();
3840	if (!MBB.isLiveIn(X86::EFLAGS))
3841	return true;
3842
3843	// If stack probes have to loop inline or call, that will clobber EFLAGS.
3844	// FIXME: we could allow cases that will use emitStackProbeInlineGenericBlock.
3845	const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
3846	const X86TargetLowering &TLI = *STI.getTargetLowering();
3847	if (TLI.hasInlineStackProbe(MF) \|\| TLI.hasStackProbeSymbol(MF))
3848	return false;
3849
3850	const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
3851	return !TRI->hasStackRealignment(MF) && !X86FI->hasSwiftAsyncContext();
3852	}
3853
3854	bool X86FrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const {
3855	assert(MBB.getParent() && "Block is not attached to a function!");
3856
3857	// Win64 has strict requirements in terms of epilogue and we are
3858	// not taking a chance at messing with them.
3859	// I.e., unless this block is already an exit block, we can't use
3860	// it as an epilogue.
3861	if (STI.isTargetWin64() && !MBB.succ_empty() && !MBB.isReturnBlock())
3862	return false;
3863
3864	// Swift async context epilogue has a BTR instruction that clobbers parts of
3865	// EFLAGS.
3866	const MachineFunction &MF = *MBB.getParent();
3867	if (MF.getInfo<X86MachineFunctionInfo>()->hasSwiftAsyncContext())
3868	return !flagsNeedToBePreservedBeforeTheTerminators(MBB);
3869
3870	if (canUseLEAForSPInEpilogue(MF: *MBB.getParent()))
3871	return true;
3872
3873	// If we cannot use LEA to adjust SP, we may need to use ADD, which
3874	// clobbers the EFLAGS. Check that we do not need to preserve it,
3875	// otherwise, conservatively assume this is not
3876	// safe to insert the epilogue here.
3877	return !flagsNeedToBePreservedBeforeTheTerminators(MBB);
3878	}
3879
3880	bool X86FrameLowering::enableShrinkWrapping(const MachineFunction &MF) const {
3881	// If we may need to emit frameless compact unwind information, give
3882	// up as this is currently broken: PR25614.
3883	bool CompactUnwind =
3884	MF.getMMI().getContext().getObjectFileInfo()->getCompactUnwindSection() !=
3885	nullptr;
3886	return (MF.getFunction().hasFnAttribute(Attribute::NoUnwind) \|\| hasFP(MF) \|\|
3887	!CompactUnwind) &&
3888	// The lowering of segmented stack and HiPE only support entry
3889	// blocks as prologue blocks: PR26107. This limitation may be
3890	// lifted if we fix:
3891	// - adjustForSegmentedStacks
3892	// - adjustForHiPEPrologue
3893	MF.getFunction().getCallingConv() != CallingConv::HiPE &&
3894	!MF.shouldSplitStack();
3895	}
3896
3897	MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers(
3898	MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
3899	const DebugLoc &DL, bool RestoreSP) const {
3900	assert(STI.isTargetWindowsMSVC() && "funclets only supported in MSVC env");
3901	assert(STI.isTargetWin32() && "EBP/ESI restoration only required on win32");
3902	assert(STI.is32Bit() && !Uses64BitFramePtr &&
3903	"restoring EBP/ESI on non-32-bit target");
3904
3905	MachineFunction &MF = *MBB.getParent();
3906	Register FramePtr = TRI->getFrameRegister(MF);
3907	Register BasePtr = TRI->getBaseRegister();
3908	WinEHFuncInfo &FuncInfo = *MF.getWinEHFuncInfo();
3909	X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
3910	MachineFrameInfo &MFI = MF.getFrameInfo();
3911
3912	// FIXME: Don't set FrameSetup flag in catchret case.
3913
3914	int FI = FuncInfo.EHRegNodeFrameIndex;
3915	int EHRegSize = MFI.getObjectSize(ObjectIdx: FI);
3916
3917	if (RestoreSP) {
3918	// MOV32rm -EHRegSize(%ebp), %esp
3919	addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32rm), X86::ESP),
3920	X86::EBP, true, -EHRegSize)
3921	.setMIFlag(MachineInstr::FrameSetup);
3922	}
3923
3924	Register UsedReg;
3925	int EHRegOffset = getFrameIndexReference(MF, FI, FrameReg&: UsedReg).getFixed();
3926	int EndOffset = -EHRegOffset - EHRegSize;
3927	FuncInfo.EHRegNodeEndOffset = EndOffset;
3928
3929	if (UsedReg == FramePtr) {
3930	// ADD $offset, %ebp
3931	unsigned ADDri = getADDriOpcode(IsLP64: false);
3932	BuildMI(MBB, MBBI, DL, TII.get(ADDri), FramePtr)
3933	.addReg(FramePtr)
3934	.addImm(EndOffset)
3935	.setMIFlag(MachineInstr::FrameSetup)
3936	->getOperand(`3`)
3937	.setIsDead();
3938	assert(EndOffset >= `0` &&
3939	"end of registration object above normal EBP position!");
3940	} else if (UsedReg == BasePtr) {
3941	// LEA offset(%ebp), %esi
3942	addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::LEA32r), BasePtr),
3943	FramePtr, false, EndOffset)
3944	.setMIFlag(MachineInstr::FrameSetup);
3945	// MOV32rm SavedEBPOffset(%esi), %ebp
3946	assert(X86FI->getHasSEHFramePtrSave());
3947	int Offset =
3948	getFrameIndexReference(MF, FI: X86FI->getSEHFramePtrSaveIndex(), FrameReg&: UsedReg)
3949	.getFixed();
3950	assert(UsedReg == BasePtr);
3951	addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32rm), FramePtr),
3952	UsedReg, true, Offset)
3953	.setMIFlag(MachineInstr::FrameSetup);
3954	} else {
3955	llvm_unreachable("32-bit frames with WinEH must use FramePtr or BasePtr");
3956	}
3957	return MBBI;
3958	}
3959
3960	int X86FrameLowering::getInitialCFAOffset(const MachineFunction &MF) const {
3961	return TRI->getSlotSize();
3962	}
3963
3964	Register
3965	X86FrameLowering::getInitialCFARegister(const MachineFunction &MF) const {
3966	return StackPtr;
3967	}
3968
3969	TargetFrameLowering::DwarfFrameBase
3970	X86FrameLowering::getDwarfFrameBase(const MachineFunction &MF) const {
3971	const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
3972	Register FrameRegister = RI->getFrameRegister(MF);
3973	if (getInitialCFARegister(MF) == FrameRegister &&
3974	MF.getInfo<X86MachineFunctionInfo>()->hasCFIAdjustCfa()) {
3975	DwarfFrameBase FrameBase;
3976	FrameBase.Kind = DwarfFrameBase::CFA;
3977	FrameBase.Location.Offset =
3978	-MF.getFrameInfo().getStackSize() - getInitialCFAOffset(MF);
3979	return FrameBase;
3980	}
3981
3982	return DwarfFrameBase{.Kind: DwarfFrameBase::Register, .Location: {.Reg: FrameRegister}};
3983	}
3984
3985	namespace {
3986	// Struct used by orderFrameObjects to help sort the stack objects.
3987	struct X86FrameSortingObject {
3988	bool IsValid = false; // true if we care about this Object.
3989	unsigned ObjectIndex = `0`; // Index of Object into MFI list.
3990	unsigned ObjectSize = `0`; // Size of Object in bytes.
3991	Align ObjectAlignment = Align (`1`); // Alignment of Object in bytes.
3992	unsigned ObjectNumUses = `0`; // Object static number of uses.
3993	};
3994
3995	// The comparison function we use for std::sort to order our local
3996	// stack symbols. The current algorithm is to use an estimated
3997	// "density". This takes into consideration the size and number of
3998	// uses each object has in order to roughly minimize code size.
3999	// So, for example, an object of size 16B that is referenced 5 times
4000	// will get higher priority than 4 4B objects referenced 1 time each.
4001	// It's not perfect and we may be able to squeeze a few more bytes out of
4002	// it (for example : 0(esp) requires fewer bytes, symbols allocated at the
4003	// fringe end can have special consideration, given their size is less
4004	// important, etc.), but the algorithmic complexity grows too much to be
4005	// worth the extra gains we get. This gets us pretty close.
4006	// The final order leaves us with objects with highest priority going
4007	// at the end of our list.
4008	struct X86FrameSortingComparator {
4009	inline bool operator()(const X86FrameSortingObject &A,
4010	const X86FrameSortingObject &B) const {
4011	uint64_t DensityAScaled, DensityBScaled;
4012
4013	// For consistency in our comparison, all invalid objects are placed
4014	// at the end. This also allows us to stop walking when we hit the
4015	// first invalid item after it's all sorted.
4016	if (!A.IsValid)
4017	return false;
4018	if (!B.IsValid)
4019	return true;
4020
4021	// The density is calculated by doing :
4022	// (double)DensityA = A.ObjectNumUses / A.ObjectSize
4023	// (double)DensityB = B.ObjectNumUses / B.ObjectSize
4024	// Since this approach may cause inconsistencies in
4025	// the floating point <, >, == comparisons, depending on the floating
4026	// point model with which the compiler was built, we're going
4027	// to scale both sides by multiplying with
4028	// A.ObjectSize B.ObjectSize. This ends up factoring away*
4029	// the division and, with it, the need for any floating point
4030	// arithmetic.
4031	DensityAScaled = static_cast<uint64_t>(A.ObjectNumUses) *
4032	static_cast<uint64_t>(B.ObjectSize);
4033	DensityBScaled = static_cast<uint64_t>(B.ObjectNumUses) *
4034	static_cast<uint64_t>(A.ObjectSize);
4035
4036	// If the two densities are equal, prioritize highest alignment
4037	// objects. This allows for similar alignment objects
4038	// to be packed together (given the same density).
4039	// There's room for improvement here, also, since we can pack
4040	// similar alignment (different density) objects next to each
4041	// other to save padding. This will also require further
4042	// complexity/iterations, and the overall gain isn't worth it,
4043	// in general. Something to keep in mind, though.
4044	if (DensityAScaled == DensityBScaled)
4045	return A.ObjectAlignment < B.ObjectAlignment;
4046
4047	return DensityAScaled < DensityBScaled;
4048	}
4049	};
4050	} // namespace
4051
4052	// Order the symbols in the local stack.
4053	// We want to place the local stack objects in some sort of sensible order.
4054	// The heuristic we use is to try and pack them according to static number
4055	// of uses and size of object in order to minimize code size.
4056	void X86FrameLowering::orderFrameObjects(
4057	const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const {
4058	const MachineFrameInfo &MFI = MF.getFrameInfo();
4059
4060	// Don't waste time if there's nothing to do.
4061	if (ObjectsToAllocate.empty())
4062	return;
4063
4064	// Create an array of all MFI objects. We won't need all of these
4065	// objects, but we're going to create a full array of them to make
4066	// it easier to index into when we're counting "uses" down below.
4067	// We want to be able to easily/cheaply access an object by simply
4068	// indexing into it, instead of having to search for it every time.
4069	std::vector<X86FrameSortingObject> SortingObjects(MFI.getObjectIndexEnd());
4070
4071	// Walk the objects we care about and mark them as such in our working
4072	// struct.
4073	for (auto &Obj : ObjectsToAllocate) {
4074	SortingObjects [Obj].IsValid = true;
4075	SortingObjects [Obj].ObjectIndex = Obj;
4076	SortingObjects [Obj].ObjectAlignment = MFI.getObjectAlign(ObjectIdx: Obj);
4077	// Set the size.
4078	int ObjectSize = MFI.getObjectSize(ObjectIdx: Obj);
4079	if (ObjectSize == `0`)
4080	// Variable size. Just use 4.
4081	SortingObjects [Obj].ObjectSize = `4`;
4082	else
4083	SortingObjects [Obj].ObjectSize = ObjectSize;
4084	}
4085
4086	// Count the number of uses for each object.
4087	for (auto &MBB : MF) {
4088	for (auto &MI : MBB) {
4089	if (MI.isDebugInstr())
4090	continue;
4091	for (const MachineOperand &MO : MI.operands()) {
4092	// Check to see if it's a local stack symbol.
4093	if (!MO.isFI())
4094	continue;
4095	int Index = MO.getIndex();
4096	// Check to see if it falls within our range, and is tagged
4097	// to require ordering.
4098	if (Index >= `0` && Index < MFI.getObjectIndexEnd() &&
4099	SortingObjects [Index].IsValid)
4100	SortingObjects [Index].ObjectNumUses++;
4101	}
4102	}
4103	}
4104
4105	// Sort the objects using X86FrameSortingAlgorithm (see its comment for
4106	// info).
4107	llvm::stable_sort(Range&: SortingObjects, C: X86FrameSortingComparator ());
4108
4109	// Now modify the original list to represent the final order that
4110	// we want. The order will depend on whether we're going to access them
4111	// from the stack pointer or the frame pointer. For SP, the list should
4112	// end up with the END containing objects that we want with smaller offsets.
4113	// For FP, it should be flipped.
4114	int i = `0`;
4115	for (auto &Obj : SortingObjects) {
4116	// All invalid items are sorted at the end, so it's safe to stop.
4117	if (!Obj.IsValid)
4118	break;
4119	ObjectsToAllocate [i++] = Obj.ObjectIndex;
4120	}
4121
4122	// Flip it if we're accessing off of the FP.
4123	if (!TRI->hasStackRealignment(MF) && hasFP(MF))
4124	std::reverse(first: ObjectsToAllocate.begin(), last: ObjectsToAllocate.end());
4125	}
4126
4127	unsigned
4128	X86FrameLowering::getWinEHParentFrameOffset(const MachineFunction &MF) const {
4129	// RDX, the parent frame pointer, is homed into 16(%rsp) in the prologue.
4130	unsigned Offset = `16`;
4131	// RBP is immediately pushed.
4132	Offset += SlotSize;
4133	// All callee-saved registers are then pushed.
4134	Offset += MF.getInfo<X86MachineFunctionInfo>()->getCalleeSavedFrameSize();
4135	// Every funclet allocates enough stack space for the largest outgoing call.
4136	Offset += getWinEHFuncletFrameSize(MF);
4137	return Offset;
4138	}
4139
4140	void X86FrameLowering::processFunctionBeforeFrameFinalized(
4141	MachineFunction &MF, RegScavenger RS) const* {
4142	// Mark the function as not having WinCFI. We will set it back to true in
4143	// emitPrologue if it gets called and emits CFI.
4144	MF.setHasWinCFI(false);
4145
4146	// If we are using Windows x64 CFI, ensure that the stack is always 8 byte
4147	// aligned. The format doesn't support misaligned stack adjustments.
4148	if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI())
4149	MF.getFrameInfo().ensureMaxAlignment(Alignment: Align (SlotSize));
4150
4151	// If this function isn't doing Win64-style C++ EH, we don't need to do
4152	// anything.
4153	if (STI.is64Bit() && MF.hasEHFunclets() &&
4154	classifyEHPersonality(Pers: MF.getFunction().getPersonalityFn()) ==
4155	EHPersonality::MSVC_CXX) {
4156	adjustFrameForMsvcCxxEh(MF);
4157	}
4158	}
4159
4160	void X86FrameLowering::adjustFrameForMsvcCxxEh(MachineFunction &MF) const {
4161	// Win64 C++ EH needs to allocate the UnwindHelp object at some fixed offset
4162	// relative to RSP after the prologue. Find the offset of the last fixed
4163	// object, so that we can allocate a slot immediately following it. If there
4164	// were no fixed objects, use offset -SlotSize, which is immediately after the
4165	// return address. Fixed objects have negative frame indices.
4166	MachineFrameInfo &MFI = MF.getFrameInfo();
4167	WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo();
4168	int64_t MinFixedObjOffset = -SlotSize;
4169	for (int I = MFI.getObjectIndexBegin(); I < `0`; ++I)
4170	MinFixedObjOffset = std::min(a: MinFixedObjOffset, b: MFI.getObjectOffset(ObjectIdx: I));
4171
4172	for (WinEHTryBlockMapEntry &TBME : EHInfo.TryBlockMap) {
4173	for (WinEHHandlerType &H : TBME.HandlerArray) {
4174	int FrameIndex = H.CatchObj.FrameIndex;
4175	if (FrameIndex != INT_MAX) {
4176	// Ensure alignment.
4177	unsigned Align = MFI.getObjectAlign(ObjectIdx: FrameIndex).value();
4178	MinFixedObjOffset -= std::abs(i: MinFixedObjOffset) % Align;
4179	MinFixedObjOffset -= MFI.getObjectSize(ObjectIdx: FrameIndex);
4180	MFI.setObjectOffset(ObjectIdx: FrameIndex, SPOffset: MinFixedObjOffset);
4181	}
4182	}
4183	}
4184
4185	// Ensure alignment.
4186	MinFixedObjOffset -= std::abs(i: MinFixedObjOffset) % `8`;
4187	int64_t UnwindHelpOffset = MinFixedObjOffset - SlotSize;
4188	int UnwindHelpFI =
4189	MFI.CreateFixedObject(Size: SlotSize, SPOffset: UnwindHelpOffset, /IsImmutable=/false);
4190	EHInfo.UnwindHelpFrameIdx = UnwindHelpFI;
4191
4192	// Store -2 into UnwindHelp on function entry. We have to scan forwards past
4193	// other frame setup instructions.
4194	MachineBasicBlock &MBB = MF.front();
4195	auto MBBI = MBB.begin();
4196	while (MBBI != MBB.end() && MBBI ->getFlag(Flag: MachineInstr::FrameSetup))
4197	++MBBI;
4198
4199	DebugLoc DL = MBB.findDebugLoc(MBBI);
4200	addFrameReference(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mi32)),
4201	UnwindHelpFI)
4202	.addImm(-`2`);
4203	}
4204
4205	void X86FrameLowering::processFunctionBeforeFrameIndicesReplaced(
4206	MachineFunction &MF, RegScavenger RS) const* {
4207	auto *X86FI = MF.getInfo<X86MachineFunctionInfo>();
4208
4209	if (STI.is32Bit() && MF.hasEHFunclets())
4210	restoreWinEHStackPointersInParent(MF);
4211	// We have emitted prolog and epilog. Don't need stack pointer saving
4212	// instruction any more.
4213	if (MachineInstr *MI = X86FI->getStackPtrSaveMI()) {
4214	MI->eraseFromParent();
4215	X86FI->setStackPtrSaveMI(nullptr);
4216	}
4217	}
4218
4219	void X86FrameLowering::restoreWinEHStackPointersInParent(
4220	MachineFunction &MF) const {
4221	// 32-bit functions have to restore stack pointers when control is transferred
4222	// back to the parent function. These blocks are identified as eh pads that
4223	// are not funclet entries.
4224	bool IsSEH = isAsynchronousEHPersonality(
4225	Pers: classifyEHPersonality(Pers: MF.getFunction().getPersonalityFn()));
4226	for (MachineBasicBlock &MBB : MF) {
4227	bool NeedsRestore = MBB.isEHPad() && !MBB.isEHFuncletEntry();
4228	if (NeedsRestore)
4229	restoreWin32EHStackPointers(MBB, MBBI: MBB.begin(), DL: DebugLoc (),
4230	/RestoreSP=/IsSEH);
4231	}
4232	}
4233

source code of llvm/lib/Target/X86/X86FrameLowering.cpp