X86SpeculativeLoadHardening.cpp source code [llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp]

1	//====- X86SpeculativeLoadHardening.cpp - A Spectre v1 mitigation ---------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	/// \file
9	///
10	/// Provide a pass which mitigates speculative execution attacks which operate
11	/// by speculating incorrectly past some predicate (a type check, bounds check,
12	/// or other condition) to reach a load with invalid inputs and leak the data
13	/// accessed by that load using a side channel out of the speculative domain.
14	///
15	/// For details on the attacks, see the first variant in both the Project Zero
16	/// writeup and the Spectre paper:
17	/// https://googleprojectzero.blogspot.com/2018/01/reading-privileged-memory-with-side.html
18	/// https://spectreattack.com/spectre.pdf
19	///
20	//===----------------------------------------------------------------------===//
21
22	#include "X86.h"
23	#include "X86InstrBuilder.h"
24	#include "X86InstrInfo.h"
25	#include "X86Subtarget.h"
26	#include "llvm/ADT/ArrayRef.h"
27	#include "llvm/ADT/DenseMap.h"
28	#include "llvm/ADT/STLExtras.h"
29	#include "llvm/ADT/SmallPtrSet.h"
30	#include "llvm/ADT/SmallSet.h"
31	#include "llvm/ADT/SmallVector.h"
32	#include "llvm/ADT/SparseBitVector.h"
33	#include "llvm/ADT/Statistic.h"
34	#include "llvm/CodeGen/MachineBasicBlock.h"
35	#include "llvm/CodeGen/MachineConstantPool.h"
36	#include "llvm/CodeGen/MachineFunction.h"
37	#include "llvm/CodeGen/MachineFunctionPass.h"
38	#include "llvm/CodeGen/MachineInstr.h"
39	#include "llvm/CodeGen/MachineInstrBuilder.h"
40	#include "llvm/CodeGen/MachineModuleInfo.h"
41	#include "llvm/CodeGen/MachineOperand.h"
42	#include "llvm/CodeGen/MachineRegisterInfo.h"
43	#include "llvm/CodeGen/MachineSSAUpdater.h"
44	#include "llvm/CodeGen/TargetInstrInfo.h"
45	#include "llvm/CodeGen/TargetRegisterInfo.h"
46	#include "llvm/CodeGen/TargetSchedule.h"
47	#include "llvm/CodeGen/TargetSubtargetInfo.h"
48	#include "llvm/IR/DebugLoc.h"
49	#include "llvm/MC/MCSchedule.h"
50	#include "llvm/Pass.h"
51	#include "llvm/Support/CommandLine.h"
52	#include "llvm/Support/Debug.h"
53	#include "llvm/Support/raw_ostream.h"
54	#include "llvm/Target/TargetMachine.h"
55	#include <algorithm>
56	#include <cassert>
57	#include <iterator>
58	#include <optional>
59	#include <utility>
60
61	using namespace llvm;
62
63	#define PASS_KEY "x86-slh"
64	#define DEBUG_TYPE PASS_KEY
65
66	STATISTIC(NumCondBranchesTraced, "Number of conditional branches traced");
67	STATISTIC(NumBranchesUntraced, "Number of branches unable to trace");
68	STATISTIC(NumAddrRegsHardened,
69	"Number of address mode used registers hardaned");
70	STATISTIC(NumPostLoadRegsHardened,
71	"Number of post-load register values hardened");
72	STATISTIC(NumCallsOrJumpsHardened,
73	"Number of calls or jumps requiring extra hardening");
74	STATISTIC(NumInstsInserted, "Number of instructions inserted");
75	STATISTIC(NumLFENCEsInserted, "Number of lfence instructions inserted");
76
77	static cl::opt<bool> EnableSpeculativeLoadHardening(
78	"x86-speculative-load-hardening",
79	cl::desc ("Force enable speculative load hardening"), cl::init(Val: false),
80	cl::Hidden);
81
82	static cl::opt<bool> HardenEdgesWithLFENCE(
83	PASS_KEY "-lfence",
84	cl::desc (
85	"Use LFENCE along each conditional edge to harden against speculative "
86	"loads rather than conditional movs and poisoned pointers."),
87	cl::init(Val: false), cl::Hidden);
88
89	static cl::opt<bool> EnablePostLoadHardening(
90	PASS_KEY "-post-load",
91	cl::desc ("Harden the value loaded after it is loaded by "
92	"flushing the loaded bits to 1. This is hard to do "
93	"in general but can be done easily for GPRs."),
94	cl::init(Val: true), cl::Hidden);
95
96	static cl::opt<bool> FenceCallAndRet(
97	PASS_KEY "-fence-call-and-ret",
98	cl::desc ("Use a full speculation fence to harden both call and ret edges "
99	"rather than a lighter weight mitigation."),
100	cl::init(Val: false), cl::Hidden);
101
102	static cl::opt<bool> HardenInterprocedurally(
103	PASS_KEY "-ip",
104	cl::desc ("Harden interprocedurally by passing our state in and out of "
105	"functions in the high bits of the stack pointer."),
106	cl::init(Val: true), cl::Hidden);
107
108	static cl::opt<bool>
109	HardenLoads(PASS_KEY "-loads",
110	cl::desc ("Sanitize loads from memory. When disable, no "
111	"significant security is provided."),
112	cl::init(Val: true), cl::Hidden);
113
114	static cl::opt<bool> HardenIndirectCallsAndJumps(
115	PASS_KEY "-indirect",
116	cl::desc ("Harden indirect calls and jumps against using speculatively "
117	"stored attacker controlled addresses. This is designed to "
118	"mitigate Spectre v1.2 style attacks."),
119	cl::init(Val: true), cl::Hidden);
120
121	namespace {
122
123	class X86SpeculativeLoadHardeningPass : public MachineFunctionPass {
124	public:
125	X86SpeculativeLoadHardeningPass() : MachineFunctionPass (ID) { }
126
127	StringRef getPassName() const override {
128	return "X86 speculative load hardening";
129	}
130	bool runOnMachineFunction(MachineFunction &MF) override;
131	void getAnalysisUsage(AnalysisUsage &AU) const override;
132
133	/// Pass identification, replacement for typeid.
134	static char ID;
135
136	private:
137	/// The information about a block's conditional terminators needed to trace
138	/// our predicate state through the exiting edges.
139	struct BlockCondInfo {
140	MachineBasicBlock *MBB;
141
142	// We mostly have one conditional branch, and in extremely rare cases have
143	// two. Three and more are so rare as to be unimportant for compile time.
144	SmallVector<MachineInstr *, `2`> CondBrs;
145
146	MachineInstr *UncondBr;
147	};
148
149	/// Manages the predicate state traced through the program.
150	struct PredState {
151	unsigned InitialReg = `0`;
152	unsigned PoisonReg = `0`;
153
154	const TargetRegisterClass *RC;
155	MachineSSAUpdater SSA;
156
157	PredState(MachineFunction &MF, const TargetRegisterClass *RC)
158	: RC(RC), SSA (MF) {}
159	};
160
161	const X86Subtarget Subtarget = nullptr*;
162	MachineRegisterInfo MRI = nullptr*;
163	const X86InstrInfo TII = nullptr*;
164	const TargetRegisterInfo TRI = nullptr*;
165
166	std::optional<PredState> PS;
167
168	void hardenEdgesWithLFENCE(MachineFunction &MF);
169
170	SmallVector<BlockCondInfo, `16`> collectBlockCondInfo(MachineFunction &MF);
171
172	SmallVector<MachineInstr *, `16`>
173	tracePredStateThroughCFG(MachineFunction &MF, ArrayRef<BlockCondInfo> Infos);
174
175	void unfoldCallAndJumpLoads(MachineFunction &MF);
176
177	SmallVector<MachineInstr *, `16`>
178	tracePredStateThroughIndirectBranches(MachineFunction &MF);
179
180	void tracePredStateThroughBlocksAndHarden(MachineFunction &MF);
181
182	unsigned saveEFLAGS(MachineBasicBlock &MBB,
183	MachineBasicBlock::iterator InsertPt,
184	const DebugLoc &Loc);
185	void restoreEFLAGS(MachineBasicBlock &MBB,
186	MachineBasicBlock::iterator InsertPt, const DebugLoc &Loc,
187	Register Reg);
188
189	void mergePredStateIntoSP(MachineBasicBlock &MBB,
190	MachineBasicBlock::iterator InsertPt,
191	const DebugLoc &Loc, unsigned PredStateReg);
192	unsigned extractPredStateFromSP(MachineBasicBlock &MBB,
193	MachineBasicBlock::iterator InsertPt,
194	const DebugLoc &Loc);
195
196	void
197	hardenLoadAddr(MachineInstr &MI, MachineOperand &BaseMO,
198	MachineOperand &IndexMO,
199	SmallDenseMap<unsigned, unsigned, `32`> &AddrRegToHardenedReg);
200	MachineInstr *
201	sinkPostLoadHardenedInst(MachineInstr &MI,
202	SmallPtrSetImpl<MachineInstr *> &HardenedInstrs);
203	bool canHardenRegister(Register Reg);
204	unsigned hardenValueInRegister(Register Reg, MachineBasicBlock &MBB,
205	MachineBasicBlock::iterator InsertPt,
206	const DebugLoc &Loc);
207	unsigned hardenPostLoad(MachineInstr &MI);
208	void hardenReturnInstr(MachineInstr &MI);
209	void tracePredStateThroughCall(MachineInstr &MI);
210	void hardenIndirectCallOrJumpInstr(
211	MachineInstr &MI,
212	SmallDenseMap<unsigned, unsigned, `32`> &AddrRegToHardenedReg);
213	};
214
215	} // end anonymous namespace
216
217	char X86SpeculativeLoadHardeningPass::ID = `0`;
218
219	void X86SpeculativeLoadHardeningPass::getAnalysisUsage(
220	AnalysisUsage &AU) const {
221	MachineFunctionPass::getAnalysisUsage(AU);
222	}
223
224	static MachineBasicBlock &splitEdge(MachineBasicBlock &MBB,
225	MachineBasicBlock &Succ, int SuccCount,
226	MachineInstr Br, MachineInstr &UncondBr,
227	const X86InstrInfo &TII) {
228	assert(!Succ.isEHPad() && "Shouldn't get edges to EH pads!");
229
230	MachineFunction &MF = *MBB.getParent();
231
232	MachineBasicBlock &NewMBB = *MF.CreateMachineBasicBlock();
233
234	// We have to insert the new block immediately after the current one as we
235	// don't know what layout-successor relationships the successor has and we
236	// may not be able to (and generally don't want to) try to fix those up.
237	MF.insert(MBBI: std::next(x: MachineFunction::iterator (&MBB)), MBB: &NewMBB);
238
239	// Update the branch instruction if necessary.
240	if (Br) {
241	assert(Br->getOperand(`0`).getMBB() == &Succ &&
242	"Didn't start with the right target!");
243	Br->getOperand(i: `0`).setMBB(&NewMBB);
244
245	// If this successor was reached through a branch rather than fallthrough,
246	// we might have broken* fallthrough and so need to inject a new*
247	// unconditional branch.
248	if (!UncondBr) {
249	MachineBasicBlock &OldLayoutSucc =
250	*std::next(x: MachineFunction::iterator (&NewMBB));
251	assert(MBB.isSuccessor(&OldLayoutSucc) &&
252	"Without an unconditional branch, the old layout successor should "
253	"be an actual successor!");
254	auto BrBuilder =
255	BuildMI(&MBB, DebugLoc(), TII.get(X86::JMP_1)).addMBB(&OldLayoutSucc);
256	// Update the unconditional branch now that we've added one.
257	UncondBr = &*BrBuilder;
258	}
259
260	// Insert unconditional "jump Succ" instruction in the new block if
261	// necessary.
262	if (!NewMBB.isLayoutSuccessor(MBB: &Succ)) {
263	SmallVector<MachineOperand, `4`> Cond;
264	TII.insertBranch(MBB&: NewMBB, TBB: &Succ, FBB: nullptr, Cond, DL: Br->getDebugLoc());
265	}
266	} else {
267	assert(!UncondBr &&
268	"Cannot have a branchless successor and an unconditional branch!");
269	assert(NewMBB.isLayoutSuccessor(&Succ) &&
270	"A non-branch successor must have been a layout successor before "
271	"and now is a layout successor of the new block.");
272	}
273
274	// If this is the only edge to the successor, we can just replace it in the
275	// CFG. Otherwise we need to add a new entry in the CFG for the new
276	// successor.
277	if (SuccCount == `1`) {
278	MBB.replaceSuccessor(Old: &Succ, New: &NewMBB);
279	} else {
280	MBB.splitSuccessor(Old: &Succ, New: &NewMBB);
281	}
282
283	// Hook up the edge from the new basic block to the old successor in the CFG.
284	NewMBB.addSuccessor(Succ: &Succ);
285
286	// Fix PHI nodes in Succ so they refer to NewMBB instead of MBB.
287	for (MachineInstr &MI : Succ) {
288	if (!MI.isPHI())
289	break;
290	for (int OpIdx = `1`, NumOps = MI.getNumOperands(); OpIdx < NumOps;
291	OpIdx += `2`) {
292	MachineOperand &OpV = MI.getOperand(i: OpIdx);
293	MachineOperand &OpMBB = MI.getOperand(i: OpIdx + `1`);
294	assert(OpMBB.isMBB() && "Block operand to a PHI is not a block!");
295	if (OpMBB.getMBB() != &MBB)
296	continue;
297
298	// If this is the last edge to the succesor, just replace MBB in the PHI
299	if (SuccCount == `1`) {
300	OpMBB.setMBB(&NewMBB);
301	break;
302	}
303
304	// Otherwise, append a new pair of operands for the new incoming edge.
305	MI.addOperand(MF, Op: OpV);
306	MI.addOperand(MF, Op: MachineOperand::CreateMBB(MBB: &NewMBB));
307	break;
308	}
309	}
310
311	// Inherit live-ins from the successor
312	for (auto &LI : Succ.liveins())
313	NewMBB.addLiveIn(RegMaskPair: LI);
314
315	LLVM_DEBUG(dbgs() << " Split edge from '" << MBB.getName() << "' to '"
316	<< Succ.getName() << "'.\n");
317	return NewMBB;
318	}
319
320	/// Removing duplicate PHI operands to leave the PHI in a canonical and
321	/// predictable form.
322	///
323	/// FIXME: It's really frustrating that we have to do this, but SSA-form in MIR
324	/// isn't what you might expect. We may have multiple entries in PHI nodes for
325	/// a single predecessor. This makes CFG-updating extremely complex, so here we
326	/// simplify all PHI nodes to a model even simpler than the IR's model: exactly
327	/// one entry per predecessor, regardless of how many edges there are.
328	static void canonicalizePHIOperands(MachineFunction &MF) {
329	SmallPtrSet<MachineBasicBlock *, `4`> Preds;
330	SmallVector<int, `4`> DupIndices;
331	for (auto &MBB : MF)
332	for (auto &MI : MBB) {
333	if (!MI.isPHI())
334	break;
335
336	// First we scan the operands of the PHI looking for duplicate entries
337	// a particular predecessor. We retain the operand index of each duplicate
338	// entry found.
339	for (int OpIdx = `1`, NumOps = MI.getNumOperands(); OpIdx < NumOps;
340	OpIdx += `2`)
341	if (!Preds.insert(Ptr: MI.getOperand(i: OpIdx + `1`).getMBB()).second)
342	DupIndices.push_back(Elt: OpIdx);
343
344	// Now walk the duplicate indices, removing both the block and value. Note
345	// that these are stored as a vector making this element-wise removal
346	// :w
347	// potentially quadratic.
348	//
349	// FIXME: It is really frustrating that we have to use a quadratic
350	// removal algorithm here. There should be a better way, but the use-def
351	// updates required make that impossible using the public API.
352	//
353	// Note that we have to process these backwards so that we don't
354	// invalidate other indices with each removal.
355	while (!DupIndices.empty()) {
356	int OpIdx = DupIndices.pop_back_val();
357	// Remove both the block and value operand, again in reverse order to
358	// preserve indices.
359	MI.removeOperand(OpNo: OpIdx + `1`);
360	MI.removeOperand(OpNo: OpIdx);
361	}
362
363	Preds.clear();
364	}
365	}
366
367	/// Helper to scan a function for loads vulnerable to misspeculation that we
368	/// want to harden.
369	///
370	/// We use this to avoid making changes to functions where there is nothing we
371	/// need to do to harden against misspeculation.
372	static bool hasVulnerableLoad(MachineFunction &MF) {
373	for (MachineBasicBlock &MBB : MF) {
374	for (MachineInstr &MI : MBB) {
375	// Loads within this basic block after an LFENCE are not at risk of
376	// speculatively executing with invalid predicates from prior control
377	// flow. So break out of this block but continue scanning the function.
378	if (MI.getOpcode() == X86::LFENCE)
379	break;
380
381	// Looking for loads only.
382	if (!MI.mayLoad())
383	continue;
384
385	// An MFENCE is modeled as a load but isn't vulnerable to misspeculation.
386	if (MI.getOpcode() == X86::MFENCE)
387	continue;
388
389	// We found a load.
390	return true;
391	}
392	}
393
394	// No loads found.
395	return false;
396	}
397
398	bool X86SpeculativeLoadHardeningPass::runOnMachineFunction(
399	MachineFunction &MF) {
400	LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
401	<< " **********\n");
402
403	// Only run if this pass is forced enabled or we detect the relevant function
404	// attribute requesting SLH.
405	if (!EnableSpeculativeLoadHardening &&
406	!MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
407	return false;
408
409	Subtarget = &MF.getSubtarget<X86Subtarget>();
410	MRI = &MF.getRegInfo();
411	TII = Subtarget->getInstrInfo();
412	TRI = Subtarget->getRegisterInfo();
413
414	// FIXME: Support for 32-bit.
415	PS.emplace(MF, &X86::GR64_NOSPRegClass);
416
417	if (MF.begin() == MF.end())
418	// Nothing to do for a degenerate empty function...
419	return false;
420
421	// We support an alternative hardening technique based on a debug flag.
422	if (HardenEdgesWithLFENCE) {
423	hardenEdgesWithLFENCE(MF);
424	return true;
425	}
426
427	// Create a dummy debug loc to use for all the generated code here.
428	DebugLoc Loc;
429
430	MachineBasicBlock &Entry = *MF.begin();
431	auto EntryInsertPt = Entry.SkipPHIsLabelsAndDebug(I: Entry.begin());
432
433	// Do a quick scan to see if we have any checkable loads.
434	bool HasVulnerableLoad = hasVulnerableLoad(MF);
435
436	// See if we have any conditional branching blocks that we will need to trace
437	// predicate state through.
438	SmallVector<BlockCondInfo, `16`> Infos = collectBlockCondInfo(MF);
439
440	// If we have no interesting conditions or loads, nothing to do here.
441	if (!HasVulnerableLoad && Infos.empty())
442	return true;
443
444	// The poison value is required to be an all-ones value for many aspects of
445	// this mitigation.
446	const int PoisonVal = -`1`;
447	PS ->PoisonReg = MRI->createVirtualRegister(RegClass: PS ->RC);
448	BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV64ri32), PS ->PoisonReg)
449	.addImm(PoisonVal);
450	++NumInstsInserted;
451
452	// If we have loads being hardened and we've asked for call and ret edges to
453	// get a full fence-based mitigation, inject that fence.
454	if (HasVulnerableLoad && FenceCallAndRet) {
455	// We need to insert an LFENCE at the start of the function to suspend any
456	// incoming misspeculation from the caller. This helps two-fold: the caller
457	// may not have been protected as this code has been, and this code gets to
458	// not take any specific action to protect across calls.
459	// FIXME: We could skip this for functions which unconditionally return
460	// a constant.
461	BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::LFENCE));
462	++NumInstsInserted;
463	++NumLFENCEsInserted;
464	}
465
466	// If we guarded the entry with an LFENCE and have no conditionals to protect
467	// in blocks, then we're done.
468	if (FenceCallAndRet && Infos.empty())
469	// We may have changed the function's code at this point to insert fences.
470	return true;
471
472	// For every basic block in the function which can b
473	if (HardenInterprocedurally && !FenceCallAndRet) {
474	// Set up the predicate state by extracting it from the incoming stack
475	// pointer so we pick up any misspeculation in our caller.
476	PS ->InitialReg = extractPredStateFromSP(MBB&: Entry, InsertPt: EntryInsertPt, Loc);
477	} else {
478	// Otherwise, just build the predicate state itself by zeroing a register
479	// as we don't need any initial state.
480	PS ->InitialReg = MRI->createVirtualRegister(RegClass: PS ->RC);
481	Register PredStateSubReg = MRI->createVirtualRegister(&X86::GR32RegClass);
482	auto ZeroI = BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV32r0),
483	PredStateSubReg);
484	++NumInstsInserted;
485	MachineOperand *ZeroEFLAGSDefOp =
486	ZeroI->findRegisterDefOperand(X86::EFLAGS, /TRI=/nullptr);
487	assert(ZeroEFLAGSDefOp && ZeroEFLAGSDefOp->isImplicit() &&
488	"Must have an implicit def of EFLAGS!");
489	ZeroEFLAGSDefOp->setIsDead(true);
490	BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::SUBREG_TO_REG),
491	PS ->InitialReg)
492	.addImm(`0`)
493	.addReg(PredStateSubReg)
494	.addImm(X86::sub_32bit);
495	}
496
497	// We're going to need to trace predicate state throughout the function's
498	// CFG. Prepare for this by setting up our initial state of PHIs with unique
499	// predecessor entries and all the initial predicate state.
500	canonicalizePHIOperands(MF);
501
502	// Track the updated values in an SSA updater to rewrite into SSA form at the
503	// end.
504	PS ->SSA.Initialize(V: PS ->InitialReg);
505	PS ->SSA.AddAvailableValue(BB: &Entry, V: PS ->InitialReg);
506
507	// Trace through the CFG.
508	auto CMovs = tracePredStateThroughCFG(MF, Infos);
509
510	// We may also enter basic blocks in this function via exception handling
511	// control flow. Here, if we are hardening interprocedurally, we need to
512	// re-capture the predicate state from the throwing code. In the Itanium ABI,
513	// the throw will always look like a call to __cxa_throw and will have the
514	// predicate state in the stack pointer, so extract fresh predicate state from
515	// the stack pointer and make it available in SSA.
516	// FIXME: Handle non-itanium ABI EH models.
517	if (HardenInterprocedurally) {
518	for (MachineBasicBlock &MBB : MF) {
519	assert(!MBB.isEHScopeEntry() && "Only Itanium ABI EH supported!");
520	assert(!MBB.isEHFuncletEntry() && "Only Itanium ABI EH supported!");
521	assert(!MBB.isCleanupFuncletEntry() && "Only Itanium ABI EH supported!");
522	if (!MBB.isEHPad())
523	continue;
524	PS ->SSA.AddAvailableValue(
525	BB: &MBB,
526	V: extractPredStateFromSP(MBB, InsertPt: MBB.SkipPHIsAndLabels(I: MBB.begin()), Loc));
527	}
528	}
529
530	if (HardenIndirectCallsAndJumps) {
531	// If we are going to harden calls and jumps we need to unfold their memory
532	// operands.
533	unfoldCallAndJumpLoads(MF);
534
535	// Then we trace predicate state through the indirect branches.
536	auto IndirectBrCMovs = tracePredStateThroughIndirectBranches(MF);
537	CMovs.append(in_start: IndirectBrCMovs.begin(), in_end: IndirectBrCMovs.end());
538	}
539
540	// Now that we have the predicate state available at the start of each block
541	// in the CFG, trace it through each block, hardening vulnerable instructions
542	// as we go.
543	tracePredStateThroughBlocksAndHarden(MF);
544
545	// Now rewrite all the uses of the pred state using the SSA updater to insert
546	// PHIs connecting the state between blocks along the CFG edges.
547	for (MachineInstr *CMovI : CMovs)
548	for (MachineOperand &Op : CMovI->operands()) {
549	if (!Op.isReg() \|\| Op.getReg() != PS ->InitialReg)
550	continue;
551
552	PS ->SSA.RewriteUse(U&: Op);
553	}
554
555	LLVM_DEBUG(dbgs() << "Final speculative load hardened function:\n"; MF.dump();
556	dbgs() << "\n"; MF.verify(this));
557	return true;
558	}
559
560	/// Implements the naive hardening approach of putting an LFENCE after every
561	/// potentially mis-predicted control flow construct.
562	///
563	/// We include this as an alternative mostly for the purpose of comparison. The
564	/// performance impact of this is expected to be extremely severe and not
565	/// practical for any real-world users.
566	void X86SpeculativeLoadHardeningPass::hardenEdgesWithLFENCE(
567	MachineFunction &MF) {
568	// First, we scan the function looking for blocks that are reached along edges
569	// that we might want to harden.
570	SmallSetVector<MachineBasicBlock *, `8`> Blocks;
571	for (MachineBasicBlock &MBB : MF) {
572	// If there are no or only one successor, nothing to do here.
573	if (MBB.succ_size() <= `1`)
574	continue;
575
576	// Skip blocks unless their terminators start with a branch. Other
577	// terminators don't seem interesting for guarding against misspeculation.
578	auto TermIt = MBB.getFirstTerminator();
579	if (TermIt == MBB.end() \|\| !TermIt ->isBranch())
580	continue;
581
582	// Add all the non-EH-pad succossors to the blocks we want to harden. We
583	// skip EH pads because there isn't really a condition of interest on
584	// entering.
585	for (MachineBasicBlock *SuccMBB : MBB.successors())
586	if (!SuccMBB->isEHPad())
587	Blocks.insert(X: SuccMBB);
588	}
589
590	for (MachineBasicBlock *MBB : Blocks) {
591	auto InsertPt = MBB->SkipPHIsAndLabels(I: MBB->begin());
592	BuildMI(*MBB, InsertPt, DebugLoc(), TII->get(X86::LFENCE));
593	++NumInstsInserted;
594	++NumLFENCEsInserted;
595	}
596	}
597
598	SmallVector<X86SpeculativeLoadHardeningPass::BlockCondInfo, `16`>
599	X86SpeculativeLoadHardeningPass::collectBlockCondInfo(MachineFunction &MF) {
600	SmallVector<BlockCondInfo, `16`> Infos;
601
602	// Walk the function and build up a summary for each block's conditions that
603	// we need to trace through.
604	for (MachineBasicBlock &MBB : MF) {
605	// If there are no or only one successor, nothing to do here.
606	if (MBB.succ_size() <= `1`)
607	continue;
608
609	// We want to reliably handle any conditional branch terminators in the
610	// MBB, so we manually analyze the branch. We can handle all of the
611	// permutations here, including ones that analyze branch cannot.
612	//
613	// The approach is to walk backwards across the terminators, resetting at
614	// any unconditional non-indirect branch, and track all conditional edges
615	// to basic blocks as well as the fallthrough or unconditional successor
616	// edge. For each conditional edge, we track the target and the opposite
617	// condition code in order to inject a "no-op" cmov into that successor
618	// that will harden the predicate. For the fallthrough/unconditional
619	// edge, we inject a separate cmov for each conditional branch with
620	// matching condition codes. This effectively implements an "and" of the
621	// condition flags, even if there isn't a single condition flag that would
622	// directly implement that. We don't bother trying to optimize either of
623	// these cases because if such an optimization is possible, LLVM should
624	// have optimized the conditional branches* in that way already to reduce*
625	// instruction count. This late, we simply assume the minimal number of
626	// branch instructions is being emitted and use that to guide our cmov
627	// insertion.
628
629	BlockCondInfo Info = {.MBB: &MBB, .CondBrs: {}, .UncondBr: nullptr};
630
631	// Now walk backwards through the terminators and build up successors they
632	// reach and the conditions.
633	for (MachineInstr &MI : llvm::reverse(C&: MBB)) {
634	// Once we've handled all the terminators, we're done.
635	if (!MI.isTerminator())
636	break;
637
638	// If we see a non-branch terminator, we can't handle anything so bail.
639	if (!MI.isBranch()) {
640	Info.CondBrs.clear();
641	break;
642	}
643
644	// If we see an unconditional branch, reset our state, clear any
645	// fallthrough, and set this is the "else" successor.
646	if (MI.getOpcode() == X86::JMP_1) {
647	Info.CondBrs.clear();
648	Info.UncondBr = &MI;
649	continue;
650	}
651
652	// If we get an invalid condition, we have an indirect branch or some
653	// other unanalyzable "fallthrough" case. We model this as a nullptr for
654	// the destination so we can still guard any conditional successors.
655	// Consider code sequences like:
656	// ```
657	// jCC L1
658	// jmpq %rax*
659	// ```
660	// We still want to harden the edge to `L1`.
661	if (X86::getCondFromBranch(MI) == X86::COND_INVALID) {
662	Info.CondBrs.clear();
663	Info.UncondBr = &MI;
664	continue;
665	}
666
667	// We have a vanilla conditional branch, add it to our list.
668	Info.CondBrs.push_back(Elt: &MI);
669	}
670	if (Info.CondBrs.empty()) {
671	++NumBranchesUntraced;
672	LLVM_DEBUG(dbgs() << "WARNING: unable to secure successors of block:\n";
673	MBB.dump());
674	continue;
675	}
676
677	Infos.push_back(Elt: Info);
678	}
679
680	return Infos;
681	}
682
683	/// Trace the predicate state through the CFG, instrumenting each conditional
684	/// branch such that misspeculation through an edge will poison the predicate
685	/// state.
686	///
687	/// Returns the list of inserted CMov instructions so that they can have their
688	/// uses of the predicate state rewritten into proper SSA form once it is
689	/// complete.
690	SmallVector<MachineInstr *, `16`>
691	X86SpeculativeLoadHardeningPass::tracePredStateThroughCFG(
692	MachineFunction &MF, ArrayRef<BlockCondInfo> Infos) {
693	// Collect the inserted cmov instructions so we can rewrite their uses of the
694	// predicate state into SSA form.
695	SmallVector<MachineInstr *, `16`> CMovs;
696
697	// Now walk all of the basic blocks looking for ones that end in conditional
698	// jumps where we need to update this register along each edge.
699	for (const BlockCondInfo &Info : Infos) {
700	MachineBasicBlock &MBB = *Info.MBB;
701	const SmallVectorImpl<MachineInstr *> &CondBrs = Info.CondBrs;
702	MachineInstr *UncondBr = Info.UncondBr;
703
704	LLVM_DEBUG(dbgs() << "Tracing predicate through block: " << MBB.getName()
705	<< "\n");
706	++NumCondBranchesTraced;
707
708	// Compute the non-conditional successor as either the target of any
709	// unconditional branch or the layout successor.
710	MachineBasicBlock *UncondSucc =
711	UncondBr ? (UncondBr->getOpcode() == X86::JMP_1
712	? UncondBr->getOperand(i: `0`).getMBB()
713	: nullptr)
714	: &*std::next(x: MachineFunction::iterator (&MBB));
715
716	// Count how many edges there are to any given successor.
717	SmallDenseMap<MachineBasicBlock , int*> SuccCounts;
718	if (UncondSucc)
719	++SuccCounts [UncondSucc];
720	for (auto *CondBr : CondBrs)
721	++SuccCounts [CondBr->getOperand(i: `0`).getMBB()];
722
723	// A lambda to insert cmov instructions into a block checking all of the
724	// condition codes in a sequence.
725	auto BuildCheckingBlockForSuccAndConds =
726	[&](MachineBasicBlock &MBB, MachineBasicBlock &Succ, int SuccCount,
727	MachineInstr Br, MachineInstr &UncondBr,
728	ArrayRef<X86::CondCode> Conds) {
729	// First, we split the edge to insert the checking block into a safe
730	// location.
731	auto &CheckingMBB =
732	(SuccCount == `1` && Succ.pred_size() == `1`)
733	? Succ
734	: splitEdge(MBB, Succ, SuccCount, Br, UncondBr, TII: *TII);
735
736	bool LiveEFLAGS = Succ.isLiveIn(X86::Reg: EFLAGS);
737	if (!LiveEFLAGS)
738	CheckingMBB.addLiveIn(X86::EFLAGS);
739
740	// Now insert the cmovs to implement the checks.
741	auto InsertPt = CheckingMBB.begin();
742	assert((InsertPt == CheckingMBB.end() \|\| !InsertPt ->isPHI()) &&
743	"Should never have a PHI in the initial checking block as it "
744	"always has a single predecessor!");
745
746	// We will wire each cmov to each other, but need to start with the
747	// incoming pred state.
748	unsigned CurStateReg = PS ->InitialReg;
749
750	for (X86::CondCode Cond : Conds) {
751	int PredStateSizeInBytes = TRI->getRegSizeInBits(RC: *PS ->RC) / `8`;
752	auto CMovOp = X86::getCMovOpcode(RegBytes: PredStateSizeInBytes);
753
754	Register UpdatedStateReg = MRI->createVirtualRegister(RegClass: PS ->RC);
755	// Note that we intentionally use an empty debug location so that
756	// this picks up the preceding location.
757	auto CMovI = BuildMI(CheckingMBB, InsertPt, DebugLoc (),
758	TII->get(CMovOp), UpdatedStateReg)
759	.addReg(CurStateReg)
760	.addReg(PS ->PoisonReg)
761	.addImm(Cond);
762	// If this is the last cmov and the EFLAGS weren't originally
763	// live-in, mark them as killed.
764	if (!LiveEFLAGS && Cond == Conds.back())
765	CMovI->findRegisterUseOperand(X86::EFLAGS, /TRI=/nullptr)
766	->setIsKill(true);
767
768	++NumInstsInserted;
769	LLVM_DEBUG(dbgs() << " Inserting cmov: "; CMovI->dump();
770	dbgs() << "\n");
771
772	// The first one of the cmovs will be using the top level
773	// `PredStateReg` and need to get rewritten into SSA form.
774	if (CurStateReg == PS ->InitialReg)
775	CMovs.push_back(Elt: &*CMovI);
776
777	// The next cmov should start from this one's def.
778	CurStateReg = UpdatedStateReg;
779	}
780
781	// And put the last one into the available values for SSA form of our
782	// predicate state.
783	PS ->SSA.AddAvailableValue(BB: &CheckingMBB, V: CurStateReg);
784	};
785
786	std::vector<X86::CondCode> UncondCodeSeq;
787	for (auto *CondBr : CondBrs) {
788	MachineBasicBlock &Succ = *CondBr->getOperand(i: `0`).getMBB();
789	int &SuccCount = SuccCounts [&Succ];
790
791	X86::CondCode Cond = X86::getCondFromBranch(MI: *CondBr);
792	X86::CondCode InvCond = X86::GetOppositeBranchCondition(CC: Cond);
793	UncondCodeSeq.push_back(x: Cond);
794
795	BuildCheckingBlockForSuccAndConds (MBB, Succ, SuccCount, CondBr, UncondBr,
796	{InvCond});
797
798	// Decrement the successor count now that we've split one of the edges.
799	// We need to keep the count of edges to the successor accurate in order
800	// to know above when to replace* the successor in the CFG vs. just*
801	// adding the new successor.
802	--SuccCount;
803	}
804
805	// Since we may have split edges and changed the number of successors,
806	// normalize the probabilities. This avoids doing it each time we split an
807	// edge.
808	MBB.normalizeSuccProbs();
809
810	// Finally, we need to insert cmovs into the "fallthrough" edge. Here, we
811	// need to intersect the other condition codes. We can do this by just
812	// doing a cmov for each one.
813	if (!UncondSucc)
814	// If we have no fallthrough to protect (perhaps it is an indirect jump?)
815	// just skip this and continue.
816	continue;
817
818	assert(SuccCounts[UncondSucc] == `1` &&
819	"We should never have more than one edge to the unconditional "
820	"successor at this point because every other edge must have been "
821	"split above!");
822
823	// Sort and unique the codes to minimize them.
824	llvm::sort(C&: UncondCodeSeq);
825	UncondCodeSeq.erase(first: std::unique(first: UncondCodeSeq.begin(), last: UncondCodeSeq.end()),
826	last: UncondCodeSeq.end());
827
828	// Build a checking version of the successor.
829	BuildCheckingBlockForSuccAndConds (MBB, UncondSucc, /SuccCount/* `1`,
830	UncondBr, UncondBr, UncondCodeSeq);
831	}
832
833	return CMovs;
834	}
835
836	/// Compute the register class for the unfolded load.
837	///
838	/// FIXME: This should probably live in X86InstrInfo, potentially by adding
839	/// a way to unfold into a newly created vreg rather than requiring a register
840	/// input.
841	static const TargetRegisterClass *
842	getRegClassForUnfoldedLoad(MachineFunction &MF, const X86InstrInfo &TII,
843	unsigned Opcode) {
844	unsigned Index;
845	unsigned UnfoldedOpc = TII.getOpcodeAfterMemoryUnfold(
846	Opc: Opcode, /UnfoldLoad/ true, /UnfoldStore/ false, LoadRegIndex: &Index);
847	const MCInstrDesc &MCID = TII.get(UnfoldedOpc);
848	return TII.getRegClass(MCID, Index, &TII.getRegisterInfo(), MF);
849	}
850
851	void X86SpeculativeLoadHardeningPass::unfoldCallAndJumpLoads(
852	MachineFunction &MF) {
853	for (MachineBasicBlock &MBB : MF)
854	// We use make_early_inc_range here so we can remove instructions if needed
855	// without disturbing the iteration.
856	for (MachineInstr &MI : llvm::make_early_inc_range(Range: MBB.instrs())) {
857	// Must either be a call or a branch.
858	if (!MI.isCall() && !MI.isBranch())
859	continue;
860	// We only care about loading variants of these instructions.
861	if (!MI.mayLoad())
862	continue;
863
864	switch (MI.getOpcode()) {
865	default: {
866	LLVM_DEBUG(
867	dbgs() << "ERROR: Found an unexpected loading branch or call "
868	"instruction:\n";
869	MI.dump(); dbgs() << "\n");
870	report_fatal_error(reason: "Unexpected loading branch or call!");
871	}
872
873	case X86::FARCALL16m:
874	case X86::FARCALL32m:
875	case X86::FARCALL64m:
876	case X86::FARJMP16m:
877	case X86::FARJMP32m:
878	case X86::FARJMP64m:
879	// We cannot mitigate far jumps or calls, but we also don't expect them
880	// to be vulnerable to Spectre v1.2 style attacks.
881	continue;
882
883	case X86::CALL16m:
884	case X86::CALL16m_NT:
885	case X86::CALL32m:
886	case X86::CALL32m_NT:
887	case X86::CALL64m:
888	case X86::CALL64m_NT:
889	case X86::JMP16m:
890	case X86::JMP16m_NT:
891	case X86::JMP32m:
892	case X86::JMP32m_NT:
893	case X86::JMP64m:
894	case X86::JMP64m_NT:
895	case X86::TAILJMPm64:
896	case X86::TAILJMPm64_REX:
897	case X86::TAILJMPm:
898	case X86::TCRETURNmi64:
899	case X86::TCRETURNmi: {
900	// Use the generic unfold logic now that we know we're dealing with
901	// expected instructions.
902	// FIXME: We don't have test coverage for all of these!
903	auto UnfoldedRC = getRegClassForUnfoldedLoad(MF, TII: TII, Opcode: MI.getOpcode());
904	if (!UnfoldedRC) {
905	LLVM_DEBUG(dbgs()
906	<< "ERROR: Unable to unfold load from instruction:\n";
907	MI.dump(); dbgs() << "\n");
908	report_fatal_error(reason: "Unable to unfold load!");
909	}
910	Register Reg = MRI->createVirtualRegister(RegClass: UnfoldedRC);
911	SmallVector<MachineInstr *, `2`> NewMIs;
912	// If we were able to compute an unfolded reg class, any failure here
913	// is just a programming error so just assert.
914	bool Unfolded =
915	TII->unfoldMemoryOperand(MF, MI, Reg, /UnfoldLoad/ true,
916	/UnfoldStore/ false, NewMIs);
917	(void)Unfolded;
918	assert(Unfolded &&
919	"Computed unfolded register class but failed to unfold");
920	// Now stitch the new instructions into place and erase the old one.
921	for (auto *NewMI : NewMIs)
922	MBB.insert(I: MI.getIterator(), M: NewMI);
923
924	// Update the call site info.
925	if (MI.isCandidateForCallSiteEntry())
926	MF.eraseCallSiteInfo(MI: &MI);
927
928	MI.eraseFromParent();
929	LLVM_DEBUG({
930	dbgs() << "Unfolded load successfully into:\n";
931	for (auto *NewMI : NewMIs) {
932	NewMI->dump();
933	dbgs() << "\n";
934	}
935	});
936	continue;
937	}
938	}
939	llvm_unreachable("Escaped switch with default!");
940	}
941	}
942
943	/// Trace the predicate state through indirect branches, instrumenting them to
944	/// poison the state if a target is reached that does not match the expected
945	/// target.
946	///
947	/// This is designed to mitigate Spectre variant 1 attacks where an indirect
948	/// branch is trained to predict a particular target and then mispredicts that
949	/// target in a way that can leak data. Despite using an indirect branch, this
950	/// is really a variant 1 style attack: it does not steer execution to an
951	/// arbitrary or attacker controlled address, and it does not require any
952	/// special code executing next to the victim. This attack can also be mitigated
953	/// through retpolines, but those require either replacing indirect branches
954	/// with conditional direct branches or lowering them through a device that
955	/// blocks speculation. This mitigation can replace these retpoline-style
956	/// mitigations for jump tables and other indirect branches within a function
957	/// when variant 2 isn't a risk while allowing limited speculation. Indirect
958	/// calls, however, cannot be mitigated through this technique without changing
959	/// the ABI in a fundamental way.
960	SmallVector<MachineInstr *, `16`>
961	X86SpeculativeLoadHardeningPass::tracePredStateThroughIndirectBranches(
962	MachineFunction &MF) {
963	// We use the SSAUpdater to insert PHI nodes for the target addresses of
964	// indirect branches. We don't actually need the full power of the SSA updater
965	// in this particular case as we always have immediately available values, but
966	// this avoids us having to re-implement the PHI construction logic.
967	MachineSSAUpdater TargetAddrSSA(MF);
968	TargetAddrSSA.Initialize(MRI->createVirtualRegister(&X86::GR64RegClass));
969
970	// Track which blocks were terminated with an indirect branch.
971	SmallPtrSet<MachineBasicBlock *, `4`> IndirectTerminatedMBBs;
972
973	// We need to know what blocks end up reached via indirect branches. We
974	// expect this to be a subset of those whose address is taken and so track it
975	// directly via the CFG.
976	SmallPtrSet<MachineBasicBlock *, `4`> IndirectTargetMBBs;
977
978	// Walk all the blocks which end in an indirect branch and make the
979	// target address available.
980	for (MachineBasicBlock &MBB : MF) {
981	// Find the last terminator.
982	auto MII = MBB.instr_rbegin();
983	while (MII != MBB.instr_rend() && MII ->isDebugInstr())
984	++MII;
985	if (MII == MBB.instr_rend())
986	continue;
987	MachineInstr &TI = *MII;
988	if (!TI.isTerminator() \|\| !TI.isBranch())
989	// No terminator or non-branch terminator.
990	continue;
991
992	unsigned TargetReg;
993
994	switch (TI.getOpcode()) {
995	default:
996	// Direct branch or conditional branch (leading to fallthrough).
997	continue;
998
999	case X86::FARJMP16m:
1000	case X86::FARJMP32m:
1001	case X86::FARJMP64m:
1002	// We cannot mitigate far jumps or calls, but we also don't expect them
1003	// to be vulnerable to Spectre v1.2 or v2 (self trained) style attacks.
1004	continue;
1005
1006	case X86::JMP16m:
1007	case X86::JMP16m_NT:
1008	case X86::JMP32m:
1009	case X86::JMP32m_NT:
1010	case X86::JMP64m:
1011	case X86::JMP64m_NT:
1012	// Mostly as documentation.
1013	report_fatal_error(reason: "Memory operand jumps should have been unfolded!");
1014
1015	case X86::JMP16r:
1016	report_fatal_error(
1017	reason: "Support for 16-bit indirect branches is not implemented.");
1018	case X86::JMP32r:
1019	report_fatal_error(
1020	reason: "Support for 32-bit indirect branches is not implemented.");
1021
1022	case X86::JMP64r:
1023	TargetReg = TI.getOperand(i: `0`).getReg();
1024	}
1025
1026	// We have definitely found an indirect branch. Verify that there are no
1027	// preceding conditional branches as we don't yet support that.
1028	if (llvm::any_of(Range: MBB.terminators(), P: [&](MachineInstr &OtherTI) {
1029	return !OtherTI.isDebugInstr() && &OtherTI != &TI;
1030	})) {
1031	LLVM_DEBUG({
1032	dbgs() << "ERROR: Found other terminators in a block with an indirect "
1033	"branch! This is not yet supported! Terminator sequence:\n";
1034	for (MachineInstr &MI : MBB.terminators()) {
1035	MI.dump();
1036	dbgs() << `'\n'`;
1037	}
1038	});
1039	report_fatal_error(reason: "Unimplemented terminator sequence!");
1040	}
1041
1042	// Make the target register an available value for this block.
1043	TargetAddrSSA.AddAvailableValue(BB: &MBB, V: TargetReg);
1044	IndirectTerminatedMBBs.insert(Ptr: &MBB);
1045
1046	// Add all the successors to our target candidates.
1047	for (MachineBasicBlock *Succ : MBB.successors())
1048	IndirectTargetMBBs.insert(Ptr: Succ);
1049	}
1050
1051	// Keep track of the cmov instructions we insert so we can return them.
1052	SmallVector<MachineInstr *, `16`> CMovs;
1053
1054	// If we didn't find any indirect branches with targets, nothing to do here.
1055	if (IndirectTargetMBBs.empty())
1056	return CMovs;
1057
1058	// We found indirect branches and targets that need to be instrumented to
1059	// harden loads within them. Walk the blocks of the function (to get a stable
1060	// ordering) and instrument each target of an indirect branch.
1061	for (MachineBasicBlock &MBB : MF) {
1062	// Skip the blocks that aren't candidate targets.
1063	if (!IndirectTargetMBBs.count(Ptr: &MBB))
1064	continue;
1065
1066	// We don't expect EH pads to ever be reached via an indirect branch. If
1067	// this is desired for some reason, we could simply skip them here rather
1068	// than asserting.
1069	assert(!MBB.isEHPad() &&
1070	"Unexpected EH pad as target of an indirect branch!");
1071
1072	// We should never end up threading EFLAGS into a block to harden
1073	// conditional jumps as there would be an additional successor via the
1074	// indirect branch. As a consequence, all such edges would be split before
1075	// reaching here, and the inserted block will handle the EFLAGS-based
1076	// hardening.
1077	assert(!MBB.isLiveIn(X86::EFLAGS) &&
1078	"Cannot check within a block that already has live-in EFLAGS!");
1079
1080	// We can't handle having non-indirect edges into this block unless this is
1081	// the only successor and we can synthesize the necessary target address.
1082	for (MachineBasicBlock *Pred : MBB.predecessors()) {
1083	// If we've already handled this by extracting the target directly,
1084	// nothing to do.
1085	if (IndirectTerminatedMBBs.count(Ptr: Pred))
1086	continue;
1087
1088	// Otherwise, we have to be the only successor. We generally expect this
1089	// to be true as conditional branches should have had a critical edge
1090	// split already. We don't however need to worry about EH pad successors
1091	// as they'll happily ignore the target and their hardening strategy is
1092	// resilient to all ways in which they could be reached speculatively.
1093	if (!llvm::all_of(Range: Pred->successors(), P: [&](MachineBasicBlock *Succ) {
1094	return Succ->isEHPad() \|\| Succ == &MBB;
1095	})) {
1096	LLVM_DEBUG({
1097	dbgs() << "ERROR: Found conditional entry to target of indirect "
1098	"branch!\n";
1099	Pred->dump();
1100	MBB.dump();
1101	});
1102	report_fatal_error(reason: "Cannot harden a conditional entry to a target of "
1103	"an indirect branch!");
1104	}
1105
1106	// Now we need to compute the address of this block and install it as a
1107	// synthetic target in the predecessor. We do this at the bottom of the
1108	// predecessor.
1109	auto InsertPt = Pred->getFirstTerminator();
1110	Register TargetReg = MRI->createVirtualRegister(&X86::GR64RegClass);
1111	if (MF.getTarget().getCodeModel() == CodeModel::Small &&
1112	!Subtarget->isPositionIndependent()) {
1113	// Directly materialize it into an immediate.
1114	auto AddrI = BuildMI(*Pred, InsertPt, DebugLoc(),
1115	TII->get(X86::MOV64ri32), TargetReg)
1116	.addMBB(&MBB);
1117	++NumInstsInserted;
1118	(void)AddrI;
1119	LLVM_DEBUG(dbgs() << " Inserting mov: "; AddrI->dump();
1120	dbgs() << "\n");
1121	} else {
1122	auto AddrI = BuildMI(*Pred, InsertPt, DebugLoc(), TII->get(X86::LEA64r),
1123	TargetReg)
1124	.addReg(/Base/ X86::RIP)
1125	.addImm(/Scale/ `1`)
1126	.addReg(/Index/ `0`)
1127	.addMBB(&MBB)
1128	.addReg(/Segment/ `0`);
1129	++NumInstsInserted;
1130	(void)AddrI;
1131	LLVM_DEBUG(dbgs() << " Inserting lea: "; AddrI->dump();
1132	dbgs() << "\n");
1133	}
1134	// And make this available.
1135	TargetAddrSSA.AddAvailableValue(BB: Pred, V: TargetReg);
1136	}
1137
1138	// Materialize the needed SSA value of the target. Note that we need the
1139	// middle of the block as this block might at the bottom have an indirect
1140	// branch back to itself. We can do this here because at this point, every
1141	// predecessor of this block has an available value. This is basically just
1142	// automating the construction of a PHI node for this target.
1143	Register TargetReg = TargetAddrSSA.GetValueInMiddleOfBlock(BB: &MBB);
1144
1145	// Insert a comparison of the incoming target register with this block's
1146	// address. This also requires us to mark the block as having its address
1147	// taken explicitly.
1148	MBB.setMachineBlockAddressTaken();
1149	auto InsertPt = MBB.SkipPHIsLabelsAndDebug(I: MBB.begin());
1150	if (MF.getTarget().getCodeModel() == CodeModel::Small &&
1151	!Subtarget->isPositionIndependent()) {
1152	// Check directly against a relocated immediate when we can.
1153	auto CheckI = BuildMI(MBB, InsertPt, DebugLoc(), TII->get(X86::CMP64ri32))
1154	.addReg(TargetReg, RegState::Kill)
1155	.addMBB(&MBB);
1156	++NumInstsInserted;
1157	(void)CheckI;
1158	LLVM_DEBUG(dbgs() << " Inserting cmp: "; CheckI->dump(); dbgs() << "\n");
1159	} else {
1160	// Otherwise compute the address into a register first.
1161	Register AddrReg = MRI->createVirtualRegister(&X86::GR64RegClass);
1162	auto AddrI =
1163	BuildMI(MBB, InsertPt, DebugLoc(), TII->get(X86::LEA64r), AddrReg)
1164	.addReg(/Base/ X86::RIP)
1165	.addImm(/Scale/ `1`)
1166	.addReg(/Index/ `0`)
1167	.addMBB(&MBB)
1168	.addReg(/Segment/ `0`);
1169	++NumInstsInserted;
1170	(void)AddrI;
1171	LLVM_DEBUG(dbgs() << " Inserting lea: "; AddrI->dump(); dbgs() << "\n");
1172	auto CheckI = BuildMI(MBB, InsertPt, DebugLoc(), TII->get(X86::CMP64rr))
1173	.addReg(TargetReg, RegState::Kill)
1174	.addReg(AddrReg, RegState::Kill);
1175	++NumInstsInserted;
1176	(void)CheckI;
1177	LLVM_DEBUG(dbgs() << " Inserting cmp: "; CheckI->dump(); dbgs() << "\n");
1178	}
1179
1180	// Now cmov over the predicate if the comparison wasn't equal.
1181	int PredStateSizeInBytes = TRI->getRegSizeInBits(RC: *PS ->RC) / `8`;
1182	auto CMovOp = X86::getCMovOpcode(RegBytes: PredStateSizeInBytes);
1183	Register UpdatedStateReg = MRI->createVirtualRegister(RegClass: PS ->RC);
1184	auto CMovI =
1185	BuildMI(MBB, InsertPt, DebugLoc (), TII->get(CMovOp), UpdatedStateReg)
1186	.addReg(PS ->InitialReg)
1187	.addReg(PS ->PoisonReg)
1188	.addImm(X86::COND_NE);
1189	CMovI->findRegisterUseOperand(X86::EFLAGS, /TRI=/nullptr)
1190	->setIsKill(true);
1191	++NumInstsInserted;
1192	LLVM_DEBUG(dbgs() << " Inserting cmov: "; CMovI->dump(); dbgs() << "\n");
1193	CMovs.push_back(Elt: &*CMovI);
1194
1195	// And put the new value into the available values for SSA form of our
1196	// predicate state.
1197	PS ->SSA.AddAvailableValue(BB: &MBB, V: UpdatedStateReg);
1198	}
1199
1200	// Return all the newly inserted cmov instructions of the predicate state.
1201	return CMovs;
1202	}
1203
1204	// Returns true if the MI has EFLAGS as a register def operand and it's live,
1205	// otherwise it returns false
1206	static bool isEFLAGSDefLive(const MachineInstr &MI) {
1207	if (const MachineOperand *DefOp =
1208	MI.findRegisterDefOperand(X86::EFLAGS, /TRI=/nullptr)) {
1209	return !DefOp->isDead();
1210	}
1211	return false;
1212	}
1213
1214	static bool isEFLAGSLive(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
1215	const TargetRegisterInfo &TRI) {
1216	// Check if EFLAGS are alive by seeing if there is a def of them or they
1217	// live-in, and then seeing if that def is in turn used.
1218	for (MachineInstr &MI : llvm::reverse(C: llvm::make_range(x: MBB.begin(), y: I))) {
1219	if (MachineOperand *DefOp =
1220	MI.findRegisterDefOperand(X86::EFLAGS, /TRI=/nullptr)) {
1221	// If the def is dead, then EFLAGS is not live.
1222	if (DefOp->isDead())
1223	return false;
1224
1225	// Otherwise we've def'ed it, and it is live.
1226	return true;
1227	}
1228	// While at this instruction, also check if we use and kill EFLAGS
1229	// which means it isn't live.
1230	if (MI.killsRegister(X86::EFLAGS, &TRI))
1231	return false;
1232	}
1233
1234	// If we didn't find anything conclusive (neither definitely alive or
1235	// definitely dead) return whether it lives into the block.
1236	return MBB.isLiveIn(X86::EFLAGS);
1237	}
1238
1239	/// Trace the predicate state through each of the blocks in the function,
1240	/// hardening everything necessary along the way.
1241	///
1242	/// We call this routine once the initial predicate state has been established
1243	/// for each basic block in the function in the SSA updater. This routine traces
1244	/// it through the instructions within each basic block, and for non-returning
1245	/// blocks informs the SSA updater about the final state that lives out of the
1246	/// block. Along the way, it hardens any vulnerable instruction using the
1247	/// currently valid predicate state. We have to do these two things together
1248	/// because the SSA updater only works across blocks. Within a block, we track
1249	/// the current predicate state directly and update it as it changes.
1250	///
1251	/// This operates in two passes over each block. First, we analyze the loads in
1252	/// the block to determine which strategy will be used to harden them: hardening
1253	/// the address or hardening the loaded value when loaded into a register
1254	/// amenable to hardening. We have to process these first because the two
1255	/// strategies may interact -- later hardening may change what strategy we wish
1256	/// to use. We also will analyze data dependencies between loads and avoid
1257	/// hardening those loads that are data dependent on a load with a hardened
1258	/// address. We also skip hardening loads already behind an LFENCE as that is
1259	/// sufficient to harden them against misspeculation.
1260	///
1261	/// Second, we actively trace the predicate state through the block, applying
1262	/// the hardening steps we determined necessary in the first pass as we go.
1263	///
1264	/// These two passes are applied to each basic block. We operate one block at a
1265	/// time to simplify reasoning about reachability and sequencing.
1266	void X86SpeculativeLoadHardeningPass::tracePredStateThroughBlocksAndHarden(
1267	MachineFunction &MF) {
1268	SmallPtrSet<MachineInstr *, `16`> HardenPostLoad;
1269	SmallPtrSet<MachineInstr *, `16`> HardenLoadAddr;
1270
1271	SmallSet<unsigned, `16`> HardenedAddrRegs;
1272
1273	SmallDenseMap<unsigned, unsigned, `32`> AddrRegToHardenedReg;
1274
1275	// Track the set of load-dependent registers through the basic block. Because
1276	// the values of these registers have an existing data dependency on a loaded
1277	// value which we would have checked, we can omit any checks on them.
1278	SparseBitVector<> LoadDepRegs;
1279
1280	for (MachineBasicBlock &MBB : MF) {
1281	// The first pass over the block: collect all the loads which can have their
1282	// loaded value hardened and all the loads that instead need their address
1283	// hardened. During this walk we propagate load dependence for address
1284	// hardened loads and also look for LFENCE to stop hardening wherever
1285	// possible. When deciding whether or not to harden the loaded value or not,
1286	// we check to see if any registers used in the address will have been
1287	// hardened at this point and if so, harden any remaining address registers
1288	// as that often successfully re-uses hardened addresses and minimizes
1289	// instructions.
1290	//
1291	// FIXME: We should consider an aggressive mode where we continue to keep as
1292	// many loads value hardened even when some address register hardening would
1293	// be free (due to reuse).
1294	//
1295	// Note that we only need this pass if we are actually hardening loads.
1296	if (HardenLoads)
1297	for (MachineInstr &MI : MBB) {
1298	// We naively assume that all def'ed registers of an instruction have
1299	// a data dependency on all of their operands.
1300	// FIXME: Do a more careful analysis of x86 to build a conservative
1301	// model here.
1302	if (llvm::any_of(Range: MI.uses(), P: [&](MachineOperand &Op) {
1303	return Op.isReg() && LoadDepRegs.test(Idx: Op.getReg());
1304	}))
1305	for (MachineOperand &Def : MI.defs())
1306	if (Def.isReg())
1307	LoadDepRegs.set(Def.getReg());
1308
1309	// Both Intel and AMD are guiding that they will change the semantics of
1310	// LFENCE to be a speculation barrier, so if we see an LFENCE, there is
1311	// no more need to guard things in this block.
1312	if (MI.getOpcode() == X86::LFENCE)
1313	break;
1314
1315	// If this instruction cannot load, nothing to do.
1316	if (!MI.mayLoad())
1317	continue;
1318
1319	// Some instructions which "load" are trivially safe or unimportant.
1320	if (MI.getOpcode() == X86::MFENCE)
1321	continue;
1322
1323	// Extract the memory operand information about this instruction.
1324	const int MemRefBeginIdx = X86::getFirstAddrOperandIdx(MI);
1325	if (MemRefBeginIdx < `0`) {
1326	LLVM_DEBUG(dbgs()
1327	<< "WARNING: unable to harden loading instruction: ";
1328	MI.dump());
1329	continue;
1330	}
1331
1332	MachineOperand &BaseMO =
1333	MI.getOperand(i: MemRefBeginIdx + X86::AddrBaseReg);
1334	MachineOperand &IndexMO =
1335	MI.getOperand(i: MemRefBeginIdx + X86::AddrIndexReg);
1336
1337	// If we have at least one (non-frame-index, non-RIP) register operand,
1338	// and neither operand is load-dependent, we need to check the load.
1339	unsigned BaseReg = `0`, IndexReg = `0`;
1340	if (!BaseMO.isFI() && BaseMO.getReg() != X86::RIP &&
1341	BaseMO.getReg() != X86::NoRegister)
1342	BaseReg = BaseMO.getReg();
1343	if (IndexMO.getReg() != X86::NoRegister)
1344	IndexReg = IndexMO.getReg();
1345
1346	if (!BaseReg && !IndexReg)
1347	// No register operands!
1348	continue;
1349
1350	// If any register operand is dependent, this load is dependent and we
1351	// needn't check it.
1352	// FIXME: Is this true in the case where we are hardening loads after
1353	// they complete? Unclear, need to investigate.
1354	if ((BaseReg && LoadDepRegs.test(Idx: BaseReg)) \|\|
1355	(IndexReg && LoadDepRegs.test(Idx: IndexReg)))
1356	continue;
1357
1358	// If post-load hardening is enabled, this load is compatible with
1359	// post-load hardening, and we aren't already going to harden one of the
1360	// address registers, queue it up to be hardened post-load. Notably,
1361	// even once hardened this won't introduce a useful dependency that
1362	// could prune out subsequent loads.
1363	if (EnablePostLoadHardening && X86InstrInfo::isDataInvariantLoad(MI) &&
1364	!isEFLAGSDefLive(MI) && MI.getDesc().getNumDefs() == `1` &&
1365	MI.getOperand(i: `0`).isReg() &&
1366	canHardenRegister(Reg: MI.getOperand(i: `0`).getReg()) &&
1367	!HardenedAddrRegs.count(V: BaseReg) &&
1368	!HardenedAddrRegs.count(V: IndexReg)) {
1369	HardenPostLoad.insert(Ptr: &MI);
1370	HardenedAddrRegs.insert(V: MI.getOperand(i: `0`).getReg());
1371	continue;
1372	}
1373
1374	// Record this instruction for address hardening and record its register
1375	// operands as being address-hardened.
1376	HardenLoadAddr.insert(Ptr: &MI);
1377	if (BaseReg)
1378	HardenedAddrRegs.insert(V: BaseReg);
1379	if (IndexReg)
1380	HardenedAddrRegs.insert(V: IndexReg);
1381
1382	for (MachineOperand &Def : MI.defs())
1383	if (Def.isReg())
1384	LoadDepRegs.set(Def.getReg());
1385	}
1386
1387	// Now re-walk the instructions in the basic block, and apply whichever
1388	// hardening strategy we have elected. Note that we do this in a second
1389	// pass specifically so that we have the complete set of instructions for
1390	// which we will do post-load hardening and can defer it in certain
1391	// circumstances.
1392	for (MachineInstr &MI : MBB) {
1393	if (HardenLoads) {
1394	// We cannot both require hardening the def of a load and its address.
1395	assert(!(HardenLoadAddr.count(&MI) && HardenPostLoad.count(&MI)) &&
1396	"Requested to harden both the address and def of a load!");
1397
1398	// Check if this is a load whose address needs to be hardened.
1399	if (HardenLoadAddr.erase(Ptr: &MI)) {
1400	const int MemRefBeginIdx = X86::getFirstAddrOperandIdx(MI);
1401	assert(MemRefBeginIdx >= `0` && "Cannot have an invalid index here!");
1402
1403	MachineOperand &BaseMO =
1404	MI.getOperand(i: MemRefBeginIdx + X86::AddrBaseReg);
1405	MachineOperand &IndexMO =
1406	MI.getOperand(i: MemRefBeginIdx + X86::AddrIndexReg);
1407	hardenLoadAddr(MI, BaseMO, IndexMO, AddrRegToHardenedReg);
1408	continue;
1409	}
1410
1411	// Test if this instruction is one of our post load instructions (and
1412	// remove it from the set if so).
1413	if (HardenPostLoad.erase(Ptr: &MI)) {
1414	assert(!MI.isCall() && "Must not try to post-load harden a call!");
1415
1416	// If this is a data-invariant load and there is no EFLAGS
1417	// interference, we want to try and sink any hardening as far as
1418	// possible.
1419	if (X86InstrInfo::isDataInvariantLoad(MI) && !isEFLAGSDefLive(MI)) {
1420	// Sink the instruction we'll need to harden as far as we can down
1421	// the graph.
1422	MachineInstr *SunkMI = sinkPostLoadHardenedInst(MI, HardenedInstrs&: HardenPostLoad);
1423
1424	// If we managed to sink this instruction, update everything so we
1425	// harden that instruction when we reach it in the instruction
1426	// sequence.
1427	if (SunkMI != &MI) {
1428	// If in sinking there was no instruction needing to be hardened,
1429	// we're done.
1430	if (!SunkMI)
1431	continue;
1432
1433	// Otherwise, add this to the set of defs we harden.
1434	HardenPostLoad.insert(Ptr: SunkMI);
1435	continue;
1436	}
1437	}
1438
1439	unsigned HardenedReg = hardenPostLoad(MI);
1440
1441	// Mark the resulting hardened register as such so we don't re-harden.
1442	AddrRegToHardenedReg [HardenedReg] = HardenedReg;
1443
1444	continue;
1445	}
1446
1447	// Check for an indirect call or branch that may need its input hardened
1448	// even if we couldn't find the specific load used, or were able to
1449	// avoid hardening it for some reason. Note that here we cannot break
1450	// out afterward as we may still need to handle any call aspect of this
1451	// instruction.
1452	if ((MI.isCall() \|\| MI.isBranch()) && HardenIndirectCallsAndJumps)
1453	hardenIndirectCallOrJumpInstr(MI, AddrRegToHardenedReg);
1454	}
1455
1456	// After we finish hardening loads we handle interprocedural hardening if
1457	// enabled and relevant for this instruction.
1458	if (!HardenInterprocedurally)
1459	continue;
1460	if (!MI.isCall() && !MI.isReturn())
1461	continue;
1462
1463	// If this is a direct return (IE, not a tail call) just directly harden
1464	// it.
1465	if (MI.isReturn() && !MI.isCall()) {
1466	hardenReturnInstr(MI);
1467	continue;
1468	}
1469
1470	// Otherwise we have a call. We need to handle transferring the predicate
1471	// state into a call and recovering it after the call returns (unless this
1472	// is a tail call).
1473	assert(MI.isCall() && "Should only reach here for calls!");
1474	tracePredStateThroughCall(MI);
1475	}
1476
1477	HardenPostLoad.clear();
1478	HardenLoadAddr.clear();
1479	HardenedAddrRegs.clear();
1480	AddrRegToHardenedReg.clear();
1481
1482	// Currently, we only track data-dependent loads within a basic block.
1483	// FIXME: We should see if this is necessary or if we could be more
1484	// aggressive here without opening up attack avenues.
1485	LoadDepRegs.clear();
1486	}
1487	}
1488
1489	/// Save EFLAGS into the returned GPR. This can in turn be restored with
1490	/// `restoreEFLAGS`.
1491	///
1492	/// Note that LLVM can only lower very simple patterns of saved and restored
1493	/// EFLAGS registers. The restore should always be within the same basic block
1494	/// as the save so that no PHI nodes are inserted.
1495	unsigned X86SpeculativeLoadHardeningPass::saveEFLAGS(
1496	MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
1497	const DebugLoc &Loc) {
1498	// FIXME: Hard coding this to a 32-bit register class seems weird, but matches
1499	// what instruction selection does.
1500	Register Reg = MRI->createVirtualRegister(&X86::GR32RegClass);
1501	// We directly copy the FLAGS register and rely on later lowering to clean
1502	// this up into the appropriate setCC instructions.
1503	BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), Reg).addReg(X86::EFLAGS);
1504	++NumInstsInserted;
1505	return Reg;
1506	}
1507
1508	/// Restore EFLAGS from the provided GPR. This should be produced by
1509	/// `saveEFLAGS`.
1510	///
1511	/// This must be done within the same basic block as the save in order to
1512	/// reliably lower.
1513	void X86SpeculativeLoadHardeningPass::restoreEFLAGS(
1514	MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
1515	const DebugLoc &Loc, Register Reg) {
1516	BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), X86::EFLAGS).addReg(Reg);
1517	++NumInstsInserted;
1518	}
1519
1520	/// Takes the current predicate state (in a register) and merges it into the
1521	/// stack pointer. The state is essentially a single bit, but we merge this in
1522	/// a way that won't form non-canonical pointers and also will be preserved
1523	/// across normal stack adjustments.
1524	void X86SpeculativeLoadHardeningPass::mergePredStateIntoSP(
1525	MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
1526	const DebugLoc &Loc, unsigned PredStateReg) {
1527	Register TmpReg = MRI->createVirtualRegister(RegClass: PS ->RC);
1528	// FIXME: This hard codes a shift distance based on the number of bits needed
1529	// to stay canonical on 64-bit. We should compute this somehow and support
1530	// 32-bit as part of that.
1531	auto ShiftI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::SHL64ri), TmpReg)
1532	.addReg(PredStateReg, RegState::Kill)
1533	.addImm(`47`);
1534	ShiftI->addRegisterDead(X86::EFLAGS, TRI);
1535	++NumInstsInserted;
1536	auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::OR64rr), X86::RSP)
1537	.addReg(X86::RSP)
1538	.addReg(TmpReg, RegState::Kill);
1539	OrI->addRegisterDead(X86::EFLAGS, TRI);
1540	++NumInstsInserted;
1541	}
1542
1543	/// Extracts the predicate state stored in the high bits of the stack pointer.
1544	unsigned X86SpeculativeLoadHardeningPass::extractPredStateFromSP(
1545	MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
1546	const DebugLoc &Loc) {
1547	Register PredStateReg = MRI->createVirtualRegister(RegClass: PS ->RC);
1548	Register TmpReg = MRI->createVirtualRegister(RegClass: PS ->RC);
1549
1550	// We know that the stack pointer will have any preserved predicate state in
1551	// its high bit. We just want to smear this across the other bits. Turns out,
1552	// this is exactly what an arithmetic right shift does.
1553	BuildMI(MBB, InsertPt, Loc, TII->get(TargetOpcode::COPY), TmpReg)
1554	.addReg(X86::RSP);
1555	auto ShiftI =
1556	BuildMI(MBB, InsertPt, Loc, TII->get(X86::SAR64ri), PredStateReg)
1557	.addReg(TmpReg, RegState::Kill)
1558	.addImm(TRI->getRegSizeInBits(*PS->RC) - `1`);
1559	ShiftI->addRegisterDead(X86::EFLAGS, TRI);
1560	++NumInstsInserted;
1561
1562	return PredStateReg;
1563	}
1564
1565	void X86SpeculativeLoadHardeningPass::hardenLoadAddr(
1566	MachineInstr &MI, MachineOperand &BaseMO, MachineOperand &IndexMO,
1567	SmallDenseMap<unsigned, unsigned, `32`> &AddrRegToHardenedReg) {
1568	MachineBasicBlock &MBB = *MI.getParent();
1569	const DebugLoc &Loc = MI.getDebugLoc();
1570
1571	// Check if EFLAGS are alive by seeing if there is a def of them or they
1572	// live-in, and then seeing if that def is in turn used.
1573	bool EFLAGSLive = isEFLAGSLive(MBB, I: MI.getIterator(), TRI: *TRI);
1574
1575	SmallVector<MachineOperand *, `2`> HardenOpRegs;
1576
1577	if (BaseMO.isFI()) {
1578	// A frame index is never a dynamically controllable load, so only
1579	// harden it if we're covering fixed address loads as well.
1580	LLVM_DEBUG(
1581	dbgs() << " Skipping hardening base of explicit stack frame load: ";
1582	MI.dump(); dbgs() << "\n");
1583	} else if (BaseMO.getReg() == X86::RSP) {
1584	// Some idempotent atomic operations are lowered directly to a locked
1585	// OR with 0 to the top of stack(or slightly offset from top) which uses an
1586	// explicit RSP register as the base.
1587	assert(IndexMO.getReg() == X86::NoRegister &&
1588	"Explicit RSP access with dynamic index!");
1589	LLVM_DEBUG(
1590	dbgs() << " Cannot harden base of explicit RSP offset in a load!");
1591	} else if (BaseMO.getReg() == X86::RIP \|\|
1592	BaseMO.getReg() == X86::NoRegister) {
1593	// For both RIP-relative addressed loads or absolute loads, we cannot
1594	// meaningfully harden them because the address being loaded has no
1595	// dynamic component.
1596	//
1597	// FIXME: When using a segment base (like TLS does) we end up with the
1598	// dynamic address being the base plus -1 because we can't mutate the
1599	// segment register here. This allows the signed 32-bit offset to point at
1600	// valid segment-relative addresses and load them successfully.
1601	LLVM_DEBUG(
1602	dbgs() << " Cannot harden base of "
1603	<< (BaseMO.getReg() == X86::RIP ? "RIP-relative" : "no-base")
1604	<< " address in a load!");
1605	} else {
1606	assert(BaseMO.isReg() &&
1607	"Only allowed to have a frame index or register base.");
1608	HardenOpRegs.push_back(Elt: &BaseMO);
1609	}
1610
1611	if (IndexMO.getReg() != X86::NoRegister &&
1612	(HardenOpRegs.empty() \|\|
1613	HardenOpRegs.front()->getReg() != IndexMO.getReg()))
1614	HardenOpRegs.push_back(Elt: &IndexMO);
1615
1616	assert((HardenOpRegs.size() == `1` \|\| HardenOpRegs.size() == `2`) &&
1617	"Should have exactly one or two registers to harden!");
1618	assert((HardenOpRegs.size() == `1` \|\|
1619	HardenOpRegs[`0`]->getReg() != HardenOpRegs[`1`]->getReg()) &&
1620	"Should not have two of the same registers!");
1621
1622	// Remove any registers that have alreaded been checked.
1623	llvm::erase_if(C&: HardenOpRegs, P: [&](MachineOperand *Op) {
1624	// See if this operand's register has already been checked.
1625	auto It = AddrRegToHardenedReg.find(Val: Op->getReg());
1626	if (It == AddrRegToHardenedReg.end())
1627	// Not checked, so retain this one.
1628	return false;
1629
1630	// Otherwise, we can directly update this operand and remove it.
1631	Op->setReg(It ->second);
1632	return true;
1633	});
1634	// If there are none left, we're done.
1635	if (HardenOpRegs.empty())
1636	return;
1637
1638	// Compute the current predicate state.
1639	Register StateReg = PS ->SSA.GetValueAtEndOfBlock(BB: &MBB);
1640
1641	auto InsertPt = MI.getIterator();
1642
1643	// If EFLAGS are live and we don't have access to instructions that avoid
1644	// clobbering EFLAGS we need to save and restore them. This in turn makes
1645	// the EFLAGS no longer live.
1646	unsigned FlagsReg = `0`;
1647	if (EFLAGSLive && !Subtarget->hasBMI2()) {
1648	EFLAGSLive = false;
1649	FlagsReg = saveEFLAGS(MBB, InsertPt, Loc);
1650	}
1651
1652	for (MachineOperand *Op : HardenOpRegs) {
1653	Register OpReg = Op->getReg();
1654	auto *OpRC = MRI->getRegClass(Reg: OpReg);
1655	Register TmpReg = MRI->createVirtualRegister(RegClass: OpRC);
1656
1657	// If this is a vector register, we'll need somewhat custom logic to handle
1658	// hardening it.
1659	if (!Subtarget->hasVLX() && (OpRC->hasSuperClassEq(&X86::VR128RegClass) \|\|
1660	OpRC->hasSuperClassEq(&X86::VR256RegClass))) {
1661	assert(Subtarget->hasAVX2() && "AVX2-specific register classes!");
1662	bool Is128Bit = OpRC->hasSuperClassEq(&X86::VR128RegClass);
1663
1664	// Move our state into a vector register.
1665	// FIXME: We could skip this at the cost of longer encodings with AVX-512
1666	// but that doesn't seem likely worth it.
1667	Register VStateReg = MRI->createVirtualRegister(&X86::VR128RegClass);
1668	auto MovI =
1669	BuildMI(MBB, InsertPt, Loc, TII->get(X86::VMOV64toPQIrr), VStateReg)
1670	.addReg(StateReg);
1671	(void)MovI;
1672	++NumInstsInserted;
1673	LLVM_DEBUG(dbgs() << " Inserting mov: "; MovI->dump(); dbgs() << "\n");
1674
1675	// Broadcast it across the vector register.
1676	Register VBStateReg = MRI->createVirtualRegister(RegClass: OpRC);
1677	auto BroadcastI = BuildMI(MBB, InsertPt, Loc,
1678	TII->get(Is128Bit ? X86::VPBROADCASTQrr
1679	: X86::VPBROADCASTQYrr),
1680	VBStateReg)
1681	.addReg(VStateReg);
1682	(void)BroadcastI;
1683	++NumInstsInserted;
1684	LLVM_DEBUG(dbgs() << " Inserting broadcast: "; BroadcastI->dump();
1685	dbgs() << "\n");
1686
1687	// Merge our potential poison state into the value with a vector or.
1688	auto OrI =
1689	BuildMI(MBB, InsertPt, Loc,
1690	TII->get(Is128Bit ? X86::VPORrr : X86::VPORYrr), TmpReg)
1691	.addReg(VBStateReg)
1692	.addReg(OpReg);
1693	(void)OrI;
1694	++NumInstsInserted;
1695	LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
1696	} else if (OpRC->hasSuperClassEq(&X86::VR128XRegClass) \|\|
1697	OpRC->hasSuperClassEq(&X86::VR256XRegClass) \|\|
1698	OpRC->hasSuperClassEq(&X86::VR512RegClass)) {
1699	assert(Subtarget->hasAVX512() && "AVX512-specific register classes!");
1700	bool Is128Bit = OpRC->hasSuperClassEq(&X86::VR128XRegClass);
1701	bool Is256Bit = OpRC->hasSuperClassEq(&X86::VR256XRegClass);
1702	if (Is128Bit \|\| Is256Bit)
1703	assert(Subtarget->hasVLX() && "AVX512VL-specific register classes!");
1704
1705	// Broadcast our state into a vector register.
1706	Register VStateReg = MRI->createVirtualRegister(RegClass: OpRC);
1707	unsigned BroadcastOp = Is128Bit ? X86::VPBROADCASTQrZ128rr
1708	: Is256Bit ? X86::VPBROADCASTQrZ256rr
1709	: X86::VPBROADCASTQrZrr;
1710	auto BroadcastI =
1711	BuildMI(MBB, InsertPt, Loc, TII->get(BroadcastOp), VStateReg)
1712	.addReg(StateReg);
1713	(void)BroadcastI;
1714	++NumInstsInserted;
1715	LLVM_DEBUG(dbgs() << " Inserting broadcast: "; BroadcastI->dump();
1716	dbgs() << "\n");
1717
1718	// Merge our potential poison state into the value with a vector or.
1719	unsigned OrOp = Is128Bit ? X86::VPORQZ128rr
1720	: Is256Bit ? X86::VPORQZ256rr : X86::VPORQZrr;
1721	auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(OrOp), TmpReg)
1722	.addReg(VStateReg)
1723	.addReg(OpReg);
1724	(void)OrI;
1725	++NumInstsInserted;
1726	LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
1727	} else {
1728	// FIXME: Need to support GR32 here for 32-bit code.
1729	assert(OpRC->hasSuperClassEq(&X86::GR64RegClass) &&
1730	"Not a supported register class for address hardening!");
1731
1732	if (!EFLAGSLive) {
1733	// Merge our potential poison state into the value with an or.
1734	auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::OR64rr), TmpReg)
1735	.addReg(StateReg)
1736	.addReg(OpReg);
1737	OrI->addRegisterDead(X86::EFLAGS, TRI);
1738	++NumInstsInserted;
1739	LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
1740	} else {
1741	// We need to avoid touching EFLAGS so shift out all but the least
1742	// significant bit using the instruction that doesn't update flags.
1743	auto ShiftI =
1744	BuildMI(MBB, InsertPt, Loc, TII->get(X86::SHRX64rr), TmpReg)
1745	.addReg(OpReg)
1746	.addReg(StateReg);
1747	(void)ShiftI;
1748	++NumInstsInserted;
1749	LLVM_DEBUG(dbgs() << " Inserting shrx: "; ShiftI->dump();
1750	dbgs() << "\n");
1751	}
1752	}
1753
1754	// Record this register as checked and update the operand.
1755	assert(!AddrRegToHardenedReg.count(Op->getReg()) &&
1756	"Should not have checked this register yet!");
1757	AddrRegToHardenedReg [Op->getReg()] = TmpReg;
1758	Op->setReg(TmpReg);
1759	++NumAddrRegsHardened;
1760	}
1761
1762	// And restore the flags if needed.
1763	if (FlagsReg)
1764	restoreEFLAGS(MBB, InsertPt, Loc, Reg: FlagsReg);
1765	}
1766
1767	MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst(
1768	MachineInstr &InitialMI, SmallPtrSetImpl<MachineInstr *> &HardenedInstrs) {
1769	assert(X86InstrInfo::isDataInvariantLoad(InitialMI) &&
1770	"Cannot get here with a non-invariant load!");
1771	assert(!isEFLAGSDefLive(InitialMI) &&
1772	"Cannot get here with a data invariant load "
1773	"that interferes with EFLAGS!");
1774
1775	// See if we can sink hardening the loaded value.
1776	auto SinkCheckToSingleUse =
1777	[&](MachineInstr &MI) -> std::optional<MachineInstr *> {
1778	Register DefReg = MI.getOperand(i: `0`).getReg();
1779
1780	// We need to find a single use which we can sink the check. We can
1781	// primarily do this because many uses may already end up checked on their
1782	// own.
1783	MachineInstr SingleUseMI = nullptr*;
1784	for (MachineInstr &UseMI : MRI->use_instructions(Reg: DefReg)) {
1785	// If we're already going to harden this use, it is data invariant, it
1786	// does not interfere with EFLAGS, and within our block.
1787	if (HardenedInstrs.count(Ptr: &UseMI)) {
1788	if (!X86InstrInfo::isDataInvariantLoad(MI&: UseMI) \|\| isEFLAGSDefLive(MI: UseMI)) {
1789	// If we've already decided to harden a non-load, we must have sunk
1790	// some other post-load hardened instruction to it and it must itself
1791	// be data-invariant.
1792	assert(X86InstrInfo::isDataInvariant(UseMI) &&
1793	"Data variant instruction being hardened!");
1794	continue;
1795	}
1796
1797	// Otherwise, this is a load and the load component can't be data
1798	// invariant so check how this register is being used.
1799	const int MemRefBeginIdx = X86::getFirstAddrOperandIdx(MI: UseMI);
1800	assert(MemRefBeginIdx >= `0` &&
1801	"Should always have mem references here!");
1802
1803	MachineOperand &BaseMO =
1804	UseMI.getOperand(i: MemRefBeginIdx + X86::AddrBaseReg);
1805	MachineOperand &IndexMO =
1806	UseMI.getOperand(i: MemRefBeginIdx + X86::AddrIndexReg);
1807	if ((BaseMO.isReg() && BaseMO.getReg() == DefReg) \|\|
1808	(IndexMO.isReg() && IndexMO.getReg() == DefReg))
1809	// The load uses the register as part of its address making it not
1810	// invariant.
1811	return {};
1812
1813	continue;
1814	}
1815
1816	if (SingleUseMI)
1817	// We already have a single use, this would make two. Bail.
1818	return {};
1819
1820	// If this single use isn't data invariant, isn't in this block, or has
1821	// interfering EFLAGS, we can't sink the hardening to it.
1822	if (!X86InstrInfo::isDataInvariant(MI&: UseMI) \|\| UseMI.getParent() != MI.getParent() \|\|
1823	isEFLAGSDefLive(MI: UseMI))
1824	return {};
1825
1826	// If this instruction defines multiple registers bail as we won't harden
1827	// all of them.
1828	if (UseMI.getDesc().getNumDefs() > `1`)
1829	return {};
1830
1831	// If this register isn't a virtual register we can't walk uses of sanely,
1832	// just bail. Also check that its register class is one of the ones we
1833	// can harden.
1834	Register UseDefReg = UseMI.getOperand(i: `0`).getReg();
1835	if (!canHardenRegister(Reg: UseDefReg))
1836	return {};
1837
1838	SingleUseMI = &UseMI;
1839	}
1840
1841	// If SingleUseMI is still null, there is no use that needs its own
1842	// checking. Otherwise, it is the single use that needs checking.
1843	return {SingleUseMI};
1844	};
1845
1846	MachineInstr *MI = &InitialMI;
1847	while (std::optional<MachineInstr > SingleUse = SinkCheckToSingleUse (MI)) {
1848	// Update which MI we're checking now.
1849	MI = *SingleUse;
1850	if (!MI)
1851	break;
1852	}
1853
1854	return MI;
1855	}
1856
1857	bool X86SpeculativeLoadHardeningPass::canHardenRegister(Register Reg) {
1858	// We only support hardening virtual registers.
1859	if (!Reg.isVirtual())
1860	return false;
1861
1862	auto *RC = MRI->getRegClass(Reg);
1863	int RegBytes = TRI->getRegSizeInBits(RC: *RC) / `8`;
1864	if (RegBytes > `8`)
1865	// We don't support post-load hardening of vectors.
1866	return false;
1867
1868	unsigned RegIdx = Log2_32(Value: RegBytes);
1869	assert(RegIdx < `4` && "Unsupported register size");
1870
1871	// If this register class is explicitly constrained to a class that doesn't
1872	// require REX prefix, we may not be able to satisfy that constraint when
1873	// emitting the hardening instructions, so bail out here.
1874	// FIXME: This seems like a pretty lame hack. The way this comes up is when we
1875	// end up both with a NOREX and REX-only register as operands to the hardening
1876	// instructions. It would be better to fix that code to handle this situation
1877	// rather than hack around it in this way.
1878	const TargetRegisterClass *NOREXRegClasses[] = {
1879	&X86::GR8_NOREXRegClass, &X86::GR16_NOREXRegClass,
1880	&X86::GR32_NOREXRegClass, &X86::GR64_NOREXRegClass};
1881	if (RC == NOREXRegClasses[RegIdx])
1882	return false;
1883
1884	const TargetRegisterClass *GPRRegClasses[] = {
1885	&X86::GR8RegClass, &X86::GR16RegClass, &X86::GR32RegClass,
1886	&X86::GR64RegClass};
1887	return RC->hasSuperClassEq(RC: GPRRegClasses[RegIdx]);
1888	}
1889
1890	/// Harden a value in a register.
1891	///
1892	/// This is the low-level logic to fully harden a value sitting in a register
1893	/// against leaking during speculative execution.
1894	///
1895	/// Unlike hardening an address that is used by a load, this routine is required
1896	/// to hide all* incoming bits in the register.*
1897	///
1898	/// `Reg` must be a virtual register. Currently, it is required to be a GPR no
1899	/// larger than the predicate state register. FIXME: We should support vector
1900	/// registers here by broadcasting the predicate state.
1901	///
1902	/// The new, hardened virtual register is returned. It will have the same
1903	/// register class as `Reg`.
1904	unsigned X86SpeculativeLoadHardeningPass::hardenValueInRegister(
1905	Register Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
1906	const DebugLoc &Loc) {
1907	assert(canHardenRegister(Reg) && "Cannot harden this register!");
1908
1909	auto *RC = MRI->getRegClass(Reg);
1910	int Bytes = TRI->getRegSizeInBits(RC: *RC) / `8`;
1911	Register StateReg = PS ->SSA.GetValueAtEndOfBlock(BB: &MBB);
1912	assert((Bytes == `1` \|\| Bytes == `2` \|\| Bytes == `4` \|\| Bytes == `8`) &&
1913	"Unknown register size");
1914
1915	// FIXME: Need to teach this about 32-bit mode.
1916	if (Bytes != `8`) {
1917	unsigned SubRegImms[] = {X86::sub_8bit, X86::sub_16bit, X86::sub_32bit};
1918	unsigned SubRegImm = SubRegImms[Log2_32(Value: Bytes)];
1919	Register NarrowStateReg = MRI->createVirtualRegister(RegClass: RC);
1920	BuildMI(MBB, InsertPt, Loc, TII->get(TargetOpcode::COPY), NarrowStateReg)
1921	.addReg(StateReg, `0`, SubRegImm);
1922	StateReg = NarrowStateReg;
1923	}
1924
1925	unsigned FlagsReg = `0`;
1926	if (isEFLAGSLive(MBB, I: InsertPt, TRI: *TRI))
1927	FlagsReg = saveEFLAGS(MBB, InsertPt, Loc);
1928
1929	Register NewReg = MRI->createVirtualRegister(RegClass: RC);
1930	unsigned OrOpCodes[] = {X86::OR8rr, X86::OR16rr, X86::OR32rr, X86::OR64rr};
1931	unsigned OrOpCode = OrOpCodes[Log2_32(Value: Bytes)];
1932	auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(OrOpCode), NewReg)
1933	.addReg(StateReg)
1934	.addReg(Reg);
1935	OrI->addRegisterDead(X86::EFLAGS, TRI);
1936	++NumInstsInserted;
1937	LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
1938
1939	if (FlagsReg)
1940	restoreEFLAGS(MBB, InsertPt, Loc, Reg: FlagsReg);
1941
1942	return NewReg;
1943	}
1944
1945	/// Harden a load by hardening the loaded value in the defined register.
1946	///
1947	/// We can harden a non-leaking load into a register without touching the
1948	/// address by just hiding all of the loaded bits during misspeculation. We use
1949	/// an `or` instruction to do this because we set up our poison value as all
1950	/// ones. And the goal is just for the loaded bits to not be exposed to
1951	/// execution and coercing them to one is sufficient.
1952	///
1953	/// Returns the newly hardened register.
1954	unsigned X86SpeculativeLoadHardeningPass::hardenPostLoad(MachineInstr &MI) {
1955	MachineBasicBlock &MBB = *MI.getParent();
1956	const DebugLoc &Loc = MI.getDebugLoc();
1957
1958	auto &DefOp = MI.getOperand(i: `0`);
1959	Register OldDefReg = DefOp.getReg();
1960	auto *DefRC = MRI->getRegClass(Reg: OldDefReg);
1961
1962	// Because we want to completely replace the uses of this def'ed value with
1963	// the hardened value, create a dedicated new register that will only be used
1964	// to communicate the unhardened value to the hardening.
1965	Register UnhardenedReg = MRI->createVirtualRegister(RegClass: DefRC);
1966	DefOp.setReg(UnhardenedReg);
1967
1968	// Now harden this register's value, getting a hardened reg that is safe to
1969	// use. Note that we insert the instructions to compute this after* the*
1970	// defining instruction, not before it.
1971	unsigned HardenedReg = hardenValueInRegister(
1972	Reg: UnhardenedReg, MBB, InsertPt: std::next(x: MI.getIterator()), Loc);
1973
1974	// Finally, replace the old register (which now only has the uses of the
1975	// original def) with the hardened register.
1976	MRI->replaceRegWith(/FromReg/ OldDefReg, /ToReg/ HardenedReg);
1977
1978	++NumPostLoadRegsHardened;
1979	return HardenedReg;
1980	}
1981
1982	/// Harden a return instruction.
1983	///
1984	/// Returns implicitly perform a load which we need to harden. Without hardening
1985	/// this load, an attacker my speculatively write over the return address to
1986	/// steer speculation of the return to an attacker controlled address. This is
1987	/// called Spectre v1.1 or Bounds Check Bypass Store (BCBS) and is described in
1988	/// this paper:
1989	/// https://people.csail.mit.edu/vlk/spectre11.pdf
1990	///
1991	/// We can harden this by introducing an LFENCE that will delay any load of the
1992	/// return address until prior instructions have retired (and thus are not being
1993	/// speculated), or we can harden the address used by the implicit load: the
1994	/// stack pointer.
1995	///
1996	/// If we are not using an LFENCE, hardening the stack pointer has an additional
1997	/// benefit: it allows us to pass the predicate state accumulated in this
1998	/// function back to the caller. In the absence of a BCBS attack on the return,
1999	/// the caller will typically be resumed and speculatively executed due to the
2000	/// Return Stack Buffer (RSB) prediction which is very accurate and has a high
2001	/// priority. It is possible that some code from the caller will be executed
2002	/// speculatively even during a BCBS-attacked return until the steering takes
2003	/// effect. Whenever this happens, the caller can recover the (poisoned)
2004	/// predicate state from the stack pointer and continue to harden loads.
2005	void X86SpeculativeLoadHardeningPass::hardenReturnInstr(MachineInstr &MI) {
2006	MachineBasicBlock &MBB = *MI.getParent();
2007	const DebugLoc &Loc = MI.getDebugLoc();
2008	auto InsertPt = MI.getIterator();
2009
2010	if (FenceCallAndRet)
2011	// No need to fence here as we'll fence at the return site itself. That
2012	// handles more cases than we can handle here.
2013	return;
2014
2015	// Take our predicate state, shift it to the high 17 bits (so that we keep
2016	// pointers canonical) and merge it into RSP. This will allow the caller to
2017	// extract it when we return (speculatively).
2018	mergePredStateIntoSP(MBB, InsertPt, Loc, PredStateReg: PS ->SSA.GetValueAtEndOfBlock(BB: &MBB));
2019	}
2020
2021	/// Trace the predicate state through a call.
2022	///
2023	/// There are several layers of this needed to handle the full complexity of
2024	/// calls.
2025	///
2026	/// First, we need to send the predicate state into the called function. We do
2027	/// this by merging it into the high bits of the stack pointer.
2028	///
2029	/// For tail calls, this is all we need to do.
2030	///
2031	/// For calls where we might return and resume the control flow, we need to
2032	/// extract the predicate state from the high bits of the stack pointer after
2033	/// control returns from the called function.
2034	///
2035	/// We also need to verify that we intended to return to this location in the
2036	/// code. An attacker might arrange for the processor to mispredict the return
2037	/// to this valid but incorrect return address in the program rather than the
2038	/// correct one. See the paper on this attack, called "ret2spec" by the
2039	/// researchers, here:
2040	/// https://christian-rossow.de/publications/ret2spec-ccs2018.pdf
2041	///
2042	/// The way we verify that we returned to the correct location is by preserving
2043	/// the expected return address across the call. One technique involves taking
2044	/// advantage of the red-zone to load the return address from `8(%rsp)` where it
2045	/// was left by the RET instruction when it popped `%rsp`. Alternatively, we can
2046	/// directly save the address into a register that will be preserved across the
2047	/// call. We compare this intended return address against the address
2048	/// immediately following the call (the observed return address). If these
2049	/// mismatch, we have detected misspeculation and can poison our predicate
2050	/// state.
2051	void X86SpeculativeLoadHardeningPass::tracePredStateThroughCall(
2052	MachineInstr &MI) {
2053	MachineBasicBlock &MBB = *MI.getParent();
2054	MachineFunction &MF = *MBB.getParent();
2055	auto InsertPt = MI.getIterator();
2056	const DebugLoc &Loc = MI.getDebugLoc();
2057
2058	if (FenceCallAndRet) {
2059	if (MI.isReturn())
2060	// Tail call, we don't return to this function.
2061	// FIXME: We should also handle noreturn calls.
2062	return;
2063
2064	// We don't need to fence before the call because the function should fence
2065	// in its entry. However, we do need to fence after the call returns.
2066	// Fencing before the return doesn't correctly handle cases where the return
2067	// itself is mispredicted.
2068	BuildMI(MBB, std::next(InsertPt), Loc, TII->get(X86::LFENCE));
2069	++NumInstsInserted;
2070	++NumLFENCEsInserted;
2071	return;
2072	}
2073
2074	// First, we transfer the predicate state into the called function by merging
2075	// it into the stack pointer. This will kill the current def of the state.
2076	Register StateReg = PS ->SSA.GetValueAtEndOfBlock(BB: &MBB);
2077	mergePredStateIntoSP(MBB, InsertPt, Loc, PredStateReg: StateReg);
2078
2079	// If this call is also a return, it is a tail call and we don't need anything
2080	// else to handle it so just return. Also, if there are no further
2081	// instructions and no successors, this call does not return so we can also
2082	// bail.
2083	if (MI.isReturn() \|\| (std::next(x: InsertPt) == MBB.end() && MBB.succ_empty()))
2084	return;
2085
2086	// Create a symbol to track the return address and attach it to the call
2087	// machine instruction. We will lower extra symbols attached to call
2088	// instructions as label immediately following the call.
2089	MCSymbol *RetSymbol =
2090	MF.getContext().createTempSymbol(Name: "slh_ret_addr",
2091	/AlwaysAddSuffix/ true);
2092	MI.setPostInstrSymbol(MF, Symbol: RetSymbol);
2093
2094	const TargetRegisterClass *AddrRC = &X86::GR64RegClass;
2095	unsigned ExpectedRetAddrReg = `0`;
2096
2097	// If we have no red zones or if the function returns twice (possibly without
2098	// using the `ret` instruction) like setjmp, we need to save the expected
2099	// return address prior to the call.
2100	if (!Subtarget->getFrameLowering()->has128ByteRedZone(MF) \|\|
2101	MF.exposesReturnsTwice()) {
2102	// If we don't have red zones, we need to compute the expected return
2103	// address prior to the call and store it in a register that lives across
2104	// the call.
2105	//
2106	// In some ways, this is doubly satisfying as a mitigation because it will
2107	// also successfully detect stack smashing bugs in some cases (typically,
2108	// when a callee-saved register is used and the callee doesn't push it onto
2109	// the stack). But that isn't our primary goal, so we only use it as
2110	// a fallback.
2111	//
2112	// FIXME: It isn't clear that this is reliable in the face of
2113	// rematerialization in the register allocator. We somehow need to force
2114	// that to not occur for this particular instruction, and instead to spill
2115	// or otherwise preserve the value computed prior* to the call.*
2116	//
2117	// FIXME: It is even less clear why MachineCSE can't just fold this when we
2118	// end up having to use identical instructions both before and after the
2119	// call to feed the comparison.
2120	ExpectedRetAddrReg = MRI->createVirtualRegister(RegClass: AddrRC);
2121	if (MF.getTarget().getCodeModel() == CodeModel::Small &&
2122	!Subtarget->isPositionIndependent()) {
2123	BuildMI(MBB, InsertPt, Loc, TII->get(X86::MOV64ri32), ExpectedRetAddrReg)
2124	.addSym(RetSymbol);
2125	} else {
2126	BuildMI(MBB, InsertPt, Loc, TII->get(X86::LEA64r), ExpectedRetAddrReg)
2127	.addReg(/Base/ X86::RIP)
2128	.addImm(/Scale/ `1`)
2129	.addReg(/Index/ `0`)
2130	.addSym(RetSymbol)
2131	.addReg(/Segment/ `0`);
2132	}
2133	}
2134
2135	// Step past the call to handle when it returns.
2136	++InsertPt;
2137
2138	// If we didn't pre-compute the expected return address into a register, then
2139	// red zones are enabled and the return address is still available on the
2140	// stack immediately after the call. As the very first instruction, we load it
2141	// into a register.
2142	if (!ExpectedRetAddrReg) {
2143	ExpectedRetAddrReg = MRI->createVirtualRegister(RegClass: AddrRC);
2144	BuildMI(MBB, InsertPt, Loc, TII->get(X86::MOV64rm), ExpectedRetAddrReg)
2145	.addReg(/Base/ X86::RSP)
2146	.addImm(/Scale/ `1`)
2147	.addReg(/Index/ `0`)
2148	.addImm(/Displacement/ -`8`) // The stack pointer has been popped, so
2149	// the return address is 8-bytes past it.
2150	.addReg(/Segment/ `0`);
2151	}
2152
2153	// Now we extract the callee's predicate state from the stack pointer.
2154	unsigned NewStateReg = extractPredStateFromSP(MBB, InsertPt, Loc);
2155
2156	// Test the expected return address against our actual address. If we can
2157	// form this basic block's address as an immediate, this is easy. Otherwise
2158	// we compute it.
2159	if (MF.getTarget().getCodeModel() == CodeModel::Small &&
2160	!Subtarget->isPositionIndependent()) {
2161	// FIXME: Could we fold this with the load? It would require careful EFLAGS
2162	// management.
2163	BuildMI(MBB, InsertPt, Loc, TII->get(X86::CMP64ri32))
2164	.addReg(ExpectedRetAddrReg, RegState::Kill)
2165	.addSym(RetSymbol);
2166	} else {
2167	Register ActualRetAddrReg = MRI->createVirtualRegister(RegClass: AddrRC);
2168	BuildMI(MBB, InsertPt, Loc, TII->get(X86::LEA64r), ActualRetAddrReg)
2169	.addReg(/Base/ X86::RIP)
2170	.addImm(/Scale/ `1`)
2171	.addReg(/Index/ `0`)
2172	.addSym(RetSymbol)
2173	.addReg(/Segment/ `0`);
2174	BuildMI(MBB, InsertPt, Loc, TII->get(X86::CMP64rr))
2175	.addReg(ExpectedRetAddrReg, RegState::Kill)
2176	.addReg(ActualRetAddrReg, RegState::Kill);
2177	}
2178
2179	// Now conditionally update the predicate state we just extracted if we ended
2180	// up at a different return address than expected.
2181	int PredStateSizeInBytes = TRI->getRegSizeInBits(RC: *PS ->RC) / `8`;
2182	auto CMovOp = X86::getCMovOpcode(RegBytes: PredStateSizeInBytes);
2183
2184	Register UpdatedStateReg = MRI->createVirtualRegister(RegClass: PS ->RC);
2185	auto CMovI = BuildMI(MBB, InsertPt, Loc, TII->get(CMovOp), UpdatedStateReg)
2186	.addReg(NewStateReg, RegState::Kill)
2187	.addReg(PS ->PoisonReg)
2188	.addImm(X86::COND_NE);
2189	CMovI->findRegisterUseOperand(X86::EFLAGS, /TRI=/nullptr)->setIsKill(true);
2190	++NumInstsInserted;
2191	LLVM_DEBUG(dbgs() << " Inserting cmov: "; CMovI->dump(); dbgs() << "\n");
2192
2193	PS ->SSA.AddAvailableValue(BB: &MBB, V: UpdatedStateReg);
2194	}
2195
2196	/// An attacker may speculatively store over a value that is then speculatively
2197	/// loaded and used as the target of an indirect call or jump instruction. This
2198	/// is called Spectre v1.2 or Bounds Check Bypass Store (BCBS) and is described
2199	/// in this paper:
2200	/// https://people.csail.mit.edu/vlk/spectre11.pdf
2201	///
2202	/// When this happens, the speculative execution of the call or jump will end up
2203	/// being steered to this attacker controlled address. While most such loads
2204	/// will be adequately hardened already, we want to ensure that they are
2205	/// definitively treated as needing post-load hardening. While address hardening
2206	/// is sufficient to prevent secret data from leaking to the attacker, it may
2207	/// not be sufficient to prevent an attacker from steering speculative
2208	/// execution. We forcibly unfolded all relevant loads above and so will always
2209	/// have an opportunity to post-load harden here, we just need to scan for cases
2210	/// not already flagged and add them.
2211	void X86SpeculativeLoadHardeningPass::hardenIndirectCallOrJumpInstr(
2212	MachineInstr &MI,
2213	SmallDenseMap<unsigned, unsigned, `32`> &AddrRegToHardenedReg) {
2214	switch (MI.getOpcode()) {
2215	case X86::FARCALL16m:
2216	case X86::FARCALL32m:
2217	case X86::FARCALL64m:
2218	case X86::FARJMP16m:
2219	case X86::FARJMP32m:
2220	case X86::FARJMP64m:
2221	// We don't need to harden either far calls or far jumps as they are
2222	// safe from Spectre.
2223	return;
2224
2225	default:
2226	break;
2227	}
2228
2229	// We should never see a loading instruction at this point, as those should
2230	// have been unfolded.
2231	assert(!MI.mayLoad() && "Found a lingering loading instruction!");
2232
2233	// If the first operand isn't a register, this is a branch or call
2234	// instruction with an immediate operand which doesn't need to be hardened.
2235	if (!MI.getOperand(i: `0`).isReg())
2236	return;
2237
2238	// For all of these, the target register is the first operand of the
2239	// instruction.
2240	auto &TargetOp = MI.getOperand(i: `0`);
2241	Register OldTargetReg = TargetOp.getReg();
2242
2243	// Try to lookup a hardened version of this register. We retain a reference
2244	// here as we want to update the map to track any newly computed hardened
2245	// register.
2246	unsigned &HardenedTargetReg = AddrRegToHardenedReg [OldTargetReg];
2247
2248	// If we don't have a hardened register yet, compute one. Otherwise, just use
2249	// the already hardened register.
2250	//
2251	// FIXME: It is a little suspect that we use partially hardened registers that
2252	// only feed addresses. The complexity of partial hardening with SHRX
2253	// continues to pile up. Should definitively measure its value and consider
2254	// eliminating it.
2255	if (!HardenedTargetReg)
2256	HardenedTargetReg = hardenValueInRegister(
2257	Reg: OldTargetReg, MBB&: *MI.getParent(), InsertPt: MI.getIterator(), Loc: MI.getDebugLoc());
2258
2259	// Set the target operand to the hardened register.
2260	TargetOp.setReg(HardenedTargetReg);
2261
2262	++NumCallsOrJumpsHardened;
2263	}
2264
2265	INITIALIZE_PASS_BEGIN(X86SpeculativeLoadHardeningPass, PASS_KEY,
2266	"X86 speculative load hardener", false, false)
2267	INITIALIZE_PASS_END(X86SpeculativeLoadHardeningPass, PASS_KEY,
2268	"X86 speculative load hardener", false, false)
2269
2270	FunctionPass *llvm::createX86SpeculativeLoadHardeningPass() {
2271	return new X86SpeculativeLoadHardeningPass ();
2272	}
2273

source code of llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp