X86_64.cpp source code [lld/ELF/Arch/X86_64.cpp]

1	//===- X86_64.cpp ---------------------------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "OutputSections.h"
10	#include "Relocations.h"
11	#include "Symbols.h"
12	#include "SyntheticSections.h"
13	#include "Target.h"
14	#include "lld/Common/ErrorHandler.h"
15	#include "llvm/BinaryFormat/ELF.h"
16	#include "llvm/Support/Endian.h"
17	#include "llvm/Support/MathExtras.h"
18
19	using namespace llvm;
20	using namespace llvm::object;
21	using namespace llvm::support::endian;
22	using namespace llvm::ELF;
23	using namespace lld;
24	using namespace lld::elf;
25
26	namespace {
27	class X86_64 : public TargetInfo {
28	public:
29	X86_64();
30	int getTlsGdRelaxSkip(RelType type) const override;
31	RelExpr getRelExpr(RelType type, const Symbol &s,
32	const uint8_t loc) const* override;
33	RelType getDynRel(RelType type) const override;
34	void writeGotPltHeader(uint8_t buf) const* override;
35	void writeGotPlt(uint8_t buf, const* Symbol &s) const override;
36	void writeIgotPlt(uint8_t buf, const* Symbol &s) const override;
37	void writePltHeader(uint8_t buf) const* override;
38	void writePlt(uint8_t buf, const* Symbol &sym,
39	uint64_t pltEntryAddr) const override;
40	void relocate(uint8_t loc, const* Relocation &rel,
41	uint64_t val) const override;
42	int64_t getImplicitAddend(const uint8_t buf, RelType type) const* override;
43	void applyJumpInstrMod(uint8_t *loc, JumpModType type,
44	unsigned size) const override;
45	RelExpr adjustGotPcExpr(RelType type, int64_t addend,
46	const uint8_t loc) const* override;
47	void relocateAlloc(InputSectionBase &sec, uint8_t buf) const* override;
48	bool adjustPrologueForCrossSplitStack(uint8_t loc, uint8_t end,
49	uint8_t stOther) const override;
50	bool deleteFallThruJmpInsn(InputSection &is, InputFile *file,
51	InputSection nextIS) const* override;
52	bool relaxOnce(int pass) const override;
53	};
54	} // namespace
55
56	// This is vector of NOP instructions of sizes from 1 to 8 bytes. The
57	// appropriately sized instructions are used to fill the gaps between sections
58	// which are executed during fall through.
59	static const std::vector<std::vector<uint8_t>> nopInstructions = {
60	{`0x90`},
61	{`0x66`, `0x90`},
62	{`0x0f`, `0x1f`, `0x00`},
63	{`0x0f`, `0x1f`, `0x40`, `0x00`},
64	{`0x0f`, `0x1f`, `0x44`, `0x00`, `0x00`},
65	{`0x66`, `0x0f`, `0x1f`, `0x44`, `0x00`, `0x00`},
66	{`0x0F`, `0x1F`, `0x80`, `0x00`, `0x00`, `0x00`, `0x00`},
67	{`0x0F`, `0x1F`, `0x84`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00`},
68	{`0x66`, `0x0F`, `0x1F`, `0x84`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00`}};
69
70	X86_64::X86_64() {
71	copyRel = R_X86_64_COPY;
72	gotRel = R_X86_64_GLOB_DAT;
73	pltRel = R_X86_64_JUMP_SLOT;
74	relativeRel = R_X86_64_RELATIVE;
75	iRelativeRel = R_X86_64_IRELATIVE;
76	symbolicRel = R_X86_64_64;
77	tlsDescRel = R_X86_64_TLSDESC;
78	tlsGotRel = R_X86_64_TPOFF64;
79	tlsModuleIndexRel = R_X86_64_DTPMOD64;
80	tlsOffsetRel = R_X86_64_DTPOFF64;
81	gotBaseSymInGotPlt = true;
82	gotEntrySize = `8`;
83	pltHeaderSize = `16`;
84	pltEntrySize = `16`;
85	ipltEntrySize = `16`;
86	trapInstr = {`0xcc`, `0xcc`, `0xcc`, `0xcc`}; // 0xcc = INT3
87	nopInstrs = nopInstructions;
88
89	// Align to the large page size (known as a superpage or huge page).
90	// FreeBSD automatically promotes large, superpage-aligned allocations.
91	defaultImageBase = `0x200000`;
92	}
93
94	int X86_64::getTlsGdRelaxSkip(RelType type) const {
95	// TLSDESC relocations are processed separately. See relaxTlsGdToLe below.
96	return type == R_X86_64_GOTPC32_TLSDESC \|\| type == R_X86_64_TLSDESC_CALL ? `1`
97	: `2`;
98	}
99
100	// Opcodes for the different X86_64 jmp instructions.
101	enum JmpInsnOpcode : uint32_t {
102	J_JMP_32,
103	J_JNE_32,
104	J_JE_32,
105	J_JG_32,
106	J_JGE_32,
107	J_JB_32,
108	J_JBE_32,
109	J_JL_32,
110	J_JLE_32,
111	J_JA_32,
112	J_JAE_32,
113	J_UNKNOWN,
114	};
115
116	// Given the first (optional) and second byte of the insn's opcode, this
117	// returns the corresponding enum value.
118	static JmpInsnOpcode getJmpInsnType(const uint8_t *first,
119	const uint8_t *second) {
120	if (*second == `0xe9`)
121	return J_JMP_32;
122
123	if (first == nullptr)
124	return J_UNKNOWN;
125
126	if (*first == `0x0f`) {
127	switch (*second) {
128	case `0x84`:
129	return J_JE_32;
130	case `0x85`:
131	return J_JNE_32;
132	case `0x8f`:
133	return J_JG_32;
134	case `0x8d`:
135	return J_JGE_32;
136	case `0x82`:
137	return J_JB_32;
138	case `0x86`:
139	return J_JBE_32;
140	case `0x8c`:
141	return J_JL_32;
142	case `0x8e`:
143	return J_JLE_32;
144	case `0x87`:
145	return J_JA_32;
146	case `0x83`:
147	return J_JAE_32;
148	}
149	}
150	return J_UNKNOWN;
151	}
152
153	// Return the relocation index for input section IS with a specific Offset.
154	// Returns the maximum size of the vector if no such relocation is found.
155	static unsigned getRelocationWithOffset(const InputSection &is,
156	uint64_t offset) {
157	unsigned size = is.relocs().size();
158	for (unsigned i = size - `1`; i + `1` > `0`; --i) {
159	if (is.relocs()[i].offset == offset && is.relocs()[i].expr != R_NONE)
160	return i;
161	}
162	return size;
163	}
164
165	// Returns true if R corresponds to a relocation used for a jump instruction.
166	// TODO: Once special relocations for relaxable jump instructions are available,
167	// this should be modified to use those relocations.
168	static bool isRelocationForJmpInsn(Relocation &R) {
169	return R.type == R_X86_64_PLT32 \|\| R.type == R_X86_64_PC32 \|\|
170	R.type == R_X86_64_PC8;
171	}
172
173	// Return true if Relocation R points to the first instruction in the
174	// next section.
175	// TODO: Delete this once psABI reserves a new relocation type for fall thru
176	// jumps.
177	static bool isFallThruRelocation(InputSection &is, InputFile *file,
178	InputSection *nextIS, Relocation &r) {
179	if (!isRelocationForJmpInsn(R&: r))
180	return false;
181
182	uint64_t addrLoc = is.getOutputSection()->addr + is.outSecOff + r.offset;
183	uint64_t targetOffset = InputSectionBase::getRelocTargetVA(
184	File: file, Type: r.type, A: r.addend, P: addrLoc, Sym: *r.sym, Expr: r.expr);
185
186	// If this jmp is a fall thru, the target offset is the beginning of the
187	// next section.
188	uint64_t nextSectionOffset =
189	nextIS->getOutputSection()->addr + nextIS->outSecOff;
190	return (addrLoc + `4` + targetOffset) == nextSectionOffset;
191	}
192
193	// Return the jmp instruction opcode that is the inverse of the given
194	// opcode. For example, JE inverted is JNE.
195	static JmpInsnOpcode invertJmpOpcode(const JmpInsnOpcode opcode) {
196	switch (opcode) {
197	case J_JE_32:
198	return J_JNE_32;
199	case J_JNE_32:
200	return J_JE_32;
201	case J_JG_32:
202	return J_JLE_32;
203	case J_JGE_32:
204	return J_JL_32;
205	case J_JB_32:
206	return J_JAE_32;
207	case J_JBE_32:
208	return J_JA_32;
209	case J_JL_32:
210	return J_JGE_32;
211	case J_JLE_32:
212	return J_JG_32;
213	case J_JA_32:
214	return J_JBE_32;
215	case J_JAE_32:
216	return J_JB_32;
217	default:
218	return J_UNKNOWN;
219	}
220	}
221
222	// Deletes direct jump instruction in input sections that jumps to the
223	// following section as it is not required. If there are two consecutive jump
224	// instructions, it checks if they can be flipped and one can be deleted.
225	// For example:
226	// .section .text
227	// a.BB.foo:
228	// ...
229	// 10: jne aa.BB.foo
230	// 16: jmp bar
231	// aa.BB.foo:
232	// ...
233	//
234	// can be converted to:
235	// a.BB.foo:
236	// ...
237	// 10: je bar #jne flipped to je and the jmp is deleted.
238	// aa.BB.foo:
239	// ...
240	bool X86_64::deleteFallThruJmpInsn(InputSection &is, InputFile *file,
241	InputSection nextIS) const* {
242	const unsigned sizeOfDirectJmpInsn = `5`;
243
244	if (nextIS == nullptr)
245	return false;
246
247	if (is.getSize() < sizeOfDirectJmpInsn)
248	return false;
249
250	// If this jmp insn can be removed, it is the last insn and the
251	// relocation is 4 bytes before the end.
252	unsigned rIndex = getRelocationWithOffset(is, offset: is.getSize() - `4`);
253	if (rIndex == is.relocs().size())
254	return false;
255
256	Relocation &r = is.relocs()[rIndex];
257
258	// Check if the relocation corresponds to a direct jmp.
259	const uint8_t *secContents = is.content().data();
260	// If it is not a direct jmp instruction, there is nothing to do here.
261	if (*(secContents + r.offset - `1`) != `0xe9`)
262	return false;
263
264	if (isFallThruRelocation(is, file, nextIS, r)) {
265	// This is a fall thru and can be deleted.
266	r.expr = R_NONE;
267	r.offset = `0`;
268	is.drop_back(num: sizeOfDirectJmpInsn);
269	is.nopFiller = true;
270	return true;
271	}
272
273	// Now, check if flip and delete is possible.
274	const unsigned sizeOfJmpCCInsn = `6`;
275	// To flip, there must be at least one JmpCC and one direct jmp.
276	if (is.getSize() < sizeOfDirectJmpInsn + sizeOfJmpCCInsn)
277	return false;
278
279	unsigned rbIndex =
280	getRelocationWithOffset(is, offset: (is.getSize() - sizeOfDirectJmpInsn - `4`));
281	if (rbIndex == is.relocs().size())
282	return false;
283
284	Relocation &rB = is.relocs()[rbIndex];
285
286	const uint8_t *jmpInsnB = secContents + rB.offset - `1`;
287	JmpInsnOpcode jmpOpcodeB = getJmpInsnType(first: jmpInsnB - `1`, second: jmpInsnB);
288	if (jmpOpcodeB == J_UNKNOWN)
289	return false;
290
291	if (!isFallThruRelocation(is, file, nextIS, r&: rB))
292	return false;
293
294	// jmpCC jumps to the fall thru block, the branch can be flipped and the
295	// jmp can be deleted.
296	JmpInsnOpcode jInvert = invertJmpOpcode(opcode: jmpOpcodeB);
297	if (jInvert == J_UNKNOWN)
298	return false;
299	is.jumpInstrMod = make<JumpInstrMod>();
300	*is.jumpInstrMod = {.offset: rB.offset - `1`, .original: jInvert, .size: `4`};
301	// Move R's values to rB except the offset.
302	rB = {.expr: r.expr, .type: r.type, .offset: rB.offset, .addend: r.addend, .sym: r.sym};
303	// Cancel R
304	r.expr = R_NONE;
305	r.offset = `0`;
306	is.drop_back(num: sizeOfDirectJmpInsn);
307	is.nopFiller = true;
308	return true;
309	}
310
311	bool X86_64::relaxOnce(int pass) const {
312	uint64_t minVA = UINT64_MAX, maxVA = `0`;
313	for (OutputSection *osec : outputSections) {
314	minVA = std::min(a: minVA, b: osec->addr);
315	maxVA = std::max(a: maxVA, b: osec->addr + osec->size);
316	}
317	// If the max VA difference is under 2^31, GOT-generating relocations with a 32-bit range cannot overflow.
318	if (isUInt<`31`>(x: maxVA - minVA))
319	return false;
320
321	SmallVector<InputSection *, `0`> storage;
322	bool changed = false;
323	for (OutputSection *osec : outputSections) {
324	if (!(osec->flags & SHF_EXECINSTR))
325	continue;
326	for (InputSection sec : getInputSections(os: osec, storage)) {
327	for (Relocation &rel : sec->relocs()) {
328	if (rel.expr != R_RELAX_GOT_PC)
329	continue;
330
331	uint64_t v = sec->getRelocTargetVA(File: sec->file, Type: rel.type, A: rel.addend,
332	P: sec->getOutputSection()->addr +
333	sec->outSecOff + rel.offset,
334	Sym: *rel.sym, Expr: rel.expr);
335	if (isInt<`32`>(x: v))
336	continue;
337	if (rel.sym->auxIdx == `0`) {
338	rel.sym->allocateAux();
339	addGotEntry(sym&: *rel.sym);
340	changed = true;
341	}
342	rel.expr = R_GOT_PC;
343	}
344	}
345	}
346	return changed;
347	}
348
349	RelExpr X86_64::getRelExpr(RelType type, const Symbol &s,
350	const uint8_t loc) const* {
351	switch (type) {
352	case R_X86_64_8:
353	case R_X86_64_16:
354	case R_X86_64_32:
355	case R_X86_64_32S:
356	case R_X86_64_64:
357	return R_ABS;
358	case R_X86_64_DTPOFF32:
359	case R_X86_64_DTPOFF64:
360	return R_DTPREL;
361	case R_X86_64_TPOFF32:
362	case R_X86_64_TPOFF64:
363	return R_TPREL;
364	case R_X86_64_TLSDESC_CALL:
365	return R_TLSDESC_CALL;
366	case R_X86_64_TLSLD:
367	return R_TLSLD_PC;
368	case R_X86_64_TLSGD:
369	return R_TLSGD_PC;
370	case R_X86_64_SIZE32:
371	case R_X86_64_SIZE64:
372	return R_SIZE;
373	case R_X86_64_PLT32:
374	return R_PLT_PC;
375	case R_X86_64_PC8:
376	case R_X86_64_PC16:
377	case R_X86_64_PC32:
378	case R_X86_64_PC64:
379	return R_PC;
380	case R_X86_64_GOT32:
381	case R_X86_64_GOT64:
382	return R_GOTPLT;
383	case R_X86_64_GOTPC32_TLSDESC:
384	return R_TLSDESC_PC;
385	case R_X86_64_GOTPCREL:
386	case R_X86_64_GOTPCRELX:
387	case R_X86_64_REX_GOTPCRELX:
388	case R_X86_64_GOTTPOFF:
389	return R_GOT_PC;
390	case R_X86_64_GOTOFF64:
391	return R_GOTPLTREL;
392	case R_X86_64_PLTOFF64:
393	return R_PLT_GOTPLT;
394	case R_X86_64_GOTPC32:
395	case R_X86_64_GOTPC64:
396	return R_GOTPLTONLY_PC;
397	case R_X86_64_NONE:
398	return R_NONE;
399	default:
400	error(msg: getErrorLocation(loc) + "unknown relocation (" + Twine (type) +
401	") against symbol " + toString(s));
402	return R_NONE;
403	}
404	}
405
406	void X86_64::writeGotPltHeader(uint8_t buf) const* {
407	// The first entry holds the link-time address of _DYNAMIC. It is documented
408	// in the psABI and glibc before Aug 2021 used the entry to compute run-time
409	// load address of the shared object (note that this is relevant for linking
410	// ld.so, not any other program).
411	write64le(P: buf, V: mainPart->dynamic ->getVA());
412	}
413
414	void X86_64::writeGotPlt(uint8_t buf, const* Symbol &s) const {
415	// See comments in X86::writeGotPlt.
416	write64le(P: buf, V: s.getPltVA() + `6`);
417	}
418
419	void X86_64::writeIgotPlt(uint8_t buf, const* Symbol &s) const {
420	// An x86 entry is the address of the ifunc resolver function (for -z rel).
421	if (config ->writeAddends)
422	write64le(P: buf, V: s.getVA());
423	}
424
425	void X86_64::writePltHeader(uint8_t buf) const* {
426	const uint8_t pltData[] = {
427	`0xff`, `0x35`, `0`, `0`, `0`, `0`, // pushq GOTPLT+8(%rip)
428	`0xff`, `0x25`, `0`, `0`, `0`, `0`, // jmp GOTPLT+16(%rip)*
429	`0x0f`, `0x1f`, `0x40`, `0x00`, // nop
430	};
431	memcpy(dest: buf, src: pltData, n: sizeof(pltData));
432	uint64_t gotPlt = in.gotPlt ->getVA();
433	uint64_t plt = in.ibtPlt ? in.ibtPlt ->getVA() : in.plt ->getVA();
434	write32le(P: buf + `2`, V: gotPlt - plt + `2`); // GOTPLT+8
435	write32le(P: buf + `8`, V: gotPlt - plt + `4`); // GOTPLT+16
436	}
437
438	void X86_64::writePlt(uint8_t buf, const* Symbol &sym,
439	uint64_t pltEntryAddr) const {
440	const uint8_t inst[] = {
441	`0xff`, `0x25`, `0`, `0`, `0`, `0`, // jmpq got(%rip)*
442	`0x68`, `0`, `0`, `0`, `0`, // pushq <relocation index>
443	`0xe9`, `0`, `0`, `0`, `0`, // jmpq plt[0]
444	};
445	memcpy(dest: buf, src: inst, n: sizeof(inst));
446
447	write32le(P: buf + `2`, V: sym.getGotPltVA() - pltEntryAddr - `6`);
448	write32le(P: buf + `7`, V: sym.getPltIdx());
449	write32le(P: buf + `12`, V: in.plt ->getVA() - pltEntryAddr - `16`);
450	}
451
452	RelType X86_64::getDynRel(RelType type) const {
453	if (type == R_X86_64_64 \|\| type == R_X86_64_PC64 \|\| type == R_X86_64_SIZE32 \|\|
454	type == R_X86_64_SIZE64)
455	return type;
456	return R_X86_64_NONE;
457	}
458
459	static void relaxTlsGdToLe(uint8_t loc, const* Relocation &rel, uint64_t val) {
460	if (rel.type == R_X86_64_TLSGD) {
461	// Convert
462	// .byte 0x66
463	// leaq x@tlsgd(%rip), %rdi
464	// .word 0x6666
465	// rex64
466	// call __tls_get_addr@plt
467	// to the following two instructions.
468	const uint8_t inst[] = {
469	`0x64`, `0x48`, `0x8b`, `0x04`, `0x25`, `0x00`, `0x00`,
470	`0x00`, `0x00`, // mov %fs:0x0,%rax
471	`0x48`, `0x8d`, `0x80`, `0`, `0`, `0`, `0`, // lea x@tpoff,%rax
472	};
473	memcpy(dest: loc - `4`, src: inst, n: sizeof(inst));
474
475	// The original code used a pc relative relocation and so we have to
476	// compensate for the -4 in had in the addend.
477	write32le(P: loc + `8`, V: val + `4`);
478	} else if (rel.type == R_X86_64_GOTPC32_TLSDESC) {
479	// Convert leaq x@tlsdesc(%rip), %REG to movq $x@tpoff, %REG.
480	if ((loc[-`3`] & `0xfb`) != `0x48` \|\| loc[-`2`] != `0x8d` \|\|
481	(loc[-`1`] & `0xc7`) != `0x05`) {
482	errorOrWarn(msg: getErrorLocation(loc: loc - `3`) +
483	"R_X86_64_GOTPC32_TLSDESC must be used "
484	"in leaq x@tlsdesc(%rip), %REG");
485	return;
486	}
487	loc[-`3`] = `0x48` \| ((loc[-`3`] >> `2`) & `1`);
488	loc[-`2`] = `0xc7`;
489	loc[-`1`] = `0xc0` \| ((loc[-`1`] >> `3`) & `7`);
490	write32le(P: loc, V: val + `4`);
491	} else {
492	// Convert call x@tlsdesc(%REG) to xchg ax, ax.*
493	assert(rel.type == R_X86_64_TLSDESC_CALL);
494	loc[`0`] = `0x66`;
495	loc[`1`] = `0x90`;
496	}
497	}
498
499	static void relaxTlsGdToIe(uint8_t loc, const* Relocation &rel, uint64_t val) {
500	if (rel.type == R_X86_64_TLSGD) {
501	// Convert
502	// .byte 0x66
503	// leaq x@tlsgd(%rip), %rdi
504	// .word 0x6666
505	// rex64
506	// call __tls_get_addr@plt
507	// to the following two instructions.
508	const uint8_t inst[] = {
509	`0x64`, `0x48`, `0x8b`, `0x04`, `0x25`, `0x00`, `0x00`,
510	`0x00`, `0x00`, // mov %fs:0x0,%rax
511	`0x48`, `0x03`, `0x05`, `0`, `0`, `0`, `0`, // addq x@gottpoff(%rip),%rax
512	};
513	memcpy(dest: loc - `4`, src: inst, n: sizeof(inst));
514
515	// Both code sequences are PC relatives, but since we are moving the
516	// constant forward by 8 bytes we have to subtract the value by 8.
517	write32le(P: loc + `8`, V: val - `8`);
518	} else if (rel.type == R_X86_64_GOTPC32_TLSDESC) {
519	// Convert leaq x@tlsdesc(%rip), %REG to movq x@gottpoff(%rip), %REG.
520	assert(rel.type == R_X86_64_GOTPC32_TLSDESC);
521	if ((loc[-`3`] & `0xfb`) != `0x48` \|\| loc[-`2`] != `0x8d` \|\|
522	(loc[-`1`] & `0xc7`) != `0x05`) {
523	errorOrWarn(msg: getErrorLocation(loc: loc - `3`) +
524	"R_X86_64_GOTPC32_TLSDESC must be used "
525	"in leaq x@tlsdesc(%rip), %REG");
526	return;
527	}
528	loc[-`2`] = `0x8b`;
529	write32le(P: loc, V: val);
530	} else {
531	// Convert call x@tlsdesc(%rax) to xchg ax, ax.*
532	assert(rel.type == R_X86_64_TLSDESC_CALL);
533	loc[`0`] = `0x66`;
534	loc[`1`] = `0x90`;
535	}
536	}
537
538	// In some conditions, R_X86_64_GOTTPOFF relocation can be optimized to
539	// R_X86_64_TPOFF32 so that it does not use GOT.
540	static void relaxTlsIeToLe(uint8_t loc, const* Relocation &, uint64_t val) {
541	uint8_t *inst = loc - `3`;
542	uint8_t reg = loc[-`1`] >> `3`;
543	uint8_t *regSlot = loc - `1`;
544
545	// Note that ADD with RSP or R12 is converted to ADD instead of LEA
546	// because LEA with these registers needs 4 bytes to encode and thus
547	// wouldn't fit the space.
548
549	if (memcmp(s1: inst, s2: "\x48\x03\x25", n: `3`) == `0`) {
550	// "addq foo@gottpoff(%rip),%rsp" -> "addq $foo,%rsp"
551	memcpy(dest: inst, src: "\x48\x81\xc4", n: `3`);
552	} else if (memcmp(s1: inst, s2: "\x4c\x03\x25", n: `3`) == `0`) {
553	// "addq foo@gottpoff(%rip),%r12" -> "addq $foo,%r12"
554	memcpy(dest: inst, src: "\x49\x81\xc4", n: `3`);
555	} else if (memcmp(s1: inst, s2: "\x4c\x03", n: `2`) == `0`) {
556	// "addq foo@gottpoff(%rip),%r[8-15]" -> "leaq foo(%r[8-15]),%r[8-15]"
557	memcpy(dest: inst, src: "\x4d\x8d", n: `2`);
558	*regSlot = `0x80` \| (reg << `3`) \| reg;
559	} else if (memcmp(s1: inst, s2: "\x48\x03", n: `2`) == `0`) {
560	// "addq foo@gottpoff(%rip),%reg -> "leaq foo(%reg),%reg"
561	memcpy(dest: inst, src: "\x48\x8d", n: `2`);
562	*regSlot = `0x80` \| (reg << `3`) \| reg;
563	} else if (memcmp(s1: inst, s2: "\x4c\x8b", n: `2`) == `0`) {
564	// "movq foo@gottpoff(%rip),%r[8-15]" -> "movq $foo,%r[8-15]"
565	memcpy(dest: inst, src: "\x49\xc7", n: `2`);
566	*regSlot = `0xc0` \| reg;
567	} else if (memcmp(s1: inst, s2: "\x48\x8b", n: `2`) == `0`) {
568	// "movq foo@gottpoff(%rip),%reg" -> "movq $foo,%reg"
569	memcpy(dest: inst, src: "\x48\xc7", n: `2`);
570	*regSlot = `0xc0` \| reg;
571	} else {
572	error(msg: getErrorLocation(loc: loc - `3`) +
573	"R_X86_64_GOTTPOFF must be used in MOVQ or ADDQ instructions only");
574	}
575
576	// The original code used a PC relative relocation.
577	// Need to compensate for the -4 it had in the addend.
578	write32le(P: loc, V: val + `4`);
579	}
580
581	static void relaxTlsLdToLe(uint8_t loc, const* Relocation &rel, uint64_t val) {
582	const uint8_t inst[] = {
583	`0x66`, `0x66`, // .word 0x6666
584	`0x66`, // .byte 0x66
585	`0x64`, `0x48`, `0x8b`, `0x04`, `0x25`, `0x00`, `0x00`, `0x00`, `0x00`, // mov %fs:0,%rax
586	};
587
588	if (loc[`4`] == `0xe8`) {
589	// Convert
590	// leaq bar@tlsld(%rip), %rdi # 48 8d 3d <Loc>
591	// callq __tls_get_addr@PLT # e8 <disp32>
592	// leaq bar@dtpoff(%rax), %rcx
593	// to
594	// .word 0x6666
595	// .byte 0x66
596	// mov %fs:0,%rax
597	// leaq bar@tpoff(%rax), %rcx
598	memcpy(dest: loc - `3`, src: inst, n: sizeof(inst));
599	return;
600	}
601
602	if (loc[`4`] == `0xff` && loc[`5`] == `0x15`) {
603	// Convert
604	// leaq x@tlsld(%rip),%rdi # 48 8d 3d <Loc>
605	// call __tls_get_addr@GOTPCREL(%rip) # ff 15 <disp32>*
606	// to
607	// .long 0x66666666
608	// movq %fs:0,%rax
609	// See "Table 11.9: LD -> LE Code Transition (LP64)" in
610	// https://raw.githubusercontent.com/wiki/hjl-tools/x86-psABI/x86-64-psABI-1.0.pdf
611	loc[-`3`] = `0x66`;
612	memcpy(dest: loc - `2`, src: inst, n: sizeof(inst));
613	return;
614	}
615
616	error(msg: getErrorLocation(loc: loc - `3`) +
617	"expected R_X86_64_PLT32 or R_X86_64_GOTPCRELX after R_X86_64_TLSLD");
618	}
619
620	// A JumpInstrMod at a specific offset indicates that the jump instruction
621	// opcode at that offset must be modified. This is specifically used to relax
622	// jump instructions with basic block sections. This function looks at the
623	// JumpMod and effects the change.
624	void X86_64::applyJumpInstrMod(uint8_t *loc, JumpModType type,
625	unsigned size) const {
626	switch (type) {
627	case J_JMP_32:
628	if (size == `4`)
629	*loc = `0xe9`;
630	else
631	*loc = `0xeb`;
632	break;
633	case J_JE_32:
634	if (size == `4`) {
635	loc[-`1`] = `0x0f`;
636	*loc = `0x84`;
637	} else
638	*loc = `0x74`;
639	break;
640	case J_JNE_32:
641	if (size == `4`) {
642	loc[-`1`] = `0x0f`;
643	*loc = `0x85`;
644	} else
645	*loc = `0x75`;
646	break;
647	case J_JG_32:
648	if (size == `4`) {
649	loc[-`1`] = `0x0f`;
650	*loc = `0x8f`;
651	} else
652	*loc = `0x7f`;
653	break;
654	case J_JGE_32:
655	if (size == `4`) {
656	loc[-`1`] = `0x0f`;
657	*loc = `0x8d`;
658	} else
659	*loc = `0x7d`;
660	break;
661	case J_JB_32:
662	if (size == `4`) {
663	loc[-`1`] = `0x0f`;
664	*loc = `0x82`;
665	} else
666	*loc = `0x72`;
667	break;
668	case J_JBE_32:
669	if (size == `4`) {
670	loc[-`1`] = `0x0f`;
671	*loc = `0x86`;
672	} else
673	*loc = `0x76`;
674	break;
675	case J_JL_32:
676	if (size == `4`) {
677	loc[-`1`] = `0x0f`;
678	*loc = `0x8c`;
679	} else
680	*loc = `0x7c`;
681	break;
682	case J_JLE_32:
683	if (size == `4`) {
684	loc[-`1`] = `0x0f`;
685	*loc = `0x8e`;
686	} else
687	*loc = `0x7e`;
688	break;
689	case J_JA_32:
690	if (size == `4`) {
691	loc[-`1`] = `0x0f`;
692	*loc = `0x87`;
693	} else
694	*loc = `0x77`;
695	break;
696	case J_JAE_32:
697	if (size == `4`) {
698	loc[-`1`] = `0x0f`;
699	*loc = `0x83`;
700	} else
701	*loc = `0x73`;
702	break;
703	case J_UNKNOWN:
704	llvm_unreachable("Unknown Jump Relocation");
705	}
706	}
707
708	int64_t X86_64::getImplicitAddend(const uint8_t buf, RelType type) const* {
709	switch (type) {
710	case R_X86_64_8:
711	case R_X86_64_PC8:
712	return SignExtend64<`8`>(x: *buf);
713	case R_X86_64_16:
714	case R_X86_64_PC16:
715	return SignExtend64<`16`>(x: read16le(P: buf));
716	case R_X86_64_32:
717	case R_X86_64_32S:
718	case R_X86_64_TPOFF32:
719	case R_X86_64_GOT32:
720	case R_X86_64_GOTPC32:
721	case R_X86_64_GOTPC32_TLSDESC:
722	case R_X86_64_GOTPCREL:
723	case R_X86_64_GOTPCRELX:
724	case R_X86_64_REX_GOTPCRELX:
725	case R_X86_64_PC32:
726	case R_X86_64_GOTTPOFF:
727	case R_X86_64_PLT32:
728	case R_X86_64_TLSGD:
729	case R_X86_64_TLSLD:
730	case R_X86_64_DTPOFF32:
731	case R_X86_64_SIZE32:
732	return SignExtend64<`32`>(x: read32le(P: buf));
733	case R_X86_64_64:
734	case R_X86_64_TPOFF64:
735	case R_X86_64_DTPOFF64:
736	case R_X86_64_DTPMOD64:
737	case R_X86_64_PC64:
738	case R_X86_64_SIZE64:
739	case R_X86_64_GLOB_DAT:
740	case R_X86_64_GOT64:
741	case R_X86_64_GOTOFF64:
742	case R_X86_64_GOTPC64:
743	case R_X86_64_PLTOFF64:
744	case R_X86_64_IRELATIVE:
745	case R_X86_64_RELATIVE:
746	return read64le(P: buf);
747	case R_X86_64_TLSDESC:
748	return read64le(P: buf + `8`);
749	case R_X86_64_JUMP_SLOT:
750	case R_X86_64_NONE:
751	// These relocations are defined as not having an implicit addend.
752	return `0`;
753	default:
754	internalLinkerError(loc: getErrorLocation(loc: buf),
755	msg: "cannot read addend for relocation " + toString(type));
756	return `0`;
757	}
758	}
759
760	static void relaxGot(uint8_t loc, const* Relocation &rel, uint64_t val);
761
762	void X86_64::relocate(uint8_t loc, const* Relocation &rel, uint64_t val) const {
763	switch (rel.type) {
764	case R_X86_64_8:
765	checkIntUInt(loc, v: val, n: `8`, rel);
766	*loc = val;
767	break;
768	case R_X86_64_PC8:
769	checkInt(loc, v: val, n: `8`, rel);
770	*loc = val;
771	break;
772	case R_X86_64_16:
773	checkIntUInt(loc, v: val, n: `16`, rel);
774	write16le(P: loc, V: val);
775	break;
776	case R_X86_64_PC16:
777	checkInt(loc, v: val, n: `16`, rel);
778	write16le(P: loc, V: val);
779	break;
780	case R_X86_64_32:
781	checkUInt(loc, v: val, n: `32`, rel);
782	write32le(P: loc, V: val);
783	break;
784	case R_X86_64_32S:
785	case R_X86_64_GOT32:
786	case R_X86_64_GOTPC32:
787	case R_X86_64_GOTPCREL:
788	case R_X86_64_PC32:
789	case R_X86_64_PLT32:
790	case R_X86_64_DTPOFF32:
791	case R_X86_64_SIZE32:
792	checkInt(loc, v: val, n: `32`, rel);
793	write32le(P: loc, V: val);
794	break;
795	case R_X86_64_64:
796	case R_X86_64_TPOFF64:
797	case R_X86_64_DTPOFF64:
798	case R_X86_64_PC64:
799	case R_X86_64_SIZE64:
800	case R_X86_64_GOT64:
801	case R_X86_64_GOTOFF64:
802	case R_X86_64_GOTPC64:
803	case R_X86_64_PLTOFF64:
804	write64le(P: loc, V: val);
805	break;
806	case R_X86_64_GOTPCRELX:
807	case R_X86_64_REX_GOTPCRELX:
808	if (rel.expr != R_GOT_PC) {
809	relaxGot(loc, rel, val);
810	} else {
811	checkInt(loc, v: val, n: `32`, rel);
812	write32le(P: loc, V: val);
813	}
814	break;
815	case R_X86_64_GOTPC32_TLSDESC:
816	case R_X86_64_TLSDESC_CALL:
817	case R_X86_64_TLSGD:
818	if (rel.expr == R_RELAX_TLS_GD_TO_LE) {
819	relaxTlsGdToLe(loc, rel, val);
820	} else if (rel.expr == R_RELAX_TLS_GD_TO_IE) {
821	relaxTlsGdToIe(loc, rel, val);
822	} else {
823	checkInt(loc, v: val, n: `32`, rel);
824	write32le(P: loc, V: val);
825	}
826	break;
827	case R_X86_64_TLSLD:
828	if (rel.expr == R_RELAX_TLS_LD_TO_LE) {
829	relaxTlsLdToLe(loc, rel, val);
830	} else {
831	checkInt(loc, v: val, n: `32`, rel);
832	write32le(P: loc, V: val);
833	}
834	break;
835	case R_X86_64_GOTTPOFF:
836	if (rel.expr == R_RELAX_TLS_IE_TO_LE) {
837	relaxTlsIeToLe(loc, rel, val);
838	} else {
839	checkInt(loc, v: val, n: `32`, rel);
840	write32le(P: loc, V: val);
841	}
842	break;
843	case R_X86_64_TPOFF32:
844	checkInt(loc, v: val, n: `32`, rel);
845	write32le(P: loc, V: val);
846	break;
847
848	case R_X86_64_TLSDESC:
849	// The addend is stored in the second 64-bit word.
850	write64le(P: loc + `8`, V: val);
851	break;
852	default:
853	llvm_unreachable("unknown relocation");
854	}
855	}
856
857	RelExpr X86_64::adjustGotPcExpr(RelType type, int64_t addend,
858	const uint8_t loc) const* {
859	// Only R_X86_64_[REX_]GOTPCRELX can be relaxed. GNU as may emit GOTPCRELX
860	// with addend != -4. Such an instruction does not load the full GOT entry, so
861	// we cannot relax the relocation. E.g. movl x@GOTPCREL+4(%rip), %rax
862	// (addend=0) loads the high 32 bits of the GOT entry.
863	if (!config ->relax \|\| addend != -`4` \|\|
864	(type != R_X86_64_GOTPCRELX && type != R_X86_64_REX_GOTPCRELX))
865	return R_GOT_PC;
866	const uint8_t op = loc[-`2`];
867	const uint8_t modRm = loc[-`1`];
868
869	// FIXME: When PIC is disabled and foo is defined locally in the
870	// lower 32 bit address space, memory operand in mov can be converted into
871	// immediate operand. Otherwise, mov must be changed to lea. We support only
872	// latter relaxation at this moment.
873	if (op == `0x8b`)
874	return R_RELAX_GOT_PC;
875
876	// Relax call and jmp.
877	if (op == `0xff` && (modRm == `0x15` \|\| modRm == `0x25`))
878	return R_RELAX_GOT_PC;
879
880	// We don't support test/binop instructions without a REX prefix.
881	if (type == R_X86_64_GOTPCRELX)
882	return R_GOT_PC;
883
884	// Relaxation of test, adc, add, and, cmp, or, sbb, sub, xor.
885	// If PIC then no relaxation is available.
886	return config ->isPic ? R_GOT_PC : R_RELAX_GOT_PC_NOPIC;
887	}
888
889	// A subset of relaxations can only be applied for no-PIC. This method
890	// handles such relaxations. Instructions encoding information was taken from:
891	// "Intel 64 and IA-32 Architectures Software Developer's Manual V2"
892	// (http://www.intel.com/content/dam/www/public/us/en/documents/manuals/
893	// 64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf)
894	static void relaxGotNoPic(uint8_t *loc, uint64_t val, uint8_t op,
895	uint8_t modRm) {
896	const uint8_t rex = loc[-`3`];
897	// Convert "test %reg, foo@GOTPCREL(%rip)" to "test $foo, %reg".
898	if (op == `0x85`) {
899	// See "TEST-Logical Compare" (4-428 Vol. 2B),
900	// TEST r/m64, r64 uses "full" ModR / M byte (no opcode extension).
901
902	// ModR/M byte has form XX YYY ZZZ, where
903	// YYY is MODRM.reg(register 2), ZZZ is MODRM.rm(register 1).
904	// XX has different meanings:
905	// 00: The operand's memory address is in reg1.
906	// 01: The operand's memory address is reg1 + a byte-sized displacement.
907	// 10: The operand's memory address is reg1 + a word-sized displacement.
908	// 11: The operand is reg1 itself.
909	// If an instruction requires only one operand, the unused reg2 field
910	// holds extra opcode bits rather than a register code
911	// 0xC0 == 11 000 000 binary.
912	// 0x38 == 00 111 000 binary.
913	// We transfer reg2 to reg1 here as operand.
914	// See "2.1.3 ModR/M and SIB Bytes" (Vol. 2A 2-3).
915	loc[-`1`] = `0xc0` \| (modRm & `0x38`) >> `3`; // ModR/M byte.
916
917	// Change opcode from TEST r/m64, r64 to TEST r/m64, imm32
918	// See "TEST-Logical Compare" (4-428 Vol. 2B).
919	loc[-`2`] = `0xf7`;
920
921	// Move R bit to the B bit in REX byte.
922	// REX byte is encoded as 0100WRXB, where
923	// 0100 is 4bit fixed pattern.
924	// REX.W When 1, a 64-bit operand size is used. Otherwise, when 0, the
925	// default operand size is used (which is 32-bit for most but not all
926	// instructions).
927	// REX.R This 1-bit value is an extension to the MODRM.reg field.
928	// REX.X This 1-bit value is an extension to the SIB.index field.
929	// REX.B This 1-bit value is an extension to the MODRM.rm field or the
930	// SIB.base field.
931	// See "2.2.1.2 More on REX Prefix Fields " (2-8 Vol. 2A).
932	loc[-`3`] = (rex & ~`0x4`) \| (rex & `0x4`) >> `2`;
933	write32le(P: loc, V: val);
934	return;
935	}
936
937	// If we are here then we need to relax the adc, add, and, cmp, or, sbb, sub
938	// or xor operations.
939
940	// Convert "binop foo@GOTPCREL(%rip), %reg" to "binop $foo, %reg".
941	// Logic is close to one for test instruction above, but we also
942	// write opcode extension here, see below for details.
943	loc[-`1`] = `0xc0` \| (modRm & `0x38`) >> `3` \| (op & `0x3c`); // ModR/M byte.
944
945	// Primary opcode is 0x81, opcode extension is one of:
946	// 000b = ADD, 001b is OR, 010b is ADC, 011b is SBB,
947	// 100b is AND, 101b is SUB, 110b is XOR, 111b is CMP.
948	// This value was wrote to MODRM.reg in a line above.
949	// See "3.2 INSTRUCTIONS (A-M)" (Vol. 2A 3-15),
950	// "INSTRUCTION SET REFERENCE, N-Z" (Vol. 2B 4-1) for
951	// descriptions about each operation.
952	loc[-`2`] = `0x81`;
953	loc[-`3`] = (rex & ~`0x4`) \| (rex & `0x4`) >> `2`;
954	write32le(P: loc, V: val);
955	}
956
957	static void relaxGot(uint8_t loc, const* Relocation &rel, uint64_t val) {
958	assert(isInt<`32`>(val) &&
959	"GOTPCRELX should not have been relaxed if it overflows");
960	const uint8_t op = loc[-`2`];
961	const uint8_t modRm = loc[-`1`];
962
963	// Convert "mov foo@GOTPCREL(%rip),%reg" to "lea foo(%rip),%reg".
964	if (op == `0x8b`) {
965	loc[-`2`] = `0x8d`;
966	write32le(P: loc, V: val);
967	return;
968	}
969
970	if (op != `0xff`) {
971	// We are relaxing a rip relative to an absolute, so compensate
972	// for the old -4 addend.
973	assert(!config ->isPic);
974	relaxGotNoPic(loc, val: val + `4`, op, modRm);
975	return;
976	}
977
978	// Convert call/jmp instructions.
979	if (modRm == `0x15`) {
980	// ABI says we can convert "call foo@GOTPCREL(%rip)" to "nop; call foo".*
981	// Instead we convert to "addr32 call foo" where addr32 is an instruction
982	// prefix. That makes result expression to be a single instruction.
983	loc[-`2`] = `0x67`; // addr32 prefix
984	loc[-`1`] = `0xe8`; // call
985	write32le(P: loc, V: val);
986	return;
987	}
988
989	// Convert "jmp foo@GOTPCREL(%rip)" to "jmp foo; nop".*
990	// jmp doesn't return, so it is fine to use nop here, it is just a stub.
991	assert(modRm == `0x25`);
992	loc[-`2`] = `0xe9`; // jmp
993	loc[`3`] = `0x90`; // nop
994	write32le(P: loc - `1`, V: val + `1`);
995	}
996
997	// A split-stack prologue starts by checking the amount of stack remaining
998	// in one of two ways:
999	// A) Comparing of the stack pointer to a field in the tcb.
1000	// B) Or a load of a stack pointer offset with an lea to r10 or r11.
1001	bool X86_64::adjustPrologueForCrossSplitStack(uint8_t loc, uint8_t end,
1002	uint8_t stOther) const {
1003	if (!config ->is64) {
1004	error(msg: "target doesn't support split stacks");
1005	return false;
1006	}
1007
1008	if (loc + `8` >= end)
1009	return false;
1010
1011	// Replace "cmp %fs:0x70,%rsp" and subsequent branch
1012	// with "stc, nopl 0x0(%rax,%rax,1)"
1013	if (memcmp(s1: loc, s2: "\x64\x48\x3b\x24\x25", n: `5`) == `0`) {
1014	memcpy(dest: loc, src: "\xf9\x0f\x1f\x84\x00\x00\x00\x00", n: `8`);
1015	return true;
1016	}
1017
1018	// Adjust "lea X(%rsp),%rYY" to lea "(X - 0x4000)(%rsp),%rYY" where rYY could
1019	// be r10 or r11. The lea instruction feeds a subsequent compare which checks
1020	// if there is X available stack space. Making X larger effectively reserves
1021	// that much additional space. The stack grows downward so subtract the value.
1022	if (memcmp(s1: loc, s2: "\x4c\x8d\x94\x24", n: `4`) == `0` \|\|
1023	memcmp(s1: loc, s2: "\x4c\x8d\x9c\x24", n: `4`) == `0`) {
1024	// The offset bytes are encoded four bytes after the start of the
1025	// instruction.
1026	write32le(P: loc + `4`, V: read32le(P: loc + `4`) - `0x4000`);
1027	return true;
1028	}
1029	return false;
1030	}
1031
1032	void X86_64::relocateAlloc(InputSectionBase &sec, uint8_t buf) const* {
1033	uint64_t secAddr = sec.getOutputSection()->addr;
1034	if (auto *s = dyn_cast<InputSection>(Val: &sec))
1035	secAddr += s->outSecOff;
1036	else if (auto *ehIn = dyn_cast<EhInputSection>(Val: &sec))
1037	secAddr += ehIn->getParent()->outSecOff;
1038	for (const Relocation &rel : sec.relocs()) {
1039	if (rel.expr == R_NONE) // See deleteFallThruJmpInsn
1040	continue;
1041	uint8_t *loc = buf + rel.offset;
1042	const uint64_t val =
1043	sec.getRelocTargetVA(File: sec.file, Type: rel.type, A: rel.addend,
1044	P: secAddr + rel.offset, Sym: *rel.sym, Expr: rel.expr);
1045	relocate(loc, rel, val);
1046	}
1047	if (sec.jumpInstrMod) {
1048	applyJumpInstrMod(loc: buf + sec.jumpInstrMod->offset,
1049	type: sec.jumpInstrMod->original, size: sec.jumpInstrMod->size);
1050	}
1051	}
1052
1053	// If Intel Indirect Branch Tracking is enabled, we have to emit special PLT
1054	// entries containing endbr64 instructions. A PLT entry will be split into two
1055	// parts, one in .plt.sec (writePlt), and the other in .plt (writeIBTPlt).
1056	namespace {
1057	class IntelIBT : public X86_64 {
1058	public:
1059	IntelIBT();
1060	void writeGotPlt(uint8_t buf, const* Symbol &s) const override;
1061	void writePlt(uint8_t buf, const* Symbol &sym,
1062	uint64_t pltEntryAddr) const override;
1063	void writeIBTPlt(uint8_t buf, size_t numEntries) const* override;
1064
1065	static const unsigned IBTPltHeaderSize = `16`;
1066	};
1067	} // namespace
1068
1069	IntelIBT::IntelIBT() { pltHeaderSize = `0`; }
1070
1071	void IntelIBT::writeGotPlt(uint8_t buf, const* Symbol &s) const {
1072	uint64_t va =
1073	in.ibtPlt ->getVA() + IBTPltHeaderSize + s.getPltIdx() * pltEntrySize;
1074	write64le(P: buf, V: va);
1075	}
1076
1077	void IntelIBT::writePlt(uint8_t buf, const* Symbol &sym,
1078	uint64_t pltEntryAddr) const {
1079	const uint8_t Inst[] = {
1080	`0xf3`, `0x0f`, `0x1e`, `0xfa`, // endbr64
1081	`0xff`, `0x25`, `0`, `0`, `0`, `0`, // jmpq got(%rip)*
1082	`0x66`, `0x0f`, `0x1f`, `0x44`, `0`, `0`, // nop
1083	};
1084	memcpy(dest: buf, src: Inst, n: sizeof(Inst));
1085	write32le(P: buf + `6`, V: sym.getGotPltVA() - pltEntryAddr - `10`);
1086	}
1087
1088	void IntelIBT::writeIBTPlt(uint8_t buf, size_t numEntries) const* {
1089	writePltHeader(buf);
1090	buf += IBTPltHeaderSize;
1091
1092	const uint8_t inst[] = {
1093	`0xf3`, `0x0f`, `0x1e`, `0xfa`, // endbr64
1094	`0x68`, `0`, `0`, `0`, `0`, // pushq <relocation index>
1095	`0xe9`, `0`, `0`, `0`, `0`, // jmpq plt[0]
1096	`0x66`, `0x90`, // nop
1097	};
1098
1099	for (size_t i = `0`; i < numEntries; ++i) {
1100	memcpy(dest: buf, src: inst, n: sizeof(inst));
1101	write32le(P: buf + `5`, V: i);
1102	write32le(P: buf + `10`, V: -pltHeaderSize - sizeof(inst) * i - `30`);
1103	buf += sizeof(inst);
1104	}
1105	}
1106
1107	// These nonstandard PLT entries are to migtigate Spectre v2 security
1108	// vulnerability. In order to mitigate Spectre v2, we want to avoid indirect
1109	// branch instructions such as `jmp GOTPLT(%rip)`. So, in the following PLT*
1110	// entries, we use a CALL followed by MOV and RET to do the same thing as an
1111	// indirect jump. That instruction sequence is so-called "retpoline".
1112	//
1113	// We have two types of retpoline PLTs as a size optimization. If `-z now`
1114	// is specified, all dynamic symbols are resolved at load-time. Thus, when
1115	// that option is given, we can omit code for symbol lazy resolution.
1116	namespace {
1117	class Retpoline : public X86_64 {
1118	public:
1119	Retpoline();
1120	void writeGotPlt(uint8_t buf, const* Symbol &s) const override;
1121	void writePltHeader(uint8_t buf) const* override;
1122	void writePlt(uint8_t buf, const* Symbol &sym,
1123	uint64_t pltEntryAddr) const override;
1124	};
1125
1126	class RetpolineZNow : public X86_64 {
1127	public:
1128	RetpolineZNow();
1129	void writeGotPlt(uint8_t buf, const* Symbol &s) const override {}
1130	void writePltHeader(uint8_t buf) const* override;
1131	void writePlt(uint8_t buf, const* Symbol &sym,
1132	uint64_t pltEntryAddr) const override;
1133	};
1134	} // namespace
1135
1136	Retpoline::Retpoline() {
1137	pltHeaderSize = `48`;
1138	pltEntrySize = `32`;
1139	ipltEntrySize = `32`;
1140	}
1141
1142	void Retpoline::writeGotPlt(uint8_t buf, const* Symbol &s) const {
1143	write64le(P: buf, V: s.getPltVA() + `17`);
1144	}
1145
1146	void Retpoline::writePltHeader(uint8_t buf) const* {
1147	const uint8_t insn[] = {
1148	`0xff`, `0x35`, `0`, `0`, `0`, `0`, // 0: pushq GOTPLT+8(%rip)
1149	`0x4c`, `0x8b`, `0x1d`, `0`, `0`, `0`, `0`, // 6: mov GOTPLT+16(%rip), %r11
1150	`0xe8`, `0x0e`, `0x00`, `0x00`, `0x00`, // d: callq next
1151	`0xf3`, `0x90`, // 12: loop: pause
1152	`0x0f`, `0xae`, `0xe8`, // 14: lfence
1153	`0xeb`, `0xf9`, // 17: jmp loop
1154	`0xcc`, `0xcc`, `0xcc`, `0xcc`, `0xcc`, `0xcc`, `0xcc`, // 19: int3; .align 16
1155	`0x4c`, `0x89`, `0x1c`, `0x24`, // 20: next: mov %r11, (%rsp)
1156	`0xc3`, // 24: ret
1157	`0xcc`, `0xcc`, `0xcc`, `0xcc`, `0xcc`, `0xcc`, `0xcc`, // 25: int3; padding
1158	`0xcc`, `0xcc`, `0xcc`, `0xcc`, // 2c: int3; padding
1159	};
1160	memcpy(dest: buf, src: insn, n: sizeof(insn));
1161
1162	uint64_t gotPlt = in.gotPlt ->getVA();
1163	uint64_t plt = in.plt ->getVA();
1164	write32le(P: buf + `2`, V: gotPlt - plt - `6` + `8`);
1165	write32le(P: buf + `9`, V: gotPlt - plt - `13` + `16`);
1166	}
1167
1168	void Retpoline::writePlt(uint8_t buf, const* Symbol &sym,
1169	uint64_t pltEntryAddr) const {
1170	const uint8_t insn[] = {
1171	`0x4c`, `0x8b`, `0x1d`, `0`, `0`, `0`, `0`, // 0: mov foo@GOTPLT(%rip), %r11
1172	`0xe8`, `0`, `0`, `0`, `0`, // 7: callq plt+0x20
1173	`0xe9`, `0`, `0`, `0`, `0`, // c: jmp plt+0x12
1174	`0x68`, `0`, `0`, `0`, `0`, // 11: pushq <relocation index>
1175	`0xe9`, `0`, `0`, `0`, `0`, // 16: jmp plt+0
1176	`0xcc`, `0xcc`, `0xcc`, `0xcc`, `0xcc`, // 1b: int3; padding
1177	};
1178	memcpy(dest: buf, src: insn, n: sizeof(insn));
1179
1180	uint64_t off = pltEntryAddr - in.plt ->getVA();
1181
1182	write32le(P: buf + `3`, V: sym.getGotPltVA() - pltEntryAddr - `7`);
1183	write32le(P: buf + `8`, V: -off - `12` + `32`);
1184	write32le(P: buf + `13`, V: -off - `17` + `18`);
1185	write32le(P: buf + `18`, V: sym.getPltIdx());
1186	write32le(P: buf + `23`, V: -off - `27`);
1187	}
1188
1189	RetpolineZNow::RetpolineZNow() {
1190	pltHeaderSize = `32`;
1191	pltEntrySize = `16`;
1192	ipltEntrySize = `16`;
1193	}
1194
1195	void RetpolineZNow::writePltHeader(uint8_t buf) const* {
1196	const uint8_t insn[] = {
1197	`0xe8`, `0x0b`, `0x00`, `0x00`, `0x00`, // 0: call next
1198	`0xf3`, `0x90`, // 5: loop: pause
1199	`0x0f`, `0xae`, `0xe8`, // 7: lfence
1200	`0xeb`, `0xf9`, // a: jmp loop
1201	`0xcc`, `0xcc`, `0xcc`, `0xcc`, // c: int3; .align 16
1202	`0x4c`, `0x89`, `0x1c`, `0x24`, // 10: next: mov %r11, (%rsp)
1203	`0xc3`, // 14: ret
1204	`0xcc`, `0xcc`, `0xcc`, `0xcc`, `0xcc`, // 15: int3; padding
1205	`0xcc`, `0xcc`, `0xcc`, `0xcc`, `0xcc`, // 1a: int3; padding
1206	`0xcc`, // 1f: int3; padding
1207	};
1208	memcpy(dest: buf, src: insn, n: sizeof(insn));
1209	}
1210
1211	void RetpolineZNow::writePlt(uint8_t buf, const* Symbol &sym,
1212	uint64_t pltEntryAddr) const {
1213	const uint8_t insn[] = {
1214	`0x4c`, `0x8b`, `0x1d`, `0`, `0`, `0`, `0`, // mov foo@GOTPLT(%rip), %r11
1215	`0xe9`, `0`, `0`, `0`, `0`, // jmp plt+0
1216	`0xcc`, `0xcc`, `0xcc`, `0xcc`, // int3; padding
1217	};
1218	memcpy(dest: buf, src: insn, n: sizeof(insn));
1219
1220	write32le(P: buf + `3`, V: sym.getGotPltVA() - pltEntryAddr - `7`);
1221	write32le(P: buf + `8`, V: in.plt ->getVA() - pltEntryAddr - `12`);
1222	}
1223
1224	static TargetInfo *getTargetInfo() {
1225	if (config ->zRetpolineplt) {
1226	if (config ->zNow) {
1227	static RetpolineZNow t;
1228	return &t;
1229	}
1230	static Retpoline t;
1231	return &t;
1232	}
1233
1234	if (config ->andFeatures & GNU_PROPERTY_X86_FEATURE_1_IBT) {
1235	static IntelIBT t;
1236	return &t;
1237	}
1238
1239	static X86_64 t;
1240	return &t;
1241	}
1242
1243	TargetInfo elf::getX86_64TargetInfo() { return* getTargetInfo(); }
1244

source code of lld/ELF/Arch/X86_64.cpp