1//===- X86_64.cpp ---------------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "OutputSections.h"
10#include "Relocations.h"
11#include "Symbols.h"
12#include "SyntheticSections.h"
13#include "Target.h"
14#include "llvm/BinaryFormat/ELF.h"
15#include "llvm/Support/Endian.h"
16#include "llvm/Support/MathExtras.h"
17
18using namespace llvm;
19using namespace llvm::object;
20using namespace llvm::support::endian;
21using namespace llvm::ELF;
22using namespace lld;
23using namespace lld::elf;
24
25namespace {
26class X86_64 : public TargetInfo {
27public:
28 X86_64(Ctx &);
29 int getTlsGdRelaxSkip(RelType type) const override;
30 RelExpr getRelExpr(RelType type, const Symbol &s,
31 const uint8_t *loc) const override;
32 RelType getDynRel(RelType type) const override;
33 void writeGotPltHeader(uint8_t *buf) const override;
34 void writeGotPlt(uint8_t *buf, const Symbol &s) const override;
35 void writeIgotPlt(uint8_t *buf, const Symbol &s) const override;
36 void writePltHeader(uint8_t *buf) const override;
37 void writePlt(uint8_t *buf, const Symbol &sym,
38 uint64_t pltEntryAddr) const override;
39 void relocate(uint8_t *loc, const Relocation &rel,
40 uint64_t val) const override;
41 int64_t getImplicitAddend(const uint8_t *buf, RelType type) const override;
42 void applyJumpInstrMod(uint8_t *loc, JumpModType type,
43 unsigned size) const override;
44 RelExpr adjustGotPcExpr(RelType type, int64_t addend,
45 const uint8_t *loc) const override;
46 void relocateAlloc(InputSectionBase &sec, uint8_t *buf) const override;
47 bool adjustPrologueForCrossSplitStack(uint8_t *loc, uint8_t *end,
48 uint8_t stOther) const override;
49 bool deleteFallThruJmpInsn(InputSection &is, InputFile *file,
50 InputSection *nextIS) const override;
51 bool relaxOnce(int pass) const override;
52
53private:
54 void relaxTlsGdToLe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
55 void relaxTlsGdToIe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
56 void relaxTlsLdToLe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
57 void relaxTlsIeToLe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
58};
59} // namespace
60
61// This is vector of NOP instructions of sizes from 1 to 8 bytes. The
62// appropriately sized instructions are used to fill the gaps between sections
63// which are executed during fall through.
64static const std::vector<std::vector<uint8_t>> nopInstructions = {
65 {0x90},
66 {0x66, 0x90},
67 {0x0f, 0x1f, 0x00},
68 {0x0f, 0x1f, 0x40, 0x00},
69 {0x0f, 0x1f, 0x44, 0x00, 0x00},
70 {0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00},
71 {0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00},
72 {0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
73 {0x66, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}};
74
75X86_64::X86_64(Ctx &ctx) : TargetInfo(ctx) {
76 copyRel = R_X86_64_COPY;
77 gotRel = R_X86_64_GLOB_DAT;
78 pltRel = R_X86_64_JUMP_SLOT;
79 relativeRel = R_X86_64_RELATIVE;
80 iRelativeRel = R_X86_64_IRELATIVE;
81 symbolicRel = R_X86_64_64;
82 tlsDescRel = R_X86_64_TLSDESC;
83 tlsGotRel = R_X86_64_TPOFF64;
84 tlsModuleIndexRel = R_X86_64_DTPMOD64;
85 tlsOffsetRel = R_X86_64_DTPOFF64;
86 gotBaseSymInGotPlt = true;
87 gotEntrySize = 8;
88 pltHeaderSize = 16;
89 pltEntrySize = 16;
90 ipltEntrySize = 16;
91 trapInstr = {0xcc, 0xcc, 0xcc, 0xcc}; // 0xcc = INT3
92 nopInstrs = nopInstructions;
93
94 // Align to the large page size (known as a superpage or huge page).
95 // FreeBSD automatically promotes large, superpage-aligned allocations.
96 defaultImageBase = 0x200000;
97}
98
99int X86_64::getTlsGdRelaxSkip(RelType type) const {
100 // TLSDESC relocations are processed separately. See relaxTlsGdToLe below.
101 return type == R_X86_64_GOTPC32_TLSDESC ||
102 type == R_X86_64_CODE_4_GOTPC32_TLSDESC ||
103 type == R_X86_64_TLSDESC_CALL
104 ? 1
105 : 2;
106}
107
108// Opcodes for the different X86_64 jmp instructions.
109enum JmpInsnOpcode : uint32_t {
110 J_JMP_32,
111 J_JNE_32,
112 J_JE_32,
113 J_JG_32,
114 J_JGE_32,
115 J_JB_32,
116 J_JBE_32,
117 J_JL_32,
118 J_JLE_32,
119 J_JA_32,
120 J_JAE_32,
121 J_UNKNOWN,
122};
123
124// Given the first (optional) and second byte of the insn's opcode, this
125// returns the corresponding enum value.
126static JmpInsnOpcode getJmpInsnType(const uint8_t *first,
127 const uint8_t *second) {
128 if (*second == 0xe9)
129 return J_JMP_32;
130
131 if (first == nullptr)
132 return J_UNKNOWN;
133
134 if (*first == 0x0f) {
135 switch (*second) {
136 case 0x84:
137 return J_JE_32;
138 case 0x85:
139 return J_JNE_32;
140 case 0x8f:
141 return J_JG_32;
142 case 0x8d:
143 return J_JGE_32;
144 case 0x82:
145 return J_JB_32;
146 case 0x86:
147 return J_JBE_32;
148 case 0x8c:
149 return J_JL_32;
150 case 0x8e:
151 return J_JLE_32;
152 case 0x87:
153 return J_JA_32;
154 case 0x83:
155 return J_JAE_32;
156 }
157 }
158 return J_UNKNOWN;
159}
160
161// Return the relocation index for input section IS with a specific Offset.
162// Returns the maximum size of the vector if no such relocation is found.
163static unsigned getRelocationWithOffset(const InputSection &is,
164 uint64_t offset) {
165 unsigned size = is.relocs().size();
166 for (unsigned i = size - 1; i + 1 > 0; --i) {
167 if (is.relocs()[i].offset == offset && is.relocs()[i].expr != R_NONE)
168 return i;
169 }
170 return size;
171}
172
173// Returns true if R corresponds to a relocation used for a jump instruction.
174// TODO: Once special relocations for relaxable jump instructions are available,
175// this should be modified to use those relocations.
176static bool isRelocationForJmpInsn(Relocation &R) {
177 return R.type == R_X86_64_PLT32 || R.type == R_X86_64_PC32 ||
178 R.type == R_X86_64_PC8;
179}
180
181// Return true if Relocation R points to the first instruction in the
182// next section.
183// TODO: Delete this once psABI reserves a new relocation type for fall thru
184// jumps.
185static bool isFallThruRelocation(InputSection &is, InputFile *file,
186 InputSection *nextIS, Relocation &r) {
187 if (!isRelocationForJmpInsn(R&: r))
188 return false;
189
190 uint64_t addrLoc = is.getOutputSection()->addr + is.outSecOff + r.offset;
191 uint64_t targetOffset = is.getRelocTargetVA(is.getCtx(), r, p: addrLoc);
192
193 // If this jmp is a fall thru, the target offset is the beginning of the
194 // next section.
195 uint64_t nextSectionOffset =
196 nextIS->getOutputSection()->addr + nextIS->outSecOff;
197 return (addrLoc + 4 + targetOffset) == nextSectionOffset;
198}
199
200// Return the jmp instruction opcode that is the inverse of the given
201// opcode. For example, JE inverted is JNE.
202static JmpInsnOpcode invertJmpOpcode(const JmpInsnOpcode opcode) {
203 switch (opcode) {
204 case J_JE_32:
205 return J_JNE_32;
206 case J_JNE_32:
207 return J_JE_32;
208 case J_JG_32:
209 return J_JLE_32;
210 case J_JGE_32:
211 return J_JL_32;
212 case J_JB_32:
213 return J_JAE_32;
214 case J_JBE_32:
215 return J_JA_32;
216 case J_JL_32:
217 return J_JGE_32;
218 case J_JLE_32:
219 return J_JG_32;
220 case J_JA_32:
221 return J_JBE_32;
222 case J_JAE_32:
223 return J_JB_32;
224 default:
225 return J_UNKNOWN;
226 }
227}
228
229// Deletes direct jump instruction in input sections that jumps to the
230// following section as it is not required. If there are two consecutive jump
231// instructions, it checks if they can be flipped and one can be deleted.
232// For example:
233// .section .text
234// a.BB.foo:
235// ...
236// 10: jne aa.BB.foo
237// 16: jmp bar
238// aa.BB.foo:
239// ...
240//
241// can be converted to:
242// a.BB.foo:
243// ...
244// 10: je bar #jne flipped to je and the jmp is deleted.
245// aa.BB.foo:
246// ...
247bool X86_64::deleteFallThruJmpInsn(InputSection &is, InputFile *file,
248 InputSection *nextIS) const {
249 const unsigned sizeOfDirectJmpInsn = 5;
250
251 if (nextIS == nullptr)
252 return false;
253
254 if (is.getSize() < sizeOfDirectJmpInsn)
255 return false;
256
257 // If this jmp insn can be removed, it is the last insn and the
258 // relocation is 4 bytes before the end.
259 unsigned rIndex = getRelocationWithOffset(is, offset: is.getSize() - 4);
260 if (rIndex == is.relocs().size())
261 return false;
262
263 Relocation &r = is.relocs()[rIndex];
264
265 // Check if the relocation corresponds to a direct jmp.
266 const uint8_t *secContents = is.content().data();
267 // If it is not a direct jmp instruction, there is nothing to do here.
268 if (*(secContents + r.offset - 1) != 0xe9)
269 return false;
270
271 if (isFallThruRelocation(is, file, nextIS, r)) {
272 // This is a fall thru and can be deleted.
273 r.expr = R_NONE;
274 r.offset = 0;
275 is.drop_back(num: sizeOfDirectJmpInsn);
276 is.nopFiller = true;
277 return true;
278 }
279
280 // Now, check if flip and delete is possible.
281 const unsigned sizeOfJmpCCInsn = 6;
282 // To flip, there must be at least one JmpCC and one direct jmp.
283 if (is.getSize() < sizeOfDirectJmpInsn + sizeOfJmpCCInsn)
284 return false;
285
286 unsigned rbIndex =
287 getRelocationWithOffset(is, offset: (is.getSize() - sizeOfDirectJmpInsn - 4));
288 if (rbIndex == is.relocs().size())
289 return false;
290
291 Relocation &rB = is.relocs()[rbIndex];
292
293 const uint8_t *jmpInsnB = secContents + rB.offset - 1;
294 JmpInsnOpcode jmpOpcodeB = getJmpInsnType(first: jmpInsnB - 1, second: jmpInsnB);
295 if (jmpOpcodeB == J_UNKNOWN)
296 return false;
297
298 if (!isFallThruRelocation(is, file, nextIS, r&: rB))
299 return false;
300
301 // jmpCC jumps to the fall thru block, the branch can be flipped and the
302 // jmp can be deleted.
303 JmpInsnOpcode jInvert = invertJmpOpcode(opcode: jmpOpcodeB);
304 if (jInvert == J_UNKNOWN)
305 return false;
306 is.jumpInstrMod = make<JumpInstrMod>();
307 *is.jumpInstrMod = {.offset: rB.offset - 1, .original: jInvert, .size: 4};
308 // Move R's values to rB except the offset.
309 rB = {.expr: r.expr, .type: r.type, .offset: rB.offset, .addend: r.addend, .sym: r.sym};
310 // Cancel R
311 r.expr = R_NONE;
312 r.offset = 0;
313 is.drop_back(num: sizeOfDirectJmpInsn);
314 is.nopFiller = true;
315 return true;
316}
317
318bool X86_64::relaxOnce(int pass) const {
319 uint64_t minVA = UINT64_MAX, maxVA = 0;
320 for (OutputSection *osec : ctx.outputSections) {
321 minVA = std::min(a: minVA, b: osec->addr);
322 maxVA = std::max(a: maxVA, b: osec->addr + osec->size);
323 }
324 // If the max VA is under 2^31, GOTPCRELX relocations cannot overfow. In
325 // -pie/-shared, the condition can be relaxed to test the max VA difference as
326 // there is no R_RELAX_GOT_PC_NOPIC.
327 if (isUInt<31>(x: maxVA) || (isUInt<31>(x: maxVA - minVA) && ctx.arg.isPic))
328 return false;
329
330 SmallVector<InputSection *, 0> storage;
331 bool changed = false;
332 for (OutputSection *osec : ctx.outputSections) {
333 if (!(osec->flags & SHF_EXECINSTR))
334 continue;
335 for (InputSection *sec : getInputSections(os: *osec, storage)) {
336 for (Relocation &rel : sec->relocs()) {
337 if (rel.expr != R_RELAX_GOT_PC && rel.expr != R_RELAX_GOT_PC_NOPIC)
338 continue;
339 assert(rel.addend == -4);
340
341 Relocation rel1 = rel;
342 rel1.addend = rel.expr == R_RELAX_GOT_PC_NOPIC ? 0 : -4;
343 uint64_t v = sec->getRelocTargetVA(ctx, r: rel1,
344 p: sec->getOutputSection()->addr +
345 sec->outSecOff + rel.offset);
346 if (isInt<32>(x: v))
347 continue;
348 if (rel.sym->auxIdx == 0) {
349 rel.sym->allocateAux(ctx);
350 addGotEntry(ctx, sym&: *rel.sym);
351 changed = true;
352 }
353 rel.expr = R_GOT_PC;
354 }
355 }
356 }
357 return changed;
358}
359
360RelExpr X86_64::getRelExpr(RelType type, const Symbol &s,
361 const uint8_t *loc) const {
362 switch (type) {
363 case R_X86_64_8:
364 case R_X86_64_16:
365 case R_X86_64_32:
366 case R_X86_64_32S:
367 case R_X86_64_64:
368 return R_ABS;
369 case R_X86_64_DTPOFF32:
370 case R_X86_64_DTPOFF64:
371 return R_DTPREL;
372 case R_X86_64_TPOFF32:
373 case R_X86_64_TPOFF64:
374 return R_TPREL;
375 case R_X86_64_TLSDESC_CALL:
376 return R_TLSDESC_CALL;
377 case R_X86_64_TLSLD:
378 return R_TLSLD_PC;
379 case R_X86_64_TLSGD:
380 return R_TLSGD_PC;
381 case R_X86_64_SIZE32:
382 case R_X86_64_SIZE64:
383 return R_SIZE;
384 case R_X86_64_PLT32:
385 return R_PLT_PC;
386 case R_X86_64_PC8:
387 case R_X86_64_PC16:
388 case R_X86_64_PC32:
389 case R_X86_64_PC64:
390 return R_PC;
391 case R_X86_64_GOT32:
392 case R_X86_64_GOT64:
393 return R_GOTPLT;
394 case R_X86_64_GOTPC32_TLSDESC:
395 case R_X86_64_CODE_4_GOTPC32_TLSDESC:
396 return R_TLSDESC_PC;
397 case R_X86_64_GOTPCREL:
398 case R_X86_64_GOTPCRELX:
399 case R_X86_64_REX_GOTPCRELX:
400 case R_X86_64_CODE_4_GOTPCRELX:
401 case R_X86_64_GOTTPOFF:
402 case R_X86_64_CODE_4_GOTTPOFF:
403 case R_X86_64_CODE_6_GOTTPOFF:
404 return R_GOT_PC;
405 case R_X86_64_GOTOFF64:
406 return R_GOTPLTREL;
407 case R_X86_64_PLTOFF64:
408 return R_PLT_GOTPLT;
409 case R_X86_64_GOTPC32:
410 case R_X86_64_GOTPC64:
411 return R_GOTPLTONLY_PC;
412 case R_X86_64_NONE:
413 return R_NONE;
414 default:
415 Err(ctx) << getErrorLoc(ctx, loc) << "unknown relocation (" << type.v
416 << ") against symbol " << &s;
417 return R_NONE;
418 }
419}
420
421void X86_64::writeGotPltHeader(uint8_t *buf) const {
422 // The first entry holds the link-time address of _DYNAMIC. It is documented
423 // in the psABI and glibc before Aug 2021 used the entry to compute run-time
424 // load address of the shared object (note that this is relevant for linking
425 // ld.so, not any other program).
426 write64le(P: buf, V: ctx.mainPart->dynamic->getVA());
427}
428
429void X86_64::writeGotPlt(uint8_t *buf, const Symbol &s) const {
430 // See comments in X86::writeGotPlt.
431 write64le(P: buf, V: s.getPltVA(ctx) + 6);
432}
433
434void X86_64::writeIgotPlt(uint8_t *buf, const Symbol &s) const {
435 // An x86 entry is the address of the ifunc resolver function (for -z rel).
436 if (ctx.arg.writeAddends)
437 write64le(P: buf, V: s.getVA(ctx));
438}
439
440void X86_64::writePltHeader(uint8_t *buf) const {
441 const uint8_t pltData[] = {
442 0xff, 0x35, 0, 0, 0, 0, // pushq GOTPLT+8(%rip)
443 0xff, 0x25, 0, 0, 0, 0, // jmp *GOTPLT+16(%rip)
444 0x0f, 0x1f, 0x40, 0x00, // nop
445 };
446 memcpy(dest: buf, src: pltData, n: sizeof(pltData));
447 uint64_t gotPlt = ctx.in.gotPlt->getVA();
448 uint64_t plt = ctx.in.ibtPlt ? ctx.in.ibtPlt->getVA() : ctx.in.plt->getVA();
449 write32le(P: buf + 2, V: gotPlt - plt + 2); // GOTPLT+8
450 write32le(P: buf + 8, V: gotPlt - plt + 4); // GOTPLT+16
451}
452
453void X86_64::writePlt(uint8_t *buf, const Symbol &sym,
454 uint64_t pltEntryAddr) const {
455 const uint8_t inst[] = {
456 0xff, 0x25, 0, 0, 0, 0, // jmpq *got(%rip)
457 0x68, 0, 0, 0, 0, // pushq <relocation index>
458 0xe9, 0, 0, 0, 0, // jmpq plt[0]
459 };
460 memcpy(dest: buf, src: inst, n: sizeof(inst));
461
462 write32le(P: buf + 2, V: sym.getGotPltVA(ctx) - pltEntryAddr - 6);
463 write32le(P: buf + 7, V: sym.getPltIdx(ctx));
464 write32le(P: buf + 12, V: ctx.in.plt->getVA() - pltEntryAddr - 16);
465}
466
467RelType X86_64::getDynRel(RelType type) const {
468 if (type == R_X86_64_64 || type == R_X86_64_PC64 || type == R_X86_64_SIZE32 ||
469 type == R_X86_64_SIZE64)
470 return type;
471 return R_X86_64_NONE;
472}
473
474void X86_64::relaxTlsGdToLe(uint8_t *loc, const Relocation &rel,
475 uint64_t val) const {
476 if (rel.type == R_X86_64_TLSGD) {
477 // Convert
478 // .byte 0x66
479 // leaq x@tlsgd(%rip), %rdi
480 // .word 0x6666
481 // rex64
482 // call __tls_get_addr@plt
483 // to the following two instructions.
484 const uint8_t inst[] = {
485 0x64, 0x48, 0x8b, 0x04, 0x25, 0x00, 0x00,
486 0x00, 0x00, // mov %fs:0x0,%rax
487 0x48, 0x8d, 0x80, 0, 0, 0, 0, // lea x@tpoff,%rax
488 };
489 memcpy(dest: loc - 4, src: inst, n: sizeof(inst));
490
491 // The original code used a pc relative relocation and so we have to
492 // compensate for the -4 in had in the addend.
493 write32le(P: loc + 8, V: val + 4);
494 } else if (rel.type == R_X86_64_GOTPC32_TLSDESC ||
495 rel.type == R_X86_64_CODE_4_GOTPC32_TLSDESC) {
496 // Convert leaq x@tlsdesc(%rip), %REG to movq $x@tpoff, %REG.
497 if ((loc[-3] & 0xfb) != 0x48 || loc[-2] != 0x8d ||
498 (loc[-1] & 0xc7) != 0x05) {
499 Err(ctx) << getErrorLoc(ctx, loc: (rel.type == R_X86_64_GOTPC32_TLSDESC)
500 ? loc - 3
501 : loc - 4)
502 << "R_X86_64_GOTPC32_TLSDESC/R_X86_64_CODE_4_GOTPC32_TLSDESC "
503 "must be used in leaq x@tlsdesc(%rip), %REG";
504 return;
505 }
506 if (rel.type == R_X86_64_GOTPC32_TLSDESC) {
507 loc[-3] = 0x48 | ((loc[-3] >> 2) & 1);
508 } else {
509 loc[-3] = (loc[-3] & ~0x44) | ((loc[-3] & 0x44) >> 2);
510 }
511 loc[-2] = 0xc7;
512 loc[-1] = 0xc0 | ((loc[-1] >> 3) & 7);
513
514 write32le(P: loc, V: val + 4);
515 } else {
516 // Convert call *x@tlsdesc(%REG) to xchg ax, ax.
517 assert(rel.type == R_X86_64_TLSDESC_CALL);
518 loc[0] = 0x66;
519 loc[1] = 0x90;
520 }
521}
522
523void X86_64::relaxTlsGdToIe(uint8_t *loc, const Relocation &rel,
524 uint64_t val) const {
525 if (rel.type == R_X86_64_TLSGD) {
526 // Convert
527 // .byte 0x66
528 // leaq x@tlsgd(%rip), %rdi
529 // .word 0x6666
530 // rex64
531 // call __tls_get_addr@plt
532 // to the following two instructions.
533 const uint8_t inst[] = {
534 0x64, 0x48, 0x8b, 0x04, 0x25, 0x00, 0x00,
535 0x00, 0x00, // mov %fs:0x0,%rax
536 0x48, 0x03, 0x05, 0, 0, 0, 0, // addq x@gottpoff(%rip),%rax
537 };
538 memcpy(dest: loc - 4, src: inst, n: sizeof(inst));
539
540 // Both code sequences are PC relatives, but since we are moving the
541 // constant forward by 8 bytes we have to subtract the value by 8.
542 write32le(P: loc + 8, V: val - 8);
543 } else if (rel.type == R_X86_64_GOTPC32_TLSDESC ||
544 rel.type == R_X86_64_CODE_4_GOTPC32_TLSDESC) {
545 // Convert leaq x@tlsdesc(%rip), %REG to movq x@gottpoff(%rip), %REG.
546 if ((loc[-3] & 0xfb) != 0x48 || loc[-2] != 0x8d ||
547 (loc[-1] & 0xc7) != 0x05) {
548 Err(ctx) << getErrorLoc(ctx, loc: (rel.type == R_X86_64_GOTPC32_TLSDESC)
549 ? loc - 3
550 : loc - 4)
551 << "R_X86_64_GOTPC32_TLSDESC/R_X86_64_CODE_4_GOTPC32_TLSDESC "
552 "must be used in leaq x@tlsdesc(%rip), %REG";
553 return;
554 }
555 loc[-2] = 0x8b;
556 write32le(P: loc, V: val);
557 } else {
558 // Convert call *x@tlsdesc(%rax) to xchg ax, ax.
559 assert(rel.type == R_X86_64_TLSDESC_CALL);
560 loc[0] = 0x66;
561 loc[1] = 0x90;
562 }
563}
564
565// In some conditions,
566// R_X86_64_GOTTPOFF/R_X86_64_CODE_4_GOTTPOFF/R_X86_64_CODE_6_GOTTPOFF
567// relocation can be optimized to R_X86_64_TPOFF32 so that it does not use GOT.
568void X86_64::relaxTlsIeToLe(uint8_t *loc, const Relocation &rel,
569 uint64_t val) const {
570 uint8_t *inst = loc - 3;
571 uint8_t reg = loc[-1] >> 3;
572 uint8_t *regSlot = loc - 1;
573
574 if (rel.type == R_X86_64_GOTTPOFF) {
575 // Note that ADD with RSP or R12 is converted to ADD instead of LEA
576 // because LEA with these registers needs 4 bytes to encode and thus
577 // wouldn't fit the space.
578
579 if (memcmp(s1: inst, s2: "\x48\x03\x25", n: 3) == 0) {
580 // "addq foo@gottpoff(%rip),%rsp" -> "addq $foo,%rsp"
581 memcpy(dest: inst, src: "\x48\x81\xc4", n: 3);
582 } else if (memcmp(s1: inst, s2: "\x4c\x03\x25", n: 3) == 0) {
583 // "addq foo@gottpoff(%rip),%r12" -> "addq $foo,%r12"
584 memcpy(dest: inst, src: "\x49\x81\xc4", n: 3);
585 } else if (memcmp(s1: inst, s2: "\x4c\x03", n: 2) == 0) {
586 // "addq foo@gottpoff(%rip),%r[8-15]" -> "leaq foo(%r[8-15]),%r[8-15]"
587 memcpy(dest: inst, src: "\x4d\x8d", n: 2);
588 *regSlot = 0x80 | (reg << 3) | reg;
589 } else if (memcmp(s1: inst, s2: "\x48\x03", n: 2) == 0) {
590 // "addq foo@gottpoff(%rip),%reg -> "leaq foo(%reg),%reg"
591 memcpy(dest: inst, src: "\x48\x8d", n: 2);
592 *regSlot = 0x80 | (reg << 3) | reg;
593 } else if (memcmp(s1: inst, s2: "\x4c\x8b", n: 2) == 0) {
594 // "movq foo@gottpoff(%rip),%r[8-15]" -> "movq $foo,%r[8-15]"
595 memcpy(dest: inst, src: "\x49\xc7", n: 2);
596 *regSlot = 0xc0 | reg;
597 } else if (memcmp(s1: inst, s2: "\x48\x8b", n: 2) == 0) {
598 // "movq foo@gottpoff(%rip),%reg" -> "movq $foo,%reg"
599 memcpy(dest: inst, src: "\x48\xc7", n: 2);
600 *regSlot = 0xc0 | reg;
601 } else {
602 Err(ctx)
603 << getErrorLoc(ctx, loc: loc - 3)
604 << "R_X86_64_GOTTPOFF must be used in MOVQ or ADDQ instructions only";
605 }
606 } else if (rel.type == R_X86_64_CODE_4_GOTTPOFF) {
607 if (loc[-4] != 0xd5) {
608 Err(ctx) << getErrorLoc(ctx, loc: loc - 4)
609 << "invalid prefix with R_X86_64_CODE_4_GOTTPOFF!";
610 return;
611 }
612 const uint8_t rex = loc[-3];
613 loc[-3] = (rex & ~0x44) | (rex & 0x44) >> 2;
614 *regSlot = 0xc0 | reg;
615
616 if (loc[-2] == 0x8b) {
617 // "movq foo@gottpoff(%rip),%r[16-31]" -> "movq $foo,%r[16-31]"
618 loc[-2] = 0xc7;
619 } else if (loc[-2] == 0x03) {
620 // "addq foo@gottpoff(%rip),%r[16-31]" -> "addq $foo,%r[16-31]"
621 loc[-2] = 0x81;
622 } else {
623 Err(ctx) << getErrorLoc(ctx, loc: loc - 4)
624 << "R_X86_64_CODE_4_GOTTPOFF must be used in MOVQ or ADDQ "
625 "instructions only";
626 }
627 } else if (rel.type == R_X86_64_CODE_6_GOTTPOFF) {
628 if (loc[-6] != 0x62) {
629 Err(ctx) << getErrorLoc(ctx, loc: loc - 6)
630 << "invalid prefix with R_X86_64_CODE_6_GOTTPOFF!";
631 return;
632 }
633 // Check bits are satisfied:
634 // loc[-5]: X==1 (inverted polarity), (loc[-5] & 0x7) == 0x4
635 // loc[-4]: W==1, X2==1 (inverted polarity), pp==0b00(NP)
636 // loc[-3]: NF==1 or ND==1
637 // loc[-2]: opcode==0x1 or opcode==0x3
638 // loc[-1]: Mod==0b00, RM==0b101
639 if (((loc[-5] & 0x47) == 0x44) && ((loc[-4] & 0x87) == 0x84) &&
640 ((loc[-3] & 0x14) != 0) && (loc[-2] == 0x1 || loc[-2] == 0x3) &&
641 ((loc[-1] & 0xc7) == 0x5)) {
642 // "addq %reg1, foo@GOTTPOFF(%rip), %reg2" -> "addq $foo, %reg1, %reg2"
643 // "addq foo@GOTTPOFF(%rip), %reg1, %reg2" -> "addq $foo, %reg1, %reg2"
644 // "{nf} addq %reg1, foo@GOTTPOFF(%rip), %reg2"
645 // -> "{nf} addq $foo, %reg1, %reg2"
646 // "{nf} addq name@GOTTPOFF(%rip), %reg1, %reg2"
647 // -> "{nf} addq $foo, %reg1, %reg2"
648 // "{nf} addq name@GOTTPOFF(%rip), %reg" -> "{nf} addq $foo, %reg"
649 loc[-2] = 0x81;
650 // Move R bits to B bits in EVEX payloads and ModRM byte.
651 const uint8_t evexPayload0 = loc[-5];
652 if ((evexPayload0 & (1 << 7)) == 0)
653 loc[-5] = (evexPayload0 | (1 << 7)) & ~(1 << 5);
654 if ((evexPayload0 & (1 << 4)) == 0)
655 loc[-5] = evexPayload0 | (1 << 4) | (1 << 3);
656 *regSlot = 0xc0 | reg;
657 } else {
658 Err(ctx) << getErrorLoc(ctx, loc: loc - 6)
659 << "R_X86_64_CODE_6_GOTTPOFF must be used in ADDQ instructions "
660 "with NDD/NF/NDD+NF only";
661 }
662 } else {
663 llvm_unreachable("Unsupported relocation type!");
664 }
665
666 // The original code used a PC relative relocation.
667 // Need to compensate for the -4 it had in the addend.
668 write32le(P: loc, V: val + 4);
669}
670
671void X86_64::relaxTlsLdToLe(uint8_t *loc, const Relocation &rel,
672 uint64_t val) const {
673 const uint8_t inst[] = {
674 0x66, 0x66, // .word 0x6666
675 0x66, // .byte 0x66
676 0x64, 0x48, 0x8b, 0x04, 0x25, 0x00, 0x00, 0x00, 0x00, // mov %fs:0,%rax
677 };
678
679 if (loc[4] == 0xe8) {
680 // Convert
681 // leaq bar@tlsld(%rip), %rdi # 48 8d 3d <Loc>
682 // callq __tls_get_addr@PLT # e8 <disp32>
683 // leaq bar@dtpoff(%rax), %rcx
684 // to
685 // .word 0x6666
686 // .byte 0x66
687 // mov %fs:0,%rax
688 // leaq bar@tpoff(%rax), %rcx
689 memcpy(dest: loc - 3, src: inst, n: sizeof(inst));
690 return;
691 }
692
693 if (loc[4] == 0xff && loc[5] == 0x15) {
694 // Convert
695 // leaq x@tlsld(%rip),%rdi # 48 8d 3d <Loc>
696 // call *__tls_get_addr@GOTPCREL(%rip) # ff 15 <disp32>
697 // to
698 // .long 0x66666666
699 // movq %fs:0,%rax
700 // See "Table 11.9: LD -> LE Code Transition (LP64)" in
701 // https://raw.githubusercontent.com/wiki/hjl-tools/x86-psABI/x86-64-psABI-1.0.pdf
702 loc[-3] = 0x66;
703 memcpy(dest: loc - 2, src: inst, n: sizeof(inst));
704 return;
705 }
706
707 ErrAlways(ctx)
708 << getErrorLoc(ctx, loc: loc - 3)
709 << "expected R_X86_64_PLT32 or R_X86_64_GOTPCRELX after R_X86_64_TLSLD";
710}
711
712// A JumpInstrMod at a specific offset indicates that the jump instruction
713// opcode at that offset must be modified. This is specifically used to relax
714// jump instructions with basic block sections. This function looks at the
715// JumpMod and effects the change.
716void X86_64::applyJumpInstrMod(uint8_t *loc, JumpModType type,
717 unsigned size) const {
718 switch (type) {
719 case J_JMP_32:
720 if (size == 4)
721 *loc = 0xe9;
722 else
723 *loc = 0xeb;
724 break;
725 case J_JE_32:
726 if (size == 4) {
727 loc[-1] = 0x0f;
728 *loc = 0x84;
729 } else
730 *loc = 0x74;
731 break;
732 case J_JNE_32:
733 if (size == 4) {
734 loc[-1] = 0x0f;
735 *loc = 0x85;
736 } else
737 *loc = 0x75;
738 break;
739 case J_JG_32:
740 if (size == 4) {
741 loc[-1] = 0x0f;
742 *loc = 0x8f;
743 } else
744 *loc = 0x7f;
745 break;
746 case J_JGE_32:
747 if (size == 4) {
748 loc[-1] = 0x0f;
749 *loc = 0x8d;
750 } else
751 *loc = 0x7d;
752 break;
753 case J_JB_32:
754 if (size == 4) {
755 loc[-1] = 0x0f;
756 *loc = 0x82;
757 } else
758 *loc = 0x72;
759 break;
760 case J_JBE_32:
761 if (size == 4) {
762 loc[-1] = 0x0f;
763 *loc = 0x86;
764 } else
765 *loc = 0x76;
766 break;
767 case J_JL_32:
768 if (size == 4) {
769 loc[-1] = 0x0f;
770 *loc = 0x8c;
771 } else
772 *loc = 0x7c;
773 break;
774 case J_JLE_32:
775 if (size == 4) {
776 loc[-1] = 0x0f;
777 *loc = 0x8e;
778 } else
779 *loc = 0x7e;
780 break;
781 case J_JA_32:
782 if (size == 4) {
783 loc[-1] = 0x0f;
784 *loc = 0x87;
785 } else
786 *loc = 0x77;
787 break;
788 case J_JAE_32:
789 if (size == 4) {
790 loc[-1] = 0x0f;
791 *loc = 0x83;
792 } else
793 *loc = 0x73;
794 break;
795 case J_UNKNOWN:
796 llvm_unreachable("Unknown Jump Relocation");
797 }
798}
799
800int64_t X86_64::getImplicitAddend(const uint8_t *buf, RelType type) const {
801 switch (type) {
802 case R_X86_64_8:
803 case R_X86_64_PC8:
804 return SignExtend64<8>(x: *buf);
805 case R_X86_64_16:
806 case R_X86_64_PC16:
807 return SignExtend64<16>(x: read16le(P: buf));
808 case R_X86_64_32:
809 case R_X86_64_32S:
810 case R_X86_64_TPOFF32:
811 case R_X86_64_GOT32:
812 case R_X86_64_GOTPC32:
813 case R_X86_64_GOTPC32_TLSDESC:
814 case R_X86_64_GOTPCREL:
815 case R_X86_64_GOTPCRELX:
816 case R_X86_64_REX_GOTPCRELX:
817 case R_X86_64_CODE_4_GOTPCRELX:
818 case R_X86_64_PC32:
819 case R_X86_64_GOTTPOFF:
820 case R_X86_64_CODE_4_GOTTPOFF:
821 case R_X86_64_CODE_6_GOTTPOFF:
822 case R_X86_64_PLT32:
823 case R_X86_64_TLSGD:
824 case R_X86_64_TLSLD:
825 case R_X86_64_DTPOFF32:
826 case R_X86_64_SIZE32:
827 return SignExtend64<32>(x: read32le(P: buf));
828 case R_X86_64_64:
829 case R_X86_64_TPOFF64:
830 case R_X86_64_DTPOFF64:
831 case R_X86_64_DTPMOD64:
832 case R_X86_64_PC64:
833 case R_X86_64_SIZE64:
834 case R_X86_64_GLOB_DAT:
835 case R_X86_64_GOT64:
836 case R_X86_64_GOTOFF64:
837 case R_X86_64_GOTPC64:
838 case R_X86_64_PLTOFF64:
839 case R_X86_64_IRELATIVE:
840 case R_X86_64_RELATIVE:
841 return read64le(P: buf);
842 case R_X86_64_TLSDESC:
843 return read64le(P: buf + 8);
844 case R_X86_64_JUMP_SLOT:
845 case R_X86_64_NONE:
846 // These relocations are defined as not having an implicit addend.
847 return 0;
848 default:
849 InternalErr(ctx, buf) << "cannot read addend for relocation " << type;
850 return 0;
851 }
852}
853
854static void relaxGot(uint8_t *loc, const Relocation &rel, uint64_t val);
855
856void X86_64::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const {
857 switch (rel.type) {
858 case R_X86_64_8:
859 checkIntUInt(ctx, loc, v: val, n: 8, rel);
860 *loc = val;
861 break;
862 case R_X86_64_PC8:
863 checkInt(ctx, loc, v: val, n: 8, rel);
864 *loc = val;
865 break;
866 case R_X86_64_16:
867 checkIntUInt(ctx, loc, v: val, n: 16, rel);
868 write16le(P: loc, V: val);
869 break;
870 case R_X86_64_PC16:
871 checkInt(ctx, loc, v: val, n: 16, rel);
872 write16le(P: loc, V: val);
873 break;
874 case R_X86_64_32:
875 checkUInt(ctx, loc, v: val, n: 32, rel);
876 write32le(P: loc, V: val);
877 break;
878 case R_X86_64_32S:
879 case R_X86_64_GOT32:
880 case R_X86_64_GOTPC32:
881 case R_X86_64_GOTPCREL:
882 case R_X86_64_PC32:
883 case R_X86_64_PLT32:
884 case R_X86_64_DTPOFF32:
885 case R_X86_64_SIZE32:
886 checkInt(ctx, loc, v: val, n: 32, rel);
887 write32le(P: loc, V: val);
888 break;
889 case R_X86_64_64:
890 case R_X86_64_TPOFF64:
891 case R_X86_64_DTPOFF64:
892 case R_X86_64_PC64:
893 case R_X86_64_SIZE64:
894 case R_X86_64_GOT64:
895 case R_X86_64_GOTOFF64:
896 case R_X86_64_GOTPC64:
897 case R_X86_64_PLTOFF64:
898 write64le(P: loc, V: val);
899 break;
900 case R_X86_64_GOTPCRELX:
901 case R_X86_64_REX_GOTPCRELX:
902 case R_X86_64_CODE_4_GOTPCRELX:
903 if (rel.expr != R_GOT_PC) {
904 relaxGot(loc, rel, val);
905 } else {
906 checkInt(ctx, loc, v: val, n: 32, rel);
907 write32le(P: loc, V: val);
908 }
909 break;
910 case R_X86_64_GOTPC32_TLSDESC:
911 case R_X86_64_CODE_4_GOTPC32_TLSDESC:
912 case R_X86_64_TLSDESC_CALL:
913 case R_X86_64_TLSGD:
914 if (rel.expr == R_RELAX_TLS_GD_TO_LE) {
915 relaxTlsGdToLe(loc, rel, val);
916 } else if (rel.expr == R_RELAX_TLS_GD_TO_IE) {
917 relaxTlsGdToIe(loc, rel, val);
918 } else {
919 checkInt(ctx, loc, v: val, n: 32, rel);
920 write32le(P: loc, V: val);
921 }
922 break;
923 case R_X86_64_TLSLD:
924 if (rel.expr == R_RELAX_TLS_LD_TO_LE) {
925 relaxTlsLdToLe(loc, rel, val);
926 } else {
927 checkInt(ctx, loc, v: val, n: 32, rel);
928 write32le(P: loc, V: val);
929 }
930 break;
931 case R_X86_64_GOTTPOFF:
932 case R_X86_64_CODE_4_GOTTPOFF:
933 case R_X86_64_CODE_6_GOTTPOFF:
934 if (rel.expr == R_RELAX_TLS_IE_TO_LE) {
935 relaxTlsIeToLe(loc, rel, val);
936 } else {
937 checkInt(ctx, loc, v: val, n: 32, rel);
938 write32le(P: loc, V: val);
939 }
940 break;
941 case R_X86_64_TPOFF32:
942 checkInt(ctx, loc, v: val, n: 32, rel);
943 write32le(P: loc, V: val);
944 break;
945
946 case R_X86_64_TLSDESC:
947 // The addend is stored in the second 64-bit word.
948 write64le(P: loc + 8, V: val);
949 break;
950 default:
951 llvm_unreachable("unknown relocation");
952 }
953}
954
955RelExpr X86_64::adjustGotPcExpr(RelType type, int64_t addend,
956 const uint8_t *loc) const {
957 // Only R_X86_64_[REX_]|[CODE_4_]GOTPCRELX can be relaxed. GNU as may emit
958 // GOTPCRELX with addend != -4. Such an instruction does not load the full GOT
959 // entry, so we cannot relax the relocation. E.g. movl x@GOTPCREL+4(%rip),
960 // %rax (addend=0) loads the high 32 bits of the GOT entry.
961 if (!ctx.arg.relax || addend != -4 ||
962 (type != R_X86_64_GOTPCRELX && type != R_X86_64_REX_GOTPCRELX &&
963 type != R_X86_64_CODE_4_GOTPCRELX))
964 return R_GOT_PC;
965 const uint8_t op = loc[-2];
966 const uint8_t modRm = loc[-1];
967
968 // FIXME: When PIC is disabled and foo is defined locally in the
969 // lower 32 bit address space, memory operand in mov can be converted into
970 // immediate operand. Otherwise, mov must be changed to lea. We support only
971 // latter relaxation at this moment.
972 if (op == 0x8b)
973 return R_RELAX_GOT_PC;
974
975 // Relax call and jmp.
976 if (op == 0xff && (modRm == 0x15 || modRm == 0x25))
977 return R_RELAX_GOT_PC;
978
979 // We don't support test/binop instructions without a REX/REX2 prefix.
980 if (type == R_X86_64_GOTPCRELX)
981 return R_GOT_PC;
982
983 // Relaxation of test, adc, add, and, cmp, or, sbb, sub, xor.
984 // If PIC then no relaxation is available.
985 return ctx.arg.isPic ? R_GOT_PC : R_RELAX_GOT_PC_NOPIC;
986}
987
988// A subset of relaxations can only be applied for no-PIC. This method
989// handles such relaxations. Instructions encoding information was taken from:
990// "Intel 64 and IA-32 Architectures Software Developer's Manual V2"
991// (http://www.intel.com/content/dam/www/public/us/en/documents/manuals/
992// 64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf)
993static void relaxGotNoPic(uint8_t *loc, uint64_t val, uint8_t op, uint8_t modRm,
994 bool isRex2) {
995 const uint8_t rex = loc[-3];
996 // Convert "test %reg, foo@GOTPCREL(%rip)" to "test $foo, %reg".
997 if (op == 0x85) {
998 // See "TEST-Logical Compare" (4-428 Vol. 2B),
999 // TEST r/m64, r64 uses "full" ModR / M byte (no opcode extension).
1000
1001 // ModR/M byte has form XX YYY ZZZ, where
1002 // YYY is MODRM.reg(register 2), ZZZ is MODRM.rm(register 1).
1003 // XX has different meanings:
1004 // 00: The operand's memory address is in reg1.
1005 // 01: The operand's memory address is reg1 + a byte-sized displacement.
1006 // 10: The operand's memory address is reg1 + a word-sized displacement.
1007 // 11: The operand is reg1 itself.
1008 // If an instruction requires only one operand, the unused reg2 field
1009 // holds extra opcode bits rather than a register code
1010 // 0xC0 == 11 000 000 binary.
1011 // 0x38 == 00 111 000 binary.
1012 // We transfer reg2 to reg1 here as operand.
1013 // See "2.1.3 ModR/M and SIB Bytes" (Vol. 2A 2-3).
1014 loc[-1] = 0xc0 | (modRm & 0x38) >> 3; // ModR/M byte.
1015
1016 // Change opcode from TEST r/m64, r64 to TEST r/m64, imm32
1017 // See "TEST-Logical Compare" (4-428 Vol. 2B).
1018 loc[-2] = 0xf7;
1019
1020 // Move R bit to the B bit in REX/REX2 byte.
1021 // REX byte is encoded as 0100WRXB, where
1022 // 0100 is 4bit fixed pattern.
1023 // REX.W When 1, a 64-bit operand size is used. Otherwise, when 0, the
1024 // default operand size is used (which is 32-bit for most but not all
1025 // instructions).
1026 // REX.R This 1-bit value is an extension to the MODRM.reg field.
1027 // REX.X This 1-bit value is an extension to the SIB.index field.
1028 // REX.B This 1-bit value is an extension to the MODRM.rm field or the
1029 // SIB.base field.
1030 // See "2.2.1.2 More on REX Prefix Fields " (2-8 Vol. 2A).
1031 //
1032 // REX2 prefix is encoded as 0xd5|M|R2|X2|B2|WRXB, where
1033 // 0xd5 is 1byte fixed pattern.
1034 // REX2's [W,R,X,B] have the same meanings as REX's.
1035 // REX2.M encodes the map id.
1036 // R2/X2/B2 provides the fifth and most siginicant bits of the R/X/B
1037 // register identifiers, each of which can now address all 32 GPRs.
1038 if (isRex2)
1039 loc[-3] = (rex & ~0x44) | (rex & 0x44) >> 2;
1040 else
1041 loc[-3] = (rex & ~0x4) | (rex & 0x4) >> 2;
1042 write32le(P: loc, V: val);
1043 return;
1044 }
1045
1046 // If we are here then we need to relax the adc, add, and, cmp, or, sbb, sub
1047 // or xor operations.
1048
1049 // Convert "binop foo@GOTPCREL(%rip), %reg" to "binop $foo, %reg".
1050 // Logic is close to one for test instruction above, but we also
1051 // write opcode extension here, see below for details.
1052 loc[-1] = 0xc0 | (modRm & 0x38) >> 3 | (op & 0x3c); // ModR/M byte.
1053
1054 // Primary opcode is 0x81, opcode extension is one of:
1055 // 000b = ADD, 001b is OR, 010b is ADC, 011b is SBB,
1056 // 100b is AND, 101b is SUB, 110b is XOR, 111b is CMP.
1057 // This value was wrote to MODRM.reg in a line above.
1058 // See "3.2 INSTRUCTIONS (A-M)" (Vol. 2A 3-15),
1059 // "INSTRUCTION SET REFERENCE, N-Z" (Vol. 2B 4-1) for
1060 // descriptions about each operation.
1061 loc[-2] = 0x81;
1062 if (isRex2)
1063 loc[-3] = (rex & ~0x44) | (rex & 0x44) >> 2;
1064 else
1065 loc[-3] = (rex & ~0x4) | (rex & 0x4) >> 2;
1066 write32le(P: loc, V: val);
1067}
1068
1069static void relaxGot(uint8_t *loc, const Relocation &rel, uint64_t val) {
1070 assert(isInt<32>(val) &&
1071 "GOTPCRELX should not have been relaxed if it overflows");
1072 const uint8_t op = loc[-2];
1073 const uint8_t modRm = loc[-1];
1074
1075 // Convert "mov foo@GOTPCREL(%rip),%reg" to "lea foo(%rip),%reg".
1076 if (op == 0x8b) {
1077 loc[-2] = 0x8d;
1078 write32le(P: loc, V: val);
1079 return;
1080 }
1081
1082 if (op != 0xff) {
1083 // We are relaxing a rip relative to an absolute, so compensate
1084 // for the old -4 addend.
1085 assert(!rel.sym->file->ctx.arg.isPic);
1086 relaxGotNoPic(loc, val: val + 4, op, modRm,
1087 isRex2: rel.type == R_X86_64_CODE_4_GOTPCRELX);
1088 return;
1089 }
1090
1091 // Convert call/jmp instructions.
1092 if (modRm == 0x15) {
1093 // ABI says we can convert "call *foo@GOTPCREL(%rip)" to "nop; call foo".
1094 // Instead we convert to "addr32 call foo" where addr32 is an instruction
1095 // prefix. That makes result expression to be a single instruction.
1096 loc[-2] = 0x67; // addr32 prefix
1097 loc[-1] = 0xe8; // call
1098 write32le(P: loc, V: val);
1099 return;
1100 }
1101
1102 // Convert "jmp *foo@GOTPCREL(%rip)" to "jmp foo; nop".
1103 // jmp doesn't return, so it is fine to use nop here, it is just a stub.
1104 assert(modRm == 0x25);
1105 loc[-2] = 0xe9; // jmp
1106 loc[3] = 0x90; // nop
1107 write32le(P: loc - 1, V: val + 1);
1108}
1109
1110// A split-stack prologue starts by checking the amount of stack remaining
1111// in one of two ways:
1112// A) Comparing of the stack pointer to a field in the tcb.
1113// B) Or a load of a stack pointer offset with an lea to r10 or r11.
1114bool X86_64::adjustPrologueForCrossSplitStack(uint8_t *loc, uint8_t *end,
1115 uint8_t stOther) const {
1116 if (!ctx.arg.is64) {
1117 ErrAlways(ctx) << "target doesn't support split stacks";
1118 return false;
1119 }
1120
1121 if (loc + 8 >= end)
1122 return false;
1123
1124 // Replace "cmp %fs:0x70,%rsp" and subsequent branch
1125 // with "stc, nopl 0x0(%rax,%rax,1)"
1126 if (memcmp(s1: loc, s2: "\x64\x48\x3b\x24\x25", n: 5) == 0) {
1127 memcpy(dest: loc, src: "\xf9\x0f\x1f\x84\x00\x00\x00\x00", n: 8);
1128 return true;
1129 }
1130
1131 // Adjust "lea X(%rsp),%rYY" to lea "(X - 0x4000)(%rsp),%rYY" where rYY could
1132 // be r10 or r11. The lea instruction feeds a subsequent compare which checks
1133 // if there is X available stack space. Making X larger effectively reserves
1134 // that much additional space. The stack grows downward so subtract the value.
1135 if (memcmp(s1: loc, s2: "\x4c\x8d\x94\x24", n: 4) == 0 ||
1136 memcmp(s1: loc, s2: "\x4c\x8d\x9c\x24", n: 4) == 0) {
1137 // The offset bytes are encoded four bytes after the start of the
1138 // instruction.
1139 write32le(P: loc + 4, V: read32le(P: loc + 4) - 0x4000);
1140 return true;
1141 }
1142 return false;
1143}
1144
1145void X86_64::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const {
1146 uint64_t secAddr = sec.getOutputSection()->addr;
1147 if (auto *s = dyn_cast<InputSection>(Val: &sec))
1148 secAddr += s->outSecOff;
1149 else if (auto *ehIn = dyn_cast<EhInputSection>(Val: &sec))
1150 secAddr += ehIn->getParent()->outSecOff;
1151 for (const Relocation &rel : sec.relocs()) {
1152 if (rel.expr == R_NONE) // See deleteFallThruJmpInsn
1153 continue;
1154 uint8_t *loc = buf + rel.offset;
1155 const uint64_t val = sec.getRelocTargetVA(ctx, r: rel, p: secAddr + rel.offset);
1156 relocate(loc, rel, val);
1157 }
1158 if (sec.jumpInstrMod) {
1159 applyJumpInstrMod(loc: buf + sec.jumpInstrMod->offset,
1160 type: sec.jumpInstrMod->original, size: sec.jumpInstrMod->size);
1161 }
1162}
1163
1164// If Intel Indirect Branch Tracking is enabled, we have to emit special PLT
1165// entries containing endbr64 instructions. A PLT entry will be split into two
1166// parts, one in .plt.sec (writePlt), and the other in .plt (writeIBTPlt).
1167namespace {
1168class IntelIBT : public X86_64 {
1169public:
1170 IntelIBT(Ctx &ctx) : X86_64(ctx) { pltHeaderSize = 0; };
1171 void writeGotPlt(uint8_t *buf, const Symbol &s) const override;
1172 void writePlt(uint8_t *buf, const Symbol &sym,
1173 uint64_t pltEntryAddr) const override;
1174 void writeIBTPlt(uint8_t *buf, size_t numEntries) const override;
1175
1176 static const unsigned IBTPltHeaderSize = 16;
1177};
1178} // namespace
1179
1180void IntelIBT::writeGotPlt(uint8_t *buf, const Symbol &s) const {
1181 uint64_t va = ctx.in.ibtPlt->getVA() + IBTPltHeaderSize +
1182 s.getPltIdx(ctx) * pltEntrySize;
1183 write64le(P: buf, V: va);
1184}
1185
1186void IntelIBT::writePlt(uint8_t *buf, const Symbol &sym,
1187 uint64_t pltEntryAddr) const {
1188 const uint8_t Inst[] = {
1189 0xf3, 0x0f, 0x1e, 0xfa, // endbr64
1190 0xff, 0x25, 0, 0, 0, 0, // jmpq *got(%rip)
1191 0x66, 0x0f, 0x1f, 0x44, 0, 0, // nop
1192 };
1193 memcpy(dest: buf, src: Inst, n: sizeof(Inst));
1194 write32le(P: buf + 6, V: sym.getGotPltVA(ctx) - pltEntryAddr - 10);
1195}
1196
1197void IntelIBT::writeIBTPlt(uint8_t *buf, size_t numEntries) const {
1198 writePltHeader(buf);
1199 buf += IBTPltHeaderSize;
1200
1201 const uint8_t inst[] = {
1202 0xf3, 0x0f, 0x1e, 0xfa, // endbr64
1203 0x68, 0, 0, 0, 0, // pushq <relocation index>
1204 0xe9, 0, 0, 0, 0, // jmpq plt[0]
1205 0x66, 0x90, // nop
1206 };
1207
1208 for (size_t i = 0; i < numEntries; ++i) {
1209 memcpy(dest: buf, src: inst, n: sizeof(inst));
1210 write32le(P: buf + 5, V: i);
1211 write32le(P: buf + 10, V: -pltHeaderSize - sizeof(inst) * i - 30);
1212 buf += sizeof(inst);
1213 }
1214}
1215
1216// These nonstandard PLT entries are to migtigate Spectre v2 security
1217// vulnerability. In order to mitigate Spectre v2, we want to avoid indirect
1218// branch instructions such as `jmp *GOTPLT(%rip)`. So, in the following PLT
1219// entries, we use a CALL followed by MOV and RET to do the same thing as an
1220// indirect jump. That instruction sequence is so-called "retpoline".
1221//
1222// We have two types of retpoline PLTs as a size optimization. If `-z now`
1223// is specified, all dynamic symbols are resolved at load-time. Thus, when
1224// that option is given, we can omit code for symbol lazy resolution.
1225namespace {
1226class Retpoline : public X86_64 {
1227public:
1228 Retpoline(Ctx &);
1229 void writeGotPlt(uint8_t *buf, const Symbol &s) const override;
1230 void writePltHeader(uint8_t *buf) const override;
1231 void writePlt(uint8_t *buf, const Symbol &sym,
1232 uint64_t pltEntryAddr) const override;
1233};
1234
1235class RetpolineZNow : public X86_64 {
1236public:
1237 RetpolineZNow(Ctx &);
1238 void writeGotPlt(uint8_t *buf, const Symbol &s) const override {}
1239 void writePltHeader(uint8_t *buf) const override;
1240 void writePlt(uint8_t *buf, const Symbol &sym,
1241 uint64_t pltEntryAddr) const override;
1242};
1243} // namespace
1244
1245Retpoline::Retpoline(Ctx &ctx) : X86_64(ctx) {
1246 pltHeaderSize = 48;
1247 pltEntrySize = 32;
1248 ipltEntrySize = 32;
1249}
1250
1251void Retpoline::writeGotPlt(uint8_t *buf, const Symbol &s) const {
1252 write64le(P: buf, V: s.getPltVA(ctx) + 17);
1253}
1254
1255void Retpoline::writePltHeader(uint8_t *buf) const {
1256 const uint8_t insn[] = {
1257 0xff, 0x35, 0, 0, 0, 0, // 0: pushq GOTPLT+8(%rip)
1258 0x4c, 0x8b, 0x1d, 0, 0, 0, 0, // 6: mov GOTPLT+16(%rip), %r11
1259 0xe8, 0x0e, 0x00, 0x00, 0x00, // d: callq next
1260 0xf3, 0x90, // 12: loop: pause
1261 0x0f, 0xae, 0xe8, // 14: lfence
1262 0xeb, 0xf9, // 17: jmp loop
1263 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, // 19: int3; .align 16
1264 0x4c, 0x89, 0x1c, 0x24, // 20: next: mov %r11, (%rsp)
1265 0xc3, // 24: ret
1266 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, // 25: int3; padding
1267 0xcc, 0xcc, 0xcc, 0xcc, // 2c: int3; padding
1268 };
1269 memcpy(dest: buf, src: insn, n: sizeof(insn));
1270
1271 uint64_t gotPlt = ctx.in.gotPlt->getVA();
1272 uint64_t plt = ctx.in.plt->getVA();
1273 write32le(P: buf + 2, V: gotPlt - plt - 6 + 8);
1274 write32le(P: buf + 9, V: gotPlt - plt - 13 + 16);
1275}
1276
1277void Retpoline::writePlt(uint8_t *buf, const Symbol &sym,
1278 uint64_t pltEntryAddr) const {
1279 const uint8_t insn[] = {
1280 0x4c, 0x8b, 0x1d, 0, 0, 0, 0, // 0: mov foo@GOTPLT(%rip), %r11
1281 0xe8, 0, 0, 0, 0, // 7: callq plt+0x20
1282 0xe9, 0, 0, 0, 0, // c: jmp plt+0x12
1283 0x68, 0, 0, 0, 0, // 11: pushq <relocation index>
1284 0xe9, 0, 0, 0, 0, // 16: jmp plt+0
1285 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, // 1b: int3; padding
1286 };
1287 memcpy(dest: buf, src: insn, n: sizeof(insn));
1288
1289 uint64_t off = pltEntryAddr - ctx.in.plt->getVA();
1290
1291 write32le(P: buf + 3, V: sym.getGotPltVA(ctx) - pltEntryAddr - 7);
1292 write32le(P: buf + 8, V: -off - 12 + 32);
1293 write32le(P: buf + 13, V: -off - 17 + 18);
1294 write32le(P: buf + 18, V: sym.getPltIdx(ctx));
1295 write32le(P: buf + 23, V: -off - 27);
1296}
1297
1298RetpolineZNow::RetpolineZNow(Ctx &ctx) : X86_64(ctx) {
1299 pltHeaderSize = 32;
1300 pltEntrySize = 16;
1301 ipltEntrySize = 16;
1302}
1303
1304void RetpolineZNow::writePltHeader(uint8_t *buf) const {
1305 const uint8_t insn[] = {
1306 0xe8, 0x0b, 0x00, 0x00, 0x00, // 0: call next
1307 0xf3, 0x90, // 5: loop: pause
1308 0x0f, 0xae, 0xe8, // 7: lfence
1309 0xeb, 0xf9, // a: jmp loop
1310 0xcc, 0xcc, 0xcc, 0xcc, // c: int3; .align 16
1311 0x4c, 0x89, 0x1c, 0x24, // 10: next: mov %r11, (%rsp)
1312 0xc3, // 14: ret
1313 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, // 15: int3; padding
1314 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, // 1a: int3; padding
1315 0xcc, // 1f: int3; padding
1316 };
1317 memcpy(dest: buf, src: insn, n: sizeof(insn));
1318}
1319
1320void RetpolineZNow::writePlt(uint8_t *buf, const Symbol &sym,
1321 uint64_t pltEntryAddr) const {
1322 const uint8_t insn[] = {
1323 0x4c, 0x8b, 0x1d, 0, 0, 0, 0, // mov foo@GOTPLT(%rip), %r11
1324 0xe9, 0, 0, 0, 0, // jmp plt+0
1325 0xcc, 0xcc, 0xcc, 0xcc, // int3; padding
1326 };
1327 memcpy(dest: buf, src: insn, n: sizeof(insn));
1328
1329 write32le(P: buf + 3, V: sym.getGotPltVA(ctx) - pltEntryAddr - 7);
1330 write32le(P: buf + 8, V: ctx.in.plt->getVA() - pltEntryAddr - 12);
1331}
1332
1333void elf::setX86_64TargetInfo(Ctx &ctx) {
1334 if (ctx.arg.zRetpolineplt) {
1335 if (ctx.arg.zNow)
1336 ctx.target.reset(p: new RetpolineZNow(ctx));
1337 else
1338 ctx.target.reset(p: new Retpoline(ctx));
1339 return;
1340 }
1341
1342 if (ctx.arg.andFeatures & GNU_PROPERTY_X86_FEATURE_1_IBT)
1343 ctx.target.reset(p: new IntelIBT(ctx));
1344 else
1345 ctx.target.reset(p: new X86_64(ctx));
1346}
1347

Provided by KDAB

Privacy Policy
Learn to use CMake with our Intro Training
Find out more

source code of lld/ELF/Arch/X86_64.cpp