1 | //===- ARM64.cpp ----------------------------------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "Arch/ARM64Common.h" |
10 | #include "InputFiles.h" |
11 | #include "Symbols.h" |
12 | #include "SyntheticSections.h" |
13 | #include "Target.h" |
14 | |
15 | #include "lld/Common/ErrorHandler.h" |
16 | #include "mach-o/compact_unwind_encoding.h" |
17 | #include "llvm/ADT/SmallVector.h" |
18 | #include "llvm/ADT/StringRef.h" |
19 | #include "llvm/BinaryFormat/MachO.h" |
20 | #include "llvm/Support/Endian.h" |
21 | #include "llvm/Support/LEB128.h" |
22 | #include "llvm/Support/MathExtras.h" |
23 | |
24 | using namespace llvm; |
25 | using namespace llvm::MachO; |
26 | using namespace llvm::support::endian; |
27 | using namespace lld; |
28 | using namespace lld::macho; |
29 | |
30 | namespace { |
31 | |
32 | struct ARM64 : ARM64Common { |
33 | ARM64(); |
34 | void writeStub(uint8_t *buf, const Symbol &, uint64_t) const override; |
35 | void writeStubHelperHeader(uint8_t *buf) const override; |
36 | void writeStubHelperEntry(uint8_t *buf, const Symbol &, |
37 | uint64_t entryAddr) const override; |
38 | |
39 | void writeObjCMsgSendStub(uint8_t *buf, Symbol *sym, uint64_t stubsAddr, |
40 | uint64_t &stubOffset, uint64_t selrefVA, |
41 | Symbol *objcMsgSend) const override; |
42 | void populateThunk(InputSection *thunk, Symbol *funcSym) override; |
43 | void applyOptimizationHints(uint8_t *, const ObjFile &) const override; |
44 | }; |
45 | |
46 | } // namespace |
47 | |
48 | // Random notes on reloc types: |
49 | // ADDEND always pairs with BRANCH26, PAGE21, or PAGEOFF12 |
50 | // POINTER_TO_GOT: ld64 supports a 4-byte pc-relative form as well as an 8-byte |
51 | // absolute version of this relocation. The semantics of the absolute relocation |
52 | // are weird -- it results in the value of the GOT slot being written, instead |
53 | // of the address. Let's not support it unless we find a real-world use case. |
54 | static constexpr std::array<RelocAttrs, 11> relocAttrsArray{._M_elems: { |
55 | #define B(x) RelocAttrBits::x |
56 | {.name: "UNSIGNED" , |
57 | B(UNSIGNED) | B(ABSOLUTE) | B(EXTERN) | B(LOCAL) | B(BYTE4) | B(BYTE8)}, |
58 | {.name: "SUBTRACTOR" , B(SUBTRAHEND) | B(EXTERN) | B(BYTE4) | B(BYTE8)}, |
59 | {.name: "BRANCH26" , B(PCREL) | B(EXTERN) | B(BRANCH) | B(BYTE4)}, |
60 | {.name: "PAGE21" , B(PCREL) | B(EXTERN) | B(BYTE4)}, |
61 | {.name: "PAGEOFF12" , B(ABSOLUTE) | B(EXTERN) | B(BYTE4)}, |
62 | {.name: "GOT_LOAD_PAGE21" , B(PCREL) | B(EXTERN) | B(GOT) | B(BYTE4)}, |
63 | {.name: "GOT_LOAD_PAGEOFF12" , |
64 | B(ABSOLUTE) | B(EXTERN) | B(GOT) | B(LOAD) | B(BYTE4)}, |
65 | {.name: "POINTER_TO_GOT" , B(PCREL) | B(EXTERN) | B(GOT) | B(POINTER) | B(BYTE4)}, |
66 | {.name: "TLVP_LOAD_PAGE21" , B(PCREL) | B(EXTERN) | B(TLV) | B(BYTE4)}, |
67 | {.name: "TLVP_LOAD_PAGEOFF12" , |
68 | B(ABSOLUTE) | B(EXTERN) | B(TLV) | B(LOAD) | B(BYTE4)}, |
69 | {.name: "ADDEND" , B(ADDEND)}, |
70 | #undef B |
71 | }}; |
72 | |
73 | static constexpr uint32_t stubCode[] = { |
74 | 0x90000010, // 00: adrp x16, __la_symbol_ptr@page |
75 | 0xf9400210, // 04: ldr x16, [x16, __la_symbol_ptr@pageoff] |
76 | 0xd61f0200, // 08: br x16 |
77 | }; |
78 | |
79 | void ARM64::writeStub(uint8_t *buf8, const Symbol &sym, |
80 | uint64_t pointerVA) const { |
81 | ::writeStub(buf8, stubCode, sym, pointerVA); |
82 | } |
83 | |
84 | static constexpr uint32_t [] = { |
85 | 0x90000011, // 00: adrp x17, _dyld_private@page |
86 | 0x91000231, // 04: add x17, x17, _dyld_private@pageoff |
87 | 0xa9bf47f0, // 08: stp x16/x17, [sp, #-16]! |
88 | 0x90000010, // 0c: adrp x16, dyld_stub_binder@page |
89 | 0xf9400210, // 10: ldr x16, [x16, dyld_stub_binder@pageoff] |
90 | 0xd61f0200, // 14: br x16 |
91 | }; |
92 | |
93 | void ARM64::(uint8_t *buf8) const { |
94 | ::writeStubHelperHeader<LP64>(buf8, stubHelperHeaderCode); |
95 | } |
96 | |
97 | static constexpr uint32_t stubHelperEntryCode[] = { |
98 | 0x18000050, // 00: ldr w16, l0 |
99 | 0x14000000, // 04: b stubHelperHeader |
100 | 0x00000000, // 08: l0: .long 0 |
101 | }; |
102 | |
103 | void ARM64::writeStubHelperEntry(uint8_t *buf8, const Symbol &sym, |
104 | uint64_t entryVA) const { |
105 | ::writeStubHelperEntry(buf8, stubHelperEntryCode, sym, entryVA); |
106 | } |
107 | |
108 | static constexpr uint32_t objcStubsFastCode[] = { |
109 | 0x90000001, // adrp x1, __objc_selrefs@page |
110 | 0xf9400021, // ldr x1, [x1, @selector("foo")@pageoff] |
111 | 0x90000010, // adrp x16, _got@page |
112 | 0xf9400210, // ldr x16, [x16, _objc_msgSend@pageoff] |
113 | 0xd61f0200, // br x16 |
114 | 0xd4200020, // brk #0x1 |
115 | 0xd4200020, // brk #0x1 |
116 | 0xd4200020, // brk #0x1 |
117 | }; |
118 | |
119 | static constexpr uint32_t objcStubsSmallCode[] = { |
120 | 0x90000001, // adrp x1, __objc_selrefs@page |
121 | 0xf9400021, // ldr x1, [x1, @selector("foo")@pageoff] |
122 | 0x14000000, // b _objc_msgSend |
123 | }; |
124 | |
125 | void ARM64::writeObjCMsgSendStub(uint8_t *buf, Symbol *sym, uint64_t stubsAddr, |
126 | uint64_t &stubOffset, uint64_t selrefVA, |
127 | Symbol *objcMsgSend) const { |
128 | uint64_t objcMsgSendAddr; |
129 | uint64_t objcStubSize; |
130 | uint64_t objcMsgSendIndex; |
131 | |
132 | if (config->objcStubsMode == ObjCStubsMode::fast) { |
133 | objcStubSize = target->objcStubsFastSize; |
134 | objcMsgSendAddr = in.got->addr; |
135 | objcMsgSendIndex = objcMsgSend->gotIndex; |
136 | ::writeObjCMsgSendFastStub<LP64>(buf, objcStubsFastCode, sym, stubsAddr, |
137 | stubOffset, selrefVA, gotAddr: objcMsgSendAddr, |
138 | msgSendIndex: objcMsgSendIndex); |
139 | } else { |
140 | assert(config->objcStubsMode == ObjCStubsMode::small); |
141 | objcStubSize = target->objcStubsSmallSize; |
142 | if (auto *d = dyn_cast<Defined>(Val: objcMsgSend)) { |
143 | objcMsgSendAddr = d->getVA(); |
144 | objcMsgSendIndex = 0; |
145 | } else { |
146 | objcMsgSendAddr = in.stubs->addr; |
147 | objcMsgSendIndex = objcMsgSend->stubsIndex; |
148 | } |
149 | ::writeObjCMsgSendSmallStub<LP64>(buf, objcStubsSmallCode, sym, stubsAddr, |
150 | stubOffset, selrefVA, msgSendAddr: objcMsgSendAddr, |
151 | msgSendIndex: objcMsgSendIndex); |
152 | } |
153 | stubOffset += objcStubSize; |
154 | } |
155 | |
156 | // A thunk is the relaxed variation of stubCode. We don't need the |
157 | // extra indirection through a lazy pointer because the target address |
158 | // is known at link time. |
159 | static constexpr uint32_t thunkCode[] = { |
160 | 0x90000010, // 00: adrp x16, <thunk.ptr>@page |
161 | 0x91000210, // 04: add x16, [x16,<thunk.ptr>@pageoff] |
162 | 0xd61f0200, // 08: br x16 |
163 | }; |
164 | |
165 | void ARM64::populateThunk(InputSection *thunk, Symbol *funcSym) { |
166 | thunk->align = 4; |
167 | thunk->data = {reinterpret_cast<const uint8_t *>(thunkCode), |
168 | sizeof(thunkCode)}; |
169 | thunk->relocs.emplace_back(/*type=*/args: ARM64_RELOC_PAGEOFF12, |
170 | /*pcrel=*/args: false, /*length=*/args: 2, |
171 | /*offset=*/args: 4, /*addend=*/args: 0, |
172 | /*referent=*/args&: funcSym); |
173 | thunk->relocs.emplace_back(/*type=*/args: ARM64_RELOC_PAGE21, |
174 | /*pcrel=*/args: true, /*length=*/args: 2, |
175 | /*offset=*/args: 0, /*addend=*/args: 0, |
176 | /*referent=*/args&: funcSym); |
177 | } |
178 | |
179 | ARM64::ARM64() : ARM64Common(LP64()) { |
180 | cpuType = CPU_TYPE_ARM64; |
181 | cpuSubtype = CPU_SUBTYPE_ARM64_ALL; |
182 | |
183 | stubSize = sizeof(stubCode); |
184 | thunkSize = sizeof(thunkCode); |
185 | |
186 | objcStubsFastSize = sizeof(objcStubsFastCode); |
187 | objcStubsFastAlignment = 32; |
188 | objcStubsSmallSize = sizeof(objcStubsSmallCode); |
189 | objcStubsSmallAlignment = 4; |
190 | |
191 | // Branch immediate is two's complement 26 bits, which is implicitly |
192 | // multiplied by 4 (since all functions are 4-aligned: The branch range |
193 | // is -4*(2**(26-1))..4*(2**(26-1) - 1). |
194 | backwardBranchRange = 128 * 1024 * 1024; |
195 | forwardBranchRange = backwardBranchRange - 4; |
196 | |
197 | modeDwarfEncoding = UNWIND_ARM64_MODE_DWARF; |
198 | subtractorRelocType = ARM64_RELOC_SUBTRACTOR; |
199 | unsignedRelocType = ARM64_RELOC_UNSIGNED; |
200 | |
201 | stubHelperHeaderSize = sizeof(stubHelperHeaderCode); |
202 | stubHelperEntrySize = sizeof(stubHelperEntryCode); |
203 | |
204 | relocAttrs = {relocAttrsArray.data(), relocAttrsArray.size()}; |
205 | } |
206 | |
207 | namespace { |
208 | struct Adrp { |
209 | uint32_t destRegister; |
210 | int64_t addend; |
211 | }; |
212 | |
213 | struct Add { |
214 | uint8_t destRegister; |
215 | uint8_t srcRegister; |
216 | uint32_t addend; |
217 | }; |
218 | |
219 | enum ExtendType { ZeroExtend = 1, Sign64 = 2, Sign32 = 3 }; |
220 | |
221 | struct Ldr { |
222 | uint8_t destRegister; |
223 | uint8_t baseRegister; |
224 | uint8_t p2Size; |
225 | bool isFloat; |
226 | ExtendType extendType; |
227 | int64_t offset; |
228 | }; |
229 | } // namespace |
230 | |
231 | static bool parseAdrp(uint32_t insn, Adrp &adrp) { |
232 | if ((insn & 0x9f000000) != 0x90000000) |
233 | return false; |
234 | adrp.destRegister = insn & 0x1f; |
235 | uint64_t immHi = (insn >> 5) & 0x7ffff; |
236 | uint64_t immLo = (insn >> 29) & 0x3; |
237 | adrp.addend = SignExtend64<21>(x: immLo | (immHi << 2)) * 4096; |
238 | return true; |
239 | } |
240 | |
241 | static bool parseAdd(uint32_t insn, Add &add) { |
242 | if ((insn & 0xffc00000) != 0x91000000) |
243 | return false; |
244 | add.destRegister = insn & 0x1f; |
245 | add.srcRegister = (insn >> 5) & 0x1f; |
246 | add.addend = (insn >> 10) & 0xfff; |
247 | return true; |
248 | } |
249 | |
250 | static bool parseLdr(uint32_t insn, Ldr &ldr) { |
251 | ldr.destRegister = insn & 0x1f; |
252 | ldr.baseRegister = (insn >> 5) & 0x1f; |
253 | uint8_t size = insn >> 30; |
254 | uint8_t opc = (insn >> 22) & 3; |
255 | |
256 | if ((insn & 0x3fc00000) == 0x39400000) { |
257 | // LDR (immediate), LDRB (immediate), LDRH (immediate) |
258 | ldr.p2Size = size; |
259 | ldr.extendType = ZeroExtend; |
260 | ldr.isFloat = false; |
261 | } else if ((insn & 0x3f800000) == 0x39800000) { |
262 | // LDRSB (immediate), LDRSH (immediate), LDRSW (immediate) |
263 | ldr.p2Size = size; |
264 | ldr.extendType = static_cast<ExtendType>(opc); |
265 | ldr.isFloat = false; |
266 | } else if ((insn & 0x3f400000) == 0x3d400000) { |
267 | // LDR (immediate, SIMD&FP) |
268 | ldr.extendType = ZeroExtend; |
269 | ldr.isFloat = true; |
270 | if (opc == 1) |
271 | ldr.p2Size = size; |
272 | else if (size == 0 && opc == 3) |
273 | ldr.p2Size = 4; |
274 | else |
275 | return false; |
276 | } else { |
277 | return false; |
278 | } |
279 | ldr.offset = ((insn >> 10) & 0xfff) << ldr.p2Size; |
280 | return true; |
281 | } |
282 | |
283 | static bool isValidAdrOffset(int32_t delta) { return isInt<21>(x: delta); } |
284 | |
285 | static void writeAdr(void *loc, uint32_t dest, int32_t delta) { |
286 | assert(isValidAdrOffset(delta)); |
287 | uint32_t opcode = 0x10000000; |
288 | uint32_t immHi = (delta & 0x001ffffc) << 3; |
289 | uint32_t immLo = (delta & 0x00000003) << 29; |
290 | write32le(P: loc, V: opcode | immHi | immLo | dest); |
291 | } |
292 | |
293 | static void writeNop(void *loc) { write32le(P: loc, V: 0xd503201f); } |
294 | |
295 | static bool isLiteralLdrEligible(const Ldr &ldr) { |
296 | return ldr.p2Size > 1 && isShiftedInt<19, 2>(x: ldr.offset); |
297 | } |
298 | |
299 | static void writeLiteralLdr(void *loc, const Ldr &ldr) { |
300 | assert(isLiteralLdrEligible(ldr)); |
301 | uint32_t imm19 = (ldr.offset / 4 & maskTrailingOnes<uint32_t>(N: 19)) << 5; |
302 | uint32_t opcode; |
303 | switch (ldr.p2Size) { |
304 | case 2: |
305 | if (ldr.isFloat) |
306 | opcode = 0x1c000000; |
307 | else |
308 | opcode = ldr.extendType == Sign64 ? 0x98000000 : 0x18000000; |
309 | break; |
310 | case 3: |
311 | opcode = ldr.isFloat ? 0x5c000000 : 0x58000000; |
312 | break; |
313 | case 4: |
314 | opcode = 0x9c000000; |
315 | break; |
316 | default: |
317 | llvm_unreachable("Invalid literal ldr size" ); |
318 | } |
319 | write32le(P: loc, V: opcode | imm19 | ldr.destRegister); |
320 | } |
321 | |
322 | static bool isImmediateLdrEligible(const Ldr &ldr) { |
323 | // Note: We deviate from ld64's behavior, which converts to immediate loads |
324 | // only if ldr.offset < 4096, even though the offset is divided by the load's |
325 | // size in the 12-bit immediate operand. Only the unsigned offset variant is |
326 | // supported. |
327 | |
328 | uint32_t size = 1 << ldr.p2Size; |
329 | return ldr.offset >= 0 && (ldr.offset % size) == 0 && |
330 | isUInt<12>(x: ldr.offset >> ldr.p2Size); |
331 | } |
332 | |
333 | static void writeImmediateLdr(void *loc, const Ldr &ldr) { |
334 | assert(isImmediateLdrEligible(ldr)); |
335 | uint32_t opcode = 0x39000000; |
336 | if (ldr.isFloat) { |
337 | opcode |= 0x04000000; |
338 | assert(ldr.extendType == ZeroExtend); |
339 | } |
340 | opcode |= ldr.destRegister; |
341 | opcode |= ldr.baseRegister << 5; |
342 | uint8_t size, opc; |
343 | if (ldr.p2Size == 4) { |
344 | size = 0; |
345 | opc = 3; |
346 | } else { |
347 | opc = ldr.extendType; |
348 | size = ldr.p2Size; |
349 | } |
350 | uint32_t immBits = ldr.offset >> ldr.p2Size; |
351 | write32le(P: loc, V: opcode | (immBits << 10) | (opc << 22) | (size << 30)); |
352 | } |
353 | |
354 | // Transforms a pair of adrp+add instructions into an adr instruction if the |
355 | // target is within the +/- 1 MiB range allowed by the adr's 21 bit signed |
356 | // immediate offset. |
357 | // |
358 | // adrp xN, _foo@PAGE |
359 | // add xM, xN, _foo@PAGEOFF |
360 | // -> |
361 | // adr xM, _foo |
362 | // nop |
363 | static void applyAdrpAdd(uint8_t *buf, const ConcatInputSection *isec, |
364 | uint64_t offset1, uint64_t offset2) { |
365 | uint32_t ins1 = read32le(P: buf + offset1); |
366 | uint32_t ins2 = read32le(P: buf + offset2); |
367 | Adrp adrp; |
368 | Add add; |
369 | if (!parseAdrp(insn: ins1, adrp) || !parseAdd(insn: ins2, add)) |
370 | return; |
371 | if (adrp.destRegister != add.srcRegister) |
372 | return; |
373 | |
374 | uint64_t addr1 = isec->getVA() + offset1; |
375 | uint64_t referent = pageBits(address: addr1) + adrp.addend + add.addend; |
376 | int64_t delta = referent - addr1; |
377 | if (!isValidAdrOffset(delta)) |
378 | return; |
379 | |
380 | writeAdr(loc: buf + offset1, dest: add.destRegister, delta); |
381 | writeNop(loc: buf + offset2); |
382 | } |
383 | |
384 | // Transforms two adrp instructions into a single adrp if their referent |
385 | // addresses are located on the same 4096 byte page. |
386 | // |
387 | // adrp xN, _foo@PAGE |
388 | // adrp xN, _bar@PAGE |
389 | // -> |
390 | // adrp xN, _foo@PAGE |
391 | // nop |
392 | static void applyAdrpAdrp(uint8_t *buf, const ConcatInputSection *isec, |
393 | uint64_t offset1, uint64_t offset2) { |
394 | uint32_t ins1 = read32le(P: buf + offset1); |
395 | uint32_t ins2 = read32le(P: buf + offset2); |
396 | Adrp adrp1, adrp2; |
397 | if (!parseAdrp(insn: ins1, adrp&: adrp1) || !parseAdrp(insn: ins2, adrp&: adrp2)) |
398 | return; |
399 | if (adrp1.destRegister != adrp2.destRegister) |
400 | return; |
401 | |
402 | uint64_t page1 = pageBits(address: offset1 + isec->getVA()) + adrp1.addend; |
403 | uint64_t page2 = pageBits(address: offset2 + isec->getVA()) + adrp2.addend; |
404 | if (page1 != page2) |
405 | return; |
406 | |
407 | writeNop(loc: buf + offset2); |
408 | } |
409 | |
410 | // Transforms a pair of adrp+ldr (immediate) instructions into an ldr (literal) |
411 | // load from a PC-relative address if it is 4-byte aligned and within +/- 1 MiB, |
412 | // as ldr can encode a signed 19-bit offset that gets multiplied by 4. |
413 | // |
414 | // adrp xN, _foo@PAGE |
415 | // ldr xM, [xN, _foo@PAGEOFF] |
416 | // -> |
417 | // nop |
418 | // ldr xM, _foo |
419 | static void applyAdrpLdr(uint8_t *buf, const ConcatInputSection *isec, |
420 | uint64_t offset1, uint64_t offset2) { |
421 | uint32_t ins1 = read32le(P: buf + offset1); |
422 | uint32_t ins2 = read32le(P: buf + offset2); |
423 | Adrp adrp; |
424 | Ldr ldr; |
425 | if (!parseAdrp(insn: ins1, adrp) || !parseLdr(insn: ins2, ldr)) |
426 | return; |
427 | if (adrp.destRegister != ldr.baseRegister) |
428 | return; |
429 | |
430 | uint64_t addr1 = isec->getVA() + offset1; |
431 | uint64_t addr2 = isec->getVA() + offset2; |
432 | uint64_t referent = pageBits(address: addr1) + adrp.addend + ldr.offset; |
433 | ldr.offset = referent - addr2; |
434 | if (!isLiteralLdrEligible(ldr)) |
435 | return; |
436 | |
437 | writeNop(loc: buf + offset1); |
438 | writeLiteralLdr(loc: buf + offset2, ldr); |
439 | } |
440 | |
441 | // GOT loads are emitted by the compiler as a pair of adrp and ldr instructions, |
442 | // but they may be changed to adrp+add by relaxGotLoad(). This hint performs |
443 | // the AdrpLdr or AdrpAdd transformation depending on whether it was relaxed. |
444 | static void applyAdrpLdrGot(uint8_t *buf, const ConcatInputSection *isec, |
445 | uint64_t offset1, uint64_t offset2) { |
446 | uint32_t ins2 = read32le(P: buf + offset2); |
447 | Add add; |
448 | Ldr ldr; |
449 | if (parseAdd(insn: ins2, add)) |
450 | applyAdrpAdd(buf, isec, offset1, offset2); |
451 | else if (parseLdr(insn: ins2, ldr)) |
452 | applyAdrpLdr(buf, isec, offset1, offset2); |
453 | } |
454 | |
455 | // Optimizes an adrp+add+ldr sequence used for loading from a local symbol's |
456 | // address by loading directly if it's close enough, or to an adrp(p)+ldr |
457 | // sequence if it's not. |
458 | // |
459 | // adrp x0, _foo@PAGE |
460 | // add x1, x0, _foo@PAGEOFF |
461 | // ldr x2, [x1, #off] |
462 | static void applyAdrpAddLdr(uint8_t *buf, const ConcatInputSection *isec, |
463 | uint64_t offset1, uint64_t offset2, |
464 | uint64_t offset3) { |
465 | uint32_t ins1 = read32le(P: buf + offset1); |
466 | Adrp adrp; |
467 | if (!parseAdrp(insn: ins1, adrp)) |
468 | return; |
469 | uint32_t ins2 = read32le(P: buf + offset2); |
470 | Add add; |
471 | if (!parseAdd(insn: ins2, add)) |
472 | return; |
473 | uint32_t ins3 = read32le(P: buf + offset3); |
474 | Ldr ldr; |
475 | if (!parseLdr(insn: ins3, ldr)) |
476 | return; |
477 | if (adrp.destRegister != add.srcRegister) |
478 | return; |
479 | if (add.destRegister != ldr.baseRegister) |
480 | return; |
481 | |
482 | // Load from the target address directly. |
483 | // nop |
484 | // nop |
485 | // ldr x2, [_foo + #off] |
486 | uint64_t addr1 = isec->getVA() + offset1; |
487 | uint64_t addr3 = isec->getVA() + offset3; |
488 | uint64_t referent = pageBits(address: addr1) + adrp.addend + add.addend; |
489 | Ldr literalLdr = ldr; |
490 | literalLdr.offset += referent - addr3; |
491 | if (isLiteralLdrEligible(ldr: literalLdr)) { |
492 | writeNop(loc: buf + offset1); |
493 | writeNop(loc: buf + offset2); |
494 | writeLiteralLdr(loc: buf + offset3, ldr: literalLdr); |
495 | return; |
496 | } |
497 | |
498 | // Load the target address into a register and load from there indirectly. |
499 | // adr x1, _foo |
500 | // nop |
501 | // ldr x2, [x1, #off] |
502 | int64_t adrOffset = referent - addr1; |
503 | if (isValidAdrOffset(delta: adrOffset)) { |
504 | writeAdr(loc: buf + offset1, dest: ldr.baseRegister, delta: adrOffset); |
505 | // Note: ld64 moves the offset into the adr instruction for AdrpAddLdr, but |
506 | // not for AdrpLdrGotLdr. Its effect is the same either way. |
507 | writeNop(loc: buf + offset2); |
508 | return; |
509 | } |
510 | |
511 | // Move the target's page offset into the ldr's immediate offset. |
512 | // adrp x0, _foo@PAGE |
513 | // nop |
514 | // ldr x2, [x0, _foo@PAGEOFF + #off] |
515 | Ldr immediateLdr = ldr; |
516 | immediateLdr.baseRegister = adrp.destRegister; |
517 | immediateLdr.offset += add.addend; |
518 | if (isImmediateLdrEligible(ldr: immediateLdr)) { |
519 | writeNop(loc: buf + offset2); |
520 | writeImmediateLdr(loc: buf + offset3, ldr: immediateLdr); |
521 | return; |
522 | } |
523 | } |
524 | |
525 | // Relaxes a GOT-indirect load. |
526 | // If the referenced symbol is external and its GOT entry is within +/- 1 MiB, |
527 | // the GOT entry can be loaded with a single literal ldr instruction. |
528 | // If the referenced symbol is local and thus has been relaxed to adrp+add+ldr, |
529 | // we perform the AdrpAddLdr transformation. |
530 | static void applyAdrpLdrGotLdr(uint8_t *buf, const ConcatInputSection *isec, |
531 | uint64_t offset1, uint64_t offset2, |
532 | uint64_t offset3) { |
533 | uint32_t ins2 = read32le(P: buf + offset2); |
534 | Add add; |
535 | Ldr ldr2; |
536 | |
537 | if (parseAdd(insn: ins2, add)) { |
538 | applyAdrpAddLdr(buf, isec, offset1, offset2, offset3); |
539 | } else if (parseLdr(insn: ins2, ldr&: ldr2)) { |
540 | // adrp x1, _foo@GOTPAGE |
541 | // ldr x2, [x1, _foo@GOTPAGEOFF] |
542 | // ldr x3, [x2, #off] |
543 | |
544 | uint32_t ins1 = read32le(P: buf + offset1); |
545 | Adrp adrp; |
546 | if (!parseAdrp(insn: ins1, adrp)) |
547 | return; |
548 | uint32_t ins3 = read32le(P: buf + offset3); |
549 | Ldr ldr3; |
550 | if (!parseLdr(insn: ins3, ldr&: ldr3)) |
551 | return; |
552 | |
553 | if (ldr2.baseRegister != adrp.destRegister) |
554 | return; |
555 | if (ldr3.baseRegister != ldr2.destRegister) |
556 | return; |
557 | // Loads from the GOT must be pointer sized. |
558 | if (ldr2.p2Size != 3 || ldr2.isFloat) |
559 | return; |
560 | |
561 | uint64_t addr1 = isec->getVA() + offset1; |
562 | uint64_t addr2 = isec->getVA() + offset2; |
563 | uint64_t referent = pageBits(address: addr1) + adrp.addend + ldr2.offset; |
564 | // Load the GOT entry's address directly. |
565 | // nop |
566 | // ldr x2, _foo@GOTPAGE + _foo@GOTPAGEOFF |
567 | // ldr x3, [x2, #off] |
568 | Ldr literalLdr = ldr2; |
569 | literalLdr.offset = referent - addr2; |
570 | if (isLiteralLdrEligible(ldr: literalLdr)) { |
571 | writeNop(loc: buf + offset1); |
572 | writeLiteralLdr(loc: buf + offset2, ldr: literalLdr); |
573 | } |
574 | } |
575 | } |
576 | |
577 | static uint64_t readValue(const uint8_t *&ptr, const uint8_t *end) { |
578 | unsigned int n = 0; |
579 | uint64_t value = decodeULEB128(p: ptr, n: &n, end); |
580 | ptr += n; |
581 | return value; |
582 | } |
583 | |
584 | template <typename Callback> |
585 | static void forEachHint(ArrayRef<uint8_t> data, Callback callback) { |
586 | std::array<uint64_t, 3> args; |
587 | |
588 | for (const uint8_t *p = data.begin(), *end = data.end(); p < end;) { |
589 | uint64_t type = readValue(ptr&: p, end); |
590 | if (type == 0) |
591 | break; |
592 | |
593 | uint64_t argCount = readValue(ptr&: p, end); |
594 | // All known LOH types as of 2022-09 have 3 or fewer arguments; skip others. |
595 | if (argCount > 3) { |
596 | for (unsigned i = 0; i < argCount; ++i) |
597 | readValue(ptr&: p, end); |
598 | continue; |
599 | } |
600 | |
601 | for (unsigned i = 0; i < argCount; ++i) |
602 | args[i] = readValue(ptr&: p, end); |
603 | callback(type, ArrayRef<uint64_t>(args.data(), argCount)); |
604 | } |
605 | } |
606 | |
607 | // On RISC architectures like arm64, materializing a memory address generally |
608 | // takes multiple instructions. If the referenced symbol is located close enough |
609 | // in memory, fewer instructions are needed. |
610 | // |
611 | // Linker optimization hints record where addresses are computed. After |
612 | // addresses have been assigned, if possible, we change them to a shorter |
613 | // sequence of instructions. The size of the binary is not modified; the |
614 | // eliminated instructions are replaced with NOPs. This still leads to faster |
615 | // code as the CPU can skip over NOPs quickly. |
616 | // |
617 | // LOHs are specified by the LC_LINKER_OPTIMIZATION_HINTS load command, which |
618 | // points to a sequence of ULEB128-encoded numbers. Each entry specifies a |
619 | // transformation kind, and 2 or 3 addresses where the instructions are located. |
620 | void ARM64::applyOptimizationHints(uint8_t *outBuf, const ObjFile &obj) const { |
621 | ArrayRef<uint8_t> data = obj.getOptimizationHints(); |
622 | if (data.empty()) |
623 | return; |
624 | |
625 | const ConcatInputSection *section = nullptr; |
626 | uint64_t sectionAddr = 0; |
627 | uint8_t *buf = nullptr; |
628 | |
629 | auto findSection = [&](uint64_t addr) { |
630 | if (section && addr >= sectionAddr && |
631 | addr < sectionAddr + section->getSize()) |
632 | return true; |
633 | |
634 | if (obj.sections.empty()) |
635 | return false; |
636 | auto secIt = std::prev(x: llvm::upper_bound( |
637 | Range: obj.sections, Value&: addr, |
638 | C: [](uint64_t off, const Section *sec) { return off < sec->addr; })); |
639 | const Section *sec = *secIt; |
640 | |
641 | if (sec->subsections.empty()) |
642 | return false; |
643 | auto subsecIt = std::prev(x: llvm::upper_bound( |
644 | Range: sec->subsections, Value: addr - sec->addr, |
645 | C: [](uint64_t off, Subsection subsec) { return off < subsec.offset; })); |
646 | const Subsection &subsec = *subsecIt; |
647 | const ConcatInputSection *isec = |
648 | dyn_cast_or_null<ConcatInputSection>(Val: subsec.isec); |
649 | if (!isec || isec->shouldOmitFromOutput()) |
650 | return false; |
651 | |
652 | section = isec; |
653 | sectionAddr = subsec.offset + sec->addr; |
654 | buf = outBuf + section->outSecOff + section->parent->fileOff; |
655 | return true; |
656 | }; |
657 | |
658 | auto isValidOffset = [&](uint64_t offset) { |
659 | if (offset < sectionAddr || offset >= sectionAddr + section->getSize()) { |
660 | error(msg: toString(file: &obj) + |
661 | ": linker optimization hint spans multiple sections" ); |
662 | return false; |
663 | } |
664 | return true; |
665 | }; |
666 | |
667 | bool hasAdrpAdrp = false; |
668 | forEachHint(data, callback: [&](uint64_t kind, ArrayRef<uint64_t> args) { |
669 | if (kind == LOH_ARM64_ADRP_ADRP) { |
670 | hasAdrpAdrp = true; |
671 | return; |
672 | } |
673 | |
674 | if (!findSection(args[0])) |
675 | return; |
676 | switch (kind) { |
677 | case LOH_ARM64_ADRP_ADD: |
678 | if (isValidOffset(args[1])) |
679 | applyAdrpAdd(buf, isec: section, offset1: args[0] - sectionAddr, |
680 | offset2: args[1] - sectionAddr); |
681 | break; |
682 | case LOH_ARM64_ADRP_LDR: |
683 | if (isValidOffset(args[1])) |
684 | applyAdrpLdr(buf, isec: section, offset1: args[0] - sectionAddr, |
685 | offset2: args[1] - sectionAddr); |
686 | break; |
687 | case LOH_ARM64_ADRP_LDR_GOT: |
688 | if (isValidOffset(args[1])) |
689 | applyAdrpLdrGot(buf, isec: section, offset1: args[0] - sectionAddr, |
690 | offset2: args[1] - sectionAddr); |
691 | break; |
692 | case LOH_ARM64_ADRP_ADD_LDR: |
693 | if (isValidOffset(args[1]) && isValidOffset(args[2])) |
694 | applyAdrpAddLdr(buf, isec: section, offset1: args[0] - sectionAddr, |
695 | offset2: args[1] - sectionAddr, offset3: args[2] - sectionAddr); |
696 | break; |
697 | case LOH_ARM64_ADRP_LDR_GOT_LDR: |
698 | if (isValidOffset(args[1]) && isValidOffset(args[2])) |
699 | applyAdrpLdrGotLdr(buf, isec: section, offset1: args[0] - sectionAddr, |
700 | offset2: args[1] - sectionAddr, offset3: args[2] - sectionAddr); |
701 | break; |
702 | case LOH_ARM64_ADRP_ADD_STR: |
703 | case LOH_ARM64_ADRP_LDR_GOT_STR: |
704 | // TODO: Implement these |
705 | break; |
706 | } |
707 | }); |
708 | |
709 | if (!hasAdrpAdrp) |
710 | return; |
711 | |
712 | // AdrpAdrp optimization hints are performed in a second pass because they |
713 | // might interfere with other transformations. For instance, consider the |
714 | // following input: |
715 | // |
716 | // adrp x0, _foo@PAGE |
717 | // add x1, x0, _foo@PAGEOFF |
718 | // adrp x0, _bar@PAGE |
719 | // add x2, x0, _bar@PAGEOFF |
720 | // |
721 | // If we perform the AdrpAdrp relaxation first, we get: |
722 | // |
723 | // adrp x0, _foo@PAGE |
724 | // add x1, x0, _foo@PAGEOFF |
725 | // nop |
726 | // add x2, x0, _bar@PAGEOFF |
727 | // |
728 | // If we then apply AdrpAdd to the first two instructions, the add will have a |
729 | // garbage value in x0: |
730 | // |
731 | // adr x1, _foo |
732 | // nop |
733 | // nop |
734 | // add x2, x0, _bar@PAGEOFF |
735 | forEachHint(data, callback: [&](uint64_t kind, ArrayRef<uint64_t> args) { |
736 | if (kind != LOH_ARM64_ADRP_ADRP) |
737 | return; |
738 | if (!findSection(args[0])) |
739 | return; |
740 | if (isValidOffset(args[1])) |
741 | applyAdrpAdrp(buf, isec: section, offset1: args[0] - sectionAddr, offset2: args[1] - sectionAddr); |
742 | }); |
743 | } |
744 | |
745 | TargetInfo *macho::createARM64TargetInfo() { |
746 | static ARM64 t; |
747 | return &t; |
748 | } |
749 | |