1 | //===- ExportTrie.cpp -----------------------------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This is a partial implementation of the Mach-O export trie format. It's |
10 | // essentially a symbol table encoded as a compressed prefix trie, meaning that |
11 | // the common prefixes of each symbol name are shared for a more compact |
12 | // representation. The prefixes are stored on the edges of the trie, and one |
13 | // edge can represent multiple characters. For example, given two exported |
14 | // symbols _bar and _baz, we will have a trie like this (terminal nodes are |
15 | // marked with an asterisk): |
16 | // |
17 | // +-+-+ |
18 | // | | // root node |
19 | // +-+-+ |
20 | // | |
21 | // | _ba |
22 | // | |
23 | // +-+-+ |
24 | // | | |
25 | // +-+-+ |
26 | // r / \ z |
27 | // / \ |
28 | // +-+-+ +-+-+ |
29 | // | * | | * | |
30 | // +-+-+ +-+-+ |
31 | // |
32 | // More documentation of the format can be found in |
33 | // llvm/tools/obj2yaml/macho2yaml.cpp. |
34 | // |
35 | //===----------------------------------------------------------------------===// |
36 | |
37 | #include "ExportTrie.h" |
38 | #include "Symbols.h" |
39 | |
40 | #include "lld/Common/ErrorHandler.h" |
41 | #include "lld/Common/Memory.h" |
42 | #include "llvm/BinaryFormat/MachO.h" |
43 | #include "llvm/Support/LEB128.h" |
44 | #include <optional> |
45 | |
46 | using namespace llvm; |
47 | using namespace lld; |
48 | using namespace lld::macho; |
49 | |
50 | namespace { |
51 | |
52 | struct Edge { |
53 | Edge(StringRef s, TrieNode *node) : substring(s), child(node) {} |
54 | |
55 | StringRef substring; |
56 | struct TrieNode *child; |
57 | }; |
58 | |
59 | struct ExportInfo { |
60 | uint64_t address; |
61 | uint64_t ordinal = 0; |
62 | uint8_t flags = 0; |
63 | ExportInfo(const Symbol &sym, uint64_t imageBase) |
64 | : address(sym.getVA() - imageBase) { |
65 | using namespace llvm::MachO; |
66 | if (sym.isWeakDef()) |
67 | flags |= EXPORT_SYMBOL_FLAGS_WEAK_DEFINITION; |
68 | if (sym.isTlv()) |
69 | flags |= EXPORT_SYMBOL_FLAGS_KIND_THREAD_LOCAL; |
70 | // TODO: Add proper support for stub-and-resolver flags. |
71 | |
72 | if (auto *defined = dyn_cast<Defined>(Val: &sym)) { |
73 | if (defined->isAbsolute()) |
74 | flags |= EXPORT_SYMBOL_FLAGS_KIND_ABSOLUTE; |
75 | } else if (auto *dysym = dyn_cast<DylibSymbol>(Val: &sym)) { |
76 | flags |= EXPORT_SYMBOL_FLAGS_REEXPORT; |
77 | if (!dysym->isDynamicLookup()) |
78 | ordinal = dysym->getFile()->ordinal; |
79 | } |
80 | } |
81 | }; |
82 | |
83 | } // namespace |
84 | |
85 | struct macho::TrieNode { |
86 | std::vector<Edge> edges; |
87 | std::optional<ExportInfo> info; |
88 | // Estimated offset from the start of the serialized trie to the current node. |
89 | // This will converge to the true offset when updateOffset() is run to a |
90 | // fixpoint. |
91 | size_t offset = 0; |
92 | |
93 | uint32_t getTerminalSize() const; |
94 | // Returns whether the new estimated offset differs from the old one. |
95 | bool updateOffset(size_t &nextOffset); |
96 | void writeTo(uint8_t *buf) const; |
97 | }; |
98 | |
99 | // For regular symbols, the node layout (excluding the children) is |
100 | // |
101 | // uleb128 terminalSize; |
102 | // uleb128 flags; |
103 | // uleb128 address; |
104 | // |
105 | // For re-exported symbols, the layout is |
106 | // |
107 | // uleb128 terminalSize; |
108 | // uleb128 flags; |
109 | // uleb128 ordinal; |
110 | // char[] originalName; |
111 | // |
112 | // If libfoo.dylib is linked against libbar.dylib, and libfoo exports an alias |
113 | // _foo to a symbol _bar in libbar, then originalName will be "_bar". If libfoo |
114 | // re-exports _bar directly (i.e. not via an alias), then originalName will be |
115 | // the empty string. |
116 | // |
117 | // TODO: Support aliased re-exports. (Since we don't yet support these, |
118 | // originalName will always be the empty string.) |
119 | // |
120 | // For stub-and-resolver nodes, the layout is |
121 | // |
122 | // uleb128 terminalSize; |
123 | // uleb128 flags; |
124 | // uleb128 stubAddress; |
125 | // uleb128 resolverAddress; |
126 | // |
127 | // TODO: Support stub-and-resolver nodes. |
128 | uint32_t TrieNode::getTerminalSize() const { |
129 | uint32_t size = getULEB128Size(Value: info->flags); |
130 | if (info->flags & MachO::EXPORT_SYMBOL_FLAGS_REEXPORT) |
131 | size += getULEB128Size(Value: info->ordinal) + 1; // + 1 for the null-terminator |
132 | else |
133 | size += getULEB128Size(Value: info->address); |
134 | return size; |
135 | } |
136 | |
137 | bool TrieNode::updateOffset(size_t &nextOffset) { |
138 | // Size of the whole node (including the terminalSize and the outgoing edges.) |
139 | // In contrast, terminalSize only records the size of the other data in the |
140 | // node. |
141 | size_t nodeSize; |
142 | if (info) { |
143 | uint32_t terminalSize = getTerminalSize(); |
144 | // Overall node size so far is the uleb128 size of the length of the symbol |
145 | // info + the symbol info itself. |
146 | nodeSize = terminalSize + getULEB128Size(Value: terminalSize); |
147 | } else { |
148 | nodeSize = 1; // Size of terminalSize (which has a value of 0) |
149 | } |
150 | // Compute size of all child edges. |
151 | ++nodeSize; // Byte for number of children. |
152 | for (const Edge &edge : edges) { |
153 | nodeSize += edge.substring.size() + 1 // String length. |
154 | + getULEB128Size(Value: edge.child->offset); // Offset len. |
155 | } |
156 | // On input, 'nextOffset' is the new preferred location for this node. |
157 | bool result = (offset != nextOffset); |
158 | // Store new location in node object for use by parents. |
159 | offset = nextOffset; |
160 | nextOffset += nodeSize; |
161 | return result; |
162 | } |
163 | |
164 | void TrieNode::writeTo(uint8_t *buf) const { |
165 | buf += offset; |
166 | if (info) { |
167 | uint32_t terminalSize = getTerminalSize(); |
168 | buf += encodeULEB128(Value: terminalSize, p: buf); |
169 | buf += encodeULEB128(Value: info->flags, p: buf); |
170 | if (info->flags & MachO::EXPORT_SYMBOL_FLAGS_REEXPORT) { |
171 | buf += encodeULEB128(Value: info->ordinal, p: buf); |
172 | *buf++ = 0; // empty originalName string |
173 | } else { |
174 | buf += encodeULEB128(Value: info->address, p: buf); |
175 | } |
176 | } else { |
177 | // TrieNode with no Symbol info. |
178 | *buf++ = 0; // terminalSize |
179 | } |
180 | // Add number of children. TODO: Handle case where we have more than 256. |
181 | assert(edges.size() < 256); |
182 | *buf++ = edges.size(); |
183 | // Append each child edge substring and node offset. |
184 | for (const Edge &edge : edges) { |
185 | memcpy(dest: buf, src: edge.substring.data(), n: edge.substring.size()); |
186 | buf += edge.substring.size(); |
187 | *buf++ = '\0'; |
188 | buf += encodeULEB128(Value: edge.child->offset, p: buf); |
189 | } |
190 | } |
191 | |
192 | TrieBuilder::~TrieBuilder() { |
193 | for (TrieNode *node : nodes) |
194 | delete node; |
195 | } |
196 | |
197 | TrieNode *TrieBuilder::makeNode() { |
198 | auto *node = new TrieNode(); |
199 | nodes.emplace_back(args&: node); |
200 | return node; |
201 | } |
202 | |
203 | static int charAt(const Symbol *sym, size_t pos) { |
204 | StringRef str = sym->getName(); |
205 | if (pos >= str.size()) |
206 | return -1; |
207 | return str[pos]; |
208 | } |
209 | |
210 | // Build the trie by performing a three-way radix quicksort: We start by sorting |
211 | // the strings by their first characters, then sort the strings with the same |
212 | // first characters by their second characters, and so on recursively. Each |
213 | // time the prefixes diverge, we add a node to the trie. |
214 | // |
215 | // node: The most recently created node along this path in the trie (i.e. |
216 | // the furthest from the root.) |
217 | // lastPos: The prefix length of the most recently created node, i.e. the number |
218 | // of characters along its path from the root. |
219 | // pos: The string index we are currently sorting on. Note that each symbol |
220 | // S contained in vec has the same prefix S[0...pos). |
221 | void TrieBuilder::sortAndBuild(MutableArrayRef<const Symbol *> vec, |
222 | TrieNode *node, size_t lastPos, size_t pos) { |
223 | tailcall: |
224 | if (vec.empty()) |
225 | return; |
226 | |
227 | // Partition items so that items in [0, i) are less than the pivot, |
228 | // [i, j) are the same as the pivot, and [j, vec.size()) are greater than |
229 | // the pivot. |
230 | const Symbol *pivotSymbol = vec[vec.size() / 2]; |
231 | int pivot = charAt(sym: pivotSymbol, pos); |
232 | size_t i = 0; |
233 | size_t j = vec.size(); |
234 | for (size_t k = 0; k < j;) { |
235 | int c = charAt(sym: vec[k], pos); |
236 | if (c < pivot) |
237 | std::swap(a&: vec[i++], b&: vec[k++]); |
238 | else if (c > pivot) |
239 | std::swap(a&: vec[--j], b&: vec[k]); |
240 | else |
241 | k++; |
242 | } |
243 | |
244 | bool isTerminal = pivot == -1; |
245 | bool prefixesDiverge = i != 0 || j != vec.size(); |
246 | if (lastPos != pos && (isTerminal || prefixesDiverge)) { |
247 | TrieNode *newNode = makeNode(); |
248 | node->edges.emplace_back(args: pivotSymbol->getName().slice(Start: lastPos, End: pos), |
249 | args&: newNode); |
250 | node = newNode; |
251 | lastPos = pos; |
252 | } |
253 | |
254 | sortAndBuild(vec: vec.slice(N: 0, M: i), node, lastPos, pos); |
255 | sortAndBuild(vec: vec.slice(N: j), node, lastPos, pos); |
256 | |
257 | if (isTerminal) { |
258 | assert(j - i == 1); // no duplicate symbols |
259 | node->info = ExportInfo(*pivotSymbol, imageBase); |
260 | } else { |
261 | // This is the tail-call-optimized version of the following: |
262 | // sortAndBuild(vec.slice(i, j - i), node, lastPos, pos + 1); |
263 | vec = vec.slice(N: i, M: j - i); |
264 | ++pos; |
265 | goto tailcall; |
266 | } |
267 | } |
268 | |
269 | size_t TrieBuilder::build() { |
270 | if (exported.empty()) |
271 | return 0; |
272 | |
273 | TrieNode *root = makeNode(); |
274 | sortAndBuild(vec: exported, node: root, lastPos: 0, pos: 0); |
275 | |
276 | // Assign each node in the vector an offset in the trie stream, iterating |
277 | // until all uleb128 sizes have stabilized. |
278 | size_t offset; |
279 | bool more; |
280 | do { |
281 | offset = 0; |
282 | more = false; |
283 | for (TrieNode *node : nodes) |
284 | more |= node->updateOffset(nextOffset&: offset); |
285 | } while (more); |
286 | |
287 | return offset; |
288 | } |
289 | |
290 | void TrieBuilder::writeTo(uint8_t *buf) const { |
291 | for (TrieNode *node : nodes) |
292 | node->writeTo(buf); |
293 | } |
294 | |
295 | namespace { |
296 | |
297 | // Parse a serialized trie and invoke a callback for each entry. |
298 | class TrieParser { |
299 | public: |
300 | TrieParser(const uint8_t *buf, size_t size, const TrieEntryCallback &callback) |
301 | : start(buf), end(start + size), callback(callback) {} |
302 | |
303 | void parse(const uint8_t *buf, const Twine &cumulativeString); |
304 | |
305 | void parse() { parse(buf: start, cumulativeString: "" ); } |
306 | |
307 | const uint8_t *start; |
308 | const uint8_t *end; |
309 | const TrieEntryCallback &callback; |
310 | }; |
311 | |
312 | } // namespace |
313 | |
314 | void TrieParser::parse(const uint8_t *buf, const Twine &cumulativeString) { |
315 | if (buf >= end) |
316 | fatal(msg: "Node offset points outside export section" ); |
317 | |
318 | unsigned ulebSize; |
319 | uint64_t terminalSize = decodeULEB128(p: buf, n: &ulebSize); |
320 | buf += ulebSize; |
321 | uint64_t flags = 0; |
322 | size_t offset; |
323 | if (terminalSize != 0) { |
324 | flags = decodeULEB128(p: buf, n: &ulebSize); |
325 | callback(cumulativeString, flags); |
326 | } |
327 | buf += terminalSize; |
328 | uint8_t numEdges = *buf++; |
329 | for (uint8_t i = 0; i < numEdges; ++i) { |
330 | const char *cbuf = reinterpret_cast<const char *>(buf); |
331 | StringRef substring = StringRef(cbuf, strnlen(string: cbuf, maxlen: end - buf)); |
332 | buf += substring.size() + 1; |
333 | offset = decodeULEB128(p: buf, n: &ulebSize); |
334 | buf += ulebSize; |
335 | parse(buf: start + offset, cumulativeString: cumulativeString + substring); |
336 | } |
337 | } |
338 | |
339 | void macho::parseTrie(const uint8_t *buf, size_t size, |
340 | const TrieEntryCallback &callback) { |
341 | if (size == 0) |
342 | return; |
343 | |
344 | TrieParser(buf, size, callback).parse(); |
345 | } |
346 | |