1//===- ExportTrie.cpp -----------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is a partial implementation of the Mach-O export trie format. It's
10// essentially a symbol table encoded as a compressed prefix trie, meaning that
11// the common prefixes of each symbol name are shared for a more compact
12// representation. The prefixes are stored on the edges of the trie, and one
13// edge can represent multiple characters. For example, given two exported
14// symbols _bar and _baz, we will have a trie like this (terminal nodes are
15// marked with an asterisk):
16//
17// +-+-+
18// | | // root node
19// +-+-+
20// |
21// | _ba
22// |
23// +-+-+
24// | |
25// +-+-+
26// r / \ z
27// / \
28// +-+-+ +-+-+
29// | * | | * |
30// +-+-+ +-+-+
31//
32// More documentation of the format can be found in
33// llvm/tools/obj2yaml/macho2yaml.cpp.
34//
35//===----------------------------------------------------------------------===//
36
37#include "ExportTrie.h"
38#include "Symbols.h"
39
40#include "lld/Common/ErrorHandler.h"
41#include "llvm/BinaryFormat/MachO.h"
42#include "llvm/Support/LEB128.h"
43#include <optional>
44
45using namespace llvm;
46using namespace lld;
47using namespace lld::macho;
48
49namespace {
50
51struct Edge {
52 Edge(StringRef s, TrieNode *node) : substring(s), child(node) {}
53
54 StringRef substring;
55 struct TrieNode *child;
56};
57
58struct ExportInfo {
59 uint64_t address;
60 uint64_t ordinal = 0;
61 uint8_t flags = 0;
62 ExportInfo(const Symbol &sym, uint64_t imageBase)
63 : address(sym.getVA() - imageBase) {
64 using namespace llvm::MachO;
65 if (sym.isWeakDef())
66 flags |= EXPORT_SYMBOL_FLAGS_WEAK_DEFINITION;
67 if (sym.isTlv())
68 flags |= EXPORT_SYMBOL_FLAGS_KIND_THREAD_LOCAL;
69 // TODO: Add proper support for stub-and-resolver flags.
70
71 if (auto *defined = dyn_cast<Defined>(Val: &sym)) {
72 if (defined->isAbsolute())
73 flags |= EXPORT_SYMBOL_FLAGS_KIND_ABSOLUTE;
74 } else if (auto *dysym = dyn_cast<DylibSymbol>(Val: &sym)) {
75 flags |= EXPORT_SYMBOL_FLAGS_REEXPORT;
76 if (!dysym->isDynamicLookup())
77 ordinal = dysym->getFile()->ordinal;
78 }
79 }
80};
81
82} // namespace
83
84struct macho::TrieNode {
85 std::vector<Edge> edges;
86 std::optional<ExportInfo> info;
87 // Estimated offset from the start of the serialized trie to the current node.
88 // This will converge to the true offset when updateOffset() is run to a
89 // fixpoint.
90 size_t offset = 0;
91
92 uint32_t getTerminalSize() const;
93 // Returns whether the new estimated offset differs from the old one.
94 bool updateOffset(size_t &nextOffset);
95 void writeTo(uint8_t *buf) const;
96};
97
98// For regular symbols, the node layout (excluding the children) is
99//
100// uleb128 terminalSize;
101// uleb128 flags;
102// uleb128 address;
103//
104// For re-exported symbols, the layout is
105//
106// uleb128 terminalSize;
107// uleb128 flags;
108// uleb128 ordinal;
109// char[] originalName;
110//
111// If libfoo.dylib is linked against libbar.dylib, and libfoo exports an alias
112// _foo to a symbol _bar in libbar, then originalName will be "_bar". If libfoo
113// re-exports _bar directly (i.e. not via an alias), then originalName will be
114// the empty string.
115//
116// TODO: Support aliased re-exports. (Since we don't yet support these,
117// originalName will always be the empty string.)
118//
119// For stub-and-resolver nodes, the layout is
120//
121// uleb128 terminalSize;
122// uleb128 flags;
123// uleb128 stubAddress;
124// uleb128 resolverAddress;
125//
126// TODO: Support stub-and-resolver nodes.
127uint32_t TrieNode::getTerminalSize() const {
128 uint32_t size = getULEB128Size(Value: info->flags);
129 if (info->flags & MachO::EXPORT_SYMBOL_FLAGS_REEXPORT)
130 size += getULEB128Size(Value: info->ordinal) + 1; // + 1 for the null-terminator
131 else
132 size += getULEB128Size(Value: info->address);
133 return size;
134}
135
136bool TrieNode::updateOffset(size_t &nextOffset) {
137 // Size of the whole node (including the terminalSize and the outgoing edges.)
138 // In contrast, terminalSize only records the size of the other data in the
139 // node.
140 size_t nodeSize;
141 if (info) {
142 uint32_t terminalSize = getTerminalSize();
143 // Overall node size so far is the uleb128 size of the length of the symbol
144 // info + the symbol info itself.
145 nodeSize = terminalSize + getULEB128Size(Value: terminalSize);
146 } else {
147 nodeSize = 1; // Size of terminalSize (which has a value of 0)
148 }
149 // Compute size of all child edges.
150 ++nodeSize; // Byte for number of children.
151 for (const Edge &edge : edges) {
152 nodeSize += edge.substring.size() + 1 // String length.
153 + getULEB128Size(Value: edge.child->offset); // Offset len.
154 }
155 // On input, 'nextOffset' is the new preferred location for this node.
156 bool result = (offset != nextOffset);
157 // Store new location in node object for use by parents.
158 offset = nextOffset;
159 nextOffset += nodeSize;
160 return result;
161}
162
163void TrieNode::writeTo(uint8_t *buf) const {
164 buf += offset;
165 if (info) {
166 uint32_t terminalSize = getTerminalSize();
167 buf += encodeULEB128(Value: terminalSize, p: buf);
168 buf += encodeULEB128(Value: info->flags, p: buf);
169 if (info->flags & MachO::EXPORT_SYMBOL_FLAGS_REEXPORT) {
170 buf += encodeULEB128(Value: info->ordinal, p: buf);
171 *buf++ = 0; // empty originalName string
172 } else {
173 buf += encodeULEB128(Value: info->address, p: buf);
174 }
175 } else {
176 // TrieNode with no Symbol info.
177 *buf++ = 0; // terminalSize
178 }
179 // Add number of children. TODO: Handle case where we have more than 256.
180 assert(edges.size() < 256);
181 *buf++ = edges.size();
182 // Append each child edge substring and node offset.
183 for (const Edge &edge : edges) {
184 memcpy(dest: buf, src: edge.substring.data(), n: edge.substring.size());
185 buf += edge.substring.size();
186 *buf++ = '\0';
187 buf += encodeULEB128(Value: edge.child->offset, p: buf);
188 }
189}
190
191TrieBuilder::~TrieBuilder() {
192 for (TrieNode *node : nodes)
193 delete node;
194}
195
196TrieNode *TrieBuilder::makeNode() {
197 auto *node = new TrieNode();
198 nodes.emplace_back(args&: node);
199 return node;
200}
201
202static int charAt(const Symbol *sym, size_t pos) {
203 StringRef str = sym->getName();
204 if (pos >= str.size())
205 return -1;
206 return str[pos];
207}
208
209// Build the trie by performing a three-way radix quicksort: We start by sorting
210// the strings by their first characters, then sort the strings with the same
211// first characters by their second characters, and so on recursively. Each
212// time the prefixes diverge, we add a node to the trie.
213//
214// node: The most recently created node along this path in the trie (i.e.
215// the furthest from the root.)
216// lastPos: The prefix length of the most recently created node, i.e. the number
217// of characters along its path from the root.
218// pos: The string index we are currently sorting on. Note that each symbol
219// S contained in vec has the same prefix S[0...pos).
220void TrieBuilder::sortAndBuild(MutableArrayRef<const Symbol *> vec,
221 TrieNode *node, size_t lastPos, size_t pos) {
222tailcall:
223 if (vec.empty())
224 return;
225
226 // Partition items so that items in [0, i) are less than the pivot,
227 // [i, j) are the same as the pivot, and [j, vec.size()) are greater than
228 // the pivot.
229 const Symbol *pivotSymbol = vec[vec.size() / 2];
230 int pivot = charAt(sym: pivotSymbol, pos);
231 size_t i = 0;
232 size_t j = vec.size();
233 for (size_t k = 0; k < j;) {
234 int c = charAt(sym: vec[k], pos);
235 if (c < pivot)
236 std::swap(a&: vec[i++], b&: vec[k++]);
237 else if (c > pivot)
238 std::swap(a&: vec[--j], b&: vec[k]);
239 else
240 k++;
241 }
242
243 bool isTerminal = pivot == -1;
244 bool prefixesDiverge = i != 0 || j != vec.size();
245 if (lastPos != pos && (isTerminal || prefixesDiverge)) {
246 TrieNode *newNode = makeNode();
247 node->edges.emplace_back(args: pivotSymbol->getName().slice(Start: lastPos, End: pos),
248 args&: newNode);
249 node = newNode;
250 lastPos = pos;
251 }
252
253 sortAndBuild(vec: vec.slice(N: 0, M: i), node, lastPos, pos);
254 sortAndBuild(vec: vec.slice(N: j), node, lastPos, pos);
255
256 if (isTerminal) {
257 assert(j - i == 1); // no duplicate symbols
258 node->info = ExportInfo(*pivotSymbol, imageBase);
259 } else {
260 // This is the tail-call-optimized version of the following:
261 // sortAndBuild(vec.slice(i, j - i), node, lastPos, pos + 1);
262 vec = vec.slice(N: i, M: j - i);
263 ++pos;
264 goto tailcall;
265 }
266}
267
268size_t TrieBuilder::build() {
269 if (exported.empty())
270 return 0;
271
272 TrieNode *root = makeNode();
273 sortAndBuild(vec: exported, node: root, lastPos: 0, pos: 0);
274
275 // Assign each node in the vector an offset in the trie stream, iterating
276 // until all uleb128 sizes have stabilized.
277 size_t offset;
278 bool more;
279 do {
280 offset = 0;
281 more = false;
282 for (TrieNode *node : nodes)
283 more |= node->updateOffset(nextOffset&: offset);
284 } while (more);
285
286 return offset;
287}
288
289void TrieBuilder::writeTo(uint8_t *buf) const {
290 for (TrieNode *node : nodes)
291 node->writeTo(buf);
292}
293
294namespace {
295
296// Parse a serialized trie and invoke a callback for each entry.
297class TrieParser {
298public:
299 TrieParser(const uint8_t *buf, size_t size, const TrieEntryCallback &callback)
300 : start(buf), end(start + size), callback(callback) {}
301
302 void parse(const uint8_t *buf, const Twine &cumulativeString);
303
304 void parse() { parse(buf: start, cumulativeString: ""); }
305
306 const uint8_t *start;
307 const uint8_t *end;
308 const TrieEntryCallback &callback;
309};
310
311} // namespace
312
313void TrieParser::parse(const uint8_t *buf, const Twine &cumulativeString) {
314 if (buf >= end)
315 fatal(msg: "Node offset points outside export section");
316
317 unsigned ulebSize;
318 uint64_t terminalSize = decodeULEB128(p: buf, n: &ulebSize);
319 buf += ulebSize;
320 uint64_t flags = 0;
321 size_t offset;
322 if (terminalSize != 0) {
323 flags = decodeULEB128(p: buf, n: &ulebSize);
324 callback(cumulativeString, flags);
325 }
326 buf += terminalSize;
327 uint8_t numEdges = *buf++;
328 for (uint8_t i = 0; i < numEdges; ++i) {
329 const char *cbuf = reinterpret_cast<const char *>(buf);
330 StringRef substring = StringRef(cbuf, strnlen(string: cbuf, maxlen: end - buf));
331 buf += substring.size() + 1;
332 offset = decodeULEB128(p: buf, n: &ulebSize);
333 buf += ulebSize;
334 parse(buf: start + offset, cumulativeString: cumulativeString + substring);
335 }
336}
337
338void macho::parseTrie(const uint8_t *buf, size_t size,
339 const TrieEntryCallback &callback) {
340 if (size == 0)
341 return;
342
343 TrieParser(buf, size, callback).parse();
344}
345

Provided by KDAB

Privacy Policy
Improve your Profiling and Debugging skills
Find out more

source code of lld/MachO/ExportTrie.cpp