ICF.cpp source code [lld/ELF/ICF.cpp]

1	//===- ICF.cpp ------------------------------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// ICF is short for Identical Code Folding. This is a size optimization to
10	// identify and merge two or more read-only sections (typically functions)
11	// that happened to have the same contents. It usually reduces output size
12	// by a few percent.
13	//
14	// In ICF, two sections are considered identical if they have the same
15	// section flags, section data, and relocations. Relocations are tricky,
16	// because two relocations are considered the same if they have the same
17	// relocation types, values, and if they point to the same sections in*
18	// terms of ICF.*
19	//
20	// Here is an example. If foo and bar defined below are compiled to the
21	// same machine instructions, ICF can and should merge the two, although
22	// their relocations point to each other.
23	//
24	// void foo() { bar(); }
25	// void bar() { foo(); }
26	//
27	// If you merge the two, their relocations point to the same section and
28	// thus you know they are mergeable, but how do you know they are
29	// mergeable in the first place? This is not an easy problem to solve.
30	//
31	// What we are doing in LLD is to partition sections into equivalence
32	// classes. Sections in the same equivalence class when the algorithm
33	// terminates are considered identical. Here are details:
34	//
35	// 1. First, we partition sections using their hash values as keys. Hash
36	// values contain section types, section contents and numbers of
37	// relocations. During this step, relocation targets are not taken into
38	// account. We just put sections that apparently differ into different
39	// equivalence classes.
40	//
41	// 2. Next, for each equivalence class, we visit sections to compare
42	// relocation targets. Relocation targets are considered equivalent if
43	// their targets are in the same equivalence class. Sections with
44	// different relocation targets are put into different equivalence
45	// classes.
46	//
47	// 3. If we split an equivalence class in step 2, two relocations
48	// previously target the same equivalence class may now target
49	// different equivalence classes. Therefore, we repeat step 2 until a
50	// convergence is obtained.
51	//
52	// 4. For each equivalence class C, pick an arbitrary section in C, and
53	// merge all the other sections in C with it.
54	//
55	// For small programs, this algorithm needs 3-5 iterations. For large
56	// programs such as Chromium, it takes more than 20 iterations.
57	//
58	// This algorithm was mentioned as an "optimistic algorithm" in [1],
59	// though gold implements a different algorithm than this.
60	//
61	// We parallelize each step so that multiple threads can work on different
62	// equivalence classes concurrently. That gave us a large performance
63	// boost when applying ICF on large programs. For example, MSVC link.exe
64	// or GNU gold takes 10-20 seconds to apply ICF on Chromium, whose output
65	// size is about 1.5 GB, but LLD can finish it in less than 2 seconds on a
66	// 2.8 GHz 40 core machine. Even without threading, LLD's ICF is still
67	// faster than MSVC or gold though.
68	//
69	// [1] Safe ICF: Pointer Safe and Unwinding aware Identical Code Folding
70	// in the Gold Linker
71	// http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/36912.pdf
72	//
73	//===----------------------------------------------------------------------===//
74
75	#include "ICF.h"
76	#include "Config.h"
77	#include "InputFiles.h"
78	#include "LinkerScript.h"
79	#include "OutputSections.h"
80	#include "SymbolTable.h"
81	#include "Symbols.h"
82	#include "SyntheticSections.h"
83	#include "llvm/BinaryFormat/ELF.h"
84	#include "llvm/Support/Parallel.h"
85	#include "llvm/Support/TimeProfiler.h"
86	#include "llvm/Support/xxhash.h"
87	#include <algorithm>
88	#include <atomic>
89
90	using namespace llvm;
91	using namespace llvm::ELF;
92	using namespace llvm::object;
93	using namespace lld;
94	using namespace lld::elf;
95
96	namespace {
97	template <class ELFT> class ICF {
98	public:
99	ICF(Ctx &ctx) : ctx(ctx) {}
100	void run();
101
102	private:
103	void segregate(size_t begin, size_t end, uint32_t eqClassBase, bool constant);
104
105	template <class RelTy>
106	bool constantEq(const InputSection *a, Relocs<RelTy> relsA,
107	const InputSection *b, Relocs<RelTy> relsB);
108
109	template <class RelTy>
110	bool variableEq(const InputSection *a, Relocs<RelTy> relsA,
111	const InputSection *b, Relocs<RelTy> relsB);
112
113	bool equalsConstant(const InputSection a, const* InputSection *b);
114	bool equalsVariable(const InputSection a, const* InputSection *b);
115
116	size_t findBoundary(size_t begin, size_t end);
117
118	void forEachClassRange(size_t begin, size_t end,
119	llvm::function_ref<void(size_t, size_t)> fn);
120
121	void parallelForEachClass(llvm::function_ref<void(size_t, size_t)> fn);
122
123	Ctx &ctx;
124	SmallVector<InputSection *, `0`> sections;
125
126	// We repeat the main loop while `Repeat` is true.
127	std::atomic<bool> repeat;
128
129	// The main loop counter.
130	int cnt = `0`;
131
132	// We have two locations for equivalence classes. On the first iteration
133	// of the main loop, Class[0] has a valid value, and Class[1] contains
134	// garbage. We read equivalence classes from slot 0 and write to slot 1.
135	// So, Class[0] represents the current class, and Class[1] represents
136	// the next class. On each iteration, we switch their roles and use them
137	// alternately.
138	//
139	// Why are we doing this? Recall that other threads may be working on
140	// other equivalence classes in parallel. They may read sections that we
141	// are updating. We cannot update equivalence classes in place because
142	// it breaks the invariance that all possibly-identical sections must be
143	// in the same equivalence class at any moment. In other words, the for
144	// loop to update equivalence classes is not atomic, and that is
145	// observable from other threads. By writing new classes to other
146	// places, we can keep the invariance.
147	//
148	// Below, `Current` has the index of the current class, and `Next` has
149	// the index of the next class. If threading is enabled, they are either
150	// (0, 1) or (1, 0).
151	//
152	// Note on single-thread: if that's the case, they are always (0, 0)
153	// because we can safely read the next class without worrying about race
154	// conditions. Using the same location makes this algorithm converge
155	// faster because it uses results of the same iteration earlier.
156	int current = `0`;
157	int next = `0`;
158	};
159	}
160
161	// Returns true if section S is subject of ICF.
162	static bool isEligible(InputSection *s) {
163	if (!s->isLive() \|\| s->keepUnique \|\| !(s->flags & SHF_ALLOC))
164	return false;
165
166	// Don't merge writable sections. .data.rel.ro sections are marked as writable
167	// but are semantically read-only.
168	if ((s->flags & SHF_WRITE) && s->name != ".data.rel.ro" &&
169	!s->name.starts_with(Prefix: ".data.rel.ro."))
170	return false;
171
172	// SHF_LINK_ORDER sections are ICF'd as a unit with their dependent sections,
173	// so we don't consider them for ICF individually.
174	if (s->flags & SHF_LINK_ORDER)
175	return false;
176
177	// Don't merge synthetic sections as their Data member is not valid and empty.
178	// The Data member needs to be valid for ICF as it is used by ICF to determine
179	// the equality of section contents.
180	if (isa<SyntheticSection>(Val: s))
181	return false;
182
183	// .init and .fini contains instructions that must be executed to initialize
184	// and finalize the process. They cannot and should not be merged.
185	if (s->name == ".init" \|\| s->name == ".fini")
186	return false;
187
188	// A user program may enumerate sections named with a C identifier using
189	// __start_ and __stop_* symbols. We cannot ICF any such sections because*
190	// that could change program semantics.
191	if (isValidCIdentifier(s: s->name))
192	return false;
193
194	return true;
195	}
196
197	// Split an equivalence class into smaller classes.
198	template <class ELFT>
199	void ICF<ELFT>::segregate(size_t begin, size_t end, uint32_t eqClassBase,
200	bool constant) {
201	// This loop rearranges sections in [Begin, End) so that all sections
202	// that are equal in terms of equals{Constant,Variable} are contiguous
203	// in [Begin, End).
204	//
205	// The algorithm is quadratic in the worst case, but that is not an
206	// issue in practice because the number of the distinct sections in
207	// each range is usually very small.
208
209	while (begin < end) {
210	// Divide [Begin, End) into two. Let Mid be the start index of the
211	// second group.
212	auto bound =
213	std::stable_partition(sections.begin() + begin + `1`,
214	sections.begin() + end, [&](InputSection *s) {
215	if (constant)
216	return equalsConstant(a: sections [begin], b: s);
217	return equalsVariable(a: sections [begin], b: s);
218	});
219	size_t mid = bound - sections.begin();
220
221	// Now we split [Begin, End) into [Begin, Mid) and [Mid, End) by
222	// updating the sections in [Begin, Mid). We use Mid as the basis for
223	// the equivalence class ID because every group ends with a unique index.
224	// Add this to eqClassBase to avoid equality with unique IDs.
225	for (size_t i = begin; i < mid; ++i)
226	sections [i]->eqClass[next] = eqClassBase + mid;
227
228	// If we created a group, we need to iterate the main loop again.
229	if (mid != end)
230	repeat = true;
231
232	begin = mid;
233	}
234	}
235
236	// Compare two lists of relocations.
237	template <class ELFT>
238	template <class RelTy>
239	bool ICF<ELFT>::constantEq(const InputSection *secA, Relocs<RelTy> ra,
240	const InputSection *secB, Relocs<RelTy> rb) {
241	if (ra.size() != rb.size())
242	return false;
243	auto rai = ra.begin(), rae = ra.end(), rbi = rb.begin();
244	for (; rai != rae; ++rai, ++rbi) {
245	if (rai->r_offset != rbi->r_offset \|\|
246	rai->getType(ctx.arg.isMips64EL) != rbi->getType(ctx.arg.isMips64EL))
247	return false;
248
249	uint64_t addA = getAddend<ELFT>(*rai);
250	uint64_t addB = getAddend<ELFT>(*rbi);
251
252	Symbol &sa = secA->file->getRelocTargetSym(*rai);
253	Symbol &sb = secB->file->getRelocTargetSym(*rbi);
254	if (&sa == &sb) {
255	if (addA == addB)
256	continue;
257	return false;
258	}
259
260	auto *da = dyn_cast<Defined>(Val: &sa);
261	auto *db = dyn_cast<Defined>(Val: &sb);
262
263	// Placeholder symbols generated by linker scripts look the same now but
264	// may have different values later.
265	if (!da \|\| !db \|\| da->scriptDefined \|\| db->scriptDefined)
266	return false;
267
268	// When comparing a pair of relocations, if they refer to different symbols,
269	// and either symbol is preemptible, the containing sections should be
270	// considered different. This is because even if the sections are identical
271	// in this DSO, they may not be after preemption.
272	if (da->isPreemptible \|\| db->isPreemptible)
273	return false;
274
275	// Relocations referring to absolute symbols are constant-equal if their
276	// values are equal.
277	if (!da->section && !db->section && da->value + addA == db->value + addB)
278	continue;
279	if (!da->section \|\| !db->section)
280	return false;
281
282	if (da->section->kind() != db->section->kind())
283	return false;
284
285	// Relocations referring to InputSections are constant-equal if their
286	// section offsets are equal.
287	if (isa<InputSection>(Val: da->section)) {
288	if (da->value + addA == db->value + addB)
289	continue;
290	return false;
291	}
292
293	// Relocations referring to MergeInputSections are constant-equal if their
294	// offsets in the output section are equal.
295	auto *x = dyn_cast<MergeInputSection>(Val: da->section);
296	if (!x)
297	return false;
298	auto *y = cast<MergeInputSection>(Val: db->section);
299	if (x->getParent() != y->getParent())
300	return false;
301
302	uint64_t offsetA =
303	sa.isSection() ? x->getOffset(offset: addA) : x->getOffset(offset: da->value) + addA;
304	uint64_t offsetB =
305	sb.isSection() ? y->getOffset(offset: addB) : y->getOffset(offset: db->value) + addB;
306	if (offsetA != offsetB)
307	return false;
308	}
309
310	return true;
311	}
312
313	// Compare "non-moving" part of two InputSections, namely everything
314	// except relocation targets.
315	template <class ELFT>
316	bool ICF<ELFT>::equalsConstant(const InputSection a, const* InputSection *b) {
317	if (a->flags != b->flags \|\| a->getSize() != b->getSize() \|\|
318	a->content() != b->content())
319	return false;
320
321	// If two sections have different output sections, we cannot merge them.
322	assert(a->getParent() && b->getParent());
323	if (a->getParent() != b->getParent())
324	return false;
325
326	const RelsOrRelas<ELFT> ra = a->template relsOrRelas<ELFT>();
327	const RelsOrRelas<ELFT> rb = b->template relsOrRelas<ELFT>();
328	if (ra.areRelocsCrel() \|\| rb.areRelocsCrel())
329	return constantEq(a, ra.crels, b, rb.crels);
330	return ra.areRelocsRel() \|\| rb.areRelocsRel()
331	? constantEq(a, ra.rels, b, rb.rels)
332	: constantEq(a, ra.relas, b, rb.relas);
333	}
334
335	// Compare two lists of relocations. Returns true if all pairs of
336	// relocations point to the same section in terms of ICF.
337	template <class ELFT>
338	template <class RelTy>
339	bool ICF<ELFT>::variableEq(const InputSection *secA, Relocs<RelTy> ra,
340	const InputSection *secB, Relocs<RelTy> rb) {
341	assert(ra.size() == rb.size());
342
343	auto rai = ra.begin(), rae = ra.end(), rbi = rb.begin();
344	for (; rai != rae; ++rai, ++rbi) {
345	// The two sections must be identical.
346	Symbol &sa = secA->file->getRelocTargetSym(*rai);
347	Symbol &sb = secB->file->getRelocTargetSym(*rbi);
348	if (&sa == &sb)
349	continue;
350
351	auto *da = cast<Defined>(Val: &sa);
352	auto *db = cast<Defined>(Val: &sb);
353
354	// We already dealt with absolute and non-InputSection symbols in
355	// constantEq, and for InputSections we have already checked everything
356	// except the equivalence class.
357	if (!da->section)
358	continue;
359	auto *x = dyn_cast<InputSection>(Val: da->section);
360	if (!x)
361	continue;
362	auto *y = cast<InputSection>(Val: db->section);
363
364	// Sections that are in the special equivalence class 0, can never be the
365	// same in terms of the equivalence class.
366	if (x->eqClass[current] == `0`)
367	return false;
368	if (x->eqClass[current] != y->eqClass[current])
369	return false;
370	};
371
372	return true;
373	}
374
375	// Compare "moving" part of two InputSections, namely relocation targets.
376	template <class ELFT>
377	bool ICF<ELFT>::equalsVariable(const InputSection a, const* InputSection *b) {
378	const RelsOrRelas<ELFT> ra = a->template relsOrRelas<ELFT>();
379	const RelsOrRelas<ELFT> rb = b->template relsOrRelas<ELFT>();
380	if (ra.areRelocsCrel() \|\| rb.areRelocsCrel())
381	return variableEq(a, ra.crels, b, rb.crels);
382	return ra.areRelocsRel() \|\| rb.areRelocsRel()
383	? variableEq(a, ra.rels, b, rb.rels)
384	: variableEq(a, ra.relas, b, rb.relas);
385	}
386
387	template <class ELFT> size_t ICF<ELFT>::findBoundary(size_t begin, size_t end) {
388	uint32_t eqClass = sections [begin]->eqClass[current];
389	for (size_t i = begin + `1`; i < end; ++i)
390	if (eqClass != sections [i]->eqClass[current])
391	return i;
392	return end;
393	}
394
395	// Sections in the same equivalence class are contiguous in Sections
396	// vector. Therefore, Sections vector can be considered as contiguous
397	// groups of sections, grouped by the class.
398	//
399	// This function calls Fn on every group within [Begin, End).
400	template <class ELFT>
401	void ICF<ELFT>::forEachClassRange(size_t begin, size_t end,
402	llvm::function_ref<void(size_t, size_t)> fn) {
403	while (begin < end) {
404	size_t mid = findBoundary(begin, end);
405	fn (begin, mid);
406	begin = mid;
407	}
408	}
409
410	// Call Fn on each equivalence class.
411
412	template <class ELFT>
413	void ICF<ELFT>::parallelForEachClass(
414	llvm::function_ref<void(size_t, size_t)> fn) {
415	// If threading is disabled or the number of sections are
416	// too small to use threading, call Fn sequentially.
417	if (parallel::strategy.ThreadsRequested == `1` \|\| sections.size() < `1024`) {
418	forEachClassRange(begin: `0`, end: sections.size(), fn);
419	++cnt;
420	return;
421	}
422
423	current = cnt % `2`;
424	next = (cnt + `1`) % `2`;
425
426	// Shard into non-overlapping intervals, and call Fn in parallel.
427	// The sharding must be completed before any calls to Fn are made
428	// so that Fn can modify the Chunks in its shard without causing data
429	// races.
430	const size_t numShards = `256`;
431	size_t step = sections.size() / numShards;
432	size_t boundaries[numShards + `1`];
433	boundaries[`0`] = `0`;
434	boundaries[numShards] = sections.size();
435
436	parallelFor(`1`, numShards, [&](size_t i) {
437	boundaries[i] = findBoundary(begin: (i - `1`) * step, end: sections.size());
438	});
439
440	parallelFor(`1`, numShards + `1`, [&](size_t i) {
441	if (boundaries[i - `1`] < boundaries[i])
442	forEachClassRange(begin: boundaries[i - `1`], end: boundaries[i], fn);
443	});
444	++cnt;
445	}
446
447	// Combine the hashes of the sections referenced by the given section into its
448	// hash.
449	template <class RelTy>
450	static void combineRelocHashes(unsigned cnt, InputSection *isec,
451	Relocs<RelTy> rels) {
452	uint32_t hash = isec->eqClass[cnt % `2`];
453	for (RelTy rel : rels) {
454	Symbol &s = isec->file->getRelocTargetSym(rel);
455	if (auto *d = dyn_cast<Defined>(Val: &s))
456	if (auto *relSec = dyn_cast_or_null<InputSection>(Val: d->section))
457	hash += relSec->eqClass[cnt % `2`];
458	}
459	// Set MSB to 1 to avoid collisions with unique IDs.
460	isec->eqClass[(cnt + `1`) % `2`] = hash \| (`1U` << `31`);
461	}
462
463	// The main function of ICF.
464	template <class ELFT> void ICF<ELFT>::run() {
465	// Two text sections may have identical content and relocations but different
466	// LSDA, e.g. the two functions may have catch blocks of different types. If a
467	// text section is referenced by a .eh_frame FDE with LSDA, it is not
468	// eligible. This is implemented by iterating over CIE/FDE and setting
469	// eqClass[0] to the referenced text section from a live FDE.
470	//
471	// If two .gcc_except_table have identical semantics (usually identical
472	// content with PC-relative encoding), we will lose folding opportunity.
473	uint32_t uniqueId = `0`;
474	for (Partition &part : ctx.partitions)
475	part.ehFrame ->iterateFDEWithLSDA<ELFT>(
476	[&](InputSection &s) { s.eqClass[`0`] = s.eqClass[`1`] = ++uniqueId; });
477
478	// Collect sections to merge.
479	for (InputSectionBase *sec : ctx.inputSections) {
480	auto *s = dyn_cast<InputSection>(Val: sec);
481	if (s && s->eqClass[`0`] == `0`) {
482	if (isEligible(s))
483	sections.push_back(Elt: s);
484	else
485	// Ineligible sections are assigned unique IDs, i.e. each section
486	// belongs to an equivalence class of its own.
487	s->eqClass[`0`] = s->eqClass[`1`] = ++uniqueId;
488	}
489	}
490
491	// Initially, we use hash values to partition sections.
492	parallelForEach(sections, [&](InputSection *s) {
493	// Set MSB to 1 to avoid collisions with unique IDs.
494	s->eqClass[`0`] = xxh3_64bits(data: s->content()) \| (`1U` << `31`);
495	});
496
497	// Perform 2 rounds of relocation hash propagation. 2 is an empirical value to
498	// reduce the average sizes of equivalence classes, i.e. segregate() which has
499	// a large time complexity will have less work to do.
500	for (unsigned cnt = `0`; cnt != `2`; ++cnt) {
501	parallelForEach(sections, [&](InputSection *s) {
502	const RelsOrRelas<ELFT> rels = s->template relsOrRelas<ELFT>();
503	if (rels.areRelocsCrel())
504	combineRelocHashes(cnt, s, rels.crels);
505	else if (rels.areRelocsRel())
506	combineRelocHashes(cnt, s, rels.rels);
507	else
508	combineRelocHashes(cnt, s, rels.relas);
509	});
510	}
511
512	// From now on, sections in Sections vector are ordered so that sections
513	// in the same equivalence class are consecutive in the vector.
514	llvm::stable_sort(sections, [](const InputSection a, const* InputSection *b) {
515	return a->eqClass[`0`] < b->eqClass[`0`];
516	});
517
518	// Compare static contents and assign unique equivalence class IDs for each
519	// static content. Use a base offset for these IDs to ensure no overlap with
520	// the unique IDs already assigned.
521	uint32_t eqClassBase = ++uniqueId;
522	parallelForEachClass(fn: [&](size_t begin, size_t end) {
523	segregate(begin, end, eqClassBase, constant: true);
524	});
525
526	// Split groups by comparing relocations until convergence is obtained.
527	do {
528	repeat = false;
529	parallelForEachClass(fn: [&](size_t begin, size_t end) {
530	segregate(begin, end, eqClassBase, constant: false);
531	});
532	} while (repeat);
533
534	Log(ctx) << "ICF needed " << cnt << " iterations";
535
536	auto print = [&ctx = ctx]() -> ELFSyncStream {
537	return {ctx, ctx.arg.printIcfSections ? DiagLevel::Msg : DiagLevel::None};
538	};
539	// Merge sections by the equivalence class.
540	forEachClassRange(begin: `0`, end: sections.size(), fn: [&](size_t begin, size_t end) {
541	if (end - begin == `1`)
542	return;
543	print() << "selected section " << sections [begin];
544	for (size_t i = begin + `1`; i < end; ++i) {
545	print() << " removing identical section " << sections [i];
546	sections [begin]->replace(other: sections [i]);
547
548	// At this point we know sections merged are fully identical and hence
549	// we want to remove duplicate implicit dependencies such as link order
550	// and relocation sections.
551	for (InputSection *isec : sections [i]->dependentSections)
552	isec->markDead();
553	}
554	});
555
556	// Change Defined symbol's section field to the canonical one.
557	auto fold = [](Symbol *sym) {
558	if (auto *d = dyn_cast<Defined>(Val: sym))
559	if (auto *sec = dyn_cast_or_null<InputSection>(Val: d->section))
560	if (sec->repl != d->section) {
561	d->section = sec->repl;
562	d->folded = true;
563	}
564	};
565	for (Symbol *sym : ctx.symtab ->getSymbols())
566	fold(sym);
567	parallelForEach(ctx.objectFiles, [&](ELFFileBase *file) {
568	for (Symbol *sym : file->getLocalSymbols())
569	fold(sym);
570	});
571
572	// InputSectionDescription::sections is populated by processSectionCommands().
573	// ICF may fold some input sections assigned to output sections. Remove them.
574	for (SectionCommand *cmd : ctx.script->sectionCommands)
575	if (auto *osd = dyn_cast<OutputDesc>(Val: cmd))
576	for (SectionCommand *subCmd : osd->osec.commands)
577	if (auto *isd = dyn_cast<InputSectionDescription>(Val: subCmd))
578	llvm::erase_if(isd->sections,
579	[](InputSection isec) { return* !isec->isLive(); });
580	}
581
582	// ICF entry point function.
583	template <class ELFT> void elf::doIcf(Ctx &ctx) {
584	llvm::TimeTraceScope timeScope("ICF");
585	ICF<ELFT>(ctx).run();
586	}
587
588	template void elf::doIcf<ELF32LE>(Ctx &);
589	template void elf::doIcf<ELF32BE>(Ctx &);
590	template void elf::doIcf<ELF64LE>(Ctx &);
591	template void elf::doIcf<ELF64BE>(Ctx &);
592

source code of lld/ELF/ICF.cpp