CoroElide.cpp source code [llvm/lib/Transforms/Coroutines/CoroElide.cpp]

1	//===- CoroElide.cpp - Coroutine Frame Allocation Elision Pass ------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "llvm/Transforms/Coroutines/CoroElide.h"
10	#include "CoroInternal.h"
11	#include "llvm/ADT/DenseMap.h"
12	#include "llvm/ADT/Statistic.h"
13	#include "llvm/Analysis/AliasAnalysis.h"
14	#include "llvm/Analysis/InstructionSimplify.h"
15	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
16	#include "llvm/IR/Dominators.h"
17	#include "llvm/IR/InstIterator.h"
18	#include "llvm/Support/ErrorHandling.h"
19	#include "llvm/Support/FileSystem.h"
20	#include <optional>
21
22	using namespace llvm;
23
24	#define DEBUG_TYPE "coro-elide"
25
26	STATISTIC(NumOfCoroElided, "The # of coroutine get elided.");
27
28	#ifndef NDEBUG
29	static cl::opt<std::string> CoroElideInfoOutputFilename(
30	"coro-elide-info-output-file", cl::value_desc ("filename"),
31	cl::desc ("File to record the coroutines got elided"), cl::Hidden);
32	#endif
33
34	namespace {
35	// Created on demand if the coro-elide pass has work to do.
36	struct Lowerer : coro::LowererBase {
37	SmallVector<CoroIdInst *, `4`> CoroIds;
38	SmallVector<CoroBeginInst *, `1`> CoroBegins;
39	SmallVector<CoroAllocInst *, `1`> CoroAllocs;
40	SmallVector<CoroSubFnInst *, `4`> ResumeAddr;
41	DenseMap<CoroBeginInst , SmallVector<CoroSubFnInst , `4`>> DestroyAddr;
42	SmallPtrSet<const SwitchInst *, `4`> CoroSuspendSwitches;
43
44	Lowerer(Module &M) : LowererBase (M) {}
45
46	void elideHeapAllocations(Function *F, uint64_t FrameSize, Align FrameAlign,
47	AAResults &AA);
48	bool shouldElide(Function F, DominatorTree &DT) const*;
49	void collectPostSplitCoroIds(Function *F);
50	bool processCoroId(CoroIdInst *, AAResults &AA, DominatorTree &DT,
51	OptimizationRemarkEmitter &ORE);
52	bool hasEscapePath(const CoroBeginInst *,
53	const SmallPtrSetImpl<BasicBlock > &) const*;
54	};
55	} // end anonymous namespace
56
57	// Go through the list of coro.subfn.addr intrinsics and replace them with the
58	// provided constant.
59	static void replaceWithConstant(Constant *Value,
60	SmallVectorImpl<CoroSubFnInst *> &Users) {
61	if (Users.empty())
62	return;
63
64	// See if we need to bitcast the constant to match the type of the intrinsic
65	// being replaced. Note: All coro.subfn.addr intrinsics return the same type,
66	// so we only need to examine the type of the first one in the list.
67	Type *IntrTy = Users.front()->getType();
68	Type *ValueTy = Value->getType();
69	if (ValueTy != IntrTy) {
70	// May need to tweak the function type to match the type expected at the
71	// use site.
72	assert(ValueTy->isPointerTy() && IntrTy->isPointerTy());
73	Value = ConstantExpr::getBitCast(C: Value, Ty: IntrTy);
74	}
75
76	// Now the value type matches the type of the intrinsic. Replace them all!
77	for (CoroSubFnInst *I : Users)
78	replaceAndRecursivelySimplify(I, SimpleV: Value);
79	}
80
81	// See if any operand of the call instruction references the coroutine frame.
82	static bool operandReferences(CallInst CI, AllocaInst Frame, AAResults &AA) {
83	for (Value *Op : CI->operand_values())
84	if (!AA.isNoAlias(V1: Op, V2: Frame))
85	return true;
86	return false;
87	}
88
89	// Look for any tail calls referencing the coroutine frame and remove tail
90	// attribute from them, since now coroutine frame resides on the stack and tail
91	// call implies that the function does not references anything on the stack.
92	// However if it's a musttail call, we cannot remove the tailcall attribute.
93	// It's safe to keep it there as the musttail call is for symmetric transfer,
94	// and by that point the frame should have been destroyed and hence not
95	// interfering with operands.
96	static void removeTailCallAttribute(AllocaInst *Frame, AAResults &AA) {
97	Function &F = *Frame->getFunction();
98	for (Instruction &I : instructions(F))
99	if (auto *Call = dyn_cast<CallInst>(Val: &I))
100	if (Call->isTailCall() && operandReferences(CI: Call, Frame, AA) &&
101	!Call->isMustTailCall())
102	Call->setTailCall(false);
103	}
104
105	// Given a resume function @f.resume(%f.frame %frame), returns the size*
106	// and expected alignment of %f.frame type.
107	static std::optional<std::pair<uint64_t, Align>>
108	getFrameLayout(Function *Resume) {
109	// Pull information from the function attributes.
110	auto Size = Resume->getParamDereferenceableBytes(ArgNo: `0`);
111	if (!Size)
112	return std::nullopt;
113	return std::make_pair(x&: Size, y: Resume->getParamAlign(ArgNo: `0`).valueOrOne());
114	}
115
116	// Finds first non alloca instruction in the entry block of a function.
117	static Instruction getFirstNonAllocaInTheEntryBlock(Function F) {
118	for (Instruction &I : F->getEntryBlock())
119	if (!isa<AllocaInst>(Val: &I))
120	return &I;
121	llvm_unreachable("no terminator in the entry block");
122	}
123
124	#ifndef NDEBUG
125	static std::unique_ptr<raw_fd_ostream> getOrCreateLogFile() {
126	assert(!CoroElideInfoOutputFilename.empty() &&
127	"coro-elide-info-output-file shouldn't be empty");
128	std::error_code EC;
129	auto Result = std::make_unique<raw_fd_ostream>(args&: CoroElideInfoOutputFilename,
130	args&: EC, args: sys::fs::OF_Append);
131	if (!EC)
132	return Result;
133	llvm::errs() << "Error opening coro-elide-info-output-file '"
134	<< CoroElideInfoOutputFilename << " for appending!\n";
135	return std::make_unique<raw_fd_ostream>(args: `2`, args: false); // stderr.
136	}
137	#endif
138
139	// To elide heap allocations we need to suppress code blocks guarded by
140	// llvm.coro.alloc and llvm.coro.free instructions.
141	void Lowerer::elideHeapAllocations(Function *F, uint64_t FrameSize,
142	Align FrameAlign, AAResults &AA) {
143	LLVMContext &C = F->getContext();
144	BasicBlock::iterator InsertPt =
145	getFirstNonAllocaInTheEntryBlock(F: CoroIds.front()->getFunction())
146	->getIterator();
147
148	// Replacing llvm.coro.alloc with false will suppress dynamic
149	// allocation as it is expected for the frontend to generate the code that
150	// looks like:
151	// id = coro.id(...)
152	// mem = coro.alloc(id) ? malloc(coro.size()) : 0;
153	// coro.begin(id, mem)
154	auto *False = ConstantInt::getFalse(Context&: C);
155	for (auto *CA : CoroAllocs) {
156	CA->replaceAllUsesWith(V: False);
157	CA->eraseFromParent();
158	}
159
160	// FIXME: Design how to transmit alignment information for every alloca that
161	// is spilled into the coroutine frame and recreate the alignment information
162	// here. Possibly we will need to do a mini SROA here and break the coroutine
163	// frame into individual AllocaInst recreating the original alignment.
164	const DataLayout &DL = F->getParent()->getDataLayout();
165	auto FrameTy = ArrayType::get(ElementType: Type::getInt8Ty(C), NumElements: FrameSize);
166	auto Frame = new* AllocaInst (FrameTy, DL.getAllocaAddrSpace(), "", InsertPt);
167	Frame->setAlignment(FrameAlign);
168	auto *FrameVoidPtr =
169	new BitCastInst (Frame, PointerType::getUnqual(C), "vFrame", InsertPt);
170
171	for (auto *CB : CoroBegins) {
172	CB->replaceAllUsesWith(V: FrameVoidPtr);
173	CB->eraseFromParent();
174	}
175
176	// Since now coroutine frame lives on the stack we need to make sure that
177	// any tail call referencing it, must be made non-tail call.
178	removeTailCallAttribute(Frame, AA);
179	}
180
181	bool Lowerer::hasEscapePath(const CoroBeginInst *CB,
182	const SmallPtrSetImpl<BasicBlock > &TIs) const* {
183	const auto &It = DestroyAddr.find(Val: CB);
184	assert(It != DestroyAddr.end());
185
186	// Limit the number of blocks we visit.
187	unsigned Limit = `32` * (`1` + It ->second.size());
188
189	SmallVector<const BasicBlock *, `32`> Worklist;
190	Worklist.push_back(Elt: CB->getParent());
191
192	SmallPtrSet<const BasicBlock *, `32`> Visited;
193	// Consider basicblock of coro.destroy as visited one, so that we
194	// skip the path pass through coro.destroy.
195	for (auto *DA : It ->second)
196	Visited.insert(Ptr: DA->getParent());
197
198	SmallPtrSet<const BasicBlock *, `32`> EscapingBBs;
199	for (auto *U : CB->users()) {
200	// The use from coroutine intrinsics are not a problem.
201	if (isa<CoroFreeInst, CoroSubFnInst, CoroSaveInst>(Val: U))
202	continue;
203
204	// Think all other usages may be an escaping candidate conservatively.
205	//
206	// Note that the major user of switch ABI coroutine (the C++) will store
207	// resume.fn, destroy.fn and the index to the coroutine frame immediately.
208	// So the parent of the coro.begin in C++ will be always escaping.
209	// Then we can't get any performance benefits for C++ by improving the
210	// precision of the method.
211	//
212	// The reason why we still judge it is we want to make LLVM Coroutine in
213	// switch ABIs to be self contained as much as possible instead of a
214	// by-product of C++20 Coroutines.
215	EscapingBBs.insert(Ptr: cast<Instruction>(Val: U)->getParent());
216	}
217
218	bool PotentiallyEscaped = false;
219
220	do {
221	const auto *BB = Worklist.pop_back_val();
222	if (!Visited.insert(Ptr: BB).second)
223	continue;
224
225	// A Path insensitive marker to test whether the coro.begin escapes.
226	// It is intentional to make it path insensitive while it may not be
227	// precise since we don't want the process to be too slow.
228	PotentiallyEscaped \|= EscapingBBs.count(Ptr: BB);
229
230	if (TIs.count(Ptr: BB)) {
231	if (isa<ReturnInst>(Val: BB->getTerminator()) \|\| PotentiallyEscaped)
232	return true;
233
234	// If the function ends with the exceptional terminator, the memory used
235	// by the coroutine frame can be released by stack unwinding
236	// automatically. So we can think the coro.begin doesn't escape if it
237	// exits the function by exceptional terminator.
238
239	continue;
240	}
241
242	// Conservatively say that there is potentially a path.
243	if (!--Limit)
244	return true;
245
246	auto TI = BB->getTerminator();
247	// Although the default dest of coro.suspend switches is suspend pointer
248	// which means a escape path to normal terminator, it is reasonable to skip
249	// it since coroutine frame doesn't change outside the coroutine body.
250	if (isa<SwitchInst>(Val: TI) &&
251	CoroSuspendSwitches.count(Ptr: cast<SwitchInst>(Val: TI))) {
252	Worklist.push_back(Elt: cast<SwitchInst>(Val: TI)->getSuccessor(idx: `1`));
253	Worklist.push_back(Elt: cast<SwitchInst>(Val: TI)->getSuccessor(idx: `2`));
254	} else
255	Worklist.append(in_start: succ_begin(BB), in_end: succ_end(BB));
256
257	} while (!Worklist.empty());
258
259	// We have exhausted all possible paths and are certain that coro.begin can
260	// not reach to any of terminators.
261	return false;
262	}
263
264	bool Lowerer::shouldElide(Function F, DominatorTree &DT) const* {
265	// If no CoroAllocs, we cannot suppress allocation, so elision is not
266	// possible.
267	if (CoroAllocs.empty())
268	return false;
269
270	// Check that for every coro.begin there is at least one coro.destroy directly
271	// referencing the SSA value of that coro.begin along each
272	// non-exceptional path.
273	// If the value escaped, then coro.destroy would have been referencing a
274	// memory location storing that value and not the virtual register.
275
276	SmallPtrSet<BasicBlock *, `8`> Terminators;
277	// First gather all of the terminators for the function.
278	// Consider the final coro.suspend as the real terminator when the current
279	// function is a coroutine.
280	for (BasicBlock &B : *F) {
281	auto *TI = B.getTerminator();
282
283	if (TI->getNumSuccessors() != `0` \|\| isa<UnreachableInst>(Val: TI))
284	continue;
285
286	Terminators.insert(Ptr: &B);
287	}
288
289	// Filter out the coro.destroy that lie along exceptional paths.
290	SmallPtrSet<CoroBeginInst *, `8`> ReferencedCoroBegins;
291	for (const auto &It : DestroyAddr) {
292	// If every terminators is dominated by coro.destroy, we could know the
293	// corresponding coro.begin wouldn't escape.
294	//
295	// Otherwise hasEscapePath would decide whether there is any paths from
296	// coro.begin to Terminators which not pass through any of the
297	// coro.destroys.
298	//
299	// hasEscapePath is relatively slow, so we avoid to run it as much as
300	// possible.
301	if (llvm::all_of(Range&: Terminators,
302	P: [&](auto *TI) {
303	return llvm::any_of(It.second, [&](auto *DA) {
304	return DT.dominates(DA, TI->getTerminator());
305	});
306	}) \|\|
307	!hasEscapePath(CB: It.first, TIs: Terminators))
308	ReferencedCoroBegins.insert(Ptr: It.first);
309	}
310
311	// If size of the set is the same as total number of coro.begin, that means we
312	// found a coro.free or coro.destroy referencing each coro.begin, so we can
313	// perform heap elision.
314	return ReferencedCoroBegins.size() == CoroBegins.size();
315	}
316
317	void Lowerer::collectPostSplitCoroIds(Function *F) {
318	CoroIds.clear();
319	CoroSuspendSwitches.clear();
320	for (auto &I : instructions(F)) {
321	if (auto *CII = dyn_cast<CoroIdInst>(Val: &I))
322	if (CII->getInfo().isPostSplit())
323	// If it is the coroutine itself, don't touch it.
324	if (CII->getCoroutine() != CII->getFunction())
325	CoroIds.push_back(Elt: CII);
326
327	// Consider case like:
328	// %0 = call i8 @llvm.coro.suspend(...)
329	// switch i8 %0, label %suspend [i8 0, label %resume
330	// i8 1, label %cleanup]
331	// and collect the SwitchInsts which are used by escape analysis later.
332	if (auto *CSI = dyn_cast<CoroSuspendInst>(Val: &I))
333	if (CSI->hasOneUse() && isa<SwitchInst>(Val: CSI->use_begin()->getUser())) {
334	SwitchInst *SWI = cast<SwitchInst>(Val: CSI->use_begin()->getUser());
335	if (SWI->getNumCases() == `2`)
336	CoroSuspendSwitches.insert(Ptr: SWI);
337	}
338	}
339	}
340
341	bool Lowerer::processCoroId(CoroIdInst *CoroId, AAResults &AA,
342	DominatorTree &DT, OptimizationRemarkEmitter &ORE) {
343	CoroBegins.clear();
344	CoroAllocs.clear();
345	ResumeAddr.clear();
346	DestroyAddr.clear();
347
348	// Collect all coro.begin and coro.allocs associated with this coro.id.
349	for (User *U : CoroId->users()) {
350	if (auto *CB = dyn_cast<CoroBeginInst>(Val: U))
351	CoroBegins.push_back(Elt: CB);
352	else if (auto *CA = dyn_cast<CoroAllocInst>(Val: U))
353	CoroAllocs.push_back(Elt: CA);
354	}
355
356	// Collect all coro.subfn.addrs associated with coro.begin.
357	// Note, we only devirtualize the calls if their coro.subfn.addr refers to
358	// coro.begin directly. If we run into cases where this check is too
359	// conservative, we can consider relaxing the check.
360	for (CoroBeginInst *CB : CoroBegins) {
361	for (User *U : CB->users())
362	if (auto *II = dyn_cast<CoroSubFnInst>(Val: U))
363	switch (II->getIndex()) {
364	case CoroSubFnInst::ResumeIndex:
365	ResumeAddr.push_back(Elt: II);
366	break;
367	case CoroSubFnInst::DestroyIndex:
368	DestroyAddr [CB].push_back(Elt: II);
369	break;
370	default:
371	llvm_unreachable("unexpected coro.subfn.addr constant");
372	}
373	}
374
375	// PostSplit coro.id refers to an array of subfunctions in its Info
376	// argument.
377	ConstantArray *Resumers = CoroId->getInfo().Resumers;
378	assert(Resumers && "PostSplit coro.id Info argument must refer to an array"
379	"of coroutine subfunctions");
380	auto *ResumeAddrConstant =
381	Resumers->getAggregateElement(Elt: CoroSubFnInst::ResumeIndex);
382
383	replaceWithConstant(Value: ResumeAddrConstant, Users&: ResumeAddr);
384
385	bool ShouldElide = shouldElide(F: CoroId->getFunction(), DT);
386	if (!ShouldElide)
387	ORE.emit(RemarkBuilder: [&]() {
388	if (auto FrameSizeAndAlign =
389	getFrameLayout(Resume: cast<Function>(Val: ResumeAddrConstant)))
390	return OptimizationRemarkMissed (DEBUG_TYPE, "CoroElide", CoroId)
391	<< "'" << ore::NV ("callee", CoroId->getCoroutine()->getName())
392	<< "' not elided in '"
393	<< ore::NV ("caller", CoroId->getFunction()->getName())
394	<< "' (frame_size="
395	<< ore::NV ("frame_size", FrameSizeAndAlign ->first) << ", align="
396	<< ore::NV ("align", FrameSizeAndAlign ->second.value()) << ")";
397	else
398	return OptimizationRemarkMissed (DEBUG_TYPE, "CoroElide", CoroId)
399	<< "'" << ore::NV ("callee", CoroId->getCoroutine()->getName())
400	<< "' not elided in '"
401	<< ore::NV ("caller", CoroId->getFunction()->getName())
402	<< "' (frame_size=unknown, align=unknown)";
403	});
404
405	auto *DestroyAddrConstant = Resumers->getAggregateElement(
406	Elt: ShouldElide ? CoroSubFnInst::CleanupIndex : CoroSubFnInst::DestroyIndex);
407
408	for (auto &It : DestroyAddr)
409	replaceWithConstant(Value: DestroyAddrConstant, Users&: It.second);
410
411	if (ShouldElide) {
412	if (auto FrameSizeAndAlign =
413	getFrameLayout(Resume: cast<Function>(Val: ResumeAddrConstant))) {
414	elideHeapAllocations(F: CoroId->getFunction(), FrameSize: FrameSizeAndAlign ->first,
415	FrameAlign: FrameSizeAndAlign ->second, AA);
416	coro::replaceCoroFree(CoroId, /Elide=/true);
417	NumOfCoroElided ++;
418	#ifndef NDEBUG
419	if (!CoroElideInfoOutputFilename.empty())
420	*getOrCreateLogFile()
421	<< "Elide " << CoroId->getCoroutine()->getName() << " in "
422	<< CoroId->getFunction()->getName() << "\n";
423	#endif
424	ORE.emit(RemarkBuilder: [&]() {
425	return OptimizationRemark (DEBUG_TYPE, "CoroElide", CoroId)
426	<< "'" << ore::NV ("callee", CoroId->getCoroutine()->getName())
427	<< "' elided in '"
428	<< ore::NV ("caller", CoroId->getFunction()->getName())
429	<< "' (frame_size="
430	<< ore::NV ("frame_size", FrameSizeAndAlign ->first) << ", align="
431	<< ore::NV ("align", FrameSizeAndAlign ->second.value()) << ")";
432	});
433	} else {
434	ORE.emit(RemarkBuilder: [&]() {
435	return OptimizationRemarkMissed (DEBUG_TYPE, "CoroElide", CoroId)
436	<< "'" << ore::NV ("callee", CoroId->getCoroutine()->getName())
437	<< "' not elided in '"
438	<< ore::NV ("caller", CoroId->getFunction()->getName())
439	<< "' (frame_size=unknown, align=unknown)";
440	});
441	}
442	}
443
444	return true;
445	}
446
447	static bool declaresCoroElideIntrinsics(Module &M) {
448	return coro::declaresIntrinsics(M, {"llvm.coro.id", "llvm.coro.id.async"});
449	}
450
451	PreservedAnalyses CoroElidePass::run(Function &F, FunctionAnalysisManager &AM) {
452	auto &M = *F.getParent();
453	if (!declaresCoroElideIntrinsics(M))
454	return PreservedAnalyses::all();
455
456	Lowerer L(M);
457	L.CoroIds.clear();
458	L.collectPostSplitCoroIds(F: &F);
459	// If we did not find any coro.id, there is nothing to do.
460	if (L.CoroIds.empty())
461	return PreservedAnalyses::all();
462
463	AAResults &AA = AM.getResult<AAManager>(IR&: F);
464	DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(IR&: F);
465	auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F);
466
467	bool Changed = false;
468	for (auto *CII : L.CoroIds)
469	Changed \|= L.processCoroId(CoroId: CII, AA, DT, ORE);
470
471	return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
472	}
473

source code of llvm/lib/Transforms/Coroutines/CoroElide.cpp