deferred.c source code [linux/kernel/unwind/deferred.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Deferred user space unwinding
4	*/
5	#include <linux/sched/task_stack.h>
6	#include <linux/unwind_deferred.h>
7	#include <linux/sched/clock.h>
8	#include <linux/task_work.h>
9	#include <linux/kernel.h>
10	#include <linux/sched.h>
11	#include <linux/sizes.h>
12	#include <linux/slab.h>
13	#include <linux/mm.h>
14
15	/*
16	* For requesting a deferred user space stack trace from NMI context
17	* the architecture must support a safe cmpxchg in NMI context.
18	* For those architectures that do not have that, then it cannot ask
19	* for a deferred user space stack trace from an NMI context. If it
20	* does, then it will get -EINVAL.
21	*/
22	#if defined(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG)
23	# define CAN_USE_IN_NMI 1
24	static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt)
25	{
26	u32 old = `0`;
27
28	return try_cmpxchg(&info->id.cnt, &old, cnt);
29	}
30	#else
31	# define CAN_USE_IN_NMI 0
32	/ When NMIs are not allowed, this always succeeds /
33	static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt)
34	{
35	info->id.cnt = cnt;
36	return true;
37	}
38	#endif
39
40	/ Make the cache fit in a 4K page /
41	#define UNWIND_MAX_ENTRIES \
42	((SZ_4K - sizeof(struct unwind_cache)) / sizeof(long))
43
44	/ Guards adding to or removing from the list of callbacks /
45	static DEFINE_MUTEX(callback_mutex);
46	static LIST_HEAD(callbacks);
47
48	#define RESERVED_BITS (UNWIND_PENDING \| UNWIND_USED)
49
50	/ Zero'd bits are available for assigning callback users /
51	static unsigned long unwind_mask = RESERVED_BITS;
52	DEFINE_STATIC_SRCU(unwind_srcu);
53
54	static inline bool unwind_pending(struct unwind_task_info *info)
55	{
56	return atomic_long_read(v: &info->unwind_mask) & UNWIND_PENDING;
57	}
58
59	/*
60	* This is a unique percpu identifier for a given task entry context.
61	* Conceptually, it's incremented every time the CPU enters the kernel from
62	* user space, so that each "entry context" on the CPU gets a unique ID. In
63	* reality, as an optimization, it's only incremented on demand for the first
64	* deferred unwind request after a given entry-from-user.
65	*
66	* It's combined with the CPU id to make a systemwide-unique "context cookie".
67	*/
68	static DEFINE_PER_CPU(u32, unwind_ctx_ctr);
69
70	/*
71	* The context cookie is a unique identifier that is assigned to a user
72	* space stacktrace. As the user space stacktrace remains the same while
73	* the task is in the kernel, the cookie is an identifier for the stacktrace.
74	* Although it is possible for the stacktrace to get another cookie if another
75	* request is made after the cookie was cleared and before reentering user
76	* space.
77	*/
78	static u64 get_cookie(struct unwind_task_info *info)
79	{
80	u32 cnt = `1`;
81
82	lockdep_assert_irqs_disabled();
83
84	if (info->id.cpu)
85	return info->id.id;
86
87	/ LSB is always set to ensure 0 is an invalid value /
88	cnt \|= __this_cpu_read(unwind_ctx_ctr) + `2`;
89	if (try_assign_cnt(info, cnt)) {
90	/ Update the per cpu counter /
91	__this_cpu_write(unwind_ctx_ctr, cnt);
92	}
93	/ Interrupts are disabled, the CPU will always be same /
94	info->id.cpu = smp_processor_id() + `1`; / Must be non zero /
95
96	return info->id.id;
97	}
98
99	/**
100	* unwind_user_faultable - Produce a user stacktrace in faultable context
101	* @trace: The descriptor that will store the user stacktrace
102	*
103	* This must be called in a known faultable context (usually when entering
104	* or exiting user space). Depending on the available implementations
105	* the @trace will be loaded with the addresses of the user space stacktrace
106	* if it can be found.
107	*
108	* Return: 0 on success and negative on error
109	* On success @trace will contain the user space stacktrace
110	*/
111	int unwind_user_faultable(struct unwind_stacktrace *trace)
112	{
113	struct unwind_task_info *info = &current->unwind_info;
114	struct unwind_cache *cache;
115
116	/ Should always be called from faultable context /
117	might_fault();
118
119	if (!current->mm)
120	return -EINVAL;
121
122	if (!info->cache) {
123	info->cache = kzalloc(struct_size(cache, entries, UNWIND_MAX_ENTRIES),
124	GFP_KERNEL);
125	if (!info->cache)
126	return -ENOMEM;
127	}
128
129	cache = info->cache;
130	trace->entries = cache->entries;
131	trace->nr = cache->nr_entries;
132	/*
133	* The user stack has already been previously unwound in this
134	* entry context. Skip the unwind and use the cache.
135	*/
136	if (trace->nr)
137	return `0`;
138
139	unwind_user(trace, UNWIND_MAX_ENTRIES);
140
141	cache->nr_entries = trace->nr;
142
143	/ Clear nr_entries on way back to user space /
144	atomic_long_or(i: UNWIND_USED, v: &info->unwind_mask);
145
146	return `0`;
147	}
148
149	static void process_unwind_deferred(struct task_struct *task)
150	{
151	struct unwind_task_info *info = &task->unwind_info;
152	struct unwind_stacktrace trace;
153	struct unwind_work *work;
154	unsigned long bits;
155	u64 cookie;
156
157	if (WARN_ON_ONCE(!unwind_pending(info)))
158	return;
159
160	/ Clear pending bit but make sure to have the current bits /
161	bits = atomic_long_fetch_andnot(i: UNWIND_PENDING,
162	v: &info->unwind_mask);
163	/*
164	* From here on out, the callback must always be called, even if it's
165	* just an empty trace.
166	*/
167	trace.nr = `0`;
168	trace.entries = NULL;
169
170	unwind_user_faultable(trace: &trace);
171
172	if (info->cache)
173	bits &= ~(info->cache->unwind_completed);
174
175	cookie = info->id.id;
176
177	guard(srcu)(l: &unwind_srcu);
178	list_for_each_entry_srcu(work, &callbacks, list,
179	srcu_read_lock_held(&unwind_srcu)) {
180	if (test_bit(work->bit, &bits)) {
181	work->func(work, &trace, cookie);
182	if (info->cache)
183	info->cache->unwind_completed \|= BIT(work->bit);
184	}
185	}
186	}
187
188	static void unwind_deferred_task_work(struct callback_head *head)
189	{
190	process_unwind_deferred(current);
191	}
192
193	void unwind_deferred_task_exit(struct task_struct *task)
194	{
195	struct unwind_task_info *info = &current->unwind_info;
196
197	if (!unwind_pending(info))
198	return;
199
200	process_unwind_deferred(task);
201
202	task_work_cancel(task, cb: &info->work);
203	}
204
205	/**
206	* unwind_deferred_request - Request a user stacktrace on task kernel exit
207	* @work: Unwind descriptor requesting the trace
208	* @cookie: The cookie of the first request made for this task
209	*
210	* Schedule a user space unwind to be done in task work before exiting the
211	* kernel.
212	*
213	* The returned @cookie output is the generated cookie of the very first
214	* request for a user space stacktrace for this task since it entered the
215	* kernel. It can be from a request by any caller of this infrastructure.
216	* Its value will also be passed to the callback function. It can be
217	* used to stitch kernel and user stack traces together in post-processing.
218	*
219	* It's valid to call this function multiple times for the same @work within
220	* the same task entry context. Each call will return the same cookie
221	* while the task hasn't left the kernel. If the callback is not pending
222	* because it has already been previously called for the same entry context,
223	* it will be called again with the same stack trace and cookie.
224	*
225	* Return: 0 if the callback successfully was queued.
226	* 1 if the callback is pending or was already executed.
227	* Negative if there's an error.
228	* @cookie holds the cookie of the first request by any user
229	*/
230	int unwind_deferred_request(struct unwind_work work, u64 cookie)
231	{
232	struct unwind_task_info *info = &current->unwind_info;
233	int twa_mode = TWA_RESUME;
234	unsigned long old, bits;
235	unsigned long bit;
236	int ret;
237
238	*cookie = `0`;
239
240	if ((current->flags & (PF_KTHREAD \| PF_EXITING)) \|\|
241	!user_mode(task_pt_regs(current)))
242	return -EINVAL;
243
244	/*
245	* NMI requires having safe cmpxchg operations.
246	* Trigger a warning to make it obvious that an architecture
247	* is using this in NMI when it should not be.
248	*/
249	if (in_nmi()) {
250	if (WARN_ON_ONCE(!CAN_USE_IN_NMI))
251	return -EINVAL;
252	twa_mode = TWA_NMI_CURRENT;
253	}
254
255	/ Do not allow cancelled works to request again /
256	bit = READ_ONCE(work->bit);
257	if (WARN_ON_ONCE(bit < `0`))
258	return -EINVAL;
259
260	/ Only need the mask now /
261	bit = BIT(bit);
262
263	guard(irqsave)();
264
265	*cookie = get_cookie(info);
266
267	old = atomic_long_read(v: &info->unwind_mask);
268
269	/ Is this already queued or executed /
270	if (old & bit)
271	return `1`;
272
273	/*
274	* This work's bit hasn't been set yet. Now set it with the PENDING
275	* bit and fetch the current value of unwind_mask. If ether the
276	* work's bit or PENDING was already set, then this is already queued
277	* to have a callback.
278	*/
279	bits = UNWIND_PENDING \| bit;
280	old = atomic_long_fetch_or(i: bits, v: &info->unwind_mask);
281	if (old & bits) {
282	/*
283	* If the work's bit was set, whatever set it had better
284	* have also set pending and queued a callback.
285	*/
286	WARN_ON_ONCE(!(old & UNWIND_PENDING));
287	return old & bit;
288	}
289
290	/ The work has been claimed, now schedule it. /
291	ret = task_work_add(current, twork: &info->work, mode: twa_mode);
292
293	if (WARN_ON_ONCE(ret))
294	atomic_long_set(v: &info->unwind_mask, i: `0`);
295
296	return ret;
297	}
298
299	void unwind_deferred_cancel(struct unwind_work *work)
300	{
301	struct task_struct g, t;
302	int bit;
303
304	if (!work)
305	return;
306
307	bit = work->bit;
308
309	/ No work should be using a reserved bit /
310	if (WARN_ON_ONCE(BIT(bit) & RESERVED_BITS))
311	return;
312
313	guard(mutex)(T: &callback_mutex);
314	list_del_rcu(entry: &work->list);
315
316	/ Do not allow any more requests and prevent callbacks /
317	work->bit = -`1`;
318
319	__clear_bit(bit, &unwind_mask);
320
321	synchronize_srcu(ssp: &unwind_srcu);
322
323	guard(rcu)();
324	/ Clear this bit from all threads /
325	for_each_process_thread(g, t) {
326	atomic_long_andnot(BIT(bit),
327	v: &t->unwind_info.unwind_mask);
328	if (t->unwind_info.cache)
329	clear_bit(nr: bit, addr: &t->unwind_info.cache->unwind_completed);
330	}
331	}
332
333	int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func)
334	{
335	memset(work, `0`, sizeof(*work));
336
337	guard(mutex)(T: &callback_mutex);
338
339	/ See if there's a bit in the mask available /
340	if (unwind_mask == ~`0UL`)
341	return -EBUSY;
342
343	work->bit = ffz(unwind_mask);
344	__set_bit(work->bit, &unwind_mask);
345
346	list_add_rcu(new: &work->list, head: &callbacks);
347	work->func = func;
348	return `0`;
349	}
350
351	void unwind_task_init(struct task_struct *task)
352	{
353	struct unwind_task_info *info = &task->unwind_info;
354
355	memset(info, `0`, sizeof(*info));
356	init_task_work(twork: &info->work, func: unwind_deferred_task_work);
357	atomic_long_set(v: &info->unwind_mask, i: `0`);
358	}
359
360	void unwind_task_free(struct task_struct *task)
361	{
362	struct unwind_task_info *info = &task->unwind_info;
363
364	kfree(objp: info->cache);
365	task_work_cancel(task, cb: &info->work);
366	}
367

source code of linux/kernel/unwind/deferred.c