pi.c source code [linux/kernel/futex/pi.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2
3	#include <linux/slab.h>
4	#include <linux/sched/rt.h>
5	#include <linux/sched/task.h>
6
7	#include "futex.h"
8	#include "../locking/rtmutex_common.h"
9
10	/*
11	* PI code:
12	*/
13	int refill_pi_state_cache(void)
14	{
15	struct futex_pi_state *pi_state;
16
17	if (likely(current->pi_state_cache))
18	return `0`;
19
20	pi_state = kzalloc(size: sizeof(*pi_state), GFP_KERNEL);
21
22	if (!pi_state)
23	return -ENOMEM;
24
25	INIT_LIST_HEAD(list: &pi_state->list);
26	/ pi_mutex gets initialized later /
27	pi_state->owner = NULL;
28	refcount_set(r: &pi_state->refcount, n: `1`);
29	pi_state->key = FUTEX_KEY_INIT;
30
31	current->pi_state_cache = pi_state;
32
33	return `0`;
34	}
35
36	static struct futex_pi_state alloc_pi_state(void*)
37	{
38	struct futex_pi_state *pi_state = current->pi_state_cache;
39
40	WARN_ON(!pi_state);
41	current->pi_state_cache = NULL;
42
43	return pi_state;
44	}
45
46	static void pi_state_update_owner(struct futex_pi_state *pi_state,
47	struct task_struct *new_owner)
48	{
49	struct task_struct *old_owner = pi_state->owner;
50
51	lockdep_assert_held(&pi_state->pi_mutex.wait_lock);
52
53	if (old_owner) {
54	raw_spin_lock(&old_owner->pi_lock);
55	WARN_ON(list_empty(&pi_state->list));
56	list_del_init(entry: &pi_state->list);
57	raw_spin_unlock(&old_owner->pi_lock);
58	}
59
60	if (new_owner) {
61	raw_spin_lock(&new_owner->pi_lock);
62	WARN_ON(!list_empty(&pi_state->list));
63	list_add(new: &pi_state->list, head: &new_owner->pi_state_list);
64	pi_state->owner = new_owner;
65	raw_spin_unlock(&new_owner->pi_lock);
66	}
67	}
68
69	void get_pi_state(struct futex_pi_state *pi_state)
70	{
71	WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));
72	}
73
74	/*
75	* Drops a reference to the pi_state object and frees or caches it
76	* when the last reference is gone.
77	*/
78	void put_pi_state(struct futex_pi_state *pi_state)
79	{
80	if (!pi_state)
81	return;
82
83	if (!refcount_dec_and_test(r: &pi_state->refcount))
84	return;
85
86	/*
87	* If pi_state->owner is NULL, the owner is most probably dying
88	* and has cleaned up the pi_state already
89	*/
90	if (pi_state->owner) {
91	unsigned long flags;
92
93	raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags);
94	pi_state_update_owner(pi_state, NULL);
95	rt_mutex_proxy_unlock(lock: &pi_state->pi_mutex);
96	raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags);
97	}
98
99	if (current->pi_state_cache) {
100	kfree(objp: pi_state);
101	} else {
102	/*
103	* pi_state->list is already empty.
104	* clear pi_state->owner.
105	* refcount is at 0 - put it back to 1.
106	*/
107	pi_state->owner = NULL;
108	refcount_set(r: &pi_state->refcount, n: `1`);
109	current->pi_state_cache = pi_state;
110	}
111	}
112
113	/*
114	* We need to check the following states:
115	*
116	* Waiter \| pi_state \| pi->owner \| uTID \| uODIED \| ?
117	*
118	* [1] NULL \| --- \| --- \| 0 \| 0/1 \| Valid
119	* [2] NULL \| --- \| --- \| >0 \| 0/1 \| Valid
120	*
121	* [3] Found \| NULL \| -- \| Any \| 0/1 \| Invalid
122	*
123	* [4] Found \| Found \| NULL \| 0 \| 1 \| Valid
124	* [5] Found \| Found \| NULL \| >0 \| 1 \| Invalid
125	*
126	* [6] Found \| Found \| task \| 0 \| 1 \| Valid
127	*
128	* [7] Found \| Found \| NULL \| Any \| 0 \| Invalid
129	*
130	* [8] Found \| Found \| task \| ==taskTID \| 0/1 \| Valid
131	* [9] Found \| Found \| task \| 0 \| 0 \| Invalid
132	* [10] Found \| Found \| task \| !=taskTID \| 0/1 \| Invalid
133	*
134	* [1] Indicates that the kernel can acquire the futex atomically. We
135	* came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
136	*
137	* [2] Valid, if TID does not belong to a kernel thread. If no matching
138	* thread is found then it indicates that the owner TID has died.
139	*
140	* [3] Invalid. The waiter is queued on a non PI futex
141	*
142	* [4] Valid state after exit_robust_list(), which sets the user space
143	* value to FUTEX_WAITERS \| FUTEX_OWNER_DIED.
144	*
145	* [5] The user space value got manipulated between exit_robust_list()
146	* and exit_pi_state_list()
147	*
148	* [6] Valid state after exit_pi_state_list() which sets the new owner in
149	* the pi_state but cannot access the user space value.
150	*
151	* [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set.
152	*
153	* [8] Owner and user space value match
154	*
155	* [9] There is no transient state which sets the user space TID to 0
156	* except exit_robust_list(), but this is indicated by the
157	* FUTEX_OWNER_DIED bit. See [4]
158	*
159	* [10] There is no transient state which leaves owner and user space
160	* TID out of sync. Except one error case where the kernel is denied
161	* write access to the user address, see fixup_pi_state_owner().
162	*
163	*
164	* Serialization and lifetime rules:
165	*
166	* hb->lock:
167	*
168	* hb -> futex_q, relation
169	* futex_q -> pi_state, relation
170	*
171	* (cannot be raw because hb can contain arbitrary amount
172	* of futex_q's)
173	*
174	* pi_mutex->wait_lock:
175	*
176	* {uval, pi_state}
177	*
178	* (and pi_mutex 'obviously')
179	*
180	* p->pi_lock:
181	*
182	* p->pi_state_list -> pi_state->list, relation
183	* pi_mutex->owner -> pi_state->owner, relation
184	*
185	* pi_state->refcount:
186	*
187	* pi_state lifetime
188	*
189	*
190	* Lock order:
191	*
192	* hb->lock
193	* pi_mutex->wait_lock
194	* p->pi_lock
195	*
196	*/
197
198	/*
199	* Validate that the existing waiter has a pi_state and sanity check
200	* the pi_state against the user space value. If correct, attach to
201	* it.
202	*/
203	static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
204	struct futex_pi_state *pi_state,
205	struct futex_pi_state **ps)
206	{
207	pid_t pid = uval & FUTEX_TID_MASK;
208	u32 uval2;
209	int ret;
210
211	/*
212	* Userspace might have messed up non-PI and PI futexes [3]
213	*/
214	if (unlikely(!pi_state))
215	return -EINVAL;
216
217	/*
218	* We get here with hb->lock held, and having found a
219	* futex_top_waiter(). This means that futex_lock_pi() of said futex_q
220	* has dropped the hb->lock in between futex_queue() and futex_unqueue_pi(),
221	* which in turn means that futex_lock_pi() still has a reference on
222	* our pi_state.
223	*
224	* The waiter holding a reference on @pi_state also protects against
225	* the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
226	* and futex_wait_requeue_pi() as it cannot go to 0 and consequently
227	* free pi_state before we can take a reference ourselves.
228	*/
229	WARN_ON(!refcount_read(&pi_state->refcount));
230
231	/*
232	* Now that we have a pi_state, we can acquire wait_lock
233	* and do the state validation.
234	*/
235	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
236
237	/*
238	* Since {uval, pi_state} is serialized by wait_lock, and our current
239	* uval was read without holding it, it can have changed. Verify it
240	* still is what we expect it to be, otherwise retry the entire
241	* operation.
242	*/
243	if (futex_get_value_locked(dest: &uval2, from: uaddr))
244	goto out_efault;
245
246	if (uval != uval2)
247	goto out_eagain;
248
249	/*
250	* Handle the owner died case:
251	*/
252	if (uval & FUTEX_OWNER_DIED) {
253	/*
254	* exit_pi_state_list sets owner to NULL and wakes the
255	* topmost waiter. The task which acquires the
256	* pi_state->rt_mutex will fixup owner.
257	*/
258	if (!pi_state->owner) {
259	/*
260	* No pi state owner, but the user space TID
261	* is not 0. Inconsistent state. [5]
262	*/
263	if (pid)
264	goto out_einval;
265	/*
266	* Take a ref on the state and return success. [4]
267	*/
268	goto out_attach;
269	}
270
271	/*
272	* If TID is 0, then either the dying owner has not
273	* yet executed exit_pi_state_list() or some waiter
274	* acquired the rtmutex in the pi state, but did not
275	* yet fixup the TID in user space.
276	*
277	* Take a ref on the state and return success. [6]
278	*/
279	if (!pid)
280	goto out_attach;
281	} else {
282	/*
283	* If the owner died bit is not set, then the pi_state
284	* must have an owner. [7]
285	*/
286	if (!pi_state->owner)
287	goto out_einval;
288	}
289
290	/*
291	* Bail out if user space manipulated the futex value. If pi
292	* state exists then the owner TID must be the same as the
293	* user space TID. [9/10]
294	*/
295	if (pid != task_pid_vnr(tsk: pi_state->owner))
296	goto out_einval;
297
298	out_attach:
299	get_pi_state(pi_state);
300	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
301	*ps = pi_state;
302	return `0`;
303
304	out_einval:
305	ret = -EINVAL;
306	goto out_error;
307
308	out_eagain:
309	ret = -EAGAIN;
310	goto out_error;
311
312	out_efault:
313	ret = -EFAULT;
314	goto out_error;
315
316	out_error:
317	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
318	return ret;
319	}
320
321	static int handle_exit_race(u32 __user *uaddr, u32 uval,
322	struct task_struct *tsk)
323	{
324	u32 uval2;
325
326	/*
327	* If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
328	* caller that the alleged owner is busy.
329	*/
330	if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
331	return -EBUSY;
332
333	/*
334	* Reread the user space value to handle the following situation:
335	*
336	* CPU0 CPU1
337	*
338	* sys_exit() sys_futex()
339	* do_exit() futex_lock_pi()
340	* futex_lock_pi_atomic()
341	* exit_signals(tsk) No waiters:
342	* tsk->flags \|= PF_EXITING; *uaddr == 0x00000PID
343	* mm_release(tsk) Set waiter bit
344	* exit_robust_list(tsk) { *uaddr = 0x80000PID;
345	* Set owner died attach_to_pi_owner() {
346	* *uaddr = 0xC0000000; tsk = get_task(PID);
347	* } if (!tsk->flags & PF_EXITING) {
348	* ... attach();
349	* tsk->futex_state = } else {
350	* FUTEX_STATE_DEAD; if (tsk->futex_state !=
351	* FUTEX_STATE_DEAD)
352	* return -EAGAIN;
353	* return -ESRCH; <--- FAIL
354	* }
355	*
356	* Returning ESRCH unconditionally is wrong here because the
357	* user space value has been changed by the exiting task.
358	*
359	* The same logic applies to the case where the exiting task is
360	* already gone.
361	*/
362	if (futex_get_value_locked(dest: &uval2, from: uaddr))
363	return -EFAULT;
364
365	/ If the user space value has changed, try again. /
366	if (uval2 != uval)
367	return -EAGAIN;
368
369	/*
370	* The exiting task did not have a robust list, the robust list was
371	* corrupted or the user space value in *uaddr is simply bogus.
372	* Give up and tell user space.
373	*/
374	return -ESRCH;
375	}
376
377	static void __attach_to_pi_owner(struct task_struct p, union* futex_key *key,
378	struct futex_pi_state **ps)
379	{
380	/*
381	* No existing pi state. First waiter. [2]
382	*
383	* This creates pi_state, we have hb->lock held, this means nothing can
384	* observe this state, wait_lock is irrelevant.
385	*/
386	struct futex_pi_state *pi_state = alloc_pi_state();
387
388	/*
389	* Initialize the pi_mutex in locked state and make @p
390	* the owner of it:
391	*/
392	rt_mutex_init_proxy_locked(lock: &pi_state->pi_mutex, proxy_owner: p);
393
394	/ Store the key for possible exit cleanups: /
395	pi_state->key = *key;
396
397	WARN_ON(!list_empty(&pi_state->list));
398	list_add(new: &pi_state->list, head: &p->pi_state_list);
399	/*
400	* Assignment without holding pi_state->pi_mutex.wait_lock is safe
401	* because there is no concurrency as the object is not published yet.
402	*/
403	pi_state->owner = p;
404
405	*ps = pi_state;
406	}
407	/*
408	* Lookup the task for the TID provided from user space and attach to
409	* it after doing proper sanity checks.
410	*/
411	static int attach_to_pi_owner(u32 __user uaddr, u32 uval, union* futex_key *key,
412	struct futex_pi_state **ps,
413	struct task_struct **exiting)
414	{
415	pid_t pid = uval & FUTEX_TID_MASK;
416	struct task_struct *p;
417
418	/*
419	* We are the first waiter - try to look up the real owner and attach
420	* the new pi_state to it, but bail out when TID = 0 [1]
421	*
422	* The !pid check is paranoid. None of the call sites should end up
423	* with pid == 0, but better safe than sorry. Let the caller retry
424	*/
425	if (!pid)
426	return -EAGAIN;
427	p = find_get_task_by_vpid(nr: pid);
428	if (!p)
429	return handle_exit_race(uaddr, uval, NULL);
430
431	if (unlikely(p->flags & PF_KTHREAD)) {
432	put_task_struct(t: p);
433	return -EPERM;
434	}
435
436	/*
437	* We need to look at the task state to figure out, whether the
438	* task is exiting. To protect against the change of the task state
439	* in futex_exit_release(), we do this protected by p->pi_lock:
440	*/
441	raw_spin_lock_irq(&p->pi_lock);
442	if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
443	/*
444	* The task is on the way out. When the futex state is
445	* FUTEX_STATE_DEAD, we know that the task has finished
446	* the cleanup:
447	*/
448	int ret = handle_exit_race(uaddr, uval, tsk: p);
449
450	raw_spin_unlock_irq(&p->pi_lock);
451	/*
452	* If the owner task is between FUTEX_STATE_EXITING and
453	* FUTEX_STATE_DEAD then store the task pointer and keep
454	* the reference on the task struct. The calling code will
455	* drop all locks, wait for the task to reach
456	* FUTEX_STATE_DEAD and then drop the refcount. This is
457	* required to prevent a live lock when the current task
458	* preempted the exiting task between the two states.
459	*/
460	if (ret == -EBUSY)
461	*exiting = p;
462	else
463	put_task_struct(t: p);
464	return ret;
465	}
466
467	__attach_to_pi_owner(p, key, ps);
468	raw_spin_unlock_irq(&p->pi_lock);
469
470	put_task_struct(t: p);
471
472	return `0`;
473	}
474
475	static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
476	{
477	int err;
478	u32 curval;
479
480	if (unlikely(should_fail_futex(true)))
481	return -EFAULT;
482
483	err = futex_cmpxchg_value_locked(curval: &curval, uaddr, uval, newval);
484	if (unlikely(err))
485	return err;
486
487	/ If user space value changed, let the caller retry /
488	return curval != uval ? -EAGAIN : `0`;
489	}
490
491	/**
492	* futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
493	* @uaddr: the pi futex user address
494	* @hb: the pi futex hash bucket
495	* @key: the futex key associated with uaddr and hb
496	* @ps: the pi_state pointer where we store the result of the
497	* lookup
498	* @task: the task to perform the atomic lock work for. This will
499	* be "current" except in the case of requeue pi.
500	* @exiting: Pointer to store the task pointer of the owner task
501	* which is in the middle of exiting
502	* @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
503	*
504	* Return:
505	* - 0 - ready to wait;
506	* - 1 - acquired the lock;
507	* - <0 - error
508	*
509	* The hb->lock must be held by the caller.
510	*
511	* @exiting is only set when the return value is -EBUSY. If so, this holds
512	* a refcount on the exiting task on return and the caller needs to drop it
513	* after waiting for the exit to complete.
514	*/
515	int futex_lock_pi_atomic(u32 __user uaddr, struct* futex_hash_bucket *hb,
516	union futex_key *key,
517	struct futex_pi_state **ps,
518	struct task_struct *task,
519	struct task_struct **exiting,
520	int set_waiters)
521	{
522	u32 uval, newval, vpid = task_pid_vnr(tsk: task);
523	struct futex_q *top_waiter;
524	int ret;
525
526	/*
527	* Read the user space value first so we can validate a few
528	* things before proceeding further.
529	*/
530	if (futex_get_value_locked(dest: &uval, from: uaddr))
531	return -EFAULT;
532
533	if (unlikely(should_fail_futex(true)))
534	return -EFAULT;
535
536	/*
537	* Detect deadlocks.
538	*/
539	if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
540	return -EDEADLK;
541
542	if ((unlikely(should_fail_futex(true))))
543	return -EDEADLK;
544
545	/*
546	* Lookup existing state first. If it exists, try to attach to
547	* its pi_state.
548	*/
549	top_waiter = futex_top_waiter(hb, key);
550	if (top_waiter)
551	return attach_to_pi_state(uaddr, uval, pi_state: top_waiter->pi_state, ps);
552
553	/*
554	* No waiter and user TID is 0. We are here because the
555	* waiters or the owner died bit is set or called from
556	* requeue_cmp_pi or for whatever reason something took the
557	* syscall.
558	*/
559	if (!(uval & FUTEX_TID_MASK)) {
560	/*
561	* We take over the futex. No other waiters and the user space
562	* TID is 0. We preserve the owner died bit.
563	*/
564	newval = uval & FUTEX_OWNER_DIED;
565	newval \|= vpid;
566
567	/ The futex requeue_pi code can enforce the waiters bit /
568	if (set_waiters)
569	newval \|= FUTEX_WAITERS;
570
571	ret = lock_pi_update_atomic(uaddr, uval, newval);
572	if (ret)
573	return ret;
574
575	/*
576	* If the waiter bit was requested the caller also needs PI
577	* state attached to the new owner of the user space futex.
578	*
579	* @task is guaranteed to be alive and it cannot be exiting
580	* because it is either sleeping or waiting in
581	* futex_requeue_pi_wakeup_sync().
582	*
583	* No need to do the full attach_to_pi_owner() exercise
584	* because @task is known and valid.
585	*/
586	if (set_waiters) {
587	raw_spin_lock_irq(&task->pi_lock);
588	__attach_to_pi_owner(p: task, key, ps);
589	raw_spin_unlock_irq(&task->pi_lock);
590	}
591	return `1`;
592	}
593
594	/*
595	* First waiter. Set the waiters bit before attaching ourself to
596	* the owner. If owner tries to unlock, it will be forced into
597	* the kernel and blocked on hb->lock.
598	*/
599	newval = uval \| FUTEX_WAITERS;
600	ret = lock_pi_update_atomic(uaddr, uval, newval);
601	if (ret)
602	return ret;
603	/*
604	* If the update of the user space value succeeded, we try to
605	* attach to the owner. If that fails, no harm done, we only
606	* set the FUTEX_WAITERS bit in the user space variable.
607	*/
608	return attach_to_pi_owner(uaddr, uval: newval, key, ps, exiting);
609	}
610
611	/*
612	* Caller must hold a reference on @pi_state.
613	*/
614	static int wake_futex_pi(u32 __user *uaddr, u32 uval,
615	struct futex_pi_state *pi_state,
616	struct rt_mutex_waiter *top_waiter)
617	{
618	struct task_struct *new_owner;
619	bool postunlock = false;
620	DEFINE_RT_WAKE_Q(wqh);
621	u32 curval, newval;
622	int ret = `0`;
623
624	new_owner = top_waiter->task;
625
626	/*
627	* We pass it to the next owner. The WAITERS bit is always kept
628	* enabled while there is PI state around. We cleanup the owner
629	* died bit, because we are the owner.
630	*/
631	newval = FUTEX_WAITERS \| task_pid_vnr(tsk: new_owner);
632
633	if (unlikely(should_fail_futex(true))) {
634	ret = -EFAULT;
635	goto out_unlock;
636	}
637
638	ret = futex_cmpxchg_value_locked(curval: &curval, uaddr, uval, newval);
639	if (!ret && (curval != uval)) {
640	/*
641	* If a unconditional UNLOCK_PI operation (user space did not
642	* try the TID->0 transition) raced with a waiter setting the
643	* FUTEX_WAITERS flag between get_user() and locking the hash
644	* bucket lock, retry the operation.
645	*/
646	if ((FUTEX_TID_MASK & curval) == uval)
647	ret = -EAGAIN;
648	else
649	ret = -EINVAL;
650	}
651
652	if (!ret) {
653	/*
654	* This is a point of no return; once we modified the uval
655	* there is no going back and subsequent operations must
656	* not fail.
657	*/
658	pi_state_update_owner(pi_state, new_owner);
659	postunlock = __rt_mutex_futex_unlock(lock: &pi_state->pi_mutex, wqh: &wqh);
660	}
661
662	out_unlock:
663	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
664
665	if (postunlock)
666	rt_mutex_postunlock(wqh: &wqh);
667
668	return ret;
669	}
670
671	static int __fixup_pi_state_owner(u32 __user uaddr, struct* futex_q *q,
672	struct task_struct *argowner)
673	{
674	struct futex_pi_state *pi_state = q->pi_state;
675	struct task_struct oldowner, newowner;
676	u32 uval, curval, newval, newtid;
677	int err = `0`;
678
679	oldowner = pi_state->owner;
680
681	/*
682	* We are here because either:
683	*
684	* - we stole the lock and pi_state->owner needs updating to reflect
685	* that (@argowner == current),
686	*
687	* or:
688	*
689	* - someone stole our lock and we need to fix things to point to the
690	* new owner (@argowner == NULL).
691	*
692	* Either way, we have to replace the TID in the user space variable.
693	* This must be atomic as we have to preserve the owner died bit here.
694	*
695	* Note: We write the user space value _before_ changing the pi_state
696	* because we can fault here. Imagine swapped out pages or a fork
697	* that marked all the anonymous memory readonly for cow.
698	*
699	* Modifying pi_state _before_ the user space value would leave the
700	* pi_state in an inconsistent state when we fault here, because we
701	* need to drop the locks to handle the fault. This might be observed
702	* in the PID checks when attaching to PI state .
703	*/
704	retry:
705	if (!argowner) {
706	if (oldowner != current) {
707	/*
708	* We raced against a concurrent self; things are
709	* already fixed up. Nothing to do.
710	*/
711	return `0`;
712	}
713
714	if (__rt_mutex_futex_trylock(l: &pi_state->pi_mutex)) {
715	/ We got the lock. pi_state is correct. Tell caller. /
716	return `1`;
717	}
718
719	/*
720	* The trylock just failed, so either there is an owner or
721	* there is a higher priority waiter than this one.
722	*/
723	newowner = rt_mutex_owner(lock: &pi_state->pi_mutex);
724	/*
725	* If the higher priority waiter has not yet taken over the
726	* rtmutex then newowner is NULL. We can't return here with
727	* that state because it's inconsistent vs. the user space
728	* state. So drop the locks and try again. It's a valid
729	* situation and not any different from the other retry
730	* conditions.
731	*/
732	if (unlikely(!newowner)) {
733	err = -EAGAIN;
734	goto handle_err;
735	}
736	} else {
737	WARN_ON_ONCE(argowner != current);
738	if (oldowner == current) {
739	/*
740	* We raced against a concurrent self; things are
741	* already fixed up. Nothing to do.
742	*/
743	return `1`;
744	}
745	newowner = argowner;
746	}
747
748	newtid = task_pid_vnr(tsk: newowner) \| FUTEX_WAITERS;
749	/ Owner died? /
750	if (!pi_state->owner)
751	newtid \|= FUTEX_OWNER_DIED;
752
753	err = futex_get_value_locked(dest: &uval, from: uaddr);
754	if (err)
755	goto handle_err;
756
757	for (;;) {
758	newval = (uval & FUTEX_OWNER_DIED) \| newtid;
759
760	err = futex_cmpxchg_value_locked(curval: &curval, uaddr, uval, newval);
761	if (err)
762	goto handle_err;
763
764	if (curval == uval)
765	break;
766	uval = curval;
767	}
768
769	/*
770	* We fixed up user space. Now we need to fix the pi_state
771	* itself.
772	*/
773	pi_state_update_owner(pi_state, new_owner: newowner);
774
775	return argowner == current;
776
777	/*
778	* In order to reschedule or handle a page fault, we need to drop the
779	* locks here. In the case of a fault, this gives the other task
780	* (either the highest priority waiter itself or the task which stole
781	* the rtmutex) the chance to try the fixup of the pi_state. So once we
782	* are back from handling the fault we need to check the pi_state after
783	* reacquiring the locks and before trying to do another fixup. When
784	* the fixup has been done already we simply return.
785	*
786	* Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
787	* drop hb->lock since the caller owns the hb -> futex_q relation.
788	* Dropping the pi_mutex->wait_lock requires the state revalidate.
789	*/
790	handle_err:
791	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
792	spin_unlock(lock: q->lock_ptr);
793
794	switch (err) {
795	case -EFAULT:
796	err = fault_in_user_writeable(uaddr);
797	break;
798
799	case -EAGAIN:
800	cond_resched();
801	err = `0`;
802	break;
803
804	default:
805	WARN_ON_ONCE(`1`);
806	break;
807	}
808
809	spin_lock(lock: q->lock_ptr);
810	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
811
812	/*
813	* Check if someone else fixed it for us:
814	*/
815	if (pi_state->owner != oldowner)
816	return argowner == current;
817
818	/ Retry if err was -EAGAIN or the fault in succeeded /
819	if (!err)
820	goto retry;
821
822	/*
823	* fault_in_user_writeable() failed so user state is immutable. At
824	* best we can make the kernel state consistent but user state will
825	* be most likely hosed and any subsequent unlock operation will be
826	* rejected due to PI futex rule [10].
827	*
828	* Ensure that the rtmutex owner is also the pi_state owner despite
829	* the user space value claiming something different. There is no
830	* point in unlocking the rtmutex if current is the owner as it
831	* would need to wait until the next waiter has taken the rtmutex
832	* to guarantee consistent state. Keep it simple. Userspace asked
833	* for this wreckaged state.
834	*
835	* The rtmutex has an owner - either current or some other
836	* task. See the EAGAIN loop above.
837	*/
838	pi_state_update_owner(pi_state, new_owner: rt_mutex_owner(lock: &pi_state->pi_mutex));
839
840	return err;
841	}
842
843	static int fixup_pi_state_owner(u32 __user uaddr, struct* futex_q *q,
844	struct task_struct *argowner)
845	{
846	struct futex_pi_state *pi_state = q->pi_state;
847	int ret;
848
849	lockdep_assert_held(q->lock_ptr);
850
851	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
852	ret = __fixup_pi_state_owner(uaddr, q, argowner);
853	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
854	return ret;
855	}
856
857	/**
858	* fixup_pi_owner() - Post lock pi_state and corner case management
859	* @uaddr: user address of the futex
860	* @q: futex_q (contains pi_state and access to the rt_mutex)
861	* @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
862	*
863	* After attempting to lock an rt_mutex, this function is called to cleanup
864	* the pi_state owner as well as handle race conditions that may allow us to
865	* acquire the lock. Must be called with the hb lock held.
866	*
867	* Return:
868	* - 1 - success, lock taken;
869	* - 0 - success, lock not taken;
870	* - <0 - on error (-EFAULT)
871	*/
872	int fixup_pi_owner(u32 __user uaddr, struct* futex_q q, int* locked)
873	{
874	if (locked) {
875	/*
876	* Got the lock. We might not be the anticipated owner if we
877	* did a lock-steal - fix up the PI-state in that case:
878	*
879	* Speculative pi_state->owner read (we don't hold wait_lock);
880	* since we own the lock pi_state->owner == current is the
881	* stable state, anything else needs more attention.
882	*/
883	if (q->pi_state->owner != current)
884	return fixup_pi_state_owner(uaddr, q, current);
885	return `1`;
886	}
887
888	/*
889	* If we didn't get the lock; check if anybody stole it from us. In
890	* that case, we need to fix up the uval to point to them instead of
891	* us, otherwise bad things happen. [10]
892	*
893	* Another speculative read; pi_state->owner == current is unstable
894	* but needs our attention.
895	*/
896	if (q->pi_state->owner == current)
897	return fixup_pi_state_owner(uaddr, q, NULL);
898
899	/*
900	* Paranoia check. If we did not take the lock, then we should not be
901	* the owner of the rt_mutex. Warn and establish consistent state.
902	*/
903	if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current))
904	return fixup_pi_state_owner(uaddr, q, current);
905
906	return `0`;
907	}
908
909	/*
910	* Userspace tried a 0 -> TID atomic transition of the futex value
911	* and failed. The kernel side here does the whole locking operation:
912	* if there are waiters then it will block as a consequence of relying
913	* on rt-mutexes, it does PI, etc. (Due to races the kernel might see
914	* a 0 value of the futex too.).
915	*
916	* Also serves as futex trylock_pi()'ing, and due semantics.
917	*/
918	int futex_lock_pi(u32 __user uaddr, unsigned* int flags, ktime_t time, int* trylock)
919	{
920	struct hrtimer_sleeper timeout, *to;
921	struct task_struct *exiting = NULL;
922	struct rt_mutex_waiter rt_waiter;
923	struct futex_hash_bucket *hb;
924	struct futex_q q = futex_q_init;
925	int res, ret;
926
927	if (!IS_ENABLED(CONFIG_FUTEX_PI))
928	return -ENOSYS;
929
930	if (refill_pi_state_cache())
931	return -ENOMEM;
932
933	to = futex_setup_timer(time, timeout: &timeout, flags, range_ns: `0`);
934
935	retry:
936	ret = get_futex_key(uaddr, flags, key: &q.key, rw: FUTEX_WRITE);
937	if (unlikely(ret != `0`))
938	goto out;
939
940	retry_private:
941	hb = futex_q_lock(q: &q);
942
943	ret = futex_lock_pi_atomic(uaddr, hb, key: &q.key, ps: &q.pi_state, current,
944	exiting: &exiting, set_waiters: `0`);
945	if (unlikely(ret)) {
946	/*
947	* Atomic work succeeded and we got the lock,
948	* or failed. Either way, we do _not_ block.
949	*/
950	switch (ret) {
951	case `1`:
952	/ We got the lock. /
953	ret = `0`;
954	goto out_unlock_put_key;
955	case -EFAULT:
956	goto uaddr_faulted;
957	case -EBUSY:
958	case -EAGAIN:
959	/*
960	* Two reasons for this:
961	* - EBUSY: Task is exiting and we just wait for the
962	* exit to complete.
963	* - EAGAIN: The user space value changed.
964	*/
965	futex_q_unlock(hb);
966	/*
967	* Handle the case where the owner is in the middle of
968	* exiting. Wait for the exit to complete otherwise
969	* this task might loop forever, aka. live lock.
970	*/
971	wait_for_owner_exiting(ret, exiting);
972	cond_resched();
973	goto retry;
974	default:
975	goto out_unlock_put_key;
976	}
977	}
978
979	WARN_ON(!q.pi_state);
980
981	/*
982	* Only actually queue now that the atomic ops are done:
983	*/
984	__futex_queue(q: &q, hb);
985
986	if (trylock) {
987	ret = rt_mutex_futex_trylock(l: &q.pi_state->pi_mutex);
988	/ Fixup the trylock return value: /
989	ret = ret ? `0` : -EWOULDBLOCK;
990	goto no_block;
991	}
992
993	/*
994	* Must be done before we enqueue the waiter, here is unfortunately
995	* under the hb lock, but that should work because it does nothing.
996	*/
997	rt_mutex_pre_schedule();
998
999	rt_mutex_init_waiter(waiter: &rt_waiter);
1000
1001	/*
1002	* On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not
1003	* hold it while doing rt_mutex_start_proxy(), because then it will
1004	* include hb->lock in the blocking chain, even through we'll not in
1005	* fact hold it while blocking. This will lead it to report -EDEADLK
1006	* and BUG when futex_unlock_pi() interleaves with this.
1007	*
1008	* Therefore acquire wait_lock while holding hb->lock, but drop the
1009	* latter before calling __rt_mutex_start_proxy_lock(). This
1010	* interleaves with futex_unlock_pi() -- which does a similar lock
1011	* handoff -- such that the latter can observe the futex_q::pi_state
1012	* before __rt_mutex_start_proxy_lock() is done.
1013	*/
1014	raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
1015	spin_unlock(lock: q.lock_ptr);
1016	/*
1017	* __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
1018	* such that futex_unlock_pi() is guaranteed to observe the waiter when
1019	* it sees the futex_q::pi_state.
1020	*/
1021	ret = __rt_mutex_start_proxy_lock(lock: &q.pi_state->pi_mutex, waiter: &rt_waiter, current);
1022	raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
1023
1024	if (ret) {
1025	if (ret == `1`)
1026	ret = `0`;
1027	goto cleanup;
1028	}
1029
1030	if (unlikely(to))
1031	hrtimer_sleeper_start_expires(sl: to, mode: HRTIMER_MODE_ABS);
1032
1033	ret = rt_mutex_wait_proxy_lock(lock: &q.pi_state->pi_mutex, to, waiter: &rt_waiter);
1034
1035	cleanup:
1036	/*
1037	* If we failed to acquire the lock (deadlock/signal/timeout), we must
1038	* must unwind the above, however we canont lock hb->lock because
1039	* rt_mutex already has a waiter enqueued and hb->lock can itself try
1040	* and enqueue an rt_waiter through rtlock.
1041	*
1042	* Doing the cleanup without holding hb->lock can cause inconsistent
1043	* state between hb and pi_state, but only in the direction of not
1044	* seeing a waiter that is leaving.
1045	*
1046	* See futex_unlock_pi(), it deals with this inconsistency.
1047	*
1048	* There be dragons here, since we must deal with the inconsistency on
1049	* the way out (here), it is impossible to detect/warn about the race
1050	* the other way around (missing an incoming waiter).
1051	*
1052	* What could possibly go wrong...
1053	*/
1054	if (ret && !rt_mutex_cleanup_proxy_lock(lock: &q.pi_state->pi_mutex, waiter: &rt_waiter))
1055	ret = `0`;
1056
1057	/*
1058	* Now that the rt_waiter has been dequeued, it is safe to use
1059	* spinlock/rtlock (which might enqueue its own rt_waiter) and fix up
1060	* the
1061	*/
1062	spin_lock(lock: q.lock_ptr);
1063	/*
1064	* Waiter is unqueued.
1065	*/
1066	rt_mutex_post_schedule();
1067	no_block:
1068	/*
1069	* Fixup the pi_state owner and possibly acquire the lock if we
1070	* haven't already.
1071	*/
1072	res = fixup_pi_owner(uaddr, q: &q, locked: !ret);
1073	/*
1074	* If fixup_pi_owner() returned an error, propagate that. If it acquired
1075	* the lock, clear our -ETIMEDOUT or -EINTR.
1076	*/
1077	if (res)
1078	ret = (res < `0`) ? res : `0`;
1079
1080	futex_unqueue_pi(q: &q);
1081	spin_unlock(lock: q.lock_ptr);
1082	goto out;
1083
1084	out_unlock_put_key:
1085	futex_q_unlock(hb);
1086
1087	out:
1088	if (to) {
1089	hrtimer_cancel(timer: &to->timer);
1090	destroy_hrtimer_on_stack(timer: &to->timer);
1091	}
1092	return ret != -EINTR ? ret : -ERESTARTNOINTR;
1093
1094	uaddr_faulted:
1095	futex_q_unlock(hb);
1096
1097	ret = fault_in_user_writeable(uaddr);
1098	if (ret)
1099	goto out;
1100
1101	if (!(flags & FLAGS_SHARED))
1102	goto retry_private;
1103
1104	goto retry;
1105	}
1106
1107	/*
1108	* Userspace attempted a TID -> 0 atomic transition, and failed.
1109	* This is the in-kernel slowpath: we look up the PI state (if any),
1110	* and do the rt-mutex unlock.
1111	*/
1112	int futex_unlock_pi(u32 __user uaddr, unsigned* int flags)
1113	{
1114	u32 curval, uval, vpid = task_pid_vnr(current);
1115	union futex_key key = FUTEX_KEY_INIT;
1116	struct futex_hash_bucket *hb;
1117	struct futex_q *top_waiter;
1118	int ret;
1119
1120	if (!IS_ENABLED(CONFIG_FUTEX_PI))
1121	return -ENOSYS;
1122
1123	retry:
1124	if (get_user(uval, uaddr))
1125	return -EFAULT;
1126	/*
1127	* We release only a lock we actually own:
1128	*/
1129	if ((uval & FUTEX_TID_MASK) != vpid)
1130	return -EPERM;
1131
1132	ret = get_futex_key(uaddr, flags, key: &key, rw: FUTEX_WRITE);
1133	if (ret)
1134	return ret;
1135
1136	hb = futex_hash(key: &key);
1137	spin_lock(lock: &hb->lock);
1138
1139	/*
1140	* Check waiters first. We do not trust user space values at
1141	* all and we at least want to know if user space fiddled
1142	* with the futex value instead of blindly unlocking.
1143	*/
1144	top_waiter = futex_top_waiter(hb, key: &key);
1145	if (top_waiter) {
1146	struct futex_pi_state *pi_state = top_waiter->pi_state;
1147	struct rt_mutex_waiter *rt_waiter;
1148
1149	ret = -EINVAL;
1150	if (!pi_state)
1151	goto out_unlock;
1152
1153	/*
1154	* If current does not own the pi_state then the futex is
1155	* inconsistent and user space fiddled with the futex value.
1156	*/
1157	if (pi_state->owner != current)
1158	goto out_unlock;
1159
1160	/*
1161	* By taking wait_lock while still holding hb->lock, we ensure
1162	* there is no point where we hold neither; and thereby
1163	* wake_futex_pi() must observe any new waiters.
1164	*
1165	* Since the cleanup: case in futex_lock_pi() removes the
1166	* rt_waiter without holding hb->lock, it is possible for
1167	* wake_futex_pi() to not find a waiter while the above does,
1168	* in this case the waiter is on the way out and it can be
1169	* ignored.
1170	*
1171	* In particular; this forces __rt_mutex_start_proxy() to
1172	* complete such that we're guaranteed to observe the
1173	* rt_waiter.
1174	*/
1175	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
1176
1177	/*
1178	* Futex vs rt_mutex waiter state -- if there are no rt_mutex
1179	* waiters even though futex thinks there are, then the waiter
1180	* is leaving and the uncontended path is safe to take.
1181	*/
1182	rt_waiter = rt_mutex_top_waiter(lock: &pi_state->pi_mutex);
1183	if (!rt_waiter) {
1184	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
1185	goto do_uncontended;
1186	}
1187
1188	get_pi_state(pi_state);
1189	spin_unlock(lock: &hb->lock);
1190
1191	/ drops pi_state->pi_mutex.wait_lock /
1192	ret = wake_futex_pi(uaddr, uval, pi_state, top_waiter: rt_waiter);
1193
1194	put_pi_state(pi_state);
1195
1196	/*
1197	* Success, we're done! No tricky corner cases.
1198	*/
1199	if (!ret)
1200	return ret;
1201	/*
1202	* The atomic access to the futex value generated a
1203	* pagefault, so retry the user-access and the wakeup:
1204	*/
1205	if (ret == -EFAULT)
1206	goto pi_faulted;
1207	/*
1208	* A unconditional UNLOCK_PI op raced against a waiter
1209	* setting the FUTEX_WAITERS bit. Try again.
1210	*/
1211	if (ret == -EAGAIN)
1212	goto pi_retry;
1213	/*
1214	* wake_futex_pi has detected invalid state. Tell user
1215	* space.
1216	*/
1217	return ret;
1218	}
1219
1220	do_uncontended:
1221	/*
1222	* We have no kernel internal state, i.e. no waiters in the
1223	* kernel. Waiters which are about to queue themselves are stuck
1224	* on hb->lock. So we can safely ignore them. We do neither
1225	* preserve the WAITERS bit not the OWNER_DIED one. We are the
1226	* owner.
1227	*/
1228	if ((ret = futex_cmpxchg_value_locked(curval: &curval, uaddr, uval, newval: `0`))) {
1229	spin_unlock(lock: &hb->lock);
1230	switch (ret) {
1231	case -EFAULT:
1232	goto pi_faulted;
1233
1234	case -EAGAIN:
1235	goto pi_retry;
1236
1237	default:
1238	WARN_ON_ONCE(`1`);
1239	return ret;
1240	}
1241	}
1242
1243	/*
1244	* If uval has changed, let user space handle it.
1245	*/
1246	ret = (curval == uval) ? `0` : -EAGAIN;
1247
1248	out_unlock:
1249	spin_unlock(lock: &hb->lock);
1250	return ret;
1251
1252	pi_retry:
1253	cond_resched();
1254	goto retry;
1255
1256	pi_faulted:
1257
1258	ret = fault_in_user_writeable(uaddr);
1259	if (!ret)
1260	goto retry;
1261
1262	return ret;
1263	}
1264
1265

source code of linux/kernel/futex/pi.c