mmap_lock.c source code [linux/mm/mmap_lock.c]

1	// SPDX-License-Identifier: GPL-2.0
2	#define CREATE_TRACE_POINTS
3	#include <trace/events/mmap_lock.h>
4
5	#include <linux/mm.h>
6	#include <linux/cgroup.h>
7	#include <linux/memcontrol.h>
8	#include <linux/mmap_lock.h>
9	#include <linux/mutex.h>
10	#include <linux/percpu.h>
11	#include <linux/rcupdate.h>
12	#include <linux/smp.h>
13	#include <linux/trace_events.h>
14	#include <linux/local_lock.h>
15
16	EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking);
17	EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned);
18	EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released);
19
20	#ifdef CONFIG_TRACING
21	/*
22	* Trace calls must be in a separate file, as otherwise there's a circular
23	* dependency between linux/mmap_lock.h and trace/events/mmap_lock.h.
24	*/
25
26	void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write)
27	{
28	trace_mmap_lock_start_locking(mm, write);
29	}
30	EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking);
31
32	void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write,
33	bool success)
34	{
35	trace_mmap_lock_acquire_returned(mm, write, success);
36	}
37	EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned);
38
39	void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write)
40	{
41	trace_mmap_lock_released(mm, write);
42	}
43	EXPORT_SYMBOL(__mmap_lock_do_trace_released);
44	#endif /* CONFIG_TRACING */
45
46	#ifdef CONFIG_MMU
47	#ifdef CONFIG_PER_VMA_LOCK
48	static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching)
49	{
50	unsigned int tgt_refcnt = VMA_LOCK_OFFSET;
51
52	/ Additional refcnt if the vma is attached. /
53	if (!detaching)
54	tgt_refcnt++;
55
56	/*
57	* If vma is detached then only vma_mark_attached() can raise the
58	* vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
59	*/
60	if (!refcount_add_not_zero(VMA_LOCK_OFFSET, r: &vma->vm_refcnt))
61	return false;
62
63	rwsem_acquire(&vma->vmlock_dep_map, `0`, `0`, _RET_IP_);
64	rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
65	refcount_read(&vma->vm_refcnt) == tgt_refcnt,
66	TASK_UNINTERRUPTIBLE);
67	lock_acquired(lock: &vma->vmlock_dep_map, _RET_IP_);
68
69	return true;
70	}
71
72	static inline void __vma_exit_locked(struct vm_area_struct vma, bool detached)
73	{
74	*detached = refcount_sub_and_test(VMA_LOCK_OFFSET, r: &vma->vm_refcnt);
75	rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
76	}
77
78	void __vma_start_write(struct vm_area_struct vma, unsigned* int mm_lock_seq)
79	{
80	bool locked;
81
82	/*
83	* __vma_enter_locked() returns false immediately if the vma is not
84	* attached, otherwise it waits until refcnt is indicating that vma
85	* is attached with no readers.
86	*/
87	locked = __vma_enter_locked(vma, detaching: false);
88
89	/*
90	* We should use WRITE_ONCE() here because we can have concurrent reads
91	* from the early lockless pessimistic check in vma_start_read().
92	* We don't really care about the correctness of that early check, but
93	* we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
94	*/
95	WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
96
97	if (locked) {
98	bool detached;
99
100	__vma_exit_locked(vma, detached: &detached);
101	WARN_ON_ONCE(detached); / vma should remain attached /
102	}
103	}
104	EXPORT_SYMBOL_GPL(__vma_start_write);
105
106	void vma_mark_detached(struct vm_area_struct *vma)
107	{
108	vma_assert_write_locked(vma);
109	vma_assert_attached(vma);
110
111	/*
112	* We are the only writer, so no need to use vma_refcount_put().
113	* The condition below is unlikely because the vma has been already
114	* write-locked and readers can increment vm_refcnt only temporarily
115	* before they check vm_lock_seq, realize the vma is locked and drop
116	* back the vm_refcnt. That is a narrow window for observing a raised
117	* vm_refcnt.
118	*/
119	if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) {
120	/ Wait until vma is detached with no readers. /
121	if (__vma_enter_locked(vma, detaching: true)) {
122	bool detached;
123
124	__vma_exit_locked(vma, detached: &detached);
125	WARN_ON_ONCE(!detached);
126	}
127	}
128	}
129
130	/*
131	* Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
132	* stable and not isolated. If the VMA is not found or is being modified the
133	* function returns NULL.
134	*/
135	struct vm_area_struct lock_vma_under_rcu(struct* mm_struct *mm,
136	unsigned long address)
137	{
138	MA_STATE(mas, &mm->mm_mt, address, address);
139	struct vm_area_struct *vma;
140
141	rcu_read_lock();
142	retry:
143	vma = mas_walk(mas: &mas);
144	if (!vma)
145	goto inval;
146
147	vma = vma_start_read(mm, vma);
148	if (IS_ERR_OR_NULL(ptr: vma)) {
149	/ Check if the VMA got isolated after we found it /
150	if (PTR_ERR(ptr: vma) == -EAGAIN) {
151	count_vm_vma_lock_event(VMA_LOCK_MISS);
152	/ The area was replaced with another one /
153	goto retry;
154	}
155
156	/ Failed to lock the VMA /
157	goto inval;
158	}
159	/*
160	* At this point, we have a stable reference to a VMA: The VMA is
161	* locked and we know it hasn't already been isolated.
162	* From here on, we can access the VMA without worrying about which
163	* fields are accessible for RCU readers.
164	*/
165
166	/ Check if the vma we locked is the right one. /
167	if (unlikely(vma->vm_mm != mm \|\|
168	address < vma->vm_start \|\| address >= vma->vm_end))
169	goto inval_end_read;
170
171	rcu_read_unlock();
172	return vma;
173
174	inval_end_read:
175	vma_end_read(vma);
176	inval:
177	rcu_read_unlock();
178	count_vm_vma_lock_event(VMA_LOCK_ABORT);
179	return NULL;
180	}
181	#endif /* CONFIG_PER_VMA_LOCK */
182
183	#ifdef CONFIG_LOCK_MM_AND_FIND_VMA
184	#include <linux/extable.h>
185
186	static inline bool get_mmap_lock_carefully(struct mm_struct mm, struct* pt_regs *regs)
187	{
188	if (likely(mmap_read_trylock(mm)))
189	return true;
190
191	if (regs && !user_mode(regs)) {
192	unsigned long ip = exception_ip(regs);
193	if (!search_exception_tables(add: ip))
194	return false;
195	}
196
197	return !mmap_read_lock_killable(mm);
198	}
199
200	static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
201	{
202	/*
203	* We don't have this operation yet.
204	*
205	* It should be easy enough to do: it's basically a
206	* atomic_long_try_cmpxchg_acquire()
207	* from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
208	* it also needs the proper lockdep magic etc.
209	*/
210	return false;
211	}
212
213	static inline bool upgrade_mmap_lock_carefully(struct mm_struct mm, struct* pt_regs *regs)
214	{
215	mmap_read_unlock(mm);
216	if (regs && !user_mode(regs)) {
217	unsigned long ip = exception_ip(regs);
218	if (!search_exception_tables(add: ip))
219	return false;
220	}
221	return !mmap_write_lock_killable(mm);
222	}
223
224	/*
225	* Helper for page fault handling.
226	*
227	* This is kind of equivalent to "mmap_read_lock()" followed
228	* by "find_extend_vma()", except it's a lot more careful about
229	* the locking (and will drop the lock on failure).
230	*
231	* For example, if we have a kernel bug that causes a page
232	* fault, we don't want to just use mmap_read_lock() to get
233	* the mm lock, because that would deadlock if the bug were
234	* to happen while we're holding the mm lock for writing.
235	*
236	* So this checks the exception tables on kernel faults in
237	* order to only do this all for instructions that are actually
238	* expected to fault.
239	*
240	* We can also actually take the mm lock for writing if we
241	* need to extend the vma, which helps the VM layer a lot.
242	*/
243	struct vm_area_struct lock_mm_and_find_vma(struct* mm_struct *mm,
244	unsigned long addr, struct pt_regs *regs)
245	{
246	struct vm_area_struct *vma;
247
248	if (!get_mmap_lock_carefully(mm, regs))
249	return NULL;
250
251	vma = find_vma(mm, addr);
252	if (likely(vma && (vma->vm_start <= addr)))
253	return vma;
254
255	/*
256	* Well, dang. We might still be successful, but only
257	* if we can extend a vma to do so.
258	*/
259	if (!vma \|\| !(vma->vm_flags & VM_GROWSDOWN)) {
260	mmap_read_unlock(mm);
261	return NULL;
262	}
263
264	/*
265	* We can try to upgrade the mmap lock atomically,
266	* in which case we can continue to use the vma
267	* we already looked up.
268	*
269	* Otherwise we'll have to drop the mmap lock and
270	* re-take it, and also look up the vma again,
271	* re-checking it.
272	*/
273	if (!mmap_upgrade_trylock(mm)) {
274	if (!upgrade_mmap_lock_carefully(mm, regs))
275	return NULL;
276
277	vma = find_vma(mm, addr);
278	if (!vma)
279	goto fail;
280	if (vma->vm_start <= addr)
281	goto success;
282	if (!(vma->vm_flags & VM_GROWSDOWN))
283	goto fail;
284	}
285
286	if (expand_stack_locked(vma, address: addr))
287	goto fail;
288
289	success:
290	mmap_write_downgrade(mm);
291	return vma;
292
293	fail:
294	mmap_write_unlock(mm);
295	return NULL;
296	}
297	#endif /* CONFIG_LOCK_MM_AND_FIND_VMA */
298
299	#else /* CONFIG_MMU */
300
301	/*
302	* At least xtensa ends up having protection faults even with no
303	* MMU.. No stack expansion, at least.
304	*/
305	struct vm_area_struct lock_mm_and_find_vma(struct* mm_struct *mm,
306	unsigned long addr, struct pt_regs *regs)
307	{
308	struct vm_area_struct *vma;
309
310	mmap_read_lock(mm);
311	vma = vma_lookup(mm, addr);
312	if (!vma)
313	mmap_read_unlock(mm);
314	return vma;
315	}
316
317	#endif /* CONFIG_MMU */
318

source code of linux/mm/mmap_lock.c