1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * linux/mm/mmu_notifier.c |
4 | * |
5 | * Copyright (C) 2008 Qumranet, Inc. |
6 | * Copyright (C) 2008 SGI |
7 | * Christoph Lameter <cl@linux.com> |
8 | */ |
9 | |
10 | #include <linux/rculist.h> |
11 | #include <linux/mmu_notifier.h> |
12 | #include <linux/export.h> |
13 | #include <linux/mm.h> |
14 | #include <linux/err.h> |
15 | #include <linux/interval_tree.h> |
16 | #include <linux/srcu.h> |
17 | #include <linux/rcupdate.h> |
18 | #include <linux/sched.h> |
19 | #include <linux/sched/mm.h> |
20 | #include <linux/slab.h> |
21 | |
22 | /* global SRCU for all MMs */ |
23 | DEFINE_STATIC_SRCU(srcu); |
24 | |
25 | #ifdef CONFIG_LOCKDEP |
26 | struct lockdep_map __mmu_notifier_invalidate_range_start_map = { |
27 | .name = "mmu_notifier_invalidate_range_start" |
28 | }; |
29 | #endif |
30 | |
31 | /* |
32 | * The mmu_notifier_subscriptions structure is allocated and installed in |
33 | * mm->notifier_subscriptions inside the mm_take_all_locks() protected |
34 | * critical section and it's released only when mm_count reaches zero |
35 | * in mmdrop(). |
36 | */ |
37 | struct mmu_notifier_subscriptions { |
38 | /* all mmu notifiers registered in this mm are queued in this list */ |
39 | struct hlist_head list; |
40 | bool has_itree; |
41 | /* to serialize the list modifications and hlist_unhashed */ |
42 | spinlock_t lock; |
43 | unsigned long invalidate_seq; |
44 | unsigned long active_invalidate_ranges; |
45 | struct rb_root_cached itree; |
46 | wait_queue_head_t wq; |
47 | struct hlist_head deferred_list; |
48 | }; |
49 | |
50 | /* |
51 | * This is a collision-retry read-side/write-side 'lock', a lot like a |
52 | * seqcount, however this allows multiple write-sides to hold it at |
53 | * once. Conceptually the write side is protecting the values of the PTEs in |
54 | * this mm, such that PTES cannot be read into SPTEs (shadow PTEs) while any |
55 | * writer exists. |
56 | * |
57 | * Note that the core mm creates nested invalidate_range_start()/end() regions |
58 | * within the same thread, and runs invalidate_range_start()/end() in parallel |
59 | * on multiple CPUs. This is designed to not reduce concurrency or block |
60 | * progress on the mm side. |
61 | * |
62 | * As a secondary function, holding the full write side also serves to prevent |
63 | * writers for the itree, this is an optimization to avoid extra locking |
64 | * during invalidate_range_start/end notifiers. |
65 | * |
66 | * The write side has two states, fully excluded: |
67 | * - mm->active_invalidate_ranges != 0 |
68 | * - subscriptions->invalidate_seq & 1 == True (odd) |
69 | * - some range on the mm_struct is being invalidated |
70 | * - the itree is not allowed to change |
71 | * |
72 | * And partially excluded: |
73 | * - mm->active_invalidate_ranges != 0 |
74 | * - subscriptions->invalidate_seq & 1 == False (even) |
75 | * - some range on the mm_struct is being invalidated |
76 | * - the itree is allowed to change |
77 | * |
78 | * Operations on notifier_subscriptions->invalidate_seq (under spinlock): |
79 | * seq |= 1 # Begin writing |
80 | * seq++ # Release the writing state |
81 | * seq & 1 # True if a writer exists |
82 | * |
83 | * The later state avoids some expensive work on inv_end in the common case of |
84 | * no mmu_interval_notifier monitoring the VA. |
85 | */ |
86 | static bool |
87 | mn_itree_is_invalidating(struct mmu_notifier_subscriptions *subscriptions) |
88 | { |
89 | lockdep_assert_held(&subscriptions->lock); |
90 | return subscriptions->invalidate_seq & 1; |
91 | } |
92 | |
93 | static struct mmu_interval_notifier * |
94 | mn_itree_inv_start_range(struct mmu_notifier_subscriptions *subscriptions, |
95 | const struct mmu_notifier_range *range, |
96 | unsigned long *seq) |
97 | { |
98 | struct interval_tree_node *node; |
99 | struct mmu_interval_notifier *res = NULL; |
100 | |
101 | spin_lock(lock: &subscriptions->lock); |
102 | subscriptions->active_invalidate_ranges++; |
103 | node = interval_tree_iter_first(root: &subscriptions->itree, start: range->start, |
104 | last: range->end - 1); |
105 | if (node) { |
106 | subscriptions->invalidate_seq |= 1; |
107 | res = container_of(node, struct mmu_interval_notifier, |
108 | interval_tree); |
109 | } |
110 | |
111 | *seq = subscriptions->invalidate_seq; |
112 | spin_unlock(lock: &subscriptions->lock); |
113 | return res; |
114 | } |
115 | |
116 | static struct mmu_interval_notifier * |
117 | mn_itree_inv_next(struct mmu_interval_notifier *interval_sub, |
118 | const struct mmu_notifier_range *range) |
119 | { |
120 | struct interval_tree_node *node; |
121 | |
122 | node = interval_tree_iter_next(node: &interval_sub->interval_tree, |
123 | start: range->start, last: range->end - 1); |
124 | if (!node) |
125 | return NULL; |
126 | return container_of(node, struct mmu_interval_notifier, interval_tree); |
127 | } |
128 | |
129 | static void mn_itree_inv_end(struct mmu_notifier_subscriptions *subscriptions) |
130 | { |
131 | struct mmu_interval_notifier *interval_sub; |
132 | struct hlist_node *next; |
133 | |
134 | spin_lock(lock: &subscriptions->lock); |
135 | if (--subscriptions->active_invalidate_ranges || |
136 | !mn_itree_is_invalidating(subscriptions)) { |
137 | spin_unlock(lock: &subscriptions->lock); |
138 | return; |
139 | } |
140 | |
141 | /* Make invalidate_seq even */ |
142 | subscriptions->invalidate_seq++; |
143 | |
144 | /* |
145 | * The inv_end incorporates a deferred mechanism like rtnl_unlock(). |
146 | * Adds and removes are queued until the final inv_end happens then |
147 | * they are progressed. This arrangement for tree updates is used to |
148 | * avoid using a blocking lock during invalidate_range_start. |
149 | */ |
150 | hlist_for_each_entry_safe(interval_sub, next, |
151 | &subscriptions->deferred_list, |
152 | deferred_item) { |
153 | if (RB_EMPTY_NODE(&interval_sub->interval_tree.rb)) |
154 | interval_tree_insert(node: &interval_sub->interval_tree, |
155 | root: &subscriptions->itree); |
156 | else |
157 | interval_tree_remove(node: &interval_sub->interval_tree, |
158 | root: &subscriptions->itree); |
159 | hlist_del(n: &interval_sub->deferred_item); |
160 | } |
161 | spin_unlock(lock: &subscriptions->lock); |
162 | |
163 | wake_up_all(&subscriptions->wq); |
164 | } |
165 | |
166 | /** |
167 | * mmu_interval_read_begin - Begin a read side critical section against a VA |
168 | * range |
169 | * @interval_sub: The interval subscription |
170 | * |
171 | * mmu_iterval_read_begin()/mmu_iterval_read_retry() implement a |
172 | * collision-retry scheme similar to seqcount for the VA range under |
173 | * subscription. If the mm invokes invalidation during the critical section |
174 | * then mmu_interval_read_retry() will return true. |
175 | * |
176 | * This is useful to obtain shadow PTEs where teardown or setup of the SPTEs |
177 | * require a blocking context. The critical region formed by this can sleep, |
178 | * and the required 'user_lock' can also be a sleeping lock. |
179 | * |
180 | * The caller is required to provide a 'user_lock' to serialize both teardown |
181 | * and setup. |
182 | * |
183 | * The return value should be passed to mmu_interval_read_retry(). |
184 | */ |
185 | unsigned long |
186 | mmu_interval_read_begin(struct mmu_interval_notifier *interval_sub) |
187 | { |
188 | struct mmu_notifier_subscriptions *subscriptions = |
189 | interval_sub->mm->notifier_subscriptions; |
190 | unsigned long seq; |
191 | bool is_invalidating; |
192 | |
193 | /* |
194 | * If the subscription has a different seq value under the user_lock |
195 | * than we started with then it has collided. |
196 | * |
197 | * If the subscription currently has the same seq value as the |
198 | * subscriptions seq, then it is currently between |
199 | * invalidate_start/end and is colliding. |
200 | * |
201 | * The locking looks broadly like this: |
202 | * mn_itree_inv_start(): mmu_interval_read_begin(): |
203 | * spin_lock |
204 | * seq = READ_ONCE(interval_sub->invalidate_seq); |
205 | * seq == subs->invalidate_seq |
206 | * spin_unlock |
207 | * spin_lock |
208 | * seq = ++subscriptions->invalidate_seq |
209 | * spin_unlock |
210 | * op->invalidate(): |
211 | * user_lock |
212 | * mmu_interval_set_seq() |
213 | * interval_sub->invalidate_seq = seq |
214 | * user_unlock |
215 | * |
216 | * [Required: mmu_interval_read_retry() == true] |
217 | * |
218 | * mn_itree_inv_end(): |
219 | * spin_lock |
220 | * seq = ++subscriptions->invalidate_seq |
221 | * spin_unlock |
222 | * |
223 | * user_lock |
224 | * mmu_interval_read_retry(): |
225 | * interval_sub->invalidate_seq != seq |
226 | * user_unlock |
227 | * |
228 | * Barriers are not needed here as any races here are closed by an |
229 | * eventual mmu_interval_read_retry(), which provides a barrier via the |
230 | * user_lock. |
231 | */ |
232 | spin_lock(lock: &subscriptions->lock); |
233 | /* Pairs with the WRITE_ONCE in mmu_interval_set_seq() */ |
234 | seq = READ_ONCE(interval_sub->invalidate_seq); |
235 | is_invalidating = seq == subscriptions->invalidate_seq; |
236 | spin_unlock(lock: &subscriptions->lock); |
237 | |
238 | /* |
239 | * interval_sub->invalidate_seq must always be set to an odd value via |
240 | * mmu_interval_set_seq() using the provided cur_seq from |
241 | * mn_itree_inv_start_range(). This ensures that if seq does wrap we |
242 | * will always clear the below sleep in some reasonable time as |
243 | * subscriptions->invalidate_seq is even in the idle state. |
244 | */ |
245 | lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); |
246 | lock_map_release(&__mmu_notifier_invalidate_range_start_map); |
247 | if (is_invalidating) |
248 | wait_event(subscriptions->wq, |
249 | READ_ONCE(subscriptions->invalidate_seq) != seq); |
250 | |
251 | /* |
252 | * Notice that mmu_interval_read_retry() can already be true at this |
253 | * point, avoiding loops here allows the caller to provide a global |
254 | * time bound. |
255 | */ |
256 | |
257 | return seq; |
258 | } |
259 | EXPORT_SYMBOL_GPL(mmu_interval_read_begin); |
260 | |
261 | static void mn_itree_release(struct mmu_notifier_subscriptions *subscriptions, |
262 | struct mm_struct *mm) |
263 | { |
264 | struct mmu_notifier_range range = { |
265 | .flags = MMU_NOTIFIER_RANGE_BLOCKABLE, |
266 | .event = MMU_NOTIFY_RELEASE, |
267 | .mm = mm, |
268 | .start = 0, |
269 | .end = ULONG_MAX, |
270 | }; |
271 | struct mmu_interval_notifier *interval_sub; |
272 | unsigned long cur_seq; |
273 | bool ret; |
274 | |
275 | for (interval_sub = |
276 | mn_itree_inv_start_range(subscriptions, range: &range, seq: &cur_seq); |
277 | interval_sub; |
278 | interval_sub = mn_itree_inv_next(interval_sub, range: &range)) { |
279 | ret = interval_sub->ops->invalidate(interval_sub, &range, |
280 | cur_seq); |
281 | WARN_ON(!ret); |
282 | } |
283 | |
284 | mn_itree_inv_end(subscriptions); |
285 | } |
286 | |
287 | /* |
288 | * This function can't run concurrently against mmu_notifier_register |
289 | * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap |
290 | * runs with mm_users == 0. Other tasks may still invoke mmu notifiers |
291 | * in parallel despite there being no task using this mm any more, |
292 | * through the vmas outside of the exit_mmap context, such as with |
293 | * vmtruncate. This serializes against mmu_notifier_unregister with |
294 | * the notifier_subscriptions->lock in addition to SRCU and it serializes |
295 | * against the other mmu notifiers with SRCU. struct mmu_notifier_subscriptions |
296 | * can't go away from under us as exit_mmap holds an mm_count pin |
297 | * itself. |
298 | */ |
299 | static void mn_hlist_release(struct mmu_notifier_subscriptions *subscriptions, |
300 | struct mm_struct *mm) |
301 | { |
302 | struct mmu_notifier *subscription; |
303 | int id; |
304 | |
305 | /* |
306 | * SRCU here will block mmu_notifier_unregister until |
307 | * ->release returns. |
308 | */ |
309 | id = srcu_read_lock(ssp: &srcu); |
310 | hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist, |
311 | srcu_read_lock_held(&srcu)) |
312 | /* |
313 | * If ->release runs before mmu_notifier_unregister it must be |
314 | * handled, as it's the only way for the driver to flush all |
315 | * existing sptes and stop the driver from establishing any more |
316 | * sptes before all the pages in the mm are freed. |
317 | */ |
318 | if (subscription->ops->release) |
319 | subscription->ops->release(subscription, mm); |
320 | |
321 | spin_lock(lock: &subscriptions->lock); |
322 | while (unlikely(!hlist_empty(&subscriptions->list))) { |
323 | subscription = hlist_entry(subscriptions->list.first, |
324 | struct mmu_notifier, hlist); |
325 | /* |
326 | * We arrived before mmu_notifier_unregister so |
327 | * mmu_notifier_unregister will do nothing other than to wait |
328 | * for ->release to finish and for mmu_notifier_unregister to |
329 | * return. |
330 | */ |
331 | hlist_del_init_rcu(n: &subscription->hlist); |
332 | } |
333 | spin_unlock(lock: &subscriptions->lock); |
334 | srcu_read_unlock(ssp: &srcu, idx: id); |
335 | |
336 | /* |
337 | * synchronize_srcu here prevents mmu_notifier_release from returning to |
338 | * exit_mmap (which would proceed with freeing all pages in the mm) |
339 | * until the ->release method returns, if it was invoked by |
340 | * mmu_notifier_unregister. |
341 | * |
342 | * The notifier_subscriptions can't go away from under us because |
343 | * one mm_count is held by exit_mmap. |
344 | */ |
345 | synchronize_srcu(ssp: &srcu); |
346 | } |
347 | |
348 | void __mmu_notifier_release(struct mm_struct *mm) |
349 | { |
350 | struct mmu_notifier_subscriptions *subscriptions = |
351 | mm->notifier_subscriptions; |
352 | |
353 | if (subscriptions->has_itree) |
354 | mn_itree_release(subscriptions, mm); |
355 | |
356 | if (!hlist_empty(h: &subscriptions->list)) |
357 | mn_hlist_release(subscriptions, mm); |
358 | } |
359 | |
360 | /* |
361 | * If no young bitflag is supported by the hardware, ->clear_flush_young can |
362 | * unmap the address and return 1 or 0 depending if the mapping previously |
363 | * existed or not. |
364 | */ |
365 | int __mmu_notifier_clear_flush_young(struct mm_struct *mm, |
366 | unsigned long start, |
367 | unsigned long end) |
368 | { |
369 | struct mmu_notifier *subscription; |
370 | int young = 0, id; |
371 | |
372 | id = srcu_read_lock(ssp: &srcu); |
373 | hlist_for_each_entry_rcu(subscription, |
374 | &mm->notifier_subscriptions->list, hlist, |
375 | srcu_read_lock_held(&srcu)) { |
376 | if (subscription->ops->clear_flush_young) |
377 | young |= subscription->ops->clear_flush_young( |
378 | subscription, mm, start, end); |
379 | } |
380 | srcu_read_unlock(ssp: &srcu, idx: id); |
381 | |
382 | return young; |
383 | } |
384 | |
385 | int __mmu_notifier_clear_young(struct mm_struct *mm, |
386 | unsigned long start, |
387 | unsigned long end) |
388 | { |
389 | struct mmu_notifier *subscription; |
390 | int young = 0, id; |
391 | |
392 | id = srcu_read_lock(ssp: &srcu); |
393 | hlist_for_each_entry_rcu(subscription, |
394 | &mm->notifier_subscriptions->list, hlist, |
395 | srcu_read_lock_held(&srcu)) { |
396 | if (subscription->ops->clear_young) |
397 | young |= subscription->ops->clear_young(subscription, |
398 | mm, start, end); |
399 | } |
400 | srcu_read_unlock(ssp: &srcu, idx: id); |
401 | |
402 | return young; |
403 | } |
404 | |
405 | int __mmu_notifier_test_young(struct mm_struct *mm, |
406 | unsigned long address) |
407 | { |
408 | struct mmu_notifier *subscription; |
409 | int young = 0, id; |
410 | |
411 | id = srcu_read_lock(ssp: &srcu); |
412 | hlist_for_each_entry_rcu(subscription, |
413 | &mm->notifier_subscriptions->list, hlist, |
414 | srcu_read_lock_held(&srcu)) { |
415 | if (subscription->ops->test_young) { |
416 | young = subscription->ops->test_young(subscription, mm, |
417 | address); |
418 | if (young) |
419 | break; |
420 | } |
421 | } |
422 | srcu_read_unlock(ssp: &srcu, idx: id); |
423 | |
424 | return young; |
425 | } |
426 | |
427 | void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, |
428 | pte_t pte) |
429 | { |
430 | struct mmu_notifier *subscription; |
431 | int id; |
432 | |
433 | id = srcu_read_lock(ssp: &srcu); |
434 | hlist_for_each_entry_rcu(subscription, |
435 | &mm->notifier_subscriptions->list, hlist, |
436 | srcu_read_lock_held(&srcu)) { |
437 | if (subscription->ops->change_pte) |
438 | subscription->ops->change_pte(subscription, mm, address, |
439 | pte); |
440 | } |
441 | srcu_read_unlock(ssp: &srcu, idx: id); |
442 | } |
443 | |
444 | static int mn_itree_invalidate(struct mmu_notifier_subscriptions *subscriptions, |
445 | const struct mmu_notifier_range *range) |
446 | { |
447 | struct mmu_interval_notifier *interval_sub; |
448 | unsigned long cur_seq; |
449 | |
450 | for (interval_sub = |
451 | mn_itree_inv_start_range(subscriptions, range, seq: &cur_seq); |
452 | interval_sub; |
453 | interval_sub = mn_itree_inv_next(interval_sub, range)) { |
454 | bool ret; |
455 | |
456 | ret = interval_sub->ops->invalidate(interval_sub, range, |
457 | cur_seq); |
458 | if (!ret) { |
459 | if (WARN_ON(mmu_notifier_range_blockable(range))) |
460 | continue; |
461 | goto out_would_block; |
462 | } |
463 | } |
464 | return 0; |
465 | |
466 | out_would_block: |
467 | /* |
468 | * On -EAGAIN the non-blocking caller is not allowed to call |
469 | * invalidate_range_end() |
470 | */ |
471 | mn_itree_inv_end(subscriptions); |
472 | return -EAGAIN; |
473 | } |
474 | |
475 | static int mn_hlist_invalidate_range_start( |
476 | struct mmu_notifier_subscriptions *subscriptions, |
477 | struct mmu_notifier_range *range) |
478 | { |
479 | struct mmu_notifier *subscription; |
480 | int ret = 0; |
481 | int id; |
482 | |
483 | id = srcu_read_lock(ssp: &srcu); |
484 | hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist, |
485 | srcu_read_lock_held(&srcu)) { |
486 | const struct mmu_notifier_ops *ops = subscription->ops; |
487 | |
488 | if (ops->invalidate_range_start) { |
489 | int _ret; |
490 | |
491 | if (!mmu_notifier_range_blockable(range)) |
492 | non_block_start(); |
493 | _ret = ops->invalidate_range_start(subscription, range); |
494 | if (!mmu_notifier_range_blockable(range)) |
495 | non_block_end(); |
496 | if (_ret) { |
497 | pr_info("%pS callback failed with %d in %sblockable context.\n" , |
498 | ops->invalidate_range_start, _ret, |
499 | !mmu_notifier_range_blockable(range) ? |
500 | "non-" : |
501 | "" ); |
502 | WARN_ON(mmu_notifier_range_blockable(range) || |
503 | _ret != -EAGAIN); |
504 | /* |
505 | * We call all the notifiers on any EAGAIN, |
506 | * there is no way for a notifier to know if |
507 | * its start method failed, thus a start that |
508 | * does EAGAIN can't also do end. |
509 | */ |
510 | WARN_ON(ops->invalidate_range_end); |
511 | ret = _ret; |
512 | } |
513 | } |
514 | } |
515 | |
516 | if (ret) { |
517 | /* |
518 | * Must be non-blocking to get here. If there are multiple |
519 | * notifiers and one or more failed start, any that succeeded |
520 | * start are expecting their end to be called. Do so now. |
521 | */ |
522 | hlist_for_each_entry_rcu(subscription, &subscriptions->list, |
523 | hlist, srcu_read_lock_held(&srcu)) { |
524 | if (!subscription->ops->invalidate_range_end) |
525 | continue; |
526 | |
527 | subscription->ops->invalidate_range_end(subscription, |
528 | range); |
529 | } |
530 | } |
531 | srcu_read_unlock(ssp: &srcu, idx: id); |
532 | |
533 | return ret; |
534 | } |
535 | |
536 | int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) |
537 | { |
538 | struct mmu_notifier_subscriptions *subscriptions = |
539 | range->mm->notifier_subscriptions; |
540 | int ret; |
541 | |
542 | if (subscriptions->has_itree) { |
543 | ret = mn_itree_invalidate(subscriptions, range); |
544 | if (ret) |
545 | return ret; |
546 | } |
547 | if (!hlist_empty(h: &subscriptions->list)) |
548 | return mn_hlist_invalidate_range_start(subscriptions, range); |
549 | return 0; |
550 | } |
551 | |
552 | static void |
553 | mn_hlist_invalidate_end(struct mmu_notifier_subscriptions *subscriptions, |
554 | struct mmu_notifier_range *range) |
555 | { |
556 | struct mmu_notifier *subscription; |
557 | int id; |
558 | |
559 | id = srcu_read_lock(ssp: &srcu); |
560 | hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist, |
561 | srcu_read_lock_held(&srcu)) { |
562 | if (subscription->ops->invalidate_range_end) { |
563 | if (!mmu_notifier_range_blockable(range)) |
564 | non_block_start(); |
565 | subscription->ops->invalidate_range_end(subscription, |
566 | range); |
567 | if (!mmu_notifier_range_blockable(range)) |
568 | non_block_end(); |
569 | } |
570 | } |
571 | srcu_read_unlock(ssp: &srcu, idx: id); |
572 | } |
573 | |
574 | void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) |
575 | { |
576 | struct mmu_notifier_subscriptions *subscriptions = |
577 | range->mm->notifier_subscriptions; |
578 | |
579 | lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); |
580 | if (subscriptions->has_itree) |
581 | mn_itree_inv_end(subscriptions); |
582 | |
583 | if (!hlist_empty(h: &subscriptions->list)) |
584 | mn_hlist_invalidate_end(subscriptions, range); |
585 | lock_map_release(&__mmu_notifier_invalidate_range_start_map); |
586 | } |
587 | |
588 | void __mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm, |
589 | unsigned long start, unsigned long end) |
590 | { |
591 | struct mmu_notifier *subscription; |
592 | int id; |
593 | |
594 | id = srcu_read_lock(ssp: &srcu); |
595 | hlist_for_each_entry_rcu(subscription, |
596 | &mm->notifier_subscriptions->list, hlist, |
597 | srcu_read_lock_held(&srcu)) { |
598 | if (subscription->ops->arch_invalidate_secondary_tlbs) |
599 | subscription->ops->arch_invalidate_secondary_tlbs( |
600 | subscription, mm, |
601 | start, end); |
602 | } |
603 | srcu_read_unlock(ssp: &srcu, idx: id); |
604 | } |
605 | |
606 | /* |
607 | * Same as mmu_notifier_register but here the caller must hold the mmap_lock in |
608 | * write mode. A NULL mn signals the notifier is being registered for itree |
609 | * mode. |
610 | */ |
611 | int __mmu_notifier_register(struct mmu_notifier *subscription, |
612 | struct mm_struct *mm) |
613 | { |
614 | struct mmu_notifier_subscriptions *subscriptions = NULL; |
615 | int ret; |
616 | |
617 | mmap_assert_write_locked(mm); |
618 | BUG_ON(atomic_read(&mm->mm_users) <= 0); |
619 | |
620 | /* |
621 | * Subsystems should only register for invalidate_secondary_tlbs() or |
622 | * invalidate_range_start()/end() callbacks, not both. |
623 | */ |
624 | if (WARN_ON_ONCE(subscription && |
625 | (subscription->ops->arch_invalidate_secondary_tlbs && |
626 | (subscription->ops->invalidate_range_start || |
627 | subscription->ops->invalidate_range_end)))) |
628 | return -EINVAL; |
629 | |
630 | if (!mm->notifier_subscriptions) { |
631 | /* |
632 | * kmalloc cannot be called under mm_take_all_locks(), but we |
633 | * know that mm->notifier_subscriptions can't change while we |
634 | * hold the write side of the mmap_lock. |
635 | */ |
636 | subscriptions = kzalloc( |
637 | size: sizeof(struct mmu_notifier_subscriptions), GFP_KERNEL); |
638 | if (!subscriptions) |
639 | return -ENOMEM; |
640 | |
641 | INIT_HLIST_HEAD(&subscriptions->list); |
642 | spin_lock_init(&subscriptions->lock); |
643 | subscriptions->invalidate_seq = 2; |
644 | subscriptions->itree = RB_ROOT_CACHED; |
645 | init_waitqueue_head(&subscriptions->wq); |
646 | INIT_HLIST_HEAD(&subscriptions->deferred_list); |
647 | } |
648 | |
649 | ret = mm_take_all_locks(mm); |
650 | if (unlikely(ret)) |
651 | goto out_clean; |
652 | |
653 | /* |
654 | * Serialize the update against mmu_notifier_unregister. A |
655 | * side note: mmu_notifier_release can't run concurrently with |
656 | * us because we hold the mm_users pin (either implicitly as |
657 | * current->mm or explicitly with get_task_mm() or similar). |
658 | * We can't race against any other mmu notifier method either |
659 | * thanks to mm_take_all_locks(). |
660 | * |
661 | * release semantics on the initialization of the |
662 | * mmu_notifier_subscriptions's contents are provided for unlocked |
663 | * readers. acquire can only be used while holding the mmgrab or |
664 | * mmget, and is safe because once created the |
665 | * mmu_notifier_subscriptions is not freed until the mm is destroyed. |
666 | * As above, users holding the mmap_lock or one of the |
667 | * mm_take_all_locks() do not need to use acquire semantics. |
668 | */ |
669 | if (subscriptions) |
670 | smp_store_release(&mm->notifier_subscriptions, subscriptions); |
671 | |
672 | if (subscription) { |
673 | /* Pairs with the mmdrop in mmu_notifier_unregister_* */ |
674 | mmgrab(mm); |
675 | subscription->mm = mm; |
676 | subscription->users = 1; |
677 | |
678 | spin_lock(lock: &mm->notifier_subscriptions->lock); |
679 | hlist_add_head_rcu(n: &subscription->hlist, |
680 | h: &mm->notifier_subscriptions->list); |
681 | spin_unlock(lock: &mm->notifier_subscriptions->lock); |
682 | } else |
683 | mm->notifier_subscriptions->has_itree = true; |
684 | |
685 | mm_drop_all_locks(mm); |
686 | BUG_ON(atomic_read(&mm->mm_users) <= 0); |
687 | return 0; |
688 | |
689 | out_clean: |
690 | kfree(objp: subscriptions); |
691 | return ret; |
692 | } |
693 | EXPORT_SYMBOL_GPL(__mmu_notifier_register); |
694 | |
695 | /** |
696 | * mmu_notifier_register - Register a notifier on a mm |
697 | * @subscription: The notifier to attach |
698 | * @mm: The mm to attach the notifier to |
699 | * |
700 | * Must not hold mmap_lock nor any other VM related lock when calling |
701 | * this registration function. Must also ensure mm_users can't go down |
702 | * to zero while this runs to avoid races with mmu_notifier_release, |
703 | * so mm has to be current->mm or the mm should be pinned safely such |
704 | * as with get_task_mm(). If the mm is not current->mm, the mm_users |
705 | * pin should be released by calling mmput after mmu_notifier_register |
706 | * returns. |
707 | * |
708 | * mmu_notifier_unregister() or mmu_notifier_put() must be always called to |
709 | * unregister the notifier. |
710 | * |
711 | * While the caller has a mmu_notifier get the subscription->mm pointer will remain |
712 | * valid, and can be converted to an active mm pointer via mmget_not_zero(). |
713 | */ |
714 | int mmu_notifier_register(struct mmu_notifier *subscription, |
715 | struct mm_struct *mm) |
716 | { |
717 | int ret; |
718 | |
719 | mmap_write_lock(mm); |
720 | ret = __mmu_notifier_register(subscription, mm); |
721 | mmap_write_unlock(mm); |
722 | return ret; |
723 | } |
724 | EXPORT_SYMBOL_GPL(mmu_notifier_register); |
725 | |
726 | static struct mmu_notifier * |
727 | find_get_mmu_notifier(struct mm_struct *mm, const struct mmu_notifier_ops *ops) |
728 | { |
729 | struct mmu_notifier *subscription; |
730 | |
731 | spin_lock(lock: &mm->notifier_subscriptions->lock); |
732 | hlist_for_each_entry_rcu(subscription, |
733 | &mm->notifier_subscriptions->list, hlist, |
734 | lockdep_is_held(&mm->notifier_subscriptions->lock)) { |
735 | if (subscription->ops != ops) |
736 | continue; |
737 | |
738 | if (likely(subscription->users != UINT_MAX)) |
739 | subscription->users++; |
740 | else |
741 | subscription = ERR_PTR(error: -EOVERFLOW); |
742 | spin_unlock(lock: &mm->notifier_subscriptions->lock); |
743 | return subscription; |
744 | } |
745 | spin_unlock(lock: &mm->notifier_subscriptions->lock); |
746 | return NULL; |
747 | } |
748 | |
749 | /** |
750 | * mmu_notifier_get_locked - Return the single struct mmu_notifier for |
751 | * the mm & ops |
752 | * @ops: The operations struct being subscribe with |
753 | * @mm : The mm to attach notifiers too |
754 | * |
755 | * This function either allocates a new mmu_notifier via |
756 | * ops->alloc_notifier(), or returns an already existing notifier on the |
757 | * list. The value of the ops pointer is used to determine when two notifiers |
758 | * are the same. |
759 | * |
760 | * Each call to mmu_notifier_get() must be paired with a call to |
761 | * mmu_notifier_put(). The caller must hold the write side of mm->mmap_lock. |
762 | * |
763 | * While the caller has a mmu_notifier get the mm pointer will remain valid, |
764 | * and can be converted to an active mm pointer via mmget_not_zero(). |
765 | */ |
766 | struct mmu_notifier *mmu_notifier_get_locked(const struct mmu_notifier_ops *ops, |
767 | struct mm_struct *mm) |
768 | { |
769 | struct mmu_notifier *subscription; |
770 | int ret; |
771 | |
772 | mmap_assert_write_locked(mm); |
773 | |
774 | if (mm->notifier_subscriptions) { |
775 | subscription = find_get_mmu_notifier(mm, ops); |
776 | if (subscription) |
777 | return subscription; |
778 | } |
779 | |
780 | subscription = ops->alloc_notifier(mm); |
781 | if (IS_ERR(ptr: subscription)) |
782 | return subscription; |
783 | subscription->ops = ops; |
784 | ret = __mmu_notifier_register(subscription, mm); |
785 | if (ret) |
786 | goto out_free; |
787 | return subscription; |
788 | out_free: |
789 | subscription->ops->free_notifier(subscription); |
790 | return ERR_PTR(error: ret); |
791 | } |
792 | EXPORT_SYMBOL_GPL(mmu_notifier_get_locked); |
793 | |
794 | /* this is called after the last mmu_notifier_unregister() returned */ |
795 | void __mmu_notifier_subscriptions_destroy(struct mm_struct *mm) |
796 | { |
797 | BUG_ON(!hlist_empty(&mm->notifier_subscriptions->list)); |
798 | kfree(objp: mm->notifier_subscriptions); |
799 | mm->notifier_subscriptions = LIST_POISON1; /* debug */ |
800 | } |
801 | |
802 | /* |
803 | * This releases the mm_count pin automatically and frees the mm |
804 | * structure if it was the last user of it. It serializes against |
805 | * running mmu notifiers with SRCU and against mmu_notifier_unregister |
806 | * with the unregister lock + SRCU. All sptes must be dropped before |
807 | * calling mmu_notifier_unregister. ->release or any other notifier |
808 | * method may be invoked concurrently with mmu_notifier_unregister, |
809 | * and only after mmu_notifier_unregister returned we're guaranteed |
810 | * that ->release or any other method can't run anymore. |
811 | */ |
812 | void mmu_notifier_unregister(struct mmu_notifier *subscription, |
813 | struct mm_struct *mm) |
814 | { |
815 | BUG_ON(atomic_read(&mm->mm_count) <= 0); |
816 | |
817 | if (!hlist_unhashed(h: &subscription->hlist)) { |
818 | /* |
819 | * SRCU here will force exit_mmap to wait for ->release to |
820 | * finish before freeing the pages. |
821 | */ |
822 | int id; |
823 | |
824 | id = srcu_read_lock(ssp: &srcu); |
825 | /* |
826 | * exit_mmap will block in mmu_notifier_release to guarantee |
827 | * that ->release is called before freeing the pages. |
828 | */ |
829 | if (subscription->ops->release) |
830 | subscription->ops->release(subscription, mm); |
831 | srcu_read_unlock(ssp: &srcu, idx: id); |
832 | |
833 | spin_lock(lock: &mm->notifier_subscriptions->lock); |
834 | /* |
835 | * Can not use list_del_rcu() since __mmu_notifier_release |
836 | * can delete it before we hold the lock. |
837 | */ |
838 | hlist_del_init_rcu(n: &subscription->hlist); |
839 | spin_unlock(lock: &mm->notifier_subscriptions->lock); |
840 | } |
841 | |
842 | /* |
843 | * Wait for any running method to finish, of course including |
844 | * ->release if it was run by mmu_notifier_release instead of us. |
845 | */ |
846 | synchronize_srcu(ssp: &srcu); |
847 | |
848 | BUG_ON(atomic_read(&mm->mm_count) <= 0); |
849 | |
850 | mmdrop(mm); |
851 | } |
852 | EXPORT_SYMBOL_GPL(mmu_notifier_unregister); |
853 | |
854 | static void mmu_notifier_free_rcu(struct rcu_head *rcu) |
855 | { |
856 | struct mmu_notifier *subscription = |
857 | container_of(rcu, struct mmu_notifier, rcu); |
858 | struct mm_struct *mm = subscription->mm; |
859 | |
860 | subscription->ops->free_notifier(subscription); |
861 | /* Pairs with the get in __mmu_notifier_register() */ |
862 | mmdrop(mm); |
863 | } |
864 | |
865 | /** |
866 | * mmu_notifier_put - Release the reference on the notifier |
867 | * @subscription: The notifier to act on |
868 | * |
869 | * This function must be paired with each mmu_notifier_get(), it releases the |
870 | * reference obtained by the get. If this is the last reference then process |
871 | * to free the notifier will be run asynchronously. |
872 | * |
873 | * Unlike mmu_notifier_unregister() the get/put flow only calls ops->release |
874 | * when the mm_struct is destroyed. Instead free_notifier is always called to |
875 | * release any resources held by the user. |
876 | * |
877 | * As ops->release is not guaranteed to be called, the user must ensure that |
878 | * all sptes are dropped, and no new sptes can be established before |
879 | * mmu_notifier_put() is called. |
880 | * |
881 | * This function can be called from the ops->release callback, however the |
882 | * caller must still ensure it is called pairwise with mmu_notifier_get(). |
883 | * |
884 | * Modules calling this function must call mmu_notifier_synchronize() in |
885 | * their __exit functions to ensure the async work is completed. |
886 | */ |
887 | void mmu_notifier_put(struct mmu_notifier *subscription) |
888 | { |
889 | struct mm_struct *mm = subscription->mm; |
890 | |
891 | spin_lock(lock: &mm->notifier_subscriptions->lock); |
892 | if (WARN_ON(!subscription->users) || --subscription->users) |
893 | goto out_unlock; |
894 | hlist_del_init_rcu(n: &subscription->hlist); |
895 | spin_unlock(lock: &mm->notifier_subscriptions->lock); |
896 | |
897 | call_srcu(ssp: &srcu, head: &subscription->rcu, func: mmu_notifier_free_rcu); |
898 | return; |
899 | |
900 | out_unlock: |
901 | spin_unlock(lock: &mm->notifier_subscriptions->lock); |
902 | } |
903 | EXPORT_SYMBOL_GPL(mmu_notifier_put); |
904 | |
905 | static int __mmu_interval_notifier_insert( |
906 | struct mmu_interval_notifier *interval_sub, struct mm_struct *mm, |
907 | struct mmu_notifier_subscriptions *subscriptions, unsigned long start, |
908 | unsigned long length, const struct mmu_interval_notifier_ops *ops) |
909 | { |
910 | interval_sub->mm = mm; |
911 | interval_sub->ops = ops; |
912 | RB_CLEAR_NODE(&interval_sub->interval_tree.rb); |
913 | interval_sub->interval_tree.start = start; |
914 | /* |
915 | * Note that the representation of the intervals in the interval tree |
916 | * considers the ending point as contained in the interval. |
917 | */ |
918 | if (length == 0 || |
919 | check_add_overflow(start, length - 1, |
920 | &interval_sub->interval_tree.last)) |
921 | return -EOVERFLOW; |
922 | |
923 | /* Must call with a mmget() held */ |
924 | if (WARN_ON(atomic_read(&mm->mm_users) <= 0)) |
925 | return -EINVAL; |
926 | |
927 | /* pairs with mmdrop in mmu_interval_notifier_remove() */ |
928 | mmgrab(mm); |
929 | |
930 | /* |
931 | * If some invalidate_range_start/end region is going on in parallel |
932 | * we don't know what VA ranges are affected, so we must assume this |
933 | * new range is included. |
934 | * |
935 | * If the itree is invalidating then we are not allowed to change |
936 | * it. Retrying until invalidation is done is tricky due to the |
937 | * possibility for live lock, instead defer the add to |
938 | * mn_itree_inv_end() so this algorithm is deterministic. |
939 | * |
940 | * In all cases the value for the interval_sub->invalidate_seq should be |
941 | * odd, see mmu_interval_read_begin() |
942 | */ |
943 | spin_lock(lock: &subscriptions->lock); |
944 | if (subscriptions->active_invalidate_ranges) { |
945 | if (mn_itree_is_invalidating(subscriptions)) |
946 | hlist_add_head(n: &interval_sub->deferred_item, |
947 | h: &subscriptions->deferred_list); |
948 | else { |
949 | subscriptions->invalidate_seq |= 1; |
950 | interval_tree_insert(node: &interval_sub->interval_tree, |
951 | root: &subscriptions->itree); |
952 | } |
953 | interval_sub->invalidate_seq = subscriptions->invalidate_seq; |
954 | } else { |
955 | WARN_ON(mn_itree_is_invalidating(subscriptions)); |
956 | /* |
957 | * The starting seq for a subscription not under invalidation |
958 | * should be odd, not equal to the current invalidate_seq and |
959 | * invalidate_seq should not 'wrap' to the new seq any time |
960 | * soon. |
961 | */ |
962 | interval_sub->invalidate_seq = |
963 | subscriptions->invalidate_seq - 1; |
964 | interval_tree_insert(node: &interval_sub->interval_tree, |
965 | root: &subscriptions->itree); |
966 | } |
967 | spin_unlock(lock: &subscriptions->lock); |
968 | return 0; |
969 | } |
970 | |
971 | /** |
972 | * mmu_interval_notifier_insert - Insert an interval notifier |
973 | * @interval_sub: Interval subscription to register |
974 | * @start: Starting virtual address to monitor |
975 | * @length: Length of the range to monitor |
976 | * @mm: mm_struct to attach to |
977 | * @ops: Interval notifier operations to be called on matching events |
978 | * |
979 | * This function subscribes the interval notifier for notifications from the |
980 | * mm. Upon return the ops related to mmu_interval_notifier will be called |
981 | * whenever an event that intersects with the given range occurs. |
982 | * |
983 | * Upon return the range_notifier may not be present in the interval tree yet. |
984 | * The caller must use the normal interval notifier read flow via |
985 | * mmu_interval_read_begin() to establish SPTEs for this range. |
986 | */ |
987 | int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub, |
988 | struct mm_struct *mm, unsigned long start, |
989 | unsigned long length, |
990 | const struct mmu_interval_notifier_ops *ops) |
991 | { |
992 | struct mmu_notifier_subscriptions *subscriptions; |
993 | int ret; |
994 | |
995 | might_lock(&mm->mmap_lock); |
996 | |
997 | subscriptions = smp_load_acquire(&mm->notifier_subscriptions); |
998 | if (!subscriptions || !subscriptions->has_itree) { |
999 | ret = mmu_notifier_register(NULL, mm); |
1000 | if (ret) |
1001 | return ret; |
1002 | subscriptions = mm->notifier_subscriptions; |
1003 | } |
1004 | return __mmu_interval_notifier_insert(interval_sub, mm, subscriptions, |
1005 | start, length, ops); |
1006 | } |
1007 | EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert); |
1008 | |
1009 | int mmu_interval_notifier_insert_locked( |
1010 | struct mmu_interval_notifier *interval_sub, struct mm_struct *mm, |
1011 | unsigned long start, unsigned long length, |
1012 | const struct mmu_interval_notifier_ops *ops) |
1013 | { |
1014 | struct mmu_notifier_subscriptions *subscriptions = |
1015 | mm->notifier_subscriptions; |
1016 | int ret; |
1017 | |
1018 | mmap_assert_write_locked(mm); |
1019 | |
1020 | if (!subscriptions || !subscriptions->has_itree) { |
1021 | ret = __mmu_notifier_register(NULL, mm); |
1022 | if (ret) |
1023 | return ret; |
1024 | subscriptions = mm->notifier_subscriptions; |
1025 | } |
1026 | return __mmu_interval_notifier_insert(interval_sub, mm, subscriptions, |
1027 | start, length, ops); |
1028 | } |
1029 | EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert_locked); |
1030 | |
1031 | static bool |
1032 | mmu_interval_seq_released(struct mmu_notifier_subscriptions *subscriptions, |
1033 | unsigned long seq) |
1034 | { |
1035 | bool ret; |
1036 | |
1037 | spin_lock(lock: &subscriptions->lock); |
1038 | ret = subscriptions->invalidate_seq != seq; |
1039 | spin_unlock(lock: &subscriptions->lock); |
1040 | return ret; |
1041 | } |
1042 | |
1043 | /** |
1044 | * mmu_interval_notifier_remove - Remove a interval notifier |
1045 | * @interval_sub: Interval subscription to unregister |
1046 | * |
1047 | * This function must be paired with mmu_interval_notifier_insert(). It cannot |
1048 | * be called from any ops callback. |
1049 | * |
1050 | * Once this returns ops callbacks are no longer running on other CPUs and |
1051 | * will not be called in future. |
1052 | */ |
1053 | void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub) |
1054 | { |
1055 | struct mm_struct *mm = interval_sub->mm; |
1056 | struct mmu_notifier_subscriptions *subscriptions = |
1057 | mm->notifier_subscriptions; |
1058 | unsigned long seq = 0; |
1059 | |
1060 | might_sleep(); |
1061 | |
1062 | spin_lock(lock: &subscriptions->lock); |
1063 | if (mn_itree_is_invalidating(subscriptions)) { |
1064 | /* |
1065 | * remove is being called after insert put this on the |
1066 | * deferred list, but before the deferred list was processed. |
1067 | */ |
1068 | if (RB_EMPTY_NODE(&interval_sub->interval_tree.rb)) { |
1069 | hlist_del(n: &interval_sub->deferred_item); |
1070 | } else { |
1071 | hlist_add_head(n: &interval_sub->deferred_item, |
1072 | h: &subscriptions->deferred_list); |
1073 | seq = subscriptions->invalidate_seq; |
1074 | } |
1075 | } else { |
1076 | WARN_ON(RB_EMPTY_NODE(&interval_sub->interval_tree.rb)); |
1077 | interval_tree_remove(node: &interval_sub->interval_tree, |
1078 | root: &subscriptions->itree); |
1079 | } |
1080 | spin_unlock(lock: &subscriptions->lock); |
1081 | |
1082 | /* |
1083 | * The possible sleep on progress in the invalidation requires the |
1084 | * caller not hold any locks held by invalidation callbacks. |
1085 | */ |
1086 | lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); |
1087 | lock_map_release(&__mmu_notifier_invalidate_range_start_map); |
1088 | if (seq) |
1089 | wait_event(subscriptions->wq, |
1090 | mmu_interval_seq_released(subscriptions, seq)); |
1091 | |
1092 | /* pairs with mmgrab in mmu_interval_notifier_insert() */ |
1093 | mmdrop(mm); |
1094 | } |
1095 | EXPORT_SYMBOL_GPL(mmu_interval_notifier_remove); |
1096 | |
1097 | /** |
1098 | * mmu_notifier_synchronize - Ensure all mmu_notifiers are freed |
1099 | * |
1100 | * This function ensures that all outstanding async SRU work from |
1101 | * mmu_notifier_put() is completed. After it returns any mmu_notifier_ops |
1102 | * associated with an unused mmu_notifier will no longer be called. |
1103 | * |
1104 | * Before using the caller must ensure that all of its mmu_notifiers have been |
1105 | * fully released via mmu_notifier_put(). |
1106 | * |
1107 | * Modules using the mmu_notifier_put() API should call this in their __exit |
1108 | * function to avoid module unloading races. |
1109 | */ |
1110 | void mmu_notifier_synchronize(void) |
1111 | { |
1112 | synchronize_srcu(ssp: &srcu); |
1113 | } |
1114 | EXPORT_SYMBOL_GPL(mmu_notifier_synchronize); |
1115 | |