1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * Copyright (c) 2013 Red Hat, Inc. and Parallels Inc. All rights reserved. |
4 | * Authors: David Chinner and Glauber Costa |
5 | * |
6 | * Generic LRU infrastructure |
7 | */ |
8 | #include <linux/kernel.h> |
9 | #include <linux/module.h> |
10 | #include <linux/mm.h> |
11 | #include <linux/list_lru.h> |
12 | #include <linux/slab.h> |
13 | #include <linux/mutex.h> |
14 | #include <linux/memcontrol.h> |
15 | #include "slab.h" |
16 | #include "internal.h" |
17 | |
18 | #ifdef CONFIG_MEMCG_KMEM |
19 | static LIST_HEAD(memcg_list_lrus); |
20 | static DEFINE_MUTEX(list_lrus_mutex); |
21 | |
22 | static inline bool list_lru_memcg_aware(struct list_lru *lru) |
23 | { |
24 | return lru->memcg_aware; |
25 | } |
26 | |
27 | static void list_lru_register(struct list_lru *lru) |
28 | { |
29 | if (!list_lru_memcg_aware(lru)) |
30 | return; |
31 | |
32 | mutex_lock(&list_lrus_mutex); |
33 | list_add(&lru->list, &memcg_list_lrus); |
34 | mutex_unlock(&list_lrus_mutex); |
35 | } |
36 | |
37 | static void list_lru_unregister(struct list_lru *lru) |
38 | { |
39 | if (!list_lru_memcg_aware(lru)) |
40 | return; |
41 | |
42 | mutex_lock(&list_lrus_mutex); |
43 | list_del(&lru->list); |
44 | mutex_unlock(&list_lrus_mutex); |
45 | } |
46 | |
47 | static int lru_shrinker_id(struct list_lru *lru) |
48 | { |
49 | return lru->shrinker_id; |
50 | } |
51 | |
52 | static inline struct list_lru_one * |
53 | list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) |
54 | { |
55 | if (list_lru_memcg_aware(lru) && idx >= 0) { |
56 | struct list_lru_memcg *mlru = xa_load(&lru->xa, idx); |
57 | |
58 | return mlru ? &mlru->node[nid] : NULL; |
59 | } |
60 | return &lru->node[nid].lru; |
61 | } |
62 | |
63 | static inline struct list_lru_one * |
64 | list_lru_from_kmem(struct list_lru *lru, int nid, void *ptr, |
65 | struct mem_cgroup **memcg_ptr) |
66 | { |
67 | struct list_lru_node *nlru = &lru->node[nid]; |
68 | struct list_lru_one *l = &nlru->lru; |
69 | struct mem_cgroup *memcg = NULL; |
70 | |
71 | if (!list_lru_memcg_aware(lru)) |
72 | goto out; |
73 | |
74 | memcg = mem_cgroup_from_slab_obj(ptr); |
75 | if (!memcg) |
76 | goto out; |
77 | |
78 | l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); |
79 | out: |
80 | if (memcg_ptr) |
81 | *memcg_ptr = memcg; |
82 | return l; |
83 | } |
84 | #else |
85 | static void list_lru_register(struct list_lru *lru) |
86 | { |
87 | } |
88 | |
89 | static void list_lru_unregister(struct list_lru *lru) |
90 | { |
91 | } |
92 | |
93 | static int lru_shrinker_id(struct list_lru *lru) |
94 | { |
95 | return -1; |
96 | } |
97 | |
98 | static inline bool list_lru_memcg_aware(struct list_lru *lru) |
99 | { |
100 | return false; |
101 | } |
102 | |
103 | static inline struct list_lru_one * |
104 | list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) |
105 | { |
106 | return &lru->node[nid].lru; |
107 | } |
108 | |
109 | static inline struct list_lru_one * |
110 | list_lru_from_kmem(struct list_lru *lru, int nid, void *ptr, |
111 | struct mem_cgroup **memcg_ptr) |
112 | { |
113 | if (memcg_ptr) |
114 | *memcg_ptr = NULL; |
115 | return &lru->node[nid].lru; |
116 | } |
117 | #endif /* CONFIG_MEMCG_KMEM */ |
118 | |
119 | bool list_lru_add(struct list_lru *lru, struct list_head *item) |
120 | { |
121 | int nid = page_to_nid(virt_to_page(item)); |
122 | struct list_lru_node *nlru = &lru->node[nid]; |
123 | struct mem_cgroup *memcg; |
124 | struct list_lru_one *l; |
125 | |
126 | spin_lock(lock: &nlru->lock); |
127 | if (list_empty(head: item)) { |
128 | l = list_lru_from_kmem(lru, nid, ptr: item, memcg_ptr: &memcg); |
129 | list_add_tail(new: item, head: &l->list); |
130 | /* Set shrinker bit if the first element was added */ |
131 | if (!l->nr_items++) |
132 | set_shrinker_bit(memcg, nid, |
133 | shrinker_id: lru_shrinker_id(lru)); |
134 | nlru->nr_items++; |
135 | spin_unlock(lock: &nlru->lock); |
136 | return true; |
137 | } |
138 | spin_unlock(lock: &nlru->lock); |
139 | return false; |
140 | } |
141 | EXPORT_SYMBOL_GPL(list_lru_add); |
142 | |
143 | bool list_lru_del(struct list_lru *lru, struct list_head *item) |
144 | { |
145 | int nid = page_to_nid(virt_to_page(item)); |
146 | struct list_lru_node *nlru = &lru->node[nid]; |
147 | struct list_lru_one *l; |
148 | |
149 | spin_lock(lock: &nlru->lock); |
150 | if (!list_empty(head: item)) { |
151 | l = list_lru_from_kmem(lru, nid, ptr: item, NULL); |
152 | list_del_init(entry: item); |
153 | l->nr_items--; |
154 | nlru->nr_items--; |
155 | spin_unlock(lock: &nlru->lock); |
156 | return true; |
157 | } |
158 | spin_unlock(lock: &nlru->lock); |
159 | return false; |
160 | } |
161 | EXPORT_SYMBOL_GPL(list_lru_del); |
162 | |
163 | void list_lru_isolate(struct list_lru_one *list, struct list_head *item) |
164 | { |
165 | list_del_init(entry: item); |
166 | list->nr_items--; |
167 | } |
168 | EXPORT_SYMBOL_GPL(list_lru_isolate); |
169 | |
170 | void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item, |
171 | struct list_head *head) |
172 | { |
173 | list_move(list: item, head); |
174 | list->nr_items--; |
175 | } |
176 | EXPORT_SYMBOL_GPL(list_lru_isolate_move); |
177 | |
178 | unsigned long list_lru_count_one(struct list_lru *lru, |
179 | int nid, struct mem_cgroup *memcg) |
180 | { |
181 | struct list_lru_one *l; |
182 | long count; |
183 | |
184 | rcu_read_lock(); |
185 | l = list_lru_from_memcg_idx(lru, nid, idx: memcg_kmem_id(memcg)); |
186 | count = l ? READ_ONCE(l->nr_items) : 0; |
187 | rcu_read_unlock(); |
188 | |
189 | if (unlikely(count < 0)) |
190 | count = 0; |
191 | |
192 | return count; |
193 | } |
194 | EXPORT_SYMBOL_GPL(list_lru_count_one); |
195 | |
196 | unsigned long list_lru_count_node(struct list_lru *lru, int nid) |
197 | { |
198 | struct list_lru_node *nlru; |
199 | |
200 | nlru = &lru->node[nid]; |
201 | return nlru->nr_items; |
202 | } |
203 | EXPORT_SYMBOL_GPL(list_lru_count_node); |
204 | |
205 | static unsigned long |
206 | __list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx, |
207 | list_lru_walk_cb isolate, void *cb_arg, |
208 | unsigned long *nr_to_walk) |
209 | { |
210 | struct list_lru_node *nlru = &lru->node[nid]; |
211 | struct list_lru_one *l; |
212 | struct list_head *item, *n; |
213 | unsigned long isolated = 0; |
214 | |
215 | restart: |
216 | l = list_lru_from_memcg_idx(lru, nid, idx: memcg_idx); |
217 | if (!l) |
218 | goto out; |
219 | |
220 | list_for_each_safe(item, n, &l->list) { |
221 | enum lru_status ret; |
222 | |
223 | /* |
224 | * decrement nr_to_walk first so that we don't livelock if we |
225 | * get stuck on large numbers of LRU_RETRY items |
226 | */ |
227 | if (!*nr_to_walk) |
228 | break; |
229 | --*nr_to_walk; |
230 | |
231 | ret = isolate(item, l, &nlru->lock, cb_arg); |
232 | switch (ret) { |
233 | case LRU_REMOVED_RETRY: |
234 | assert_spin_locked(&nlru->lock); |
235 | fallthrough; |
236 | case LRU_REMOVED: |
237 | isolated++; |
238 | nlru->nr_items--; |
239 | /* |
240 | * If the lru lock has been dropped, our list |
241 | * traversal is now invalid and so we have to |
242 | * restart from scratch. |
243 | */ |
244 | if (ret == LRU_REMOVED_RETRY) |
245 | goto restart; |
246 | break; |
247 | case LRU_ROTATE: |
248 | list_move_tail(list: item, head: &l->list); |
249 | break; |
250 | case LRU_SKIP: |
251 | break; |
252 | case LRU_RETRY: |
253 | /* |
254 | * The lru lock has been dropped, our list traversal is |
255 | * now invalid and so we have to restart from scratch. |
256 | */ |
257 | assert_spin_locked(&nlru->lock); |
258 | goto restart; |
259 | default: |
260 | BUG(); |
261 | } |
262 | } |
263 | out: |
264 | return isolated; |
265 | } |
266 | |
267 | unsigned long |
268 | list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, |
269 | list_lru_walk_cb isolate, void *cb_arg, |
270 | unsigned long *nr_to_walk) |
271 | { |
272 | struct list_lru_node *nlru = &lru->node[nid]; |
273 | unsigned long ret; |
274 | |
275 | spin_lock(lock: &nlru->lock); |
276 | ret = __list_lru_walk_one(lru, nid, memcg_idx: memcg_kmem_id(memcg), isolate, |
277 | cb_arg, nr_to_walk); |
278 | spin_unlock(lock: &nlru->lock); |
279 | return ret; |
280 | } |
281 | EXPORT_SYMBOL_GPL(list_lru_walk_one); |
282 | |
283 | unsigned long |
284 | list_lru_walk_one_irq(struct list_lru *lru, int nid, struct mem_cgroup *memcg, |
285 | list_lru_walk_cb isolate, void *cb_arg, |
286 | unsigned long *nr_to_walk) |
287 | { |
288 | struct list_lru_node *nlru = &lru->node[nid]; |
289 | unsigned long ret; |
290 | |
291 | spin_lock_irq(lock: &nlru->lock); |
292 | ret = __list_lru_walk_one(lru, nid, memcg_idx: memcg_kmem_id(memcg), isolate, |
293 | cb_arg, nr_to_walk); |
294 | spin_unlock_irq(lock: &nlru->lock); |
295 | return ret; |
296 | } |
297 | |
298 | unsigned long list_lru_walk_node(struct list_lru *lru, int nid, |
299 | list_lru_walk_cb isolate, void *cb_arg, |
300 | unsigned long *nr_to_walk) |
301 | { |
302 | long isolated = 0; |
303 | |
304 | isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg, |
305 | nr_to_walk); |
306 | |
307 | #ifdef CONFIG_MEMCG_KMEM |
308 | if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) { |
309 | struct list_lru_memcg *mlru; |
310 | unsigned long index; |
311 | |
312 | xa_for_each(&lru->xa, index, mlru) { |
313 | struct list_lru_node *nlru = &lru->node[nid]; |
314 | |
315 | spin_lock(&nlru->lock); |
316 | isolated += __list_lru_walk_one(lru, nid, index, |
317 | isolate, cb_arg, |
318 | nr_to_walk); |
319 | spin_unlock(&nlru->lock); |
320 | |
321 | if (*nr_to_walk <= 0) |
322 | break; |
323 | } |
324 | } |
325 | #endif |
326 | |
327 | return isolated; |
328 | } |
329 | EXPORT_SYMBOL_GPL(list_lru_walk_node); |
330 | |
331 | static void init_one_lru(struct list_lru_one *l) |
332 | { |
333 | INIT_LIST_HEAD(list: &l->list); |
334 | l->nr_items = 0; |
335 | } |
336 | |
337 | #ifdef CONFIG_MEMCG_KMEM |
338 | static struct list_lru_memcg *memcg_init_list_lru_one(gfp_t gfp) |
339 | { |
340 | int nid; |
341 | struct list_lru_memcg *mlru; |
342 | |
343 | mlru = kmalloc(struct_size(mlru, node, nr_node_ids), gfp); |
344 | if (!mlru) |
345 | return NULL; |
346 | |
347 | for_each_node(nid) |
348 | init_one_lru(&mlru->node[nid]); |
349 | |
350 | return mlru; |
351 | } |
352 | |
353 | static void memcg_list_lru_free(struct list_lru *lru, int src_idx) |
354 | { |
355 | struct list_lru_memcg *mlru = xa_erase_irq(&lru->xa, src_idx); |
356 | |
357 | /* |
358 | * The __list_lru_walk_one() can walk the list of this node. |
359 | * We need kvfree_rcu() here. And the walking of the list |
360 | * is under lru->node[nid]->lock, which can serve as a RCU |
361 | * read-side critical section. |
362 | */ |
363 | if (mlru) |
364 | kvfree_rcu(mlru, rcu); |
365 | } |
366 | |
367 | static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) |
368 | { |
369 | if (memcg_aware) |
370 | xa_init_flags(&lru->xa, XA_FLAGS_LOCK_IRQ); |
371 | lru->memcg_aware = memcg_aware; |
372 | } |
373 | |
374 | static void memcg_destroy_list_lru(struct list_lru *lru) |
375 | { |
376 | XA_STATE(xas, &lru->xa, 0); |
377 | struct list_lru_memcg *mlru; |
378 | |
379 | if (!list_lru_memcg_aware(lru)) |
380 | return; |
381 | |
382 | xas_lock_irq(&xas); |
383 | xas_for_each(&xas, mlru, ULONG_MAX) { |
384 | kfree(mlru); |
385 | xas_store(&xas, NULL); |
386 | } |
387 | xas_unlock_irq(&xas); |
388 | } |
389 | |
390 | static void memcg_reparent_list_lru_node(struct list_lru *lru, int nid, |
391 | int src_idx, struct mem_cgroup *dst_memcg) |
392 | { |
393 | struct list_lru_node *nlru = &lru->node[nid]; |
394 | int dst_idx = dst_memcg->kmemcg_id; |
395 | struct list_lru_one *src, *dst; |
396 | |
397 | /* |
398 | * Since list_lru_{add,del} may be called under an IRQ-safe lock, |
399 | * we have to use IRQ-safe primitives here to avoid deadlock. |
400 | */ |
401 | spin_lock_irq(&nlru->lock); |
402 | |
403 | src = list_lru_from_memcg_idx(lru, nid, src_idx); |
404 | if (!src) |
405 | goto out; |
406 | dst = list_lru_from_memcg_idx(lru, nid, dst_idx); |
407 | |
408 | list_splice_init(&src->list, &dst->list); |
409 | |
410 | if (src->nr_items) { |
411 | dst->nr_items += src->nr_items; |
412 | set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru)); |
413 | src->nr_items = 0; |
414 | } |
415 | out: |
416 | spin_unlock_irq(&nlru->lock); |
417 | } |
418 | |
419 | static void memcg_reparent_list_lru(struct list_lru *lru, |
420 | int src_idx, struct mem_cgroup *dst_memcg) |
421 | { |
422 | int i; |
423 | |
424 | for_each_node(i) |
425 | memcg_reparent_list_lru_node(lru, i, src_idx, dst_memcg); |
426 | |
427 | memcg_list_lru_free(lru, src_idx); |
428 | } |
429 | |
430 | void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent) |
431 | { |
432 | struct cgroup_subsys_state *css; |
433 | struct list_lru *lru; |
434 | int src_idx = memcg->kmemcg_id; |
435 | |
436 | /* |
437 | * Change kmemcg_id of this cgroup and all its descendants to the |
438 | * parent's id, and then move all entries from this cgroup's list_lrus |
439 | * to ones of the parent. |
440 | * |
441 | * After we have finished, all list_lrus corresponding to this cgroup |
442 | * are guaranteed to remain empty. So we can safely free this cgroup's |
443 | * list lrus in memcg_list_lru_free(). |
444 | * |
445 | * Changing ->kmemcg_id to the parent can prevent memcg_list_lru_alloc() |
446 | * from allocating list lrus for this cgroup after memcg_list_lru_free() |
447 | * call. |
448 | */ |
449 | rcu_read_lock(); |
450 | css_for_each_descendant_pre(css, &memcg->css) { |
451 | struct mem_cgroup *child; |
452 | |
453 | child = mem_cgroup_from_css(css); |
454 | WRITE_ONCE(child->kmemcg_id, parent->kmemcg_id); |
455 | } |
456 | rcu_read_unlock(); |
457 | |
458 | mutex_lock(&list_lrus_mutex); |
459 | list_for_each_entry(lru, &memcg_list_lrus, list) |
460 | memcg_reparent_list_lru(lru, src_idx, parent); |
461 | mutex_unlock(&list_lrus_mutex); |
462 | } |
463 | |
464 | static inline bool memcg_list_lru_allocated(struct mem_cgroup *memcg, |
465 | struct list_lru *lru) |
466 | { |
467 | int idx = memcg->kmemcg_id; |
468 | |
469 | return idx < 0 || xa_load(&lru->xa, idx); |
470 | } |
471 | |
472 | int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru, |
473 | gfp_t gfp) |
474 | { |
475 | int i; |
476 | unsigned long flags; |
477 | struct list_lru_memcg_table { |
478 | struct list_lru_memcg *mlru; |
479 | struct mem_cgroup *memcg; |
480 | } *table; |
481 | XA_STATE(xas, &lru->xa, 0); |
482 | |
483 | if (!list_lru_memcg_aware(lru) || memcg_list_lru_allocated(memcg, lru)) |
484 | return 0; |
485 | |
486 | gfp &= GFP_RECLAIM_MASK; |
487 | table = kmalloc_array(memcg->css.cgroup->level, sizeof(*table), gfp); |
488 | if (!table) |
489 | return -ENOMEM; |
490 | |
491 | /* |
492 | * Because the list_lru can be reparented to the parent cgroup's |
493 | * list_lru, we should make sure that this cgroup and all its |
494 | * ancestors have allocated list_lru_memcg. |
495 | */ |
496 | for (i = 0; memcg; memcg = parent_mem_cgroup(memcg), i++) { |
497 | if (memcg_list_lru_allocated(memcg, lru)) |
498 | break; |
499 | |
500 | table[i].memcg = memcg; |
501 | table[i].mlru = memcg_init_list_lru_one(gfp); |
502 | if (!table[i].mlru) { |
503 | while (i--) |
504 | kfree(table[i].mlru); |
505 | kfree(table); |
506 | return -ENOMEM; |
507 | } |
508 | } |
509 | |
510 | xas_lock_irqsave(&xas, flags); |
511 | while (i--) { |
512 | int index = READ_ONCE(table[i].memcg->kmemcg_id); |
513 | struct list_lru_memcg *mlru = table[i].mlru; |
514 | |
515 | xas_set(&xas, index); |
516 | retry: |
517 | if (unlikely(index < 0 || xas_error(&xas) || xas_load(&xas))) { |
518 | kfree(mlru); |
519 | } else { |
520 | xas_store(&xas, mlru); |
521 | if (xas_error(&xas) == -ENOMEM) { |
522 | xas_unlock_irqrestore(&xas, flags); |
523 | if (xas_nomem(&xas, gfp)) |
524 | xas_set_err(&xas, 0); |
525 | xas_lock_irqsave(&xas, flags); |
526 | /* |
527 | * The xas lock has been released, this memcg |
528 | * can be reparented before us. So reload |
529 | * memcg id. More details see the comments |
530 | * in memcg_reparent_list_lrus(). |
531 | */ |
532 | index = READ_ONCE(table[i].memcg->kmemcg_id); |
533 | if (index < 0) |
534 | xas_set_err(&xas, 0); |
535 | else if (!xas_error(&xas) && index != xas.xa_index) |
536 | xas_set(&xas, index); |
537 | goto retry; |
538 | } |
539 | } |
540 | } |
541 | /* xas_nomem() is used to free memory instead of memory allocation. */ |
542 | if (xas.xa_alloc) |
543 | xas_nomem(&xas, gfp); |
544 | xas_unlock_irqrestore(&xas, flags); |
545 | kfree(table); |
546 | |
547 | return xas_error(&xas); |
548 | } |
549 | #else |
550 | static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) |
551 | { |
552 | } |
553 | |
554 | static void memcg_destroy_list_lru(struct list_lru *lru) |
555 | { |
556 | } |
557 | #endif /* CONFIG_MEMCG_KMEM */ |
558 | |
559 | int __list_lru_init(struct list_lru *lru, bool memcg_aware, |
560 | struct lock_class_key *key, struct shrinker *shrinker) |
561 | { |
562 | int i; |
563 | |
564 | #ifdef CONFIG_MEMCG_KMEM |
565 | if (shrinker) |
566 | lru->shrinker_id = shrinker->id; |
567 | else |
568 | lru->shrinker_id = -1; |
569 | #endif |
570 | |
571 | lru->node = kcalloc(nr_node_ids, size: sizeof(*lru->node), GFP_KERNEL); |
572 | if (!lru->node) |
573 | return -ENOMEM; |
574 | |
575 | for_each_node(i) { |
576 | spin_lock_init(&lru->node[i].lock); |
577 | if (key) |
578 | lockdep_set_class(&lru->node[i].lock, key); |
579 | init_one_lru(l: &lru->node[i].lru); |
580 | } |
581 | |
582 | memcg_init_list_lru(lru, memcg_aware); |
583 | list_lru_register(lru); |
584 | |
585 | return 0; |
586 | } |
587 | EXPORT_SYMBOL_GPL(__list_lru_init); |
588 | |
589 | void list_lru_destroy(struct list_lru *lru) |
590 | { |
591 | /* Already destroyed or not yet initialized? */ |
592 | if (!lru->node) |
593 | return; |
594 | |
595 | list_lru_unregister(lru); |
596 | |
597 | memcg_destroy_list_lru(lru); |
598 | kfree(objp: lru->node); |
599 | lru->node = NULL; |
600 | |
601 | #ifdef CONFIG_MEMCG_KMEM |
602 | lru->shrinker_id = -1; |
603 | #endif |
604 | } |
605 | EXPORT_SYMBOL_GPL(list_lru_destroy); |
606 | |