1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * Copyright (C) 2009-2011 Red Hat, Inc. |
4 | * |
5 | * Author: Mikulas Patocka <mpatocka@redhat.com> |
6 | * |
7 | * This file is released under the GPL. |
8 | */ |
9 | |
10 | #include <linux/dm-bufio.h> |
11 | |
12 | #include <linux/device-mapper.h> |
13 | #include <linux/dm-io.h> |
14 | #include <linux/slab.h> |
15 | #include <linux/sched/mm.h> |
16 | #include <linux/jiffies.h> |
17 | #include <linux/vmalloc.h> |
18 | #include <linux/shrinker.h> |
19 | #include <linux/module.h> |
20 | #include <linux/rbtree.h> |
21 | #include <linux/stacktrace.h> |
22 | #include <linux/jump_label.h> |
23 | |
24 | #include "dm.h" |
25 | |
26 | #define DM_MSG_PREFIX "bufio" |
27 | |
28 | /* |
29 | * Memory management policy: |
30 | * Limit the number of buffers to DM_BUFIO_MEMORY_PERCENT of main memory |
31 | * or DM_BUFIO_VMALLOC_PERCENT of vmalloc memory (whichever is lower). |
32 | * Always allocate at least DM_BUFIO_MIN_BUFFERS buffers. |
33 | * Start background writeback when there are DM_BUFIO_WRITEBACK_PERCENT |
34 | * dirty buffers. |
35 | */ |
36 | #define DM_BUFIO_MIN_BUFFERS 8 |
37 | |
38 | #define DM_BUFIO_MEMORY_PERCENT 2 |
39 | #define DM_BUFIO_VMALLOC_PERCENT 25 |
40 | #define DM_BUFIO_WRITEBACK_RATIO 3 |
41 | #define DM_BUFIO_LOW_WATERMARK_RATIO 16 |
42 | |
43 | /* |
44 | * Check buffer ages in this interval (seconds) |
45 | */ |
46 | #define DM_BUFIO_WORK_TIMER_SECS 30 |
47 | |
48 | /* |
49 | * Free buffers when they are older than this (seconds) |
50 | */ |
51 | #define DM_BUFIO_DEFAULT_AGE_SECS 300 |
52 | |
53 | /* |
54 | * The nr of bytes of cached data to keep around. |
55 | */ |
56 | #define DM_BUFIO_DEFAULT_RETAIN_BYTES (256 * 1024) |
57 | |
58 | /* |
59 | * Align buffer writes to this boundary. |
60 | * Tests show that SSDs have the highest IOPS when using 4k writes. |
61 | */ |
62 | #define DM_BUFIO_WRITE_ALIGN 4096 |
63 | |
64 | /* |
65 | * dm_buffer->list_mode |
66 | */ |
67 | #define LIST_CLEAN 0 |
68 | #define LIST_DIRTY 1 |
69 | #define LIST_SIZE 2 |
70 | |
71 | /*--------------------------------------------------------------*/ |
72 | |
73 | /* |
74 | * Rather than use an LRU list, we use a clock algorithm where entries |
75 | * are held in a circular list. When an entry is 'hit' a reference bit |
76 | * is set. The least recently used entry is approximated by running a |
77 | * cursor around the list selecting unreferenced entries. Referenced |
78 | * entries have their reference bit cleared as the cursor passes them. |
79 | */ |
80 | struct lru_entry { |
81 | struct list_head list; |
82 | atomic_t referenced; |
83 | }; |
84 | |
85 | struct lru_iter { |
86 | struct lru *lru; |
87 | struct list_head list; |
88 | struct lru_entry *stop; |
89 | struct lru_entry *e; |
90 | }; |
91 | |
92 | struct lru { |
93 | struct list_head *cursor; |
94 | unsigned long count; |
95 | |
96 | struct list_head iterators; |
97 | }; |
98 | |
99 | /*--------------*/ |
100 | |
101 | static void lru_init(struct lru *lru) |
102 | { |
103 | lru->cursor = NULL; |
104 | lru->count = 0; |
105 | INIT_LIST_HEAD(list: &lru->iterators); |
106 | } |
107 | |
108 | static void lru_destroy(struct lru *lru) |
109 | { |
110 | WARN_ON_ONCE(lru->cursor); |
111 | WARN_ON_ONCE(!list_empty(&lru->iterators)); |
112 | } |
113 | |
114 | /* |
115 | * Insert a new entry into the lru. |
116 | */ |
117 | static void lru_insert(struct lru *lru, struct lru_entry *le) |
118 | { |
119 | /* |
120 | * Don't be tempted to set to 1, makes the lru aspect |
121 | * perform poorly. |
122 | */ |
123 | atomic_set(v: &le->referenced, i: 0); |
124 | |
125 | if (lru->cursor) { |
126 | list_add_tail(new: &le->list, head: lru->cursor); |
127 | } else { |
128 | INIT_LIST_HEAD(list: &le->list); |
129 | lru->cursor = &le->list; |
130 | } |
131 | lru->count++; |
132 | } |
133 | |
134 | /*--------------*/ |
135 | |
136 | /* |
137 | * Convert a list_head pointer to an lru_entry pointer. |
138 | */ |
139 | static inline struct lru_entry *to_le(struct list_head *l) |
140 | { |
141 | return container_of(l, struct lru_entry, list); |
142 | } |
143 | |
144 | /* |
145 | * Initialize an lru_iter and add it to the list of cursors in the lru. |
146 | */ |
147 | static void lru_iter_begin(struct lru *lru, struct lru_iter *it) |
148 | { |
149 | it->lru = lru; |
150 | it->stop = lru->cursor ? to_le(l: lru->cursor->prev) : NULL; |
151 | it->e = lru->cursor ? to_le(l: lru->cursor) : NULL; |
152 | list_add(new: &it->list, head: &lru->iterators); |
153 | } |
154 | |
155 | /* |
156 | * Remove an lru_iter from the list of cursors in the lru. |
157 | */ |
158 | static inline void lru_iter_end(struct lru_iter *it) |
159 | { |
160 | list_del(entry: &it->list); |
161 | } |
162 | |
163 | /* Predicate function type to be used with lru_iter_next */ |
164 | typedef bool (*iter_predicate)(struct lru_entry *le, void *context); |
165 | |
166 | /* |
167 | * Advance the cursor to the next entry that passes the |
168 | * predicate, and return that entry. Returns NULL if the |
169 | * iteration is complete. |
170 | */ |
171 | static struct lru_entry *lru_iter_next(struct lru_iter *it, |
172 | iter_predicate pred, void *context) |
173 | { |
174 | struct lru_entry *e; |
175 | |
176 | while (it->e) { |
177 | e = it->e; |
178 | |
179 | /* advance the cursor */ |
180 | if (it->e == it->stop) |
181 | it->e = NULL; |
182 | else |
183 | it->e = to_le(l: it->e->list.next); |
184 | |
185 | if (pred(e, context)) |
186 | return e; |
187 | } |
188 | |
189 | return NULL; |
190 | } |
191 | |
192 | /* |
193 | * Invalidate a specific lru_entry and update all cursors in |
194 | * the lru accordingly. |
195 | */ |
196 | static void lru_iter_invalidate(struct lru *lru, struct lru_entry *e) |
197 | { |
198 | struct lru_iter *it; |
199 | |
200 | list_for_each_entry(it, &lru->iterators, list) { |
201 | /* Move c->e forwards if necc. */ |
202 | if (it->e == e) { |
203 | it->e = to_le(l: it->e->list.next); |
204 | if (it->e == e) |
205 | it->e = NULL; |
206 | } |
207 | |
208 | /* Move it->stop backwards if necc. */ |
209 | if (it->stop == e) { |
210 | it->stop = to_le(l: it->stop->list.prev); |
211 | if (it->stop == e) |
212 | it->stop = NULL; |
213 | } |
214 | } |
215 | } |
216 | |
217 | /*--------------*/ |
218 | |
219 | /* |
220 | * Remove a specific entry from the lru. |
221 | */ |
222 | static void lru_remove(struct lru *lru, struct lru_entry *le) |
223 | { |
224 | lru_iter_invalidate(lru, e: le); |
225 | if (lru->count == 1) { |
226 | lru->cursor = NULL; |
227 | } else { |
228 | if (lru->cursor == &le->list) |
229 | lru->cursor = lru->cursor->next; |
230 | list_del(entry: &le->list); |
231 | } |
232 | lru->count--; |
233 | } |
234 | |
235 | /* |
236 | * Mark as referenced. |
237 | */ |
238 | static inline void lru_reference(struct lru_entry *le) |
239 | { |
240 | atomic_set(v: &le->referenced, i: 1); |
241 | } |
242 | |
243 | /*--------------*/ |
244 | |
245 | /* |
246 | * Remove the least recently used entry (approx), that passes the predicate. |
247 | * Returns NULL on failure. |
248 | */ |
249 | enum evict_result { |
250 | ER_EVICT, |
251 | ER_DONT_EVICT, |
252 | ER_STOP, /* stop looking for something to evict */ |
253 | }; |
254 | |
255 | typedef enum evict_result (*le_predicate)(struct lru_entry *le, void *context); |
256 | |
257 | static struct lru_entry *lru_evict(struct lru *lru, le_predicate pred, void *context) |
258 | { |
259 | unsigned long tested = 0; |
260 | struct list_head *h = lru->cursor; |
261 | struct lru_entry *le; |
262 | |
263 | if (!h) |
264 | return NULL; |
265 | /* |
266 | * In the worst case we have to loop around twice. Once to clear |
267 | * the reference flags, and then again to discover the predicate |
268 | * fails for all entries. |
269 | */ |
270 | while (tested < lru->count) { |
271 | le = container_of(h, struct lru_entry, list); |
272 | |
273 | if (atomic_read(v: &le->referenced)) { |
274 | atomic_set(v: &le->referenced, i: 0); |
275 | } else { |
276 | tested++; |
277 | switch (pred(le, context)) { |
278 | case ER_EVICT: |
279 | /* |
280 | * Adjust the cursor, so we start the next |
281 | * search from here. |
282 | */ |
283 | lru->cursor = le->list.next; |
284 | lru_remove(lru, le); |
285 | return le; |
286 | |
287 | case ER_DONT_EVICT: |
288 | break; |
289 | |
290 | case ER_STOP: |
291 | lru->cursor = le->list.next; |
292 | return NULL; |
293 | } |
294 | } |
295 | |
296 | h = h->next; |
297 | |
298 | cond_resched(); |
299 | } |
300 | |
301 | return NULL; |
302 | } |
303 | |
304 | /*--------------------------------------------------------------*/ |
305 | |
306 | /* |
307 | * Buffer state bits. |
308 | */ |
309 | #define B_READING 0 |
310 | #define B_WRITING 1 |
311 | #define B_DIRTY 2 |
312 | |
313 | /* |
314 | * Describes how the block was allocated: |
315 | * kmem_cache_alloc(), __get_free_pages() or vmalloc(). |
316 | * See the comment at alloc_buffer_data. |
317 | */ |
318 | enum data_mode { |
319 | DATA_MODE_SLAB = 0, |
320 | DATA_MODE_GET_FREE_PAGES = 1, |
321 | DATA_MODE_VMALLOC = 2, |
322 | DATA_MODE_LIMIT = 3 |
323 | }; |
324 | |
325 | struct dm_buffer { |
326 | /* protected by the locks in dm_buffer_cache */ |
327 | struct rb_node node; |
328 | |
329 | /* immutable, so don't need protecting */ |
330 | sector_t block; |
331 | void *data; |
332 | unsigned char data_mode; /* DATA_MODE_* */ |
333 | |
334 | /* |
335 | * These two fields are used in isolation, so do not need |
336 | * a surrounding lock. |
337 | */ |
338 | atomic_t hold_count; |
339 | unsigned long last_accessed; |
340 | |
341 | /* |
342 | * Everything else is protected by the mutex in |
343 | * dm_bufio_client |
344 | */ |
345 | unsigned long state; |
346 | struct lru_entry lru; |
347 | unsigned char list_mode; /* LIST_* */ |
348 | blk_status_t read_error; |
349 | blk_status_t write_error; |
350 | unsigned int dirty_start; |
351 | unsigned int dirty_end; |
352 | unsigned int write_start; |
353 | unsigned int write_end; |
354 | struct list_head write_list; |
355 | struct dm_bufio_client *c; |
356 | void (*end_io)(struct dm_buffer *b, blk_status_t bs); |
357 | #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING |
358 | #define MAX_STACK 10 |
359 | unsigned int stack_len; |
360 | unsigned long stack_entries[MAX_STACK]; |
361 | #endif |
362 | }; |
363 | |
364 | /*--------------------------------------------------------------*/ |
365 | |
366 | /* |
367 | * The buffer cache manages buffers, particularly: |
368 | * - inc/dec of holder count |
369 | * - setting the last_accessed field |
370 | * - maintains clean/dirty state along with lru |
371 | * - selecting buffers that match predicates |
372 | * |
373 | * It does *not* handle: |
374 | * - allocation/freeing of buffers. |
375 | * - IO |
376 | * - Eviction or cache sizing. |
377 | * |
378 | * cache_get() and cache_put() are threadsafe, you do not need to |
379 | * protect these calls with a surrounding mutex. All the other |
380 | * methods are not threadsafe; they do use locking primitives, but |
381 | * only enough to ensure get/put are threadsafe. |
382 | */ |
383 | |
384 | struct buffer_tree { |
385 | struct rw_semaphore lock; |
386 | struct rb_root root; |
387 | } ____cacheline_aligned_in_smp; |
388 | |
389 | struct dm_buffer_cache { |
390 | struct lru lru[LIST_SIZE]; |
391 | /* |
392 | * We spread entries across multiple trees to reduce contention |
393 | * on the locks. |
394 | */ |
395 | unsigned int num_locks; |
396 | struct buffer_tree trees[]; |
397 | }; |
398 | |
399 | static inline unsigned int cache_index(sector_t block, unsigned int num_locks) |
400 | { |
401 | return dm_hash_locks_index(block, num_locks); |
402 | } |
403 | |
404 | static inline void cache_read_lock(struct dm_buffer_cache *bc, sector_t block) |
405 | { |
406 | down_read(sem: &bc->trees[cache_index(block, num_locks: bc->num_locks)].lock); |
407 | } |
408 | |
409 | static inline void cache_read_unlock(struct dm_buffer_cache *bc, sector_t block) |
410 | { |
411 | up_read(sem: &bc->trees[cache_index(block, num_locks: bc->num_locks)].lock); |
412 | } |
413 | |
414 | static inline void cache_write_lock(struct dm_buffer_cache *bc, sector_t block) |
415 | { |
416 | down_write(sem: &bc->trees[cache_index(block, num_locks: bc->num_locks)].lock); |
417 | } |
418 | |
419 | static inline void cache_write_unlock(struct dm_buffer_cache *bc, sector_t block) |
420 | { |
421 | up_write(sem: &bc->trees[cache_index(block, num_locks: bc->num_locks)].lock); |
422 | } |
423 | |
424 | /* |
425 | * Sometimes we want to repeatedly get and drop locks as part of an iteration. |
426 | * This struct helps avoid redundant drop and gets of the same lock. |
427 | */ |
428 | struct lock_history { |
429 | struct dm_buffer_cache *cache; |
430 | bool write; |
431 | unsigned int previous; |
432 | unsigned int no_previous; |
433 | }; |
434 | |
435 | static void lh_init(struct lock_history *lh, struct dm_buffer_cache *cache, bool write) |
436 | { |
437 | lh->cache = cache; |
438 | lh->write = write; |
439 | lh->no_previous = cache->num_locks; |
440 | lh->previous = lh->no_previous; |
441 | } |
442 | |
443 | static void __lh_lock(struct lock_history *lh, unsigned int index) |
444 | { |
445 | if (lh->write) |
446 | down_write(sem: &lh->cache->trees[index].lock); |
447 | else |
448 | down_read(sem: &lh->cache->trees[index].lock); |
449 | } |
450 | |
451 | static void __lh_unlock(struct lock_history *lh, unsigned int index) |
452 | { |
453 | if (lh->write) |
454 | up_write(sem: &lh->cache->trees[index].lock); |
455 | else |
456 | up_read(sem: &lh->cache->trees[index].lock); |
457 | } |
458 | |
459 | /* |
460 | * Make sure you call this since it will unlock the final lock. |
461 | */ |
462 | static void lh_exit(struct lock_history *lh) |
463 | { |
464 | if (lh->previous != lh->no_previous) { |
465 | __lh_unlock(lh, index: lh->previous); |
466 | lh->previous = lh->no_previous; |
467 | } |
468 | } |
469 | |
470 | /* |
471 | * Named 'next' because there is no corresponding |
472 | * 'up/unlock' call since it's done automatically. |
473 | */ |
474 | static void lh_next(struct lock_history *lh, sector_t b) |
475 | { |
476 | unsigned int index = cache_index(block: b, num_locks: lh->no_previous); /* no_previous is num_locks */ |
477 | |
478 | if (lh->previous != lh->no_previous) { |
479 | if (lh->previous != index) { |
480 | __lh_unlock(lh, index: lh->previous); |
481 | __lh_lock(lh, index); |
482 | lh->previous = index; |
483 | } |
484 | } else { |
485 | __lh_lock(lh, index); |
486 | lh->previous = index; |
487 | } |
488 | } |
489 | |
490 | static inline struct dm_buffer *le_to_buffer(struct lru_entry *le) |
491 | { |
492 | return container_of(le, struct dm_buffer, lru); |
493 | } |
494 | |
495 | static struct dm_buffer *list_to_buffer(struct list_head *l) |
496 | { |
497 | struct lru_entry *le = list_entry(l, struct lru_entry, list); |
498 | |
499 | if (!le) |
500 | return NULL; |
501 | |
502 | return le_to_buffer(le); |
503 | } |
504 | |
505 | static void cache_init(struct dm_buffer_cache *bc, unsigned int num_locks) |
506 | { |
507 | unsigned int i; |
508 | |
509 | bc->num_locks = num_locks; |
510 | |
511 | for (i = 0; i < bc->num_locks; i++) { |
512 | init_rwsem(&bc->trees[i].lock); |
513 | bc->trees[i].root = RB_ROOT; |
514 | } |
515 | |
516 | lru_init(lru: &bc->lru[LIST_CLEAN]); |
517 | lru_init(lru: &bc->lru[LIST_DIRTY]); |
518 | } |
519 | |
520 | static void cache_destroy(struct dm_buffer_cache *bc) |
521 | { |
522 | unsigned int i; |
523 | |
524 | for (i = 0; i < bc->num_locks; i++) |
525 | WARN_ON_ONCE(!RB_EMPTY_ROOT(&bc->trees[i].root)); |
526 | |
527 | lru_destroy(lru: &bc->lru[LIST_CLEAN]); |
528 | lru_destroy(lru: &bc->lru[LIST_DIRTY]); |
529 | } |
530 | |
531 | /*--------------*/ |
532 | |
533 | /* |
534 | * not threadsafe, or racey depending how you look at it |
535 | */ |
536 | static inline unsigned long cache_count(struct dm_buffer_cache *bc, int list_mode) |
537 | { |
538 | return bc->lru[list_mode].count; |
539 | } |
540 | |
541 | static inline unsigned long cache_total(struct dm_buffer_cache *bc) |
542 | { |
543 | return cache_count(bc, LIST_CLEAN) + cache_count(bc, LIST_DIRTY); |
544 | } |
545 | |
546 | /*--------------*/ |
547 | |
548 | /* |
549 | * Gets a specific buffer, indexed by block. |
550 | * If the buffer is found then its holder count will be incremented and |
551 | * lru_reference will be called. |
552 | * |
553 | * threadsafe |
554 | */ |
555 | static struct dm_buffer *__cache_get(const struct rb_root *root, sector_t block) |
556 | { |
557 | struct rb_node *n = root->rb_node; |
558 | struct dm_buffer *b; |
559 | |
560 | while (n) { |
561 | b = container_of(n, struct dm_buffer, node); |
562 | |
563 | if (b->block == block) |
564 | return b; |
565 | |
566 | n = block < b->block ? n->rb_left : n->rb_right; |
567 | } |
568 | |
569 | return NULL; |
570 | } |
571 | |
572 | static void __cache_inc_buffer(struct dm_buffer *b) |
573 | { |
574 | atomic_inc(v: &b->hold_count); |
575 | WRITE_ONCE(b->last_accessed, jiffies); |
576 | } |
577 | |
578 | static struct dm_buffer *cache_get(struct dm_buffer_cache *bc, sector_t block) |
579 | { |
580 | struct dm_buffer *b; |
581 | |
582 | cache_read_lock(bc, block); |
583 | b = __cache_get(root: &bc->trees[cache_index(block, num_locks: bc->num_locks)].root, block); |
584 | if (b) { |
585 | lru_reference(le: &b->lru); |
586 | __cache_inc_buffer(b); |
587 | } |
588 | cache_read_unlock(bc, block); |
589 | |
590 | return b; |
591 | } |
592 | |
593 | /*--------------*/ |
594 | |
595 | /* |
596 | * Returns true if the hold count hits zero. |
597 | * threadsafe |
598 | */ |
599 | static bool cache_put(struct dm_buffer_cache *bc, struct dm_buffer *b) |
600 | { |
601 | bool r; |
602 | |
603 | cache_read_lock(bc, block: b->block); |
604 | BUG_ON(!atomic_read(&b->hold_count)); |
605 | r = atomic_dec_and_test(v: &b->hold_count); |
606 | cache_read_unlock(bc, block: b->block); |
607 | |
608 | return r; |
609 | } |
610 | |
611 | /*--------------*/ |
612 | |
613 | typedef enum evict_result (*b_predicate)(struct dm_buffer *, void *); |
614 | |
615 | /* |
616 | * Evicts a buffer based on a predicate. The oldest buffer that |
617 | * matches the predicate will be selected. In addition to the |
618 | * predicate the hold_count of the selected buffer will be zero. |
619 | */ |
620 | struct evict_wrapper { |
621 | struct lock_history *lh; |
622 | b_predicate pred; |
623 | void *context; |
624 | }; |
625 | |
626 | /* |
627 | * Wraps the buffer predicate turning it into an lru predicate. Adds |
628 | * extra test for hold_count. |
629 | */ |
630 | static enum evict_result __evict_pred(struct lru_entry *le, void *context) |
631 | { |
632 | struct evict_wrapper *w = context; |
633 | struct dm_buffer *b = le_to_buffer(le); |
634 | |
635 | lh_next(lh: w->lh, b: b->block); |
636 | |
637 | if (atomic_read(v: &b->hold_count)) |
638 | return ER_DONT_EVICT; |
639 | |
640 | return w->pred(b, w->context); |
641 | } |
642 | |
643 | static struct dm_buffer *__cache_evict(struct dm_buffer_cache *bc, int list_mode, |
644 | b_predicate pred, void *context, |
645 | struct lock_history *lh) |
646 | { |
647 | struct evict_wrapper w = {.lh = lh, .pred = pred, .context = context}; |
648 | struct lru_entry *le; |
649 | struct dm_buffer *b; |
650 | |
651 | le = lru_evict(lru: &bc->lru[list_mode], pred: __evict_pred, context: &w); |
652 | if (!le) |
653 | return NULL; |
654 | |
655 | b = le_to_buffer(le); |
656 | /* __evict_pred will have locked the appropriate tree. */ |
657 | rb_erase(&b->node, &bc->trees[cache_index(block: b->block, num_locks: bc->num_locks)].root); |
658 | |
659 | return b; |
660 | } |
661 | |
662 | static struct dm_buffer *cache_evict(struct dm_buffer_cache *bc, int list_mode, |
663 | b_predicate pred, void *context) |
664 | { |
665 | struct dm_buffer *b; |
666 | struct lock_history lh; |
667 | |
668 | lh_init(lh: &lh, cache: bc, write: true); |
669 | b = __cache_evict(bc, list_mode, pred, context, lh: &lh); |
670 | lh_exit(lh: &lh); |
671 | |
672 | return b; |
673 | } |
674 | |
675 | /*--------------*/ |
676 | |
677 | /* |
678 | * Mark a buffer as clean or dirty. Not threadsafe. |
679 | */ |
680 | static void cache_mark(struct dm_buffer_cache *bc, struct dm_buffer *b, int list_mode) |
681 | { |
682 | cache_write_lock(bc, block: b->block); |
683 | if (list_mode != b->list_mode) { |
684 | lru_remove(lru: &bc->lru[b->list_mode], le: &b->lru); |
685 | b->list_mode = list_mode; |
686 | lru_insert(lru: &bc->lru[b->list_mode], le: &b->lru); |
687 | } |
688 | cache_write_unlock(bc, block: b->block); |
689 | } |
690 | |
691 | /*--------------*/ |
692 | |
693 | /* |
694 | * Runs through the lru associated with 'old_mode', if the predicate matches then |
695 | * it moves them to 'new_mode'. Not threadsafe. |
696 | */ |
697 | static void __cache_mark_many(struct dm_buffer_cache *bc, int old_mode, int new_mode, |
698 | b_predicate pred, void *context, struct lock_history *lh) |
699 | { |
700 | struct lru_entry *le; |
701 | struct dm_buffer *b; |
702 | struct evict_wrapper w = {.lh = lh, .pred = pred, .context = context}; |
703 | |
704 | while (true) { |
705 | le = lru_evict(lru: &bc->lru[old_mode], pred: __evict_pred, context: &w); |
706 | if (!le) |
707 | break; |
708 | |
709 | b = le_to_buffer(le); |
710 | b->list_mode = new_mode; |
711 | lru_insert(lru: &bc->lru[b->list_mode], le: &b->lru); |
712 | } |
713 | } |
714 | |
715 | static void cache_mark_many(struct dm_buffer_cache *bc, int old_mode, int new_mode, |
716 | b_predicate pred, void *context) |
717 | { |
718 | struct lock_history lh; |
719 | |
720 | lh_init(lh: &lh, cache: bc, write: true); |
721 | __cache_mark_many(bc, old_mode, new_mode, pred, context, lh: &lh); |
722 | lh_exit(lh: &lh); |
723 | } |
724 | |
725 | /*--------------*/ |
726 | |
727 | /* |
728 | * Iterates through all clean or dirty entries calling a function for each |
729 | * entry. The callback may terminate the iteration early. Not threadsafe. |
730 | */ |
731 | |
732 | /* |
733 | * Iterator functions should return one of these actions to indicate |
734 | * how the iteration should proceed. |
735 | */ |
736 | enum it_action { |
737 | IT_NEXT, |
738 | IT_COMPLETE, |
739 | }; |
740 | |
741 | typedef enum it_action (*iter_fn)(struct dm_buffer *b, void *context); |
742 | |
743 | static void __cache_iterate(struct dm_buffer_cache *bc, int list_mode, |
744 | iter_fn fn, void *context, struct lock_history *lh) |
745 | { |
746 | struct lru *lru = &bc->lru[list_mode]; |
747 | struct lru_entry *le, *first; |
748 | |
749 | if (!lru->cursor) |
750 | return; |
751 | |
752 | first = le = to_le(l: lru->cursor); |
753 | do { |
754 | struct dm_buffer *b = le_to_buffer(le); |
755 | |
756 | lh_next(lh, b: b->block); |
757 | |
758 | switch (fn(b, context)) { |
759 | case IT_NEXT: |
760 | break; |
761 | |
762 | case IT_COMPLETE: |
763 | return; |
764 | } |
765 | cond_resched(); |
766 | |
767 | le = to_le(l: le->list.next); |
768 | } while (le != first); |
769 | } |
770 | |
771 | static void cache_iterate(struct dm_buffer_cache *bc, int list_mode, |
772 | iter_fn fn, void *context) |
773 | { |
774 | struct lock_history lh; |
775 | |
776 | lh_init(lh: &lh, cache: bc, write: false); |
777 | __cache_iterate(bc, list_mode, fn, context, lh: &lh); |
778 | lh_exit(lh: &lh); |
779 | } |
780 | |
781 | /*--------------*/ |
782 | |
783 | /* |
784 | * Passes ownership of the buffer to the cache. Returns false if the |
785 | * buffer was already present (in which case ownership does not pass). |
786 | * eg, a race with another thread. |
787 | * |
788 | * Holder count should be 1 on insertion. |
789 | * |
790 | * Not threadsafe. |
791 | */ |
792 | static bool __cache_insert(struct rb_root *root, struct dm_buffer *b) |
793 | { |
794 | struct rb_node **new = &root->rb_node, *parent = NULL; |
795 | struct dm_buffer *found; |
796 | |
797 | while (*new) { |
798 | found = container_of(*new, struct dm_buffer, node); |
799 | |
800 | if (found->block == b->block) |
801 | return false; |
802 | |
803 | parent = *new; |
804 | new = b->block < found->block ? |
805 | &found->node.rb_left : &found->node.rb_right; |
806 | } |
807 | |
808 | rb_link_node(node: &b->node, parent, rb_link: new); |
809 | rb_insert_color(&b->node, root); |
810 | |
811 | return true; |
812 | } |
813 | |
814 | static bool cache_insert(struct dm_buffer_cache *bc, struct dm_buffer *b) |
815 | { |
816 | bool r; |
817 | |
818 | if (WARN_ON_ONCE(b->list_mode >= LIST_SIZE)) |
819 | return false; |
820 | |
821 | cache_write_lock(bc, block: b->block); |
822 | BUG_ON(atomic_read(&b->hold_count) != 1); |
823 | r = __cache_insert(root: &bc->trees[cache_index(block: b->block, num_locks: bc->num_locks)].root, b); |
824 | if (r) |
825 | lru_insert(lru: &bc->lru[b->list_mode], le: &b->lru); |
826 | cache_write_unlock(bc, block: b->block); |
827 | |
828 | return r; |
829 | } |
830 | |
831 | /*--------------*/ |
832 | |
833 | /* |
834 | * Removes buffer from cache, ownership of the buffer passes back to the caller. |
835 | * Fails if the hold_count is not one (ie. the caller holds the only reference). |
836 | * |
837 | * Not threadsafe. |
838 | */ |
839 | static bool cache_remove(struct dm_buffer_cache *bc, struct dm_buffer *b) |
840 | { |
841 | bool r; |
842 | |
843 | cache_write_lock(bc, block: b->block); |
844 | |
845 | if (atomic_read(v: &b->hold_count) != 1) { |
846 | r = false; |
847 | } else { |
848 | r = true; |
849 | rb_erase(&b->node, &bc->trees[cache_index(block: b->block, num_locks: bc->num_locks)].root); |
850 | lru_remove(lru: &bc->lru[b->list_mode], le: &b->lru); |
851 | } |
852 | |
853 | cache_write_unlock(bc, block: b->block); |
854 | |
855 | return r; |
856 | } |
857 | |
858 | /*--------------*/ |
859 | |
860 | typedef void (*b_release)(struct dm_buffer *); |
861 | |
862 | static struct dm_buffer *__find_next(struct rb_root *root, sector_t block) |
863 | { |
864 | struct rb_node *n = root->rb_node; |
865 | struct dm_buffer *b; |
866 | struct dm_buffer *best = NULL; |
867 | |
868 | while (n) { |
869 | b = container_of(n, struct dm_buffer, node); |
870 | |
871 | if (b->block == block) |
872 | return b; |
873 | |
874 | if (block <= b->block) { |
875 | n = n->rb_left; |
876 | best = b; |
877 | } else { |
878 | n = n->rb_right; |
879 | } |
880 | } |
881 | |
882 | return best; |
883 | } |
884 | |
885 | static void __remove_range(struct dm_buffer_cache *bc, |
886 | struct rb_root *root, |
887 | sector_t begin, sector_t end, |
888 | b_predicate pred, b_release release) |
889 | { |
890 | struct dm_buffer *b; |
891 | |
892 | while (true) { |
893 | cond_resched(); |
894 | |
895 | b = __find_next(root, block: begin); |
896 | if (!b || (b->block >= end)) |
897 | break; |
898 | |
899 | begin = b->block + 1; |
900 | |
901 | if (atomic_read(v: &b->hold_count)) |
902 | continue; |
903 | |
904 | if (pred(b, NULL) == ER_EVICT) { |
905 | rb_erase(&b->node, root); |
906 | lru_remove(lru: &bc->lru[b->list_mode], le: &b->lru); |
907 | release(b); |
908 | } |
909 | } |
910 | } |
911 | |
912 | static void cache_remove_range(struct dm_buffer_cache *bc, |
913 | sector_t begin, sector_t end, |
914 | b_predicate pred, b_release release) |
915 | { |
916 | unsigned int i; |
917 | |
918 | for (i = 0; i < bc->num_locks; i++) { |
919 | down_write(sem: &bc->trees[i].lock); |
920 | __remove_range(bc, root: &bc->trees[i].root, begin, end, pred, release); |
921 | up_write(sem: &bc->trees[i].lock); |
922 | } |
923 | } |
924 | |
925 | /*----------------------------------------------------------------*/ |
926 | |
927 | /* |
928 | * Linking of buffers: |
929 | * All buffers are linked to buffer_cache with their node field. |
930 | * |
931 | * Clean buffers that are not being written (B_WRITING not set) |
932 | * are linked to lru[LIST_CLEAN] with their lru_list field. |
933 | * |
934 | * Dirty and clean buffers that are being written are linked to |
935 | * lru[LIST_DIRTY] with their lru_list field. When the write |
936 | * finishes, the buffer cannot be relinked immediately (because we |
937 | * are in an interrupt context and relinking requires process |
938 | * context), so some clean-not-writing buffers can be held on |
939 | * dirty_lru too. They are later added to lru in the process |
940 | * context. |
941 | */ |
942 | struct dm_bufio_client { |
943 | struct block_device *bdev; |
944 | unsigned int block_size; |
945 | s8 sectors_per_block_bits; |
946 | |
947 | bool no_sleep; |
948 | struct mutex lock; |
949 | spinlock_t spinlock; |
950 | |
951 | int async_write_error; |
952 | |
953 | void (*alloc_callback)(struct dm_buffer *buf); |
954 | void (*write_callback)(struct dm_buffer *buf); |
955 | struct kmem_cache *slab_buffer; |
956 | struct kmem_cache *slab_cache; |
957 | struct dm_io_client *dm_io; |
958 | |
959 | struct list_head reserved_buffers; |
960 | unsigned int need_reserved_buffers; |
961 | |
962 | unsigned int minimum_buffers; |
963 | |
964 | sector_t start; |
965 | |
966 | struct shrinker *shrinker; |
967 | struct work_struct shrink_work; |
968 | atomic_long_t need_shrink; |
969 | |
970 | wait_queue_head_t free_buffer_wait; |
971 | |
972 | struct list_head client_list; |
973 | |
974 | /* |
975 | * Used by global_cleanup to sort the clients list. |
976 | */ |
977 | unsigned long oldest_buffer; |
978 | |
979 | struct dm_buffer_cache cache; /* must be last member */ |
980 | }; |
981 | |
982 | static DEFINE_STATIC_KEY_FALSE(no_sleep_enabled); |
983 | |
984 | /*----------------------------------------------------------------*/ |
985 | |
986 | #define dm_bufio_in_request() (!!current->bio_list) |
987 | |
988 | static void dm_bufio_lock(struct dm_bufio_client *c) |
989 | { |
990 | if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep) |
991 | spin_lock_bh(lock: &c->spinlock); |
992 | else |
993 | mutex_lock_nested(lock: &c->lock, dm_bufio_in_request()); |
994 | } |
995 | |
996 | static void dm_bufio_unlock(struct dm_bufio_client *c) |
997 | { |
998 | if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep) |
999 | spin_unlock_bh(lock: &c->spinlock); |
1000 | else |
1001 | mutex_unlock(lock: &c->lock); |
1002 | } |
1003 | |
1004 | /*----------------------------------------------------------------*/ |
1005 | |
1006 | /* |
1007 | * Default cache size: available memory divided by the ratio. |
1008 | */ |
1009 | static unsigned long dm_bufio_default_cache_size; |
1010 | |
1011 | /* |
1012 | * Total cache size set by the user. |
1013 | */ |
1014 | static unsigned long dm_bufio_cache_size; |
1015 | |
1016 | /* |
1017 | * A copy of dm_bufio_cache_size because dm_bufio_cache_size can change |
1018 | * at any time. If it disagrees, the user has changed cache size. |
1019 | */ |
1020 | static unsigned long dm_bufio_cache_size_latch; |
1021 | |
1022 | static DEFINE_SPINLOCK(global_spinlock); |
1023 | |
1024 | /* |
1025 | * Buffers are freed after this timeout |
1026 | */ |
1027 | static unsigned int dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS; |
1028 | static unsigned long dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES; |
1029 | |
1030 | static unsigned long dm_bufio_peak_allocated; |
1031 | static unsigned long dm_bufio_allocated_kmem_cache; |
1032 | static unsigned long dm_bufio_allocated_get_free_pages; |
1033 | static unsigned long dm_bufio_allocated_vmalloc; |
1034 | static unsigned long dm_bufio_current_allocated; |
1035 | |
1036 | /*----------------------------------------------------------------*/ |
1037 | |
1038 | /* |
1039 | * The current number of clients. |
1040 | */ |
1041 | static int dm_bufio_client_count; |
1042 | |
1043 | /* |
1044 | * The list of all clients. |
1045 | */ |
1046 | static LIST_HEAD(dm_bufio_all_clients); |
1047 | |
1048 | /* |
1049 | * This mutex protects dm_bufio_cache_size_latch and dm_bufio_client_count |
1050 | */ |
1051 | static DEFINE_MUTEX(dm_bufio_clients_lock); |
1052 | |
1053 | static struct workqueue_struct *dm_bufio_wq; |
1054 | static struct delayed_work dm_bufio_cleanup_old_work; |
1055 | static struct work_struct dm_bufio_replacement_work; |
1056 | |
1057 | |
1058 | #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING |
1059 | static void buffer_record_stack(struct dm_buffer *b) |
1060 | { |
1061 | b->stack_len = stack_trace_save(store: b->stack_entries, MAX_STACK, skipnr: 2); |
1062 | } |
1063 | #endif |
1064 | |
1065 | /*----------------------------------------------------------------*/ |
1066 | |
1067 | static void adjust_total_allocated(struct dm_buffer *b, bool unlink) |
1068 | { |
1069 | unsigned char data_mode; |
1070 | long diff; |
1071 | |
1072 | static unsigned long * const class_ptr[DATA_MODE_LIMIT] = { |
1073 | &dm_bufio_allocated_kmem_cache, |
1074 | &dm_bufio_allocated_get_free_pages, |
1075 | &dm_bufio_allocated_vmalloc, |
1076 | }; |
1077 | |
1078 | data_mode = b->data_mode; |
1079 | diff = (long)b->c->block_size; |
1080 | if (unlink) |
1081 | diff = -diff; |
1082 | |
1083 | spin_lock(lock: &global_spinlock); |
1084 | |
1085 | *class_ptr[data_mode] += diff; |
1086 | |
1087 | dm_bufio_current_allocated += diff; |
1088 | |
1089 | if (dm_bufio_current_allocated > dm_bufio_peak_allocated) |
1090 | dm_bufio_peak_allocated = dm_bufio_current_allocated; |
1091 | |
1092 | if (!unlink) { |
1093 | if (dm_bufio_current_allocated > dm_bufio_cache_size) |
1094 | queue_work(wq: dm_bufio_wq, work: &dm_bufio_replacement_work); |
1095 | } |
1096 | |
1097 | spin_unlock(lock: &global_spinlock); |
1098 | } |
1099 | |
1100 | /* |
1101 | * Change the number of clients and recalculate per-client limit. |
1102 | */ |
1103 | static void __cache_size_refresh(void) |
1104 | { |
1105 | if (WARN_ON(!mutex_is_locked(&dm_bufio_clients_lock))) |
1106 | return; |
1107 | if (WARN_ON(dm_bufio_client_count < 0)) |
1108 | return; |
1109 | |
1110 | dm_bufio_cache_size_latch = READ_ONCE(dm_bufio_cache_size); |
1111 | |
1112 | /* |
1113 | * Use default if set to 0 and report the actual cache size used. |
1114 | */ |
1115 | if (!dm_bufio_cache_size_latch) { |
1116 | (void)cmpxchg(&dm_bufio_cache_size, 0, |
1117 | dm_bufio_default_cache_size); |
1118 | dm_bufio_cache_size_latch = dm_bufio_default_cache_size; |
1119 | } |
1120 | } |
1121 | |
1122 | /* |
1123 | * Allocating buffer data. |
1124 | * |
1125 | * Small buffers are allocated with kmem_cache, to use space optimally. |
1126 | * |
1127 | * For large buffers, we choose between get_free_pages and vmalloc. |
1128 | * Each has advantages and disadvantages. |
1129 | * |
1130 | * __get_free_pages can randomly fail if the memory is fragmented. |
1131 | * __vmalloc won't randomly fail, but vmalloc space is limited (it may be |
1132 | * as low as 128M) so using it for caching is not appropriate. |
1133 | * |
1134 | * If the allocation may fail we use __get_free_pages. Memory fragmentation |
1135 | * won't have a fatal effect here, but it just causes flushes of some other |
1136 | * buffers and more I/O will be performed. Don't use __get_free_pages if it |
1137 | * always fails (i.e. order > MAX_ORDER). |
1138 | * |
1139 | * If the allocation shouldn't fail we use __vmalloc. This is only for the |
1140 | * initial reserve allocation, so there's no risk of wasting all vmalloc |
1141 | * space. |
1142 | */ |
1143 | static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask, |
1144 | unsigned char *data_mode) |
1145 | { |
1146 | if (unlikely(c->slab_cache != NULL)) { |
1147 | *data_mode = DATA_MODE_SLAB; |
1148 | return kmem_cache_alloc(cachep: c->slab_cache, flags: gfp_mask); |
1149 | } |
1150 | |
1151 | if (c->block_size <= KMALLOC_MAX_SIZE && |
1152 | gfp_mask & __GFP_NORETRY) { |
1153 | *data_mode = DATA_MODE_GET_FREE_PAGES; |
1154 | return (void *)__get_free_pages(gfp_mask, |
1155 | order: c->sectors_per_block_bits - (PAGE_SHIFT - SECTOR_SHIFT)); |
1156 | } |
1157 | |
1158 | *data_mode = DATA_MODE_VMALLOC; |
1159 | |
1160 | return __vmalloc(size: c->block_size, gfp_mask); |
1161 | } |
1162 | |
1163 | /* |
1164 | * Free buffer's data. |
1165 | */ |
1166 | static void free_buffer_data(struct dm_bufio_client *c, |
1167 | void *data, unsigned char data_mode) |
1168 | { |
1169 | switch (data_mode) { |
1170 | case DATA_MODE_SLAB: |
1171 | kmem_cache_free(s: c->slab_cache, objp: data); |
1172 | break; |
1173 | |
1174 | case DATA_MODE_GET_FREE_PAGES: |
1175 | free_pages(addr: (unsigned long)data, |
1176 | order: c->sectors_per_block_bits - (PAGE_SHIFT - SECTOR_SHIFT)); |
1177 | break; |
1178 | |
1179 | case DATA_MODE_VMALLOC: |
1180 | vfree(addr: data); |
1181 | break; |
1182 | |
1183 | default: |
1184 | DMCRIT("dm_bufio_free_buffer_data: bad data mode: %d" , |
1185 | data_mode); |
1186 | BUG(); |
1187 | } |
1188 | } |
1189 | |
1190 | /* |
1191 | * Allocate buffer and its data. |
1192 | */ |
1193 | static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask) |
1194 | { |
1195 | struct dm_buffer *b = kmem_cache_alloc(cachep: c->slab_buffer, flags: gfp_mask); |
1196 | |
1197 | if (!b) |
1198 | return NULL; |
1199 | |
1200 | b->c = c; |
1201 | |
1202 | b->data = alloc_buffer_data(c, gfp_mask, data_mode: &b->data_mode); |
1203 | if (!b->data) { |
1204 | kmem_cache_free(s: c->slab_buffer, objp: b); |
1205 | return NULL; |
1206 | } |
1207 | adjust_total_allocated(b, unlink: false); |
1208 | |
1209 | #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING |
1210 | b->stack_len = 0; |
1211 | #endif |
1212 | return b; |
1213 | } |
1214 | |
1215 | /* |
1216 | * Free buffer and its data. |
1217 | */ |
1218 | static void free_buffer(struct dm_buffer *b) |
1219 | { |
1220 | struct dm_bufio_client *c = b->c; |
1221 | |
1222 | adjust_total_allocated(b, unlink: true); |
1223 | free_buffer_data(c, data: b->data, data_mode: b->data_mode); |
1224 | kmem_cache_free(s: c->slab_buffer, objp: b); |
1225 | } |
1226 | |
1227 | /* |
1228 | *-------------------------------------------------------------------------- |
1229 | * Submit I/O on the buffer. |
1230 | * |
1231 | * Bio interface is faster but it has some problems: |
1232 | * the vector list is limited (increasing this limit increases |
1233 | * memory-consumption per buffer, so it is not viable); |
1234 | * |
1235 | * the memory must be direct-mapped, not vmalloced; |
1236 | * |
1237 | * If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and |
1238 | * it is not vmalloced, try using the bio interface. |
1239 | * |
1240 | * If the buffer is big, if it is vmalloced or if the underlying device |
1241 | * rejects the bio because it is too large, use dm-io layer to do the I/O. |
1242 | * The dm-io layer splits the I/O into multiple requests, avoiding the above |
1243 | * shortcomings. |
1244 | *-------------------------------------------------------------------------- |
1245 | */ |
1246 | |
1247 | /* |
1248 | * dm-io completion routine. It just calls b->bio.bi_end_io, pretending |
1249 | * that the request was handled directly with bio interface. |
1250 | */ |
1251 | static void dmio_complete(unsigned long error, void *context) |
1252 | { |
1253 | struct dm_buffer *b = context; |
1254 | |
1255 | b->end_io(b, unlikely(error != 0) ? BLK_STS_IOERR : 0); |
1256 | } |
1257 | |
1258 | static void use_dmio(struct dm_buffer *b, enum req_op op, sector_t sector, |
1259 | unsigned int n_sectors, unsigned int offset) |
1260 | { |
1261 | int r; |
1262 | struct dm_io_request io_req = { |
1263 | .bi_opf = op, |
1264 | .notify.fn = dmio_complete, |
1265 | .notify.context = b, |
1266 | .client = b->c->dm_io, |
1267 | }; |
1268 | struct dm_io_region region = { |
1269 | .bdev = b->c->bdev, |
1270 | .sector = sector, |
1271 | .count = n_sectors, |
1272 | }; |
1273 | |
1274 | if (b->data_mode != DATA_MODE_VMALLOC) { |
1275 | io_req.mem.type = DM_IO_KMEM; |
1276 | io_req.mem.ptr.addr = (char *)b->data + offset; |
1277 | } else { |
1278 | io_req.mem.type = DM_IO_VMA; |
1279 | io_req.mem.ptr.vma = (char *)b->data + offset; |
1280 | } |
1281 | |
1282 | r = dm_io(io_req: &io_req, num_regions: 1, region: ®ion, NULL); |
1283 | if (unlikely(r)) |
1284 | b->end_io(b, errno_to_blk_status(errno: r)); |
1285 | } |
1286 | |
1287 | static void bio_complete(struct bio *bio) |
1288 | { |
1289 | struct dm_buffer *b = bio->bi_private; |
1290 | blk_status_t status = bio->bi_status; |
1291 | |
1292 | bio_uninit(bio); |
1293 | kfree(objp: bio); |
1294 | b->end_io(b, status); |
1295 | } |
1296 | |
1297 | static void use_bio(struct dm_buffer *b, enum req_op op, sector_t sector, |
1298 | unsigned int n_sectors, unsigned int offset) |
1299 | { |
1300 | struct bio *bio; |
1301 | char *ptr; |
1302 | unsigned int len; |
1303 | |
1304 | bio = bio_kmalloc(nr_vecs: 1, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOWARN); |
1305 | if (!bio) { |
1306 | use_dmio(b, op, sector, n_sectors, offset); |
1307 | return; |
1308 | } |
1309 | bio_init(bio, bdev: b->c->bdev, table: bio->bi_inline_vecs, max_vecs: 1, opf: op); |
1310 | bio->bi_iter.bi_sector = sector; |
1311 | bio->bi_end_io = bio_complete; |
1312 | bio->bi_private = b; |
1313 | |
1314 | ptr = (char *)b->data + offset; |
1315 | len = n_sectors << SECTOR_SHIFT; |
1316 | |
1317 | __bio_add_page(bio, virt_to_page(ptr), len, offset_in_page(ptr)); |
1318 | |
1319 | submit_bio(bio); |
1320 | } |
1321 | |
1322 | static inline sector_t block_to_sector(struct dm_bufio_client *c, sector_t block) |
1323 | { |
1324 | sector_t sector; |
1325 | |
1326 | if (likely(c->sectors_per_block_bits >= 0)) |
1327 | sector = block << c->sectors_per_block_bits; |
1328 | else |
1329 | sector = block * (c->block_size >> SECTOR_SHIFT); |
1330 | sector += c->start; |
1331 | |
1332 | return sector; |
1333 | } |
1334 | |
1335 | static void submit_io(struct dm_buffer *b, enum req_op op, |
1336 | void (*end_io)(struct dm_buffer *, blk_status_t)) |
1337 | { |
1338 | unsigned int n_sectors; |
1339 | sector_t sector; |
1340 | unsigned int offset, end; |
1341 | |
1342 | b->end_io = end_io; |
1343 | |
1344 | sector = block_to_sector(c: b->c, block: b->block); |
1345 | |
1346 | if (op != REQ_OP_WRITE) { |
1347 | n_sectors = b->c->block_size >> SECTOR_SHIFT; |
1348 | offset = 0; |
1349 | } else { |
1350 | if (b->c->write_callback) |
1351 | b->c->write_callback(b); |
1352 | offset = b->write_start; |
1353 | end = b->write_end; |
1354 | offset &= -DM_BUFIO_WRITE_ALIGN; |
1355 | end += DM_BUFIO_WRITE_ALIGN - 1; |
1356 | end &= -DM_BUFIO_WRITE_ALIGN; |
1357 | if (unlikely(end > b->c->block_size)) |
1358 | end = b->c->block_size; |
1359 | |
1360 | sector += offset >> SECTOR_SHIFT; |
1361 | n_sectors = (end - offset) >> SECTOR_SHIFT; |
1362 | } |
1363 | |
1364 | if (b->data_mode != DATA_MODE_VMALLOC) |
1365 | use_bio(b, op, sector, n_sectors, offset); |
1366 | else |
1367 | use_dmio(b, op, sector, n_sectors, offset); |
1368 | } |
1369 | |
1370 | /* |
1371 | *-------------------------------------------------------------- |
1372 | * Writing dirty buffers |
1373 | *-------------------------------------------------------------- |
1374 | */ |
1375 | |
1376 | /* |
1377 | * The endio routine for write. |
1378 | * |
1379 | * Set the error, clear B_WRITING bit and wake anyone who was waiting on |
1380 | * it. |
1381 | */ |
1382 | static void write_endio(struct dm_buffer *b, blk_status_t status) |
1383 | { |
1384 | b->write_error = status; |
1385 | if (unlikely(status)) { |
1386 | struct dm_bufio_client *c = b->c; |
1387 | |
1388 | (void)cmpxchg(&c->async_write_error, 0, |
1389 | blk_status_to_errno(status)); |
1390 | } |
1391 | |
1392 | BUG_ON(!test_bit(B_WRITING, &b->state)); |
1393 | |
1394 | smp_mb__before_atomic(); |
1395 | clear_bit(B_WRITING, addr: &b->state); |
1396 | smp_mb__after_atomic(); |
1397 | |
1398 | wake_up_bit(word: &b->state, B_WRITING); |
1399 | } |
1400 | |
1401 | /* |
1402 | * Initiate a write on a dirty buffer, but don't wait for it. |
1403 | * |
1404 | * - If the buffer is not dirty, exit. |
1405 | * - If there some previous write going on, wait for it to finish (we can't |
1406 | * have two writes on the same buffer simultaneously). |
1407 | * - Submit our write and don't wait on it. We set B_WRITING indicating |
1408 | * that there is a write in progress. |
1409 | */ |
1410 | static void __write_dirty_buffer(struct dm_buffer *b, |
1411 | struct list_head *write_list) |
1412 | { |
1413 | if (!test_bit(B_DIRTY, &b->state)) |
1414 | return; |
1415 | |
1416 | clear_bit(B_DIRTY, addr: &b->state); |
1417 | wait_on_bit_lock_io(word: &b->state, B_WRITING, TASK_UNINTERRUPTIBLE); |
1418 | |
1419 | b->write_start = b->dirty_start; |
1420 | b->write_end = b->dirty_end; |
1421 | |
1422 | if (!write_list) |
1423 | submit_io(b, op: REQ_OP_WRITE, end_io: write_endio); |
1424 | else |
1425 | list_add_tail(new: &b->write_list, head: write_list); |
1426 | } |
1427 | |
1428 | static void __flush_write_list(struct list_head *write_list) |
1429 | { |
1430 | struct blk_plug plug; |
1431 | |
1432 | blk_start_plug(&plug); |
1433 | while (!list_empty(head: write_list)) { |
1434 | struct dm_buffer *b = |
1435 | list_entry(write_list->next, struct dm_buffer, write_list); |
1436 | list_del(entry: &b->write_list); |
1437 | submit_io(b, op: REQ_OP_WRITE, end_io: write_endio); |
1438 | cond_resched(); |
1439 | } |
1440 | blk_finish_plug(&plug); |
1441 | } |
1442 | |
1443 | /* |
1444 | * Wait until any activity on the buffer finishes. Possibly write the |
1445 | * buffer if it is dirty. When this function finishes, there is no I/O |
1446 | * running on the buffer and the buffer is not dirty. |
1447 | */ |
1448 | static void __make_buffer_clean(struct dm_buffer *b) |
1449 | { |
1450 | BUG_ON(atomic_read(&b->hold_count)); |
1451 | |
1452 | /* smp_load_acquire() pairs with read_endio()'s smp_mb__before_atomic() */ |
1453 | if (!smp_load_acquire(&b->state)) /* fast case */ |
1454 | return; |
1455 | |
1456 | wait_on_bit_io(word: &b->state, B_READING, TASK_UNINTERRUPTIBLE); |
1457 | __write_dirty_buffer(b, NULL); |
1458 | wait_on_bit_io(word: &b->state, B_WRITING, TASK_UNINTERRUPTIBLE); |
1459 | } |
1460 | |
1461 | static enum evict_result is_clean(struct dm_buffer *b, void *context) |
1462 | { |
1463 | struct dm_bufio_client *c = context; |
1464 | |
1465 | /* These should never happen */ |
1466 | if (WARN_ON_ONCE(test_bit(B_WRITING, &b->state))) |
1467 | return ER_DONT_EVICT; |
1468 | if (WARN_ON_ONCE(test_bit(B_DIRTY, &b->state))) |
1469 | return ER_DONT_EVICT; |
1470 | if (WARN_ON_ONCE(b->list_mode != LIST_CLEAN)) |
1471 | return ER_DONT_EVICT; |
1472 | |
1473 | if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep && |
1474 | unlikely(test_bit(B_READING, &b->state))) |
1475 | return ER_DONT_EVICT; |
1476 | |
1477 | return ER_EVICT; |
1478 | } |
1479 | |
1480 | static enum evict_result is_dirty(struct dm_buffer *b, void *context) |
1481 | { |
1482 | /* These should never happen */ |
1483 | if (WARN_ON_ONCE(test_bit(B_READING, &b->state))) |
1484 | return ER_DONT_EVICT; |
1485 | if (WARN_ON_ONCE(b->list_mode != LIST_DIRTY)) |
1486 | return ER_DONT_EVICT; |
1487 | |
1488 | return ER_EVICT; |
1489 | } |
1490 | |
1491 | /* |
1492 | * Find some buffer that is not held by anybody, clean it, unlink it and |
1493 | * return it. |
1494 | */ |
1495 | static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c) |
1496 | { |
1497 | struct dm_buffer *b; |
1498 | |
1499 | b = cache_evict(bc: &c->cache, LIST_CLEAN, pred: is_clean, context: c); |
1500 | if (b) { |
1501 | /* this also waits for pending reads */ |
1502 | __make_buffer_clean(b); |
1503 | return b; |
1504 | } |
1505 | |
1506 | if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep) |
1507 | return NULL; |
1508 | |
1509 | b = cache_evict(bc: &c->cache, LIST_DIRTY, pred: is_dirty, NULL); |
1510 | if (b) { |
1511 | __make_buffer_clean(b); |
1512 | return b; |
1513 | } |
1514 | |
1515 | return NULL; |
1516 | } |
1517 | |
1518 | /* |
1519 | * Wait until some other threads free some buffer or release hold count on |
1520 | * some buffer. |
1521 | * |
1522 | * This function is entered with c->lock held, drops it and regains it |
1523 | * before exiting. |
1524 | */ |
1525 | static void __wait_for_free_buffer(struct dm_bufio_client *c) |
1526 | { |
1527 | DECLARE_WAITQUEUE(wait, current); |
1528 | |
1529 | add_wait_queue(wq_head: &c->free_buffer_wait, wq_entry: &wait); |
1530 | set_current_state(TASK_UNINTERRUPTIBLE); |
1531 | dm_bufio_unlock(c); |
1532 | |
1533 | /* |
1534 | * It's possible to miss a wake up event since we don't always |
1535 | * hold c->lock when wake_up is called. So we have a timeout here, |
1536 | * just in case. |
1537 | */ |
1538 | io_schedule_timeout(timeout: 5 * HZ); |
1539 | |
1540 | remove_wait_queue(wq_head: &c->free_buffer_wait, wq_entry: &wait); |
1541 | |
1542 | dm_bufio_lock(c); |
1543 | } |
1544 | |
1545 | enum new_flag { |
1546 | NF_FRESH = 0, |
1547 | NF_READ = 1, |
1548 | NF_GET = 2, |
1549 | NF_PREFETCH = 3 |
1550 | }; |
1551 | |
1552 | /* |
1553 | * Allocate a new buffer. If the allocation is not possible, wait until |
1554 | * some other thread frees a buffer. |
1555 | * |
1556 | * May drop the lock and regain it. |
1557 | */ |
1558 | static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf) |
1559 | { |
1560 | struct dm_buffer *b; |
1561 | bool tried_noio_alloc = false; |
1562 | |
1563 | /* |
1564 | * dm-bufio is resistant to allocation failures (it just keeps |
1565 | * one buffer reserved in cases all the allocations fail). |
1566 | * So set flags to not try too hard: |
1567 | * GFP_NOWAIT: don't wait; if we need to sleep we'll release our |
1568 | * mutex and wait ourselves. |
1569 | * __GFP_NORETRY: don't retry and rather return failure |
1570 | * __GFP_NOMEMALLOC: don't use emergency reserves |
1571 | * __GFP_NOWARN: don't print a warning in case of failure |
1572 | * |
1573 | * For debugging, if we set the cache size to 1, no new buffers will |
1574 | * be allocated. |
1575 | */ |
1576 | while (1) { |
1577 | if (dm_bufio_cache_size_latch != 1) { |
1578 | b = alloc_buffer(c, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); |
1579 | if (b) |
1580 | return b; |
1581 | } |
1582 | |
1583 | if (nf == NF_PREFETCH) |
1584 | return NULL; |
1585 | |
1586 | if (dm_bufio_cache_size_latch != 1 && !tried_noio_alloc) { |
1587 | dm_bufio_unlock(c); |
1588 | b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); |
1589 | dm_bufio_lock(c); |
1590 | if (b) |
1591 | return b; |
1592 | tried_noio_alloc = true; |
1593 | } |
1594 | |
1595 | if (!list_empty(head: &c->reserved_buffers)) { |
1596 | b = list_to_buffer(l: c->reserved_buffers.next); |
1597 | list_del(entry: &b->lru.list); |
1598 | c->need_reserved_buffers++; |
1599 | |
1600 | return b; |
1601 | } |
1602 | |
1603 | b = __get_unclaimed_buffer(c); |
1604 | if (b) |
1605 | return b; |
1606 | |
1607 | __wait_for_free_buffer(c); |
1608 | } |
1609 | } |
1610 | |
1611 | static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf) |
1612 | { |
1613 | struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf); |
1614 | |
1615 | if (!b) |
1616 | return NULL; |
1617 | |
1618 | if (c->alloc_callback) |
1619 | c->alloc_callback(b); |
1620 | |
1621 | return b; |
1622 | } |
1623 | |
1624 | /* |
1625 | * Free a buffer and wake other threads waiting for free buffers. |
1626 | */ |
1627 | static void __free_buffer_wake(struct dm_buffer *b) |
1628 | { |
1629 | struct dm_bufio_client *c = b->c; |
1630 | |
1631 | b->block = -1; |
1632 | if (!c->need_reserved_buffers) |
1633 | free_buffer(b); |
1634 | else { |
1635 | list_add(new: &b->lru.list, head: &c->reserved_buffers); |
1636 | c->need_reserved_buffers--; |
1637 | } |
1638 | |
1639 | /* |
1640 | * We hold the bufio lock here, so no one can add entries to the |
1641 | * wait queue anyway. |
1642 | */ |
1643 | if (unlikely(waitqueue_active(&c->free_buffer_wait))) |
1644 | wake_up(&c->free_buffer_wait); |
1645 | } |
1646 | |
1647 | static enum evict_result cleaned(struct dm_buffer *b, void *context) |
1648 | { |
1649 | if (WARN_ON_ONCE(test_bit(B_READING, &b->state))) |
1650 | return ER_DONT_EVICT; /* should never happen */ |
1651 | |
1652 | if (test_bit(B_DIRTY, &b->state) || test_bit(B_WRITING, &b->state)) |
1653 | return ER_DONT_EVICT; |
1654 | else |
1655 | return ER_EVICT; |
1656 | } |
1657 | |
1658 | static void __move_clean_buffers(struct dm_bufio_client *c) |
1659 | { |
1660 | cache_mark_many(bc: &c->cache, LIST_DIRTY, LIST_CLEAN, pred: cleaned, NULL); |
1661 | } |
1662 | |
1663 | struct write_context { |
1664 | int no_wait; |
1665 | struct list_head *write_list; |
1666 | }; |
1667 | |
1668 | static enum it_action write_one(struct dm_buffer *b, void *context) |
1669 | { |
1670 | struct write_context *wc = context; |
1671 | |
1672 | if (wc->no_wait && test_bit(B_WRITING, &b->state)) |
1673 | return IT_COMPLETE; |
1674 | |
1675 | __write_dirty_buffer(b, write_list: wc->write_list); |
1676 | return IT_NEXT; |
1677 | } |
1678 | |
1679 | static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait, |
1680 | struct list_head *write_list) |
1681 | { |
1682 | struct write_context wc = {.no_wait = no_wait, .write_list = write_list}; |
1683 | |
1684 | __move_clean_buffers(c); |
1685 | cache_iterate(bc: &c->cache, LIST_DIRTY, fn: write_one, context: &wc); |
1686 | } |
1687 | |
1688 | /* |
1689 | * Check if we're over watermark. |
1690 | * If we are over threshold_buffers, start freeing buffers. |
1691 | * If we're over "limit_buffers", block until we get under the limit. |
1692 | */ |
1693 | static void __check_watermark(struct dm_bufio_client *c, |
1694 | struct list_head *write_list) |
1695 | { |
1696 | if (cache_count(bc: &c->cache, LIST_DIRTY) > |
1697 | cache_count(bc: &c->cache, LIST_CLEAN) * DM_BUFIO_WRITEBACK_RATIO) |
1698 | __write_dirty_buffers_async(c, no_wait: 1, write_list); |
1699 | } |
1700 | |
1701 | /* |
1702 | *-------------------------------------------------------------- |
1703 | * Getting a buffer |
1704 | *-------------------------------------------------------------- |
1705 | */ |
1706 | |
1707 | static void cache_put_and_wake(struct dm_bufio_client *c, struct dm_buffer *b) |
1708 | { |
1709 | /* |
1710 | * Relying on waitqueue_active() is racey, but we sleep |
1711 | * with schedule_timeout anyway. |
1712 | */ |
1713 | if (cache_put(bc: &c->cache, b) && |
1714 | unlikely(waitqueue_active(&c->free_buffer_wait))) |
1715 | wake_up(&c->free_buffer_wait); |
1716 | } |
1717 | |
1718 | /* |
1719 | * This assumes you have already checked the cache to see if the buffer |
1720 | * is already present (it will recheck after dropping the lock for allocation). |
1721 | */ |
1722 | static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block, |
1723 | enum new_flag nf, int *need_submit, |
1724 | struct list_head *write_list) |
1725 | { |
1726 | struct dm_buffer *b, *new_b = NULL; |
1727 | |
1728 | *need_submit = 0; |
1729 | |
1730 | /* This can't be called with NF_GET */ |
1731 | if (WARN_ON_ONCE(nf == NF_GET)) |
1732 | return NULL; |
1733 | |
1734 | new_b = __alloc_buffer_wait(c, nf); |
1735 | if (!new_b) |
1736 | return NULL; |
1737 | |
1738 | /* |
1739 | * We've had a period where the mutex was unlocked, so need to |
1740 | * recheck the buffer tree. |
1741 | */ |
1742 | b = cache_get(bc: &c->cache, block); |
1743 | if (b) { |
1744 | __free_buffer_wake(b: new_b); |
1745 | goto found_buffer; |
1746 | } |
1747 | |
1748 | __check_watermark(c, write_list); |
1749 | |
1750 | b = new_b; |
1751 | atomic_set(v: &b->hold_count, i: 1); |
1752 | WRITE_ONCE(b->last_accessed, jiffies); |
1753 | b->block = block; |
1754 | b->read_error = 0; |
1755 | b->write_error = 0; |
1756 | b->list_mode = LIST_CLEAN; |
1757 | |
1758 | if (nf == NF_FRESH) |
1759 | b->state = 0; |
1760 | else { |
1761 | b->state = 1 << B_READING; |
1762 | *need_submit = 1; |
1763 | } |
1764 | |
1765 | /* |
1766 | * We mustn't insert into the cache until the B_READING state |
1767 | * is set. Otherwise another thread could get it and use |
1768 | * it before it had been read. |
1769 | */ |
1770 | cache_insert(bc: &c->cache, b); |
1771 | |
1772 | return b; |
1773 | |
1774 | found_buffer: |
1775 | if (nf == NF_PREFETCH) { |
1776 | cache_put_and_wake(c, b); |
1777 | return NULL; |
1778 | } |
1779 | |
1780 | /* |
1781 | * Note: it is essential that we don't wait for the buffer to be |
1782 | * read if dm_bufio_get function is used. Both dm_bufio_get and |
1783 | * dm_bufio_prefetch can be used in the driver request routine. |
1784 | * If the user called both dm_bufio_prefetch and dm_bufio_get on |
1785 | * the same buffer, it would deadlock if we waited. |
1786 | */ |
1787 | if (nf == NF_GET && unlikely(test_bit_acquire(B_READING, &b->state))) { |
1788 | cache_put_and_wake(c, b); |
1789 | return NULL; |
1790 | } |
1791 | |
1792 | return b; |
1793 | } |
1794 | |
1795 | /* |
1796 | * The endio routine for reading: set the error, clear the bit and wake up |
1797 | * anyone waiting on the buffer. |
1798 | */ |
1799 | static void read_endio(struct dm_buffer *b, blk_status_t status) |
1800 | { |
1801 | b->read_error = status; |
1802 | |
1803 | BUG_ON(!test_bit(B_READING, &b->state)); |
1804 | |
1805 | smp_mb__before_atomic(); |
1806 | clear_bit(B_READING, addr: &b->state); |
1807 | smp_mb__after_atomic(); |
1808 | |
1809 | wake_up_bit(word: &b->state, B_READING); |
1810 | } |
1811 | |
1812 | /* |
1813 | * A common routine for dm_bufio_new and dm_bufio_read. Operation of these |
1814 | * functions is similar except that dm_bufio_new doesn't read the |
1815 | * buffer from the disk (assuming that the caller overwrites all the data |
1816 | * and uses dm_bufio_mark_buffer_dirty to write new data back). |
1817 | */ |
1818 | static void *new_read(struct dm_bufio_client *c, sector_t block, |
1819 | enum new_flag nf, struct dm_buffer **bp) |
1820 | { |
1821 | int need_submit = 0; |
1822 | struct dm_buffer *b; |
1823 | |
1824 | LIST_HEAD(write_list); |
1825 | |
1826 | *bp = NULL; |
1827 | |
1828 | /* |
1829 | * Fast path, hopefully the block is already in the cache. No need |
1830 | * to get the client lock for this. |
1831 | */ |
1832 | b = cache_get(bc: &c->cache, block); |
1833 | if (b) { |
1834 | if (nf == NF_PREFETCH) { |
1835 | cache_put_and_wake(c, b); |
1836 | return NULL; |
1837 | } |
1838 | |
1839 | /* |
1840 | * Note: it is essential that we don't wait for the buffer to be |
1841 | * read if dm_bufio_get function is used. Both dm_bufio_get and |
1842 | * dm_bufio_prefetch can be used in the driver request routine. |
1843 | * If the user called both dm_bufio_prefetch and dm_bufio_get on |
1844 | * the same buffer, it would deadlock if we waited. |
1845 | */ |
1846 | if (nf == NF_GET && unlikely(test_bit_acquire(B_READING, &b->state))) { |
1847 | cache_put_and_wake(c, b); |
1848 | return NULL; |
1849 | } |
1850 | } |
1851 | |
1852 | if (!b) { |
1853 | if (nf == NF_GET) |
1854 | return NULL; |
1855 | |
1856 | dm_bufio_lock(c); |
1857 | b = __bufio_new(c, block, nf, need_submit: &need_submit, write_list: &write_list); |
1858 | dm_bufio_unlock(c); |
1859 | } |
1860 | |
1861 | #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING |
1862 | if (b && (atomic_read(v: &b->hold_count) == 1)) |
1863 | buffer_record_stack(b); |
1864 | #endif |
1865 | |
1866 | __flush_write_list(write_list: &write_list); |
1867 | |
1868 | if (!b) |
1869 | return NULL; |
1870 | |
1871 | if (need_submit) |
1872 | submit_io(b, op: REQ_OP_READ, end_io: read_endio); |
1873 | |
1874 | wait_on_bit_io(word: &b->state, B_READING, TASK_UNINTERRUPTIBLE); |
1875 | |
1876 | if (b->read_error) { |
1877 | int error = blk_status_to_errno(status: b->read_error); |
1878 | |
1879 | dm_bufio_release(b); |
1880 | |
1881 | return ERR_PTR(error); |
1882 | } |
1883 | |
1884 | *bp = b; |
1885 | |
1886 | return b->data; |
1887 | } |
1888 | |
1889 | void *dm_bufio_get(struct dm_bufio_client *c, sector_t block, |
1890 | struct dm_buffer **bp) |
1891 | { |
1892 | return new_read(c, block, nf: NF_GET, bp); |
1893 | } |
1894 | EXPORT_SYMBOL_GPL(dm_bufio_get); |
1895 | |
1896 | void *dm_bufio_read(struct dm_bufio_client *c, sector_t block, |
1897 | struct dm_buffer **bp) |
1898 | { |
1899 | if (WARN_ON_ONCE(dm_bufio_in_request())) |
1900 | return ERR_PTR(error: -EINVAL); |
1901 | |
1902 | return new_read(c, block, nf: NF_READ, bp); |
1903 | } |
1904 | EXPORT_SYMBOL_GPL(dm_bufio_read); |
1905 | |
1906 | void *dm_bufio_new(struct dm_bufio_client *c, sector_t block, |
1907 | struct dm_buffer **bp) |
1908 | { |
1909 | if (WARN_ON_ONCE(dm_bufio_in_request())) |
1910 | return ERR_PTR(error: -EINVAL); |
1911 | |
1912 | return new_read(c, block, nf: NF_FRESH, bp); |
1913 | } |
1914 | EXPORT_SYMBOL_GPL(dm_bufio_new); |
1915 | |
1916 | void dm_bufio_prefetch(struct dm_bufio_client *c, |
1917 | sector_t block, unsigned int n_blocks) |
1918 | { |
1919 | struct blk_plug plug; |
1920 | |
1921 | LIST_HEAD(write_list); |
1922 | |
1923 | if (WARN_ON_ONCE(dm_bufio_in_request())) |
1924 | return; /* should never happen */ |
1925 | |
1926 | blk_start_plug(&plug); |
1927 | |
1928 | for (; n_blocks--; block++) { |
1929 | int need_submit; |
1930 | struct dm_buffer *b; |
1931 | |
1932 | b = cache_get(bc: &c->cache, block); |
1933 | if (b) { |
1934 | /* already in cache */ |
1935 | cache_put_and_wake(c, b); |
1936 | continue; |
1937 | } |
1938 | |
1939 | dm_bufio_lock(c); |
1940 | b = __bufio_new(c, block, nf: NF_PREFETCH, need_submit: &need_submit, |
1941 | write_list: &write_list); |
1942 | if (unlikely(!list_empty(&write_list))) { |
1943 | dm_bufio_unlock(c); |
1944 | blk_finish_plug(&plug); |
1945 | __flush_write_list(write_list: &write_list); |
1946 | blk_start_plug(&plug); |
1947 | dm_bufio_lock(c); |
1948 | } |
1949 | if (unlikely(b != NULL)) { |
1950 | dm_bufio_unlock(c); |
1951 | |
1952 | if (need_submit) |
1953 | submit_io(b, op: REQ_OP_READ, end_io: read_endio); |
1954 | dm_bufio_release(b); |
1955 | |
1956 | cond_resched(); |
1957 | |
1958 | if (!n_blocks) |
1959 | goto flush_plug; |
1960 | dm_bufio_lock(c); |
1961 | } |
1962 | dm_bufio_unlock(c); |
1963 | } |
1964 | |
1965 | flush_plug: |
1966 | blk_finish_plug(&plug); |
1967 | } |
1968 | EXPORT_SYMBOL_GPL(dm_bufio_prefetch); |
1969 | |
1970 | void dm_bufio_release(struct dm_buffer *b) |
1971 | { |
1972 | struct dm_bufio_client *c = b->c; |
1973 | |
1974 | /* |
1975 | * If there were errors on the buffer, and the buffer is not |
1976 | * to be written, free the buffer. There is no point in caching |
1977 | * invalid buffer. |
1978 | */ |
1979 | if ((b->read_error || b->write_error) && |
1980 | !test_bit_acquire(B_READING, &b->state) && |
1981 | !test_bit(B_WRITING, &b->state) && |
1982 | !test_bit(B_DIRTY, &b->state)) { |
1983 | dm_bufio_lock(c); |
1984 | |
1985 | /* cache remove can fail if there are other holders */ |
1986 | if (cache_remove(bc: &c->cache, b)) { |
1987 | __free_buffer_wake(b); |
1988 | dm_bufio_unlock(c); |
1989 | return; |
1990 | } |
1991 | |
1992 | dm_bufio_unlock(c); |
1993 | } |
1994 | |
1995 | cache_put_and_wake(c, b); |
1996 | } |
1997 | EXPORT_SYMBOL_GPL(dm_bufio_release); |
1998 | |
1999 | void dm_bufio_mark_partial_buffer_dirty(struct dm_buffer *b, |
2000 | unsigned int start, unsigned int end) |
2001 | { |
2002 | struct dm_bufio_client *c = b->c; |
2003 | |
2004 | BUG_ON(start >= end); |
2005 | BUG_ON(end > b->c->block_size); |
2006 | |
2007 | dm_bufio_lock(c); |
2008 | |
2009 | BUG_ON(test_bit(B_READING, &b->state)); |
2010 | |
2011 | if (!test_and_set_bit(B_DIRTY, addr: &b->state)) { |
2012 | b->dirty_start = start; |
2013 | b->dirty_end = end; |
2014 | cache_mark(bc: &c->cache, b, LIST_DIRTY); |
2015 | } else { |
2016 | if (start < b->dirty_start) |
2017 | b->dirty_start = start; |
2018 | if (end > b->dirty_end) |
2019 | b->dirty_end = end; |
2020 | } |
2021 | |
2022 | dm_bufio_unlock(c); |
2023 | } |
2024 | EXPORT_SYMBOL_GPL(dm_bufio_mark_partial_buffer_dirty); |
2025 | |
2026 | void dm_bufio_mark_buffer_dirty(struct dm_buffer *b) |
2027 | { |
2028 | dm_bufio_mark_partial_buffer_dirty(b, 0, b->c->block_size); |
2029 | } |
2030 | EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty); |
2031 | |
2032 | void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c) |
2033 | { |
2034 | LIST_HEAD(write_list); |
2035 | |
2036 | if (WARN_ON_ONCE(dm_bufio_in_request())) |
2037 | return; /* should never happen */ |
2038 | |
2039 | dm_bufio_lock(c); |
2040 | __write_dirty_buffers_async(c, no_wait: 0, write_list: &write_list); |
2041 | dm_bufio_unlock(c); |
2042 | __flush_write_list(write_list: &write_list); |
2043 | } |
2044 | EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async); |
2045 | |
2046 | /* |
2047 | * For performance, it is essential that the buffers are written asynchronously |
2048 | * and simultaneously (so that the block layer can merge the writes) and then |
2049 | * waited upon. |
2050 | * |
2051 | * Finally, we flush hardware disk cache. |
2052 | */ |
2053 | static bool is_writing(struct lru_entry *e, void *context) |
2054 | { |
2055 | struct dm_buffer *b = le_to_buffer(le: e); |
2056 | |
2057 | return test_bit(B_WRITING, &b->state); |
2058 | } |
2059 | |
2060 | int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c) |
2061 | { |
2062 | int a, f; |
2063 | unsigned long nr_buffers; |
2064 | struct lru_entry *e; |
2065 | struct lru_iter it; |
2066 | |
2067 | LIST_HEAD(write_list); |
2068 | |
2069 | dm_bufio_lock(c); |
2070 | __write_dirty_buffers_async(c, no_wait: 0, write_list: &write_list); |
2071 | dm_bufio_unlock(c); |
2072 | __flush_write_list(write_list: &write_list); |
2073 | dm_bufio_lock(c); |
2074 | |
2075 | nr_buffers = cache_count(bc: &c->cache, LIST_DIRTY); |
2076 | lru_iter_begin(lru: &c->cache.lru[LIST_DIRTY], it: &it); |
2077 | while ((e = lru_iter_next(it: &it, pred: is_writing, context: c))) { |
2078 | struct dm_buffer *b = le_to_buffer(le: e); |
2079 | __cache_inc_buffer(b); |
2080 | |
2081 | BUG_ON(test_bit(B_READING, &b->state)); |
2082 | |
2083 | if (nr_buffers) { |
2084 | nr_buffers--; |
2085 | dm_bufio_unlock(c); |
2086 | wait_on_bit_io(word: &b->state, B_WRITING, TASK_UNINTERRUPTIBLE); |
2087 | dm_bufio_lock(c); |
2088 | } else { |
2089 | wait_on_bit_io(word: &b->state, B_WRITING, TASK_UNINTERRUPTIBLE); |
2090 | } |
2091 | |
2092 | if (!test_bit(B_DIRTY, &b->state) && !test_bit(B_WRITING, &b->state)) |
2093 | cache_mark(bc: &c->cache, b, LIST_CLEAN); |
2094 | |
2095 | cache_put_and_wake(c, b); |
2096 | |
2097 | cond_resched(); |
2098 | } |
2099 | lru_iter_end(it: &it); |
2100 | |
2101 | wake_up(&c->free_buffer_wait); |
2102 | dm_bufio_unlock(c); |
2103 | |
2104 | a = xchg(&c->async_write_error, 0); |
2105 | f = dm_bufio_issue_flush(c); |
2106 | if (a) |
2107 | return a; |
2108 | |
2109 | return f; |
2110 | } |
2111 | EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers); |
2112 | |
2113 | /* |
2114 | * Use dm-io to send an empty barrier to flush the device. |
2115 | */ |
2116 | int dm_bufio_issue_flush(struct dm_bufio_client *c) |
2117 | { |
2118 | struct dm_io_request io_req = { |
2119 | .bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC, |
2120 | .mem.type = DM_IO_KMEM, |
2121 | .mem.ptr.addr = NULL, |
2122 | .client = c->dm_io, |
2123 | }; |
2124 | struct dm_io_region io_reg = { |
2125 | .bdev = c->bdev, |
2126 | .sector = 0, |
2127 | .count = 0, |
2128 | }; |
2129 | |
2130 | if (WARN_ON_ONCE(dm_bufio_in_request())) |
2131 | return -EINVAL; |
2132 | |
2133 | return dm_io(io_req: &io_req, num_regions: 1, region: &io_reg, NULL); |
2134 | } |
2135 | EXPORT_SYMBOL_GPL(dm_bufio_issue_flush); |
2136 | |
2137 | /* |
2138 | * Use dm-io to send a discard request to flush the device. |
2139 | */ |
2140 | int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t count) |
2141 | { |
2142 | struct dm_io_request io_req = { |
2143 | .bi_opf = REQ_OP_DISCARD | REQ_SYNC, |
2144 | .mem.type = DM_IO_KMEM, |
2145 | .mem.ptr.addr = NULL, |
2146 | .client = c->dm_io, |
2147 | }; |
2148 | struct dm_io_region io_reg = { |
2149 | .bdev = c->bdev, |
2150 | .sector = block_to_sector(c, block), |
2151 | .count = block_to_sector(c, block: count), |
2152 | }; |
2153 | |
2154 | if (WARN_ON_ONCE(dm_bufio_in_request())) |
2155 | return -EINVAL; /* discards are optional */ |
2156 | |
2157 | return dm_io(io_req: &io_req, num_regions: 1, region: &io_reg, NULL); |
2158 | } |
2159 | EXPORT_SYMBOL_GPL(dm_bufio_issue_discard); |
2160 | |
2161 | static bool forget_buffer(struct dm_bufio_client *c, sector_t block) |
2162 | { |
2163 | struct dm_buffer *b; |
2164 | |
2165 | b = cache_get(bc: &c->cache, block); |
2166 | if (b) { |
2167 | if (likely(!smp_load_acquire(&b->state))) { |
2168 | if (cache_remove(bc: &c->cache, b)) |
2169 | __free_buffer_wake(b); |
2170 | else |
2171 | cache_put_and_wake(c, b); |
2172 | } else { |
2173 | cache_put_and_wake(c, b); |
2174 | } |
2175 | } |
2176 | |
2177 | return b ? true : false; |
2178 | } |
2179 | |
2180 | /* |
2181 | * Free the given buffer. |
2182 | * |
2183 | * This is just a hint, if the buffer is in use or dirty, this function |
2184 | * does nothing. |
2185 | */ |
2186 | void dm_bufio_forget(struct dm_bufio_client *c, sector_t block) |
2187 | { |
2188 | dm_bufio_lock(c); |
2189 | forget_buffer(c, block); |
2190 | dm_bufio_unlock(c); |
2191 | } |
2192 | EXPORT_SYMBOL_GPL(dm_bufio_forget); |
2193 | |
2194 | static enum evict_result idle(struct dm_buffer *b, void *context) |
2195 | { |
2196 | return b->state ? ER_DONT_EVICT : ER_EVICT; |
2197 | } |
2198 | |
2199 | void dm_bufio_forget_buffers(struct dm_bufio_client *c, sector_t block, sector_t n_blocks) |
2200 | { |
2201 | dm_bufio_lock(c); |
2202 | cache_remove_range(bc: &c->cache, begin: block, end: block + n_blocks, pred: idle, release: __free_buffer_wake); |
2203 | dm_bufio_unlock(c); |
2204 | } |
2205 | EXPORT_SYMBOL_GPL(dm_bufio_forget_buffers); |
2206 | |
2207 | void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned int n) |
2208 | { |
2209 | c->minimum_buffers = n; |
2210 | } |
2211 | EXPORT_SYMBOL_GPL(dm_bufio_set_minimum_buffers); |
2212 | |
2213 | unsigned int dm_bufio_get_block_size(struct dm_bufio_client *c) |
2214 | { |
2215 | return c->block_size; |
2216 | } |
2217 | EXPORT_SYMBOL_GPL(dm_bufio_get_block_size); |
2218 | |
2219 | sector_t dm_bufio_get_device_size(struct dm_bufio_client *c) |
2220 | { |
2221 | sector_t s = bdev_nr_sectors(bdev: c->bdev); |
2222 | |
2223 | if (s >= c->start) |
2224 | s -= c->start; |
2225 | else |
2226 | s = 0; |
2227 | if (likely(c->sectors_per_block_bits >= 0)) |
2228 | s >>= c->sectors_per_block_bits; |
2229 | else |
2230 | sector_div(s, c->block_size >> SECTOR_SHIFT); |
2231 | return s; |
2232 | } |
2233 | EXPORT_SYMBOL_GPL(dm_bufio_get_device_size); |
2234 | |
2235 | struct dm_io_client *dm_bufio_get_dm_io_client(struct dm_bufio_client *c) |
2236 | { |
2237 | return c->dm_io; |
2238 | } |
2239 | EXPORT_SYMBOL_GPL(dm_bufio_get_dm_io_client); |
2240 | |
2241 | sector_t dm_bufio_get_block_number(struct dm_buffer *b) |
2242 | { |
2243 | return b->block; |
2244 | } |
2245 | EXPORT_SYMBOL_GPL(dm_bufio_get_block_number); |
2246 | |
2247 | void *dm_bufio_get_block_data(struct dm_buffer *b) |
2248 | { |
2249 | return b->data; |
2250 | } |
2251 | EXPORT_SYMBOL_GPL(dm_bufio_get_block_data); |
2252 | |
2253 | void *dm_bufio_get_aux_data(struct dm_buffer *b) |
2254 | { |
2255 | return b + 1; |
2256 | } |
2257 | EXPORT_SYMBOL_GPL(dm_bufio_get_aux_data); |
2258 | |
2259 | struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b) |
2260 | { |
2261 | return b->c; |
2262 | } |
2263 | EXPORT_SYMBOL_GPL(dm_bufio_get_client); |
2264 | |
2265 | static enum it_action warn_leak(struct dm_buffer *b, void *context) |
2266 | { |
2267 | bool *warned = context; |
2268 | |
2269 | WARN_ON(!(*warned)); |
2270 | *warned = true; |
2271 | DMERR("leaked buffer %llx, hold count %u, list %d" , |
2272 | (unsigned long long)b->block, atomic_read(&b->hold_count), b->list_mode); |
2273 | #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING |
2274 | stack_trace_print(trace: b->stack_entries, nr_entries: b->stack_len, spaces: 1); |
2275 | /* mark unclaimed to avoid WARN_ON at end of drop_buffers() */ |
2276 | atomic_set(v: &b->hold_count, i: 0); |
2277 | #endif |
2278 | return IT_NEXT; |
2279 | } |
2280 | |
2281 | static void drop_buffers(struct dm_bufio_client *c) |
2282 | { |
2283 | int i; |
2284 | struct dm_buffer *b; |
2285 | |
2286 | if (WARN_ON(dm_bufio_in_request())) |
2287 | return; /* should never happen */ |
2288 | |
2289 | /* |
2290 | * An optimization so that the buffers are not written one-by-one. |
2291 | */ |
2292 | dm_bufio_write_dirty_buffers_async(c); |
2293 | |
2294 | dm_bufio_lock(c); |
2295 | |
2296 | while ((b = __get_unclaimed_buffer(c))) |
2297 | __free_buffer_wake(b); |
2298 | |
2299 | for (i = 0; i < LIST_SIZE; i++) { |
2300 | bool warned = false; |
2301 | |
2302 | cache_iterate(bc: &c->cache, list_mode: i, fn: warn_leak, context: &warned); |
2303 | } |
2304 | |
2305 | #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING |
2306 | while ((b = __get_unclaimed_buffer(c))) |
2307 | __free_buffer_wake(b); |
2308 | #endif |
2309 | |
2310 | for (i = 0; i < LIST_SIZE; i++) |
2311 | WARN_ON(cache_count(&c->cache, i)); |
2312 | |
2313 | dm_bufio_unlock(c); |
2314 | } |
2315 | |
2316 | static unsigned long get_retain_buffers(struct dm_bufio_client *c) |
2317 | { |
2318 | unsigned long retain_bytes = READ_ONCE(dm_bufio_retain_bytes); |
2319 | |
2320 | if (likely(c->sectors_per_block_bits >= 0)) |
2321 | retain_bytes >>= c->sectors_per_block_bits + SECTOR_SHIFT; |
2322 | else |
2323 | retain_bytes /= c->block_size; |
2324 | |
2325 | return retain_bytes; |
2326 | } |
2327 | |
2328 | static void __scan(struct dm_bufio_client *c) |
2329 | { |
2330 | int l; |
2331 | struct dm_buffer *b; |
2332 | unsigned long freed = 0; |
2333 | unsigned long retain_target = get_retain_buffers(c); |
2334 | unsigned long count = cache_total(bc: &c->cache); |
2335 | |
2336 | for (l = 0; l < LIST_SIZE; l++) { |
2337 | while (true) { |
2338 | if (count - freed <= retain_target) |
2339 | atomic_long_set(v: &c->need_shrink, i: 0); |
2340 | if (!atomic_long_read(v: &c->need_shrink)) |
2341 | break; |
2342 | |
2343 | b = cache_evict(bc: &c->cache, list_mode: l, |
2344 | pred: l == LIST_CLEAN ? is_clean : is_dirty, context: c); |
2345 | if (!b) |
2346 | break; |
2347 | |
2348 | __make_buffer_clean(b); |
2349 | __free_buffer_wake(b); |
2350 | |
2351 | atomic_long_dec(v: &c->need_shrink); |
2352 | freed++; |
2353 | cond_resched(); |
2354 | } |
2355 | } |
2356 | } |
2357 | |
2358 | static void shrink_work(struct work_struct *w) |
2359 | { |
2360 | struct dm_bufio_client *c = container_of(w, struct dm_bufio_client, shrink_work); |
2361 | |
2362 | dm_bufio_lock(c); |
2363 | __scan(c); |
2364 | dm_bufio_unlock(c); |
2365 | } |
2366 | |
2367 | static unsigned long dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) |
2368 | { |
2369 | struct dm_bufio_client *c; |
2370 | |
2371 | c = shrink->private_data; |
2372 | atomic_long_add(i: sc->nr_to_scan, v: &c->need_shrink); |
2373 | queue_work(wq: dm_bufio_wq, work: &c->shrink_work); |
2374 | |
2375 | return sc->nr_to_scan; |
2376 | } |
2377 | |
2378 | static unsigned long dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc) |
2379 | { |
2380 | struct dm_bufio_client *c = shrink->private_data; |
2381 | unsigned long count = cache_total(bc: &c->cache); |
2382 | unsigned long retain_target = get_retain_buffers(c); |
2383 | unsigned long queued_for_cleanup = atomic_long_read(v: &c->need_shrink); |
2384 | |
2385 | if (unlikely(count < retain_target)) |
2386 | count = 0; |
2387 | else |
2388 | count -= retain_target; |
2389 | |
2390 | if (unlikely(count < queued_for_cleanup)) |
2391 | count = 0; |
2392 | else |
2393 | count -= queued_for_cleanup; |
2394 | |
2395 | return count; |
2396 | } |
2397 | |
2398 | /* |
2399 | * Create the buffering interface |
2400 | */ |
2401 | struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned int block_size, |
2402 | unsigned int reserved_buffers, unsigned int aux_size, |
2403 | void (*alloc_callback)(struct dm_buffer *), |
2404 | void (*write_callback)(struct dm_buffer *), |
2405 | unsigned int flags) |
2406 | { |
2407 | int r; |
2408 | unsigned int num_locks; |
2409 | struct dm_bufio_client *c; |
2410 | char slab_name[27]; |
2411 | |
2412 | if (!block_size || block_size & ((1 << SECTOR_SHIFT) - 1)) { |
2413 | DMERR("%s: block size not specified or is not multiple of 512b" , __func__); |
2414 | r = -EINVAL; |
2415 | goto bad_client; |
2416 | } |
2417 | |
2418 | num_locks = dm_num_hash_locks(); |
2419 | c = kzalloc(size: sizeof(*c) + (num_locks * sizeof(struct buffer_tree)), GFP_KERNEL); |
2420 | if (!c) { |
2421 | r = -ENOMEM; |
2422 | goto bad_client; |
2423 | } |
2424 | cache_init(bc: &c->cache, num_locks); |
2425 | |
2426 | c->bdev = bdev; |
2427 | c->block_size = block_size; |
2428 | if (is_power_of_2(n: block_size)) |
2429 | c->sectors_per_block_bits = __ffs(block_size) - SECTOR_SHIFT; |
2430 | else |
2431 | c->sectors_per_block_bits = -1; |
2432 | |
2433 | c->alloc_callback = alloc_callback; |
2434 | c->write_callback = write_callback; |
2435 | |
2436 | if (flags & DM_BUFIO_CLIENT_NO_SLEEP) { |
2437 | c->no_sleep = true; |
2438 | static_branch_inc(&no_sleep_enabled); |
2439 | } |
2440 | |
2441 | mutex_init(&c->lock); |
2442 | spin_lock_init(&c->spinlock); |
2443 | INIT_LIST_HEAD(list: &c->reserved_buffers); |
2444 | c->need_reserved_buffers = reserved_buffers; |
2445 | |
2446 | dm_bufio_set_minimum_buffers(c, DM_BUFIO_MIN_BUFFERS); |
2447 | |
2448 | init_waitqueue_head(&c->free_buffer_wait); |
2449 | c->async_write_error = 0; |
2450 | |
2451 | c->dm_io = dm_io_client_create(); |
2452 | if (IS_ERR(ptr: c->dm_io)) { |
2453 | r = PTR_ERR(ptr: c->dm_io); |
2454 | goto bad_dm_io; |
2455 | } |
2456 | |
2457 | if (block_size <= KMALLOC_MAX_SIZE && |
2458 | (block_size < PAGE_SIZE || !is_power_of_2(n: block_size))) { |
2459 | unsigned int align = min(1U << __ffs(block_size), (unsigned int)PAGE_SIZE); |
2460 | |
2461 | snprintf(buf: slab_name, size: sizeof(slab_name), fmt: "dm_bufio_cache-%u" , block_size); |
2462 | c->slab_cache = kmem_cache_create(name: slab_name, size: block_size, align, |
2463 | SLAB_RECLAIM_ACCOUNT, NULL); |
2464 | if (!c->slab_cache) { |
2465 | r = -ENOMEM; |
2466 | goto bad; |
2467 | } |
2468 | } |
2469 | if (aux_size) |
2470 | snprintf(buf: slab_name, size: sizeof(slab_name), fmt: "dm_bufio_buffer-%u" , aux_size); |
2471 | else |
2472 | snprintf(buf: slab_name, size: sizeof(slab_name), fmt: "dm_bufio_buffer" ); |
2473 | c->slab_buffer = kmem_cache_create(name: slab_name, size: sizeof(struct dm_buffer) + aux_size, |
2474 | align: 0, SLAB_RECLAIM_ACCOUNT, NULL); |
2475 | if (!c->slab_buffer) { |
2476 | r = -ENOMEM; |
2477 | goto bad; |
2478 | } |
2479 | |
2480 | while (c->need_reserved_buffers) { |
2481 | struct dm_buffer *b = alloc_buffer(c, GFP_KERNEL); |
2482 | |
2483 | if (!b) { |
2484 | r = -ENOMEM; |
2485 | goto bad; |
2486 | } |
2487 | __free_buffer_wake(b); |
2488 | } |
2489 | |
2490 | INIT_WORK(&c->shrink_work, shrink_work); |
2491 | atomic_long_set(v: &c->need_shrink, i: 0); |
2492 | |
2493 | c->shrinker = shrinker_alloc(flags: 0, fmt: "dm-bufio:(%u:%u)" , |
2494 | MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev)); |
2495 | if (!c->shrinker) { |
2496 | r = -ENOMEM; |
2497 | goto bad; |
2498 | } |
2499 | |
2500 | c->shrinker->count_objects = dm_bufio_shrink_count; |
2501 | c->shrinker->scan_objects = dm_bufio_shrink_scan; |
2502 | c->shrinker->seeks = 1; |
2503 | c->shrinker->batch = 0; |
2504 | c->shrinker->private_data = c; |
2505 | |
2506 | shrinker_register(shrinker: c->shrinker); |
2507 | |
2508 | mutex_lock(&dm_bufio_clients_lock); |
2509 | dm_bufio_client_count++; |
2510 | list_add(new: &c->client_list, head: &dm_bufio_all_clients); |
2511 | __cache_size_refresh(); |
2512 | mutex_unlock(lock: &dm_bufio_clients_lock); |
2513 | |
2514 | return c; |
2515 | |
2516 | bad: |
2517 | while (!list_empty(head: &c->reserved_buffers)) { |
2518 | struct dm_buffer *b = list_to_buffer(l: c->reserved_buffers.next); |
2519 | |
2520 | list_del(entry: &b->lru.list); |
2521 | free_buffer(b); |
2522 | } |
2523 | kmem_cache_destroy(s: c->slab_cache); |
2524 | kmem_cache_destroy(s: c->slab_buffer); |
2525 | dm_io_client_destroy(client: c->dm_io); |
2526 | bad_dm_io: |
2527 | mutex_destroy(lock: &c->lock); |
2528 | if (c->no_sleep) |
2529 | static_branch_dec(&no_sleep_enabled); |
2530 | kfree(objp: c); |
2531 | bad_client: |
2532 | return ERR_PTR(error: r); |
2533 | } |
2534 | EXPORT_SYMBOL_GPL(dm_bufio_client_create); |
2535 | |
2536 | /* |
2537 | * Free the buffering interface. |
2538 | * It is required that there are no references on any buffers. |
2539 | */ |
2540 | void dm_bufio_client_destroy(struct dm_bufio_client *c) |
2541 | { |
2542 | unsigned int i; |
2543 | |
2544 | drop_buffers(c); |
2545 | |
2546 | shrinker_free(shrinker: c->shrinker); |
2547 | flush_work(work: &c->shrink_work); |
2548 | |
2549 | mutex_lock(&dm_bufio_clients_lock); |
2550 | |
2551 | list_del(entry: &c->client_list); |
2552 | dm_bufio_client_count--; |
2553 | __cache_size_refresh(); |
2554 | |
2555 | mutex_unlock(lock: &dm_bufio_clients_lock); |
2556 | |
2557 | WARN_ON(c->need_reserved_buffers); |
2558 | |
2559 | while (!list_empty(head: &c->reserved_buffers)) { |
2560 | struct dm_buffer *b = list_to_buffer(l: c->reserved_buffers.next); |
2561 | |
2562 | list_del(entry: &b->lru.list); |
2563 | free_buffer(b); |
2564 | } |
2565 | |
2566 | for (i = 0; i < LIST_SIZE; i++) |
2567 | if (cache_count(bc: &c->cache, list_mode: i)) |
2568 | DMERR("leaked buffer count %d: %lu" , i, cache_count(&c->cache, i)); |
2569 | |
2570 | for (i = 0; i < LIST_SIZE; i++) |
2571 | WARN_ON(cache_count(&c->cache, i)); |
2572 | |
2573 | cache_destroy(bc: &c->cache); |
2574 | kmem_cache_destroy(s: c->slab_cache); |
2575 | kmem_cache_destroy(s: c->slab_buffer); |
2576 | dm_io_client_destroy(client: c->dm_io); |
2577 | mutex_destroy(lock: &c->lock); |
2578 | if (c->no_sleep) |
2579 | static_branch_dec(&no_sleep_enabled); |
2580 | kfree(objp: c); |
2581 | } |
2582 | EXPORT_SYMBOL_GPL(dm_bufio_client_destroy); |
2583 | |
2584 | void dm_bufio_client_reset(struct dm_bufio_client *c) |
2585 | { |
2586 | drop_buffers(c); |
2587 | flush_work(work: &c->shrink_work); |
2588 | } |
2589 | EXPORT_SYMBOL_GPL(dm_bufio_client_reset); |
2590 | |
2591 | void dm_bufio_set_sector_offset(struct dm_bufio_client *c, sector_t start) |
2592 | { |
2593 | c->start = start; |
2594 | } |
2595 | EXPORT_SYMBOL_GPL(dm_bufio_set_sector_offset); |
2596 | |
2597 | /*--------------------------------------------------------------*/ |
2598 | |
2599 | static unsigned int get_max_age_hz(void) |
2600 | { |
2601 | unsigned int max_age = READ_ONCE(dm_bufio_max_age); |
2602 | |
2603 | if (max_age > UINT_MAX / HZ) |
2604 | max_age = UINT_MAX / HZ; |
2605 | |
2606 | return max_age * HZ; |
2607 | } |
2608 | |
2609 | static bool older_than(struct dm_buffer *b, unsigned long age_hz) |
2610 | { |
2611 | return time_after_eq(jiffies, READ_ONCE(b->last_accessed) + age_hz); |
2612 | } |
2613 | |
2614 | struct evict_params { |
2615 | gfp_t gfp; |
2616 | unsigned long age_hz; |
2617 | |
2618 | /* |
2619 | * This gets updated with the largest last_accessed (ie. most |
2620 | * recently used) of the evicted buffers. It will not be reinitialised |
2621 | * by __evict_many(), so you can use it across multiple invocations. |
2622 | */ |
2623 | unsigned long last_accessed; |
2624 | }; |
2625 | |
2626 | /* |
2627 | * We may not be able to evict this buffer if IO pending or the client |
2628 | * is still using it. |
2629 | * |
2630 | * And if GFP_NOFS is used, we must not do any I/O because we hold |
2631 | * dm_bufio_clients_lock and we would risk deadlock if the I/O gets |
2632 | * rerouted to different bufio client. |
2633 | */ |
2634 | static enum evict_result select_for_evict(struct dm_buffer *b, void *context) |
2635 | { |
2636 | struct evict_params *params = context; |
2637 | |
2638 | if (!(params->gfp & __GFP_FS) || |
2639 | (static_branch_unlikely(&no_sleep_enabled) && b->c->no_sleep)) { |
2640 | if (test_bit_acquire(B_READING, &b->state) || |
2641 | test_bit(B_WRITING, &b->state) || |
2642 | test_bit(B_DIRTY, &b->state)) |
2643 | return ER_DONT_EVICT; |
2644 | } |
2645 | |
2646 | return older_than(b, age_hz: params->age_hz) ? ER_EVICT : ER_STOP; |
2647 | } |
2648 | |
2649 | static unsigned long __evict_many(struct dm_bufio_client *c, |
2650 | struct evict_params *params, |
2651 | int list_mode, unsigned long max_count) |
2652 | { |
2653 | unsigned long count; |
2654 | unsigned long last_accessed; |
2655 | struct dm_buffer *b; |
2656 | |
2657 | for (count = 0; count < max_count; count++) { |
2658 | b = cache_evict(bc: &c->cache, list_mode, pred: select_for_evict, context: params); |
2659 | if (!b) |
2660 | break; |
2661 | |
2662 | last_accessed = READ_ONCE(b->last_accessed); |
2663 | if (time_after_eq(params->last_accessed, last_accessed)) |
2664 | params->last_accessed = last_accessed; |
2665 | |
2666 | __make_buffer_clean(b); |
2667 | __free_buffer_wake(b); |
2668 | |
2669 | cond_resched(); |
2670 | } |
2671 | |
2672 | return count; |
2673 | } |
2674 | |
2675 | static void evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz) |
2676 | { |
2677 | struct evict_params params = {.gfp = 0, .age_hz = age_hz, .last_accessed = 0}; |
2678 | unsigned long retain = get_retain_buffers(c); |
2679 | unsigned long count; |
2680 | LIST_HEAD(write_list); |
2681 | |
2682 | dm_bufio_lock(c); |
2683 | |
2684 | __check_watermark(c, write_list: &write_list); |
2685 | if (unlikely(!list_empty(&write_list))) { |
2686 | dm_bufio_unlock(c); |
2687 | __flush_write_list(write_list: &write_list); |
2688 | dm_bufio_lock(c); |
2689 | } |
2690 | |
2691 | count = cache_total(bc: &c->cache); |
2692 | if (count > retain) |
2693 | __evict_many(c, params: ¶ms, LIST_CLEAN, max_count: count - retain); |
2694 | |
2695 | dm_bufio_unlock(c); |
2696 | } |
2697 | |
2698 | static void cleanup_old_buffers(void) |
2699 | { |
2700 | unsigned long max_age_hz = get_max_age_hz(); |
2701 | struct dm_bufio_client *c; |
2702 | |
2703 | mutex_lock(&dm_bufio_clients_lock); |
2704 | |
2705 | __cache_size_refresh(); |
2706 | |
2707 | list_for_each_entry(c, &dm_bufio_all_clients, client_list) |
2708 | evict_old_buffers(c, age_hz: max_age_hz); |
2709 | |
2710 | mutex_unlock(lock: &dm_bufio_clients_lock); |
2711 | } |
2712 | |
2713 | static void work_fn(struct work_struct *w) |
2714 | { |
2715 | cleanup_old_buffers(); |
2716 | |
2717 | queue_delayed_work(wq: dm_bufio_wq, dwork: &dm_bufio_cleanup_old_work, |
2718 | DM_BUFIO_WORK_TIMER_SECS * HZ); |
2719 | } |
2720 | |
2721 | /*--------------------------------------------------------------*/ |
2722 | |
2723 | /* |
2724 | * Global cleanup tries to evict the oldest buffers from across _all_ |
2725 | * the clients. It does this by repeatedly evicting a few buffers from |
2726 | * the client that holds the oldest buffer. It's approximate, but hopefully |
2727 | * good enough. |
2728 | */ |
2729 | static struct dm_bufio_client *__pop_client(void) |
2730 | { |
2731 | struct list_head *h; |
2732 | |
2733 | if (list_empty(head: &dm_bufio_all_clients)) |
2734 | return NULL; |
2735 | |
2736 | h = dm_bufio_all_clients.next; |
2737 | list_del(entry: h); |
2738 | return container_of(h, struct dm_bufio_client, client_list); |
2739 | } |
2740 | |
2741 | /* |
2742 | * Inserts the client in the global client list based on its |
2743 | * 'oldest_buffer' field. |
2744 | */ |
2745 | static void __insert_client(struct dm_bufio_client *new_client) |
2746 | { |
2747 | struct dm_bufio_client *c; |
2748 | struct list_head *h = dm_bufio_all_clients.next; |
2749 | |
2750 | while (h != &dm_bufio_all_clients) { |
2751 | c = container_of(h, struct dm_bufio_client, client_list); |
2752 | if (time_after_eq(c->oldest_buffer, new_client->oldest_buffer)) |
2753 | break; |
2754 | h = h->next; |
2755 | } |
2756 | |
2757 | list_add_tail(new: &new_client->client_list, head: h); |
2758 | } |
2759 | |
2760 | static unsigned long __evict_a_few(unsigned long nr_buffers) |
2761 | { |
2762 | unsigned long count; |
2763 | struct dm_bufio_client *c; |
2764 | struct evict_params params = { |
2765 | .gfp = GFP_KERNEL, |
2766 | .age_hz = 0, |
2767 | /* set to jiffies in case there are no buffers in this client */ |
2768 | .last_accessed = jiffies |
2769 | }; |
2770 | |
2771 | c = __pop_client(); |
2772 | if (!c) |
2773 | return 0; |
2774 | |
2775 | dm_bufio_lock(c); |
2776 | count = __evict_many(c, params: ¶ms, LIST_CLEAN, max_count: nr_buffers); |
2777 | dm_bufio_unlock(c); |
2778 | |
2779 | if (count) |
2780 | c->oldest_buffer = params.last_accessed; |
2781 | __insert_client(new_client: c); |
2782 | |
2783 | return count; |
2784 | } |
2785 | |
2786 | static void check_watermarks(void) |
2787 | { |
2788 | LIST_HEAD(write_list); |
2789 | struct dm_bufio_client *c; |
2790 | |
2791 | mutex_lock(&dm_bufio_clients_lock); |
2792 | list_for_each_entry(c, &dm_bufio_all_clients, client_list) { |
2793 | dm_bufio_lock(c); |
2794 | __check_watermark(c, write_list: &write_list); |
2795 | dm_bufio_unlock(c); |
2796 | } |
2797 | mutex_unlock(lock: &dm_bufio_clients_lock); |
2798 | |
2799 | __flush_write_list(write_list: &write_list); |
2800 | } |
2801 | |
2802 | static void evict_old(void) |
2803 | { |
2804 | unsigned long threshold = dm_bufio_cache_size - |
2805 | dm_bufio_cache_size / DM_BUFIO_LOW_WATERMARK_RATIO; |
2806 | |
2807 | mutex_lock(&dm_bufio_clients_lock); |
2808 | while (dm_bufio_current_allocated > threshold) { |
2809 | if (!__evict_a_few(nr_buffers: 64)) |
2810 | break; |
2811 | cond_resched(); |
2812 | } |
2813 | mutex_unlock(lock: &dm_bufio_clients_lock); |
2814 | } |
2815 | |
2816 | static void do_global_cleanup(struct work_struct *w) |
2817 | { |
2818 | check_watermarks(); |
2819 | evict_old(); |
2820 | } |
2821 | |
2822 | /* |
2823 | *-------------------------------------------------------------- |
2824 | * Module setup |
2825 | *-------------------------------------------------------------- |
2826 | */ |
2827 | |
2828 | /* |
2829 | * This is called only once for the whole dm_bufio module. |
2830 | * It initializes memory limit. |
2831 | */ |
2832 | static int __init dm_bufio_init(void) |
2833 | { |
2834 | __u64 mem; |
2835 | |
2836 | dm_bufio_allocated_kmem_cache = 0; |
2837 | dm_bufio_allocated_get_free_pages = 0; |
2838 | dm_bufio_allocated_vmalloc = 0; |
2839 | dm_bufio_current_allocated = 0; |
2840 | |
2841 | mem = (__u64)mult_frac(totalram_pages() - totalhigh_pages(), |
2842 | DM_BUFIO_MEMORY_PERCENT, 100) << PAGE_SHIFT; |
2843 | |
2844 | if (mem > ULONG_MAX) |
2845 | mem = ULONG_MAX; |
2846 | |
2847 | #ifdef CONFIG_MMU |
2848 | if (mem > mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100)) |
2849 | mem = mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100); |
2850 | #endif |
2851 | |
2852 | dm_bufio_default_cache_size = mem; |
2853 | |
2854 | mutex_lock(&dm_bufio_clients_lock); |
2855 | __cache_size_refresh(); |
2856 | mutex_unlock(lock: &dm_bufio_clients_lock); |
2857 | |
2858 | dm_bufio_wq = alloc_workqueue(fmt: "dm_bufio_cache" , flags: WQ_MEM_RECLAIM, max_active: 0); |
2859 | if (!dm_bufio_wq) |
2860 | return -ENOMEM; |
2861 | |
2862 | INIT_DELAYED_WORK(&dm_bufio_cleanup_old_work, work_fn); |
2863 | INIT_WORK(&dm_bufio_replacement_work, do_global_cleanup); |
2864 | queue_delayed_work(wq: dm_bufio_wq, dwork: &dm_bufio_cleanup_old_work, |
2865 | DM_BUFIO_WORK_TIMER_SECS * HZ); |
2866 | |
2867 | return 0; |
2868 | } |
2869 | |
2870 | /* |
2871 | * This is called once when unloading the dm_bufio module. |
2872 | */ |
2873 | static void __exit dm_bufio_exit(void) |
2874 | { |
2875 | int bug = 0; |
2876 | |
2877 | cancel_delayed_work_sync(dwork: &dm_bufio_cleanup_old_work); |
2878 | destroy_workqueue(wq: dm_bufio_wq); |
2879 | |
2880 | if (dm_bufio_client_count) { |
2881 | DMCRIT("%s: dm_bufio_client_count leaked: %d" , |
2882 | __func__, dm_bufio_client_count); |
2883 | bug = 1; |
2884 | } |
2885 | |
2886 | if (dm_bufio_current_allocated) { |
2887 | DMCRIT("%s: dm_bufio_current_allocated leaked: %lu" , |
2888 | __func__, dm_bufio_current_allocated); |
2889 | bug = 1; |
2890 | } |
2891 | |
2892 | if (dm_bufio_allocated_get_free_pages) { |
2893 | DMCRIT("%s: dm_bufio_allocated_get_free_pages leaked: %lu" , |
2894 | __func__, dm_bufio_allocated_get_free_pages); |
2895 | bug = 1; |
2896 | } |
2897 | |
2898 | if (dm_bufio_allocated_vmalloc) { |
2899 | DMCRIT("%s: dm_bufio_vmalloc leaked: %lu" , |
2900 | __func__, dm_bufio_allocated_vmalloc); |
2901 | bug = 1; |
2902 | } |
2903 | |
2904 | WARN_ON(bug); /* leaks are not worth crashing the system */ |
2905 | } |
2906 | |
2907 | module_init(dm_bufio_init) |
2908 | module_exit(dm_bufio_exit) |
2909 | |
2910 | module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, 0644); |
2911 | MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache" ); |
2912 | |
2913 | module_param_named(max_age_seconds, dm_bufio_max_age, uint, 0644); |
2914 | MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds" ); |
2915 | |
2916 | module_param_named(retain_bytes, dm_bufio_retain_bytes, ulong, 0644); |
2917 | MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory" ); |
2918 | |
2919 | module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, 0644); |
2920 | MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory" ); |
2921 | |
2922 | module_param_named(allocated_kmem_cache_bytes, dm_bufio_allocated_kmem_cache, ulong, 0444); |
2923 | MODULE_PARM_DESC(allocated_kmem_cache_bytes, "Memory allocated with kmem_cache_alloc" ); |
2924 | |
2925 | module_param_named(allocated_get_free_pages_bytes, dm_bufio_allocated_get_free_pages, ulong, 0444); |
2926 | MODULE_PARM_DESC(allocated_get_free_pages_bytes, "Memory allocated with get_free_pages" ); |
2927 | |
2928 | module_param_named(allocated_vmalloc_bytes, dm_bufio_allocated_vmalloc, ulong, 0444); |
2929 | MODULE_PARM_DESC(allocated_vmalloc_bytes, "Memory allocated with vmalloc" ); |
2930 | |
2931 | module_param_named(current_allocated_bytes, dm_bufio_current_allocated, ulong, 0444); |
2932 | MODULE_PARM_DESC(current_allocated_bytes, "Memory currently used by the cache" ); |
2933 | |
2934 | MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>" ); |
2935 | MODULE_DESCRIPTION(DM_NAME " buffered I/O library" ); |
2936 | MODULE_LICENSE("GPL" ); |
2937 | |