1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * background writeback - scan btree for dirty data and write it to the backing |
4 | * device |
5 | * |
6 | * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> |
7 | * Copyright 2012 Google, Inc. |
8 | */ |
9 | |
10 | #include "bcache.h" |
11 | #include "btree.h" |
12 | #include "debug.h" |
13 | #include "writeback.h" |
14 | |
15 | #include <linux/delay.h> |
16 | #include <linux/kthread.h> |
17 | #include <linux/sched/clock.h> |
18 | #include <trace/events/bcache.h> |
19 | |
20 | static void update_gc_after_writeback(struct cache_set *c) |
21 | { |
22 | if (c->gc_after_writeback != (BCH_ENABLE_AUTO_GC) || |
23 | c->gc_stats.in_use < BCH_AUTO_GC_DIRTY_THRESHOLD) |
24 | return; |
25 | |
26 | c->gc_after_writeback |= BCH_DO_AUTO_GC; |
27 | } |
28 | |
29 | /* Rate limiting */ |
30 | static uint64_t __calc_target_rate(struct cached_dev *dc) |
31 | { |
32 | struct cache_set *c = dc->disk.c; |
33 | |
34 | /* |
35 | * This is the size of the cache, minus the amount used for |
36 | * flash-only devices |
37 | */ |
38 | uint64_t cache_sectors = c->nbuckets * c->cache->sb.bucket_size - |
39 | atomic_long_read(v: &c->flash_dev_dirty_sectors); |
40 | |
41 | /* |
42 | * Unfortunately there is no control of global dirty data. If the |
43 | * user states that they want 10% dirty data in the cache, and has, |
44 | * e.g., 5 backing volumes of equal size, we try and ensure each |
45 | * backing volume uses about 2% of the cache for dirty data. |
46 | */ |
47 | uint32_t bdev_share = |
48 | div64_u64(dividend: bdev_nr_sectors(bdev: dc->bdev) << WRITEBACK_SHARE_SHIFT, |
49 | divisor: c->cached_dev_sectors); |
50 | |
51 | uint64_t cache_dirty_target = |
52 | div_u64(dividend: cache_sectors * dc->writeback_percent, divisor: 100); |
53 | |
54 | /* Ensure each backing dev gets at least one dirty share */ |
55 | if (bdev_share < 1) |
56 | bdev_share = 1; |
57 | |
58 | return (cache_dirty_target * bdev_share) >> WRITEBACK_SHARE_SHIFT; |
59 | } |
60 | |
61 | static void __update_writeback_rate(struct cached_dev *dc) |
62 | { |
63 | /* |
64 | * PI controller: |
65 | * Figures out the amount that should be written per second. |
66 | * |
67 | * First, the error (number of sectors that are dirty beyond our |
68 | * target) is calculated. The error is accumulated (numerically |
69 | * integrated). |
70 | * |
71 | * Then, the proportional value and integral value are scaled |
72 | * based on configured values. These are stored as inverses to |
73 | * avoid fixed point math and to make configuration easy-- e.g. |
74 | * the default value of 40 for writeback_rate_p_term_inverse |
75 | * attempts to write at a rate that would retire all the dirty |
76 | * blocks in 40 seconds. |
77 | * |
78 | * The writeback_rate_i_inverse value of 10000 means that 1/10000th |
79 | * of the error is accumulated in the integral term per second. |
80 | * This acts as a slow, long-term average that is not subject to |
81 | * variations in usage like the p term. |
82 | */ |
83 | int64_t target = __calc_target_rate(dc); |
84 | int64_t dirty = bcache_dev_sectors_dirty(d: &dc->disk); |
85 | int64_t error = dirty - target; |
86 | int64_t proportional_scaled = |
87 | div_s64(dividend: error, divisor: dc->writeback_rate_p_term_inverse); |
88 | int64_t integral_scaled; |
89 | uint32_t new_rate; |
90 | |
91 | /* |
92 | * We need to consider the number of dirty buckets as well |
93 | * when calculating the proportional_scaled, Otherwise we might |
94 | * have an unreasonable small writeback rate at a highly fragmented situation |
95 | * when very few dirty sectors consumed a lot dirty buckets, the |
96 | * worst case is when dirty buckets reached cutoff_writeback_sync and |
97 | * dirty data is still not even reached to writeback percent, so the rate |
98 | * still will be at the minimum value, which will cause the write |
99 | * stuck at a non-writeback mode. |
100 | */ |
101 | struct cache_set *c = dc->disk.c; |
102 | |
103 | int64_t dirty_buckets = c->nbuckets - c->avail_nbuckets; |
104 | |
105 | if (dc->writeback_consider_fragment && |
106 | c->gc_stats.in_use > BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW && dirty > 0) { |
107 | int64_t fragment = |
108 | div_s64(dividend: (dirty_buckets * c->cache->sb.bucket_size), divisor: dirty); |
109 | int64_t fp_term; |
110 | int64_t fps; |
111 | |
112 | if (c->gc_stats.in_use <= BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID) { |
113 | fp_term = (int64_t)dc->writeback_rate_fp_term_low * |
114 | (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW); |
115 | } else if (c->gc_stats.in_use <= BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH) { |
116 | fp_term = (int64_t)dc->writeback_rate_fp_term_mid * |
117 | (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID); |
118 | } else { |
119 | fp_term = (int64_t)dc->writeback_rate_fp_term_high * |
120 | (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH); |
121 | } |
122 | fps = div_s64(dividend: dirty, divisor: dirty_buckets) * fp_term; |
123 | if (fragment > 3 && fps > proportional_scaled) { |
124 | /* Only overrite the p when fragment > 3 */ |
125 | proportional_scaled = fps; |
126 | } |
127 | } |
128 | |
129 | if ((error < 0 && dc->writeback_rate_integral > 0) || |
130 | (error > 0 && time_before64(local_clock(), |
131 | dc->writeback_rate.next + NSEC_PER_MSEC))) { |
132 | /* |
133 | * Only decrease the integral term if it's more than |
134 | * zero. Only increase the integral term if the device |
135 | * is keeping up. (Don't wind up the integral |
136 | * ineffectively in either case). |
137 | * |
138 | * It's necessary to scale this by |
139 | * writeback_rate_update_seconds to keep the integral |
140 | * term dimensioned properly. |
141 | */ |
142 | dc->writeback_rate_integral += error * |
143 | dc->writeback_rate_update_seconds; |
144 | } |
145 | |
146 | integral_scaled = div_s64(dividend: dc->writeback_rate_integral, |
147 | divisor: dc->writeback_rate_i_term_inverse); |
148 | |
149 | new_rate = clamp_t(int32_t, (proportional_scaled + integral_scaled), |
150 | dc->writeback_rate_minimum, NSEC_PER_SEC); |
151 | |
152 | dc->writeback_rate_proportional = proportional_scaled; |
153 | dc->writeback_rate_integral_scaled = integral_scaled; |
154 | dc->writeback_rate_change = new_rate - |
155 | atomic_long_read(v: &dc->writeback_rate.rate); |
156 | atomic_long_set(v: &dc->writeback_rate.rate, i: new_rate); |
157 | dc->writeback_rate_target = target; |
158 | } |
159 | |
160 | static bool idle_counter_exceeded(struct cache_set *c) |
161 | { |
162 | int counter, dev_nr; |
163 | |
164 | /* |
165 | * If c->idle_counter is overflow (idel for really long time), |
166 | * reset as 0 and not set maximum rate this time for code |
167 | * simplicity. |
168 | */ |
169 | counter = atomic_inc_return(v: &c->idle_counter); |
170 | if (counter <= 0) { |
171 | atomic_set(v: &c->idle_counter, i: 0); |
172 | return false; |
173 | } |
174 | |
175 | dev_nr = atomic_read(v: &c->attached_dev_nr); |
176 | if (dev_nr == 0) |
177 | return false; |
178 | |
179 | /* |
180 | * c->idle_counter is increased by writeback thread of all |
181 | * attached backing devices, in order to represent a rough |
182 | * time period, counter should be divided by dev_nr. |
183 | * Otherwise the idle time cannot be larger with more backing |
184 | * device attached. |
185 | * The following calculation equals to checking |
186 | * (counter / dev_nr) < (dev_nr * 6) |
187 | */ |
188 | if (counter < (dev_nr * dev_nr * 6)) |
189 | return false; |
190 | |
191 | return true; |
192 | } |
193 | |
194 | /* |
195 | * Idle_counter is increased every time when update_writeback_rate() is |
196 | * called. If all backing devices attached to the same cache set have |
197 | * identical dc->writeback_rate_update_seconds values, it is about 6 |
198 | * rounds of update_writeback_rate() on each backing device before |
199 | * c->at_max_writeback_rate is set to 1, and then max wrteback rate set |
200 | * to each dc->writeback_rate.rate. |
201 | * In order to avoid extra locking cost for counting exact dirty cached |
202 | * devices number, c->attached_dev_nr is used to calculate the idle |
203 | * throushold. It might be bigger if not all cached device are in write- |
204 | * back mode, but it still works well with limited extra rounds of |
205 | * update_writeback_rate(). |
206 | */ |
207 | static bool set_at_max_writeback_rate(struct cache_set *c, |
208 | struct cached_dev *dc) |
209 | { |
210 | /* Don't sst max writeback rate if it is disabled */ |
211 | if (!c->idle_max_writeback_rate_enabled) |
212 | return false; |
213 | |
214 | /* Don't set max writeback rate if gc is running */ |
215 | if (!c->gc_mark_valid) |
216 | return false; |
217 | |
218 | if (!idle_counter_exceeded(c)) |
219 | return false; |
220 | |
221 | if (atomic_read(v: &c->at_max_writeback_rate) != 1) |
222 | atomic_set(v: &c->at_max_writeback_rate, i: 1); |
223 | |
224 | atomic_long_set(v: &dc->writeback_rate.rate, INT_MAX); |
225 | |
226 | /* keep writeback_rate_target as existing value */ |
227 | dc->writeback_rate_proportional = 0; |
228 | dc->writeback_rate_integral_scaled = 0; |
229 | dc->writeback_rate_change = 0; |
230 | |
231 | /* |
232 | * In case new I/O arrives during before |
233 | * set_at_max_writeback_rate() returns. |
234 | */ |
235 | if (!idle_counter_exceeded(c) || |
236 | !atomic_read(v: &c->at_max_writeback_rate)) |
237 | return false; |
238 | |
239 | return true; |
240 | } |
241 | |
242 | static void update_writeback_rate(struct work_struct *work) |
243 | { |
244 | struct cached_dev *dc = container_of(to_delayed_work(work), |
245 | struct cached_dev, |
246 | writeback_rate_update); |
247 | struct cache_set *c = dc->disk.c; |
248 | |
249 | /* |
250 | * should check BCACHE_DEV_RATE_DW_RUNNING before calling |
251 | * cancel_delayed_work_sync(). |
252 | */ |
253 | set_bit(BCACHE_DEV_RATE_DW_RUNNING, addr: &dc->disk.flags); |
254 | /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ |
255 | smp_mb__after_atomic(); |
256 | |
257 | /* |
258 | * CACHE_SET_IO_DISABLE might be set via sysfs interface, |
259 | * check it here too. |
260 | */ |
261 | if (!test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) || |
262 | test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { |
263 | clear_bit(BCACHE_DEV_RATE_DW_RUNNING, addr: &dc->disk.flags); |
264 | /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ |
265 | smp_mb__after_atomic(); |
266 | return; |
267 | } |
268 | |
269 | /* |
270 | * If the whole cache set is idle, set_at_max_writeback_rate() |
271 | * will set writeback rate to a max number. Then it is |
272 | * unncessary to update writeback rate for an idle cache set |
273 | * in maximum writeback rate number(s). |
274 | */ |
275 | if (atomic_read(v: &dc->has_dirty) && dc->writeback_percent && |
276 | !set_at_max_writeback_rate(c, dc)) { |
277 | do { |
278 | if (!down_read_trylock(sem: (&dc->writeback_lock))) { |
279 | dc->rate_update_retry++; |
280 | if (dc->rate_update_retry <= |
281 | BCH_WBRATE_UPDATE_MAX_SKIPS) |
282 | break; |
283 | down_read(sem: &dc->writeback_lock); |
284 | dc->rate_update_retry = 0; |
285 | } |
286 | __update_writeback_rate(dc); |
287 | update_gc_after_writeback(c); |
288 | up_read(sem: &dc->writeback_lock); |
289 | } while (0); |
290 | } |
291 | |
292 | |
293 | /* |
294 | * CACHE_SET_IO_DISABLE might be set via sysfs interface, |
295 | * check it here too. |
296 | */ |
297 | if (test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) && |
298 | !test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { |
299 | schedule_delayed_work(dwork: &dc->writeback_rate_update, |
300 | delay: dc->writeback_rate_update_seconds * HZ); |
301 | } |
302 | |
303 | /* |
304 | * should check BCACHE_DEV_RATE_DW_RUNNING before calling |
305 | * cancel_delayed_work_sync(). |
306 | */ |
307 | clear_bit(BCACHE_DEV_RATE_DW_RUNNING, addr: &dc->disk.flags); |
308 | /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ |
309 | smp_mb__after_atomic(); |
310 | } |
311 | |
312 | static unsigned int writeback_delay(struct cached_dev *dc, |
313 | unsigned int sectors) |
314 | { |
315 | if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || |
316 | !dc->writeback_percent) |
317 | return 0; |
318 | |
319 | return bch_next_delay(d: &dc->writeback_rate, done: sectors); |
320 | } |
321 | |
322 | struct dirty_io { |
323 | struct closure cl; |
324 | struct cached_dev *dc; |
325 | uint16_t sequence; |
326 | struct bio bio; |
327 | }; |
328 | |
329 | static void dirty_init(struct keybuf_key *w) |
330 | { |
331 | struct dirty_io *io = w->private; |
332 | struct bio *bio = &io->bio; |
333 | |
334 | bio_init(bio, NULL, table: bio->bi_inline_vecs, |
335 | DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), opf: 0); |
336 | if (!io->dc->writeback_percent) |
337 | bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); |
338 | |
339 | bio->bi_iter.bi_size = KEY_SIZE(k: &w->key) << 9; |
340 | bio->bi_private = w; |
341 | bch_bio_map(bio, NULL); |
342 | } |
343 | |
344 | static CLOSURE_CALLBACK(dirty_io_destructor) |
345 | { |
346 | closure_type(io, struct dirty_io, cl); |
347 | |
348 | kfree(objp: io); |
349 | } |
350 | |
351 | static CLOSURE_CALLBACK(write_dirty_finish) |
352 | { |
353 | closure_type(io, struct dirty_io, cl); |
354 | struct keybuf_key *w = io->bio.bi_private; |
355 | struct cached_dev *dc = io->dc; |
356 | |
357 | bio_free_pages(bio: &io->bio); |
358 | |
359 | /* This is kind of a dumb way of signalling errors. */ |
360 | if (KEY_DIRTY(k: &w->key)) { |
361 | int ret; |
362 | unsigned int i; |
363 | struct keylist keys; |
364 | |
365 | bch_keylist_init(l: &keys); |
366 | |
367 | bkey_copy(keys.top, &w->key); |
368 | SET_KEY_DIRTY(k: keys.top, v: false); |
369 | bch_keylist_push(l: &keys); |
370 | |
371 | for (i = 0; i < KEY_PTRS(k: &w->key); i++) |
372 | atomic_inc(v: &PTR_BUCKET(c: dc->disk.c, k: &w->key, ptr: i)->pin); |
373 | |
374 | ret = bch_btree_insert(c: dc->disk.c, keys: &keys, NULL, replace_key: &w->key); |
375 | |
376 | if (ret) |
377 | trace_bcache_writeback_collision(k: &w->key); |
378 | |
379 | atomic_long_inc(v: ret |
380 | ? &dc->disk.c->writeback_keys_failed |
381 | : &dc->disk.c->writeback_keys_done); |
382 | } |
383 | |
384 | bch_keybuf_del(buf: &dc->writeback_keys, w); |
385 | up(sem: &dc->in_flight); |
386 | |
387 | closure_return_with_destructor(cl, dirty_io_destructor); |
388 | } |
389 | |
390 | static void dirty_endio(struct bio *bio) |
391 | { |
392 | struct keybuf_key *w = bio->bi_private; |
393 | struct dirty_io *io = w->private; |
394 | |
395 | if (bio->bi_status) { |
396 | SET_KEY_DIRTY(k: &w->key, v: false); |
397 | bch_count_backing_io_errors(dc: io->dc, bio); |
398 | } |
399 | |
400 | closure_put(cl: &io->cl); |
401 | } |
402 | |
403 | static CLOSURE_CALLBACK(write_dirty) |
404 | { |
405 | closure_type(io, struct dirty_io, cl); |
406 | struct keybuf_key *w = io->bio.bi_private; |
407 | struct cached_dev *dc = io->dc; |
408 | |
409 | uint16_t next_sequence; |
410 | |
411 | if (atomic_read(v: &dc->writeback_sequence_next) != io->sequence) { |
412 | /* Not our turn to write; wait for a write to complete */ |
413 | closure_wait(list: &dc->writeback_ordering_wait, cl); |
414 | |
415 | if (atomic_read(v: &dc->writeback_sequence_next) == io->sequence) { |
416 | /* |
417 | * Edge case-- it happened in indeterminate order |
418 | * relative to when we were added to wait list.. |
419 | */ |
420 | closure_wake_up(list: &dc->writeback_ordering_wait); |
421 | } |
422 | |
423 | continue_at(cl, write_dirty, io->dc->writeback_write_wq); |
424 | return; |
425 | } |
426 | |
427 | next_sequence = io->sequence + 1; |
428 | |
429 | /* |
430 | * IO errors are signalled using the dirty bit on the key. |
431 | * If we failed to read, we should not attempt to write to the |
432 | * backing device. Instead, immediately go to write_dirty_finish |
433 | * to clean up. |
434 | */ |
435 | if (KEY_DIRTY(k: &w->key)) { |
436 | dirty_init(w); |
437 | io->bio.bi_opf = REQ_OP_WRITE; |
438 | io->bio.bi_iter.bi_sector = KEY_START(&w->key); |
439 | bio_set_dev(bio: &io->bio, bdev: io->dc->bdev); |
440 | io->bio.bi_end_io = dirty_endio; |
441 | |
442 | /* I/O request sent to backing device */ |
443 | closure_bio_submit(c: io->dc->disk.c, bio: &io->bio, cl); |
444 | } |
445 | |
446 | atomic_set(v: &dc->writeback_sequence_next, i: next_sequence); |
447 | closure_wake_up(list: &dc->writeback_ordering_wait); |
448 | |
449 | continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq); |
450 | } |
451 | |
452 | static void read_dirty_endio(struct bio *bio) |
453 | { |
454 | struct keybuf_key *w = bio->bi_private; |
455 | struct dirty_io *io = w->private; |
456 | |
457 | /* is_read = 1 */ |
458 | bch_count_io_errors(ca: io->dc->disk.c->cache, |
459 | error: bio->bi_status, is_read: 1, |
460 | m: "reading dirty data from cache" ); |
461 | |
462 | dirty_endio(bio); |
463 | } |
464 | |
465 | static CLOSURE_CALLBACK(read_dirty_submit) |
466 | { |
467 | closure_type(io, struct dirty_io, cl); |
468 | |
469 | closure_bio_submit(c: io->dc->disk.c, bio: &io->bio, cl); |
470 | |
471 | continue_at(cl, write_dirty, io->dc->writeback_write_wq); |
472 | } |
473 | |
474 | static void read_dirty(struct cached_dev *dc) |
475 | { |
476 | unsigned int delay = 0; |
477 | struct keybuf_key *next, *keys[MAX_WRITEBACKS_IN_PASS], *w; |
478 | size_t size; |
479 | int nk, i; |
480 | struct dirty_io *io; |
481 | struct closure cl; |
482 | uint16_t sequence = 0; |
483 | |
484 | BUG_ON(!llist_empty(&dc->writeback_ordering_wait.list)); |
485 | atomic_set(v: &dc->writeback_sequence_next, i: sequence); |
486 | closure_init_stack(cl: &cl); |
487 | |
488 | /* |
489 | * XXX: if we error, background writeback just spins. Should use some |
490 | * mempools. |
491 | */ |
492 | |
493 | next = bch_keybuf_next(buf: &dc->writeback_keys); |
494 | |
495 | while (!kthread_should_stop() && |
496 | !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) && |
497 | next) { |
498 | size = 0; |
499 | nk = 0; |
500 | |
501 | do { |
502 | BUG_ON(ptr_stale(dc->disk.c, &next->key, 0)); |
503 | |
504 | /* |
505 | * Don't combine too many operations, even if they |
506 | * are all small. |
507 | */ |
508 | if (nk >= MAX_WRITEBACKS_IN_PASS) |
509 | break; |
510 | |
511 | /* |
512 | * If the current operation is very large, don't |
513 | * further combine operations. |
514 | */ |
515 | if (size >= MAX_WRITESIZE_IN_PASS) |
516 | break; |
517 | |
518 | /* |
519 | * Operations are only eligible to be combined |
520 | * if they are contiguous. |
521 | * |
522 | * TODO: add a heuristic willing to fire a |
523 | * certain amount of non-contiguous IO per pass, |
524 | * so that we can benefit from backing device |
525 | * command queueing. |
526 | */ |
527 | if ((nk != 0) && bkey_cmp(l: &keys[nk-1]->key, |
528 | r: &START_KEY(&next->key))) |
529 | break; |
530 | |
531 | size += KEY_SIZE(k: &next->key); |
532 | keys[nk++] = next; |
533 | } while ((next = bch_keybuf_next(buf: &dc->writeback_keys))); |
534 | |
535 | /* Now we have gathered a set of 1..5 keys to write back. */ |
536 | for (i = 0; i < nk; i++) { |
537 | w = keys[i]; |
538 | |
539 | io = kzalloc(struct_size(io, bio.bi_inline_vecs, |
540 | DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)), |
541 | GFP_KERNEL); |
542 | if (!io) |
543 | goto err; |
544 | |
545 | w->private = io; |
546 | io->dc = dc; |
547 | io->sequence = sequence++; |
548 | |
549 | dirty_init(w); |
550 | io->bio.bi_opf = REQ_OP_READ; |
551 | io->bio.bi_iter.bi_sector = PTR_OFFSET(k: &w->key, i: 0); |
552 | bio_set_dev(bio: &io->bio, bdev: dc->disk.c->cache->bdev); |
553 | io->bio.bi_end_io = read_dirty_endio; |
554 | |
555 | if (bch_bio_alloc_pages(bio: &io->bio, GFP_KERNEL)) |
556 | goto err_free; |
557 | |
558 | trace_bcache_writeback(k: &w->key); |
559 | |
560 | down(sem: &dc->in_flight); |
561 | |
562 | /* |
563 | * We've acquired a semaphore for the maximum |
564 | * simultaneous number of writebacks; from here |
565 | * everything happens asynchronously. |
566 | */ |
567 | closure_call(cl: &io->cl, fn: read_dirty_submit, NULL, parent: &cl); |
568 | } |
569 | |
570 | delay = writeback_delay(dc, sectors: size); |
571 | |
572 | while (!kthread_should_stop() && |
573 | !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) && |
574 | delay) { |
575 | schedule_timeout_interruptible(timeout: delay); |
576 | delay = writeback_delay(dc, sectors: 0); |
577 | } |
578 | } |
579 | |
580 | if (0) { |
581 | err_free: |
582 | kfree(objp: w->private); |
583 | err: |
584 | bch_keybuf_del(buf: &dc->writeback_keys, w); |
585 | } |
586 | |
587 | /* |
588 | * Wait for outstanding writeback IOs to finish (and keybuf slots to be |
589 | * freed) before refilling again |
590 | */ |
591 | closure_sync(cl: &cl); |
592 | } |
593 | |
594 | /* Scan for dirty data */ |
595 | |
596 | void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned int inode, |
597 | uint64_t offset, int nr_sectors) |
598 | { |
599 | struct bcache_device *d = c->devices[inode]; |
600 | unsigned int stripe_offset, sectors_dirty; |
601 | int stripe; |
602 | |
603 | if (!d) |
604 | return; |
605 | |
606 | stripe = offset_to_stripe(d, offset); |
607 | if (stripe < 0) |
608 | return; |
609 | |
610 | if (UUID_FLASH_ONLY(k: &c->uuids[inode])) |
611 | atomic_long_add(i: nr_sectors, v: &c->flash_dev_dirty_sectors); |
612 | |
613 | stripe_offset = offset & (d->stripe_size - 1); |
614 | |
615 | while (nr_sectors) { |
616 | int s = min_t(unsigned int, abs(nr_sectors), |
617 | d->stripe_size - stripe_offset); |
618 | |
619 | if (nr_sectors < 0) |
620 | s = -s; |
621 | |
622 | if (stripe >= d->nr_stripes) |
623 | return; |
624 | |
625 | sectors_dirty = atomic_add_return(i: s, |
626 | v: d->stripe_sectors_dirty + stripe); |
627 | if (sectors_dirty == d->stripe_size) { |
628 | if (!test_bit(stripe, d->full_dirty_stripes)) |
629 | set_bit(nr: stripe, addr: d->full_dirty_stripes); |
630 | } else { |
631 | if (test_bit(stripe, d->full_dirty_stripes)) |
632 | clear_bit(nr: stripe, addr: d->full_dirty_stripes); |
633 | } |
634 | |
635 | nr_sectors -= s; |
636 | stripe_offset = 0; |
637 | stripe++; |
638 | } |
639 | } |
640 | |
641 | static bool dirty_pred(struct keybuf *buf, struct bkey *k) |
642 | { |
643 | struct cached_dev *dc = container_of(buf, |
644 | struct cached_dev, |
645 | writeback_keys); |
646 | |
647 | BUG_ON(KEY_INODE(k) != dc->disk.id); |
648 | |
649 | return KEY_DIRTY(k); |
650 | } |
651 | |
652 | static void refill_full_stripes(struct cached_dev *dc) |
653 | { |
654 | struct keybuf *buf = &dc->writeback_keys; |
655 | unsigned int start_stripe, next_stripe; |
656 | int stripe; |
657 | bool wrapped = false; |
658 | |
659 | stripe = offset_to_stripe(d: &dc->disk, offset: KEY_OFFSET(k: &buf->last_scanned)); |
660 | if (stripe < 0) |
661 | stripe = 0; |
662 | |
663 | start_stripe = stripe; |
664 | |
665 | while (1) { |
666 | stripe = find_next_bit(addr: dc->disk.full_dirty_stripes, |
667 | size: dc->disk.nr_stripes, offset: stripe); |
668 | |
669 | if (stripe == dc->disk.nr_stripes) |
670 | goto next; |
671 | |
672 | next_stripe = find_next_zero_bit(addr: dc->disk.full_dirty_stripes, |
673 | size: dc->disk.nr_stripes, offset: stripe); |
674 | |
675 | buf->last_scanned = KEY(dc->disk.id, |
676 | stripe * dc->disk.stripe_size, 0); |
677 | |
678 | bch_refill_keybuf(c: dc->disk.c, buf, |
679 | end: &KEY(dc->disk.id, |
680 | next_stripe * dc->disk.stripe_size, 0), |
681 | pred: dirty_pred); |
682 | |
683 | if (array_freelist_empty(&buf->freelist)) |
684 | return; |
685 | |
686 | stripe = next_stripe; |
687 | next: |
688 | if (wrapped && stripe > start_stripe) |
689 | return; |
690 | |
691 | if (stripe == dc->disk.nr_stripes) { |
692 | stripe = 0; |
693 | wrapped = true; |
694 | } |
695 | } |
696 | } |
697 | |
698 | /* |
699 | * Returns true if we scanned the entire disk |
700 | */ |
701 | static bool refill_dirty(struct cached_dev *dc) |
702 | { |
703 | struct keybuf *buf = &dc->writeback_keys; |
704 | struct bkey start = KEY(dc->disk.id, 0, 0); |
705 | struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0); |
706 | struct bkey start_pos; |
707 | |
708 | /* |
709 | * make sure keybuf pos is inside the range for this disk - at bringup |
710 | * we might not be attached yet so this disk's inode nr isn't |
711 | * initialized then |
712 | */ |
713 | if (bkey_cmp(l: &buf->last_scanned, r: &start) < 0 || |
714 | bkey_cmp(l: &buf->last_scanned, r: &end) > 0) |
715 | buf->last_scanned = start; |
716 | |
717 | if (dc->partial_stripes_expensive) { |
718 | refill_full_stripes(dc); |
719 | if (array_freelist_empty(&buf->freelist)) |
720 | return false; |
721 | } |
722 | |
723 | start_pos = buf->last_scanned; |
724 | bch_refill_keybuf(c: dc->disk.c, buf, end: &end, pred: dirty_pred); |
725 | |
726 | if (bkey_cmp(l: &buf->last_scanned, r: &end) < 0) |
727 | return false; |
728 | |
729 | /* |
730 | * If we get to the end start scanning again from the beginning, and |
731 | * only scan up to where we initially started scanning from: |
732 | */ |
733 | buf->last_scanned = start; |
734 | bch_refill_keybuf(c: dc->disk.c, buf, end: &start_pos, pred: dirty_pred); |
735 | |
736 | return bkey_cmp(l: &buf->last_scanned, r: &start_pos) >= 0; |
737 | } |
738 | |
739 | static int bch_writeback_thread(void *arg) |
740 | { |
741 | struct cached_dev *dc = arg; |
742 | struct cache_set *c = dc->disk.c; |
743 | bool searched_full_index; |
744 | |
745 | bch_ratelimit_reset(d: &dc->writeback_rate); |
746 | |
747 | while (!kthread_should_stop() && |
748 | !test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { |
749 | down_write(sem: &dc->writeback_lock); |
750 | set_current_state(TASK_INTERRUPTIBLE); |
751 | /* |
752 | * If the bache device is detaching, skip here and continue |
753 | * to perform writeback. Otherwise, if no dirty data on cache, |
754 | * or there is dirty data on cache but writeback is disabled, |
755 | * the writeback thread should sleep here and wait for others |
756 | * to wake up it. |
757 | */ |
758 | if (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && |
759 | (!atomic_read(v: &dc->has_dirty) || !dc->writeback_running)) { |
760 | up_write(sem: &dc->writeback_lock); |
761 | |
762 | if (kthread_should_stop() || |
763 | test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { |
764 | set_current_state(TASK_RUNNING); |
765 | break; |
766 | } |
767 | |
768 | schedule(); |
769 | continue; |
770 | } |
771 | set_current_state(TASK_RUNNING); |
772 | |
773 | searched_full_index = refill_dirty(dc); |
774 | |
775 | if (searched_full_index && |
776 | RB_EMPTY_ROOT(&dc->writeback_keys.keys)) { |
777 | atomic_set(v: &dc->has_dirty, i: 0); |
778 | SET_BDEV_STATE(k: &dc->sb, BDEV_STATE_CLEAN); |
779 | bch_write_bdev_super(dc, NULL); |
780 | /* |
781 | * If bcache device is detaching via sysfs interface, |
782 | * writeback thread should stop after there is no dirty |
783 | * data on cache. BCACHE_DEV_DETACHING flag is set in |
784 | * bch_cached_dev_detach(). |
785 | */ |
786 | if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) { |
787 | struct closure cl; |
788 | |
789 | closure_init_stack(cl: &cl); |
790 | memset(&dc->sb.set_uuid, 0, 16); |
791 | SET_BDEV_STATE(k: &dc->sb, BDEV_STATE_NONE); |
792 | |
793 | bch_write_bdev_super(dc, parent: &cl); |
794 | closure_sync(cl: &cl); |
795 | |
796 | up_write(sem: &dc->writeback_lock); |
797 | break; |
798 | } |
799 | |
800 | /* |
801 | * When dirty data rate is high (e.g. 50%+), there might |
802 | * be heavy buckets fragmentation after writeback |
803 | * finished, which hurts following write performance. |
804 | * If users really care about write performance they |
805 | * may set BCH_ENABLE_AUTO_GC via sysfs, then when |
806 | * BCH_DO_AUTO_GC is set, garbage collection thread |
807 | * will be wake up here. After moving gc, the shrunk |
808 | * btree and discarded free buckets SSD space may be |
809 | * helpful for following write requests. |
810 | */ |
811 | if (c->gc_after_writeback == |
812 | (BCH_ENABLE_AUTO_GC|BCH_DO_AUTO_GC)) { |
813 | c->gc_after_writeback &= ~BCH_DO_AUTO_GC; |
814 | force_wake_up_gc(c); |
815 | } |
816 | } |
817 | |
818 | up_write(sem: &dc->writeback_lock); |
819 | |
820 | read_dirty(dc); |
821 | |
822 | if (searched_full_index) { |
823 | unsigned int delay = dc->writeback_delay * HZ; |
824 | |
825 | while (delay && |
826 | !kthread_should_stop() && |
827 | !test_bit(CACHE_SET_IO_DISABLE, &c->flags) && |
828 | !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) |
829 | delay = schedule_timeout_interruptible(timeout: delay); |
830 | |
831 | bch_ratelimit_reset(d: &dc->writeback_rate); |
832 | } |
833 | } |
834 | |
835 | if (dc->writeback_write_wq) |
836 | destroy_workqueue(wq: dc->writeback_write_wq); |
837 | |
838 | cached_dev_put(dc); |
839 | wait_for_kthread_stop(); |
840 | |
841 | return 0; |
842 | } |
843 | |
844 | /* Init */ |
845 | #define INIT_KEYS_EACH_TIME 500000 |
846 | |
847 | struct sectors_dirty_init { |
848 | struct btree_op op; |
849 | unsigned int inode; |
850 | size_t count; |
851 | }; |
852 | |
853 | static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b, |
854 | struct bkey *k) |
855 | { |
856 | struct sectors_dirty_init *op = container_of(_op, |
857 | struct sectors_dirty_init, op); |
858 | if (KEY_INODE(k) > op->inode) |
859 | return MAP_DONE; |
860 | |
861 | if (KEY_DIRTY(k)) |
862 | bcache_dev_sectors_dirty_add(c: b->c, inode: KEY_INODE(k), |
863 | KEY_START(k), nr_sectors: KEY_SIZE(k)); |
864 | |
865 | op->count++; |
866 | if (!(op->count % INIT_KEYS_EACH_TIME)) |
867 | cond_resched(); |
868 | |
869 | return MAP_CONTINUE; |
870 | } |
871 | |
872 | static int bch_root_node_dirty_init(struct cache_set *c, |
873 | struct bcache_device *d, |
874 | struct bkey *k) |
875 | { |
876 | struct sectors_dirty_init op; |
877 | int ret; |
878 | |
879 | bch_btree_op_init(op: &op.op, write_lock_level: -1); |
880 | op.inode = d->id; |
881 | op.count = 0; |
882 | |
883 | ret = bcache_btree(map_keys_recurse, |
884 | k, |
885 | c->root, |
886 | &op.op, |
887 | &KEY(op.inode, 0, 0), |
888 | sectors_dirty_init_fn, |
889 | 0); |
890 | if (ret < 0) |
891 | pr_warn("sectors dirty init failed, ret=%d!\n" , ret); |
892 | |
893 | /* |
894 | * The op may be added to cache_set's btree_cache_wait |
895 | * in mca_cannibalize(), must ensure it is removed from |
896 | * the list and release btree_cache_alloc_lock before |
897 | * free op memory. |
898 | * Otherwise, the btree_cache_wait will be damaged. |
899 | */ |
900 | bch_cannibalize_unlock(c); |
901 | finish_wait(wq_head: &c->btree_cache_wait, wq_entry: &(&op.op)->wait); |
902 | |
903 | return ret; |
904 | } |
905 | |
906 | static int bch_dirty_init_thread(void *arg) |
907 | { |
908 | struct dirty_init_thrd_info *info = arg; |
909 | struct bch_dirty_init_state *state = info->state; |
910 | struct cache_set *c = state->c; |
911 | struct btree_iter iter; |
912 | struct bkey *k, *p; |
913 | int cur_idx, prev_idx, skip_nr; |
914 | |
915 | k = p = NULL; |
916 | prev_idx = 0; |
917 | |
918 | bch_btree_iter_init(b: &c->root->keys, iter: &iter, NULL); |
919 | k = bch_btree_iter_next_filter(iter: &iter, b: &c->root->keys, fn: bch_ptr_bad); |
920 | BUG_ON(!k); |
921 | |
922 | p = k; |
923 | |
924 | while (k) { |
925 | spin_lock(lock: &state->idx_lock); |
926 | cur_idx = state->key_idx; |
927 | state->key_idx++; |
928 | spin_unlock(lock: &state->idx_lock); |
929 | |
930 | skip_nr = cur_idx - prev_idx; |
931 | |
932 | while (skip_nr) { |
933 | k = bch_btree_iter_next_filter(iter: &iter, |
934 | b: &c->root->keys, |
935 | fn: bch_ptr_bad); |
936 | if (k) |
937 | p = k; |
938 | else { |
939 | atomic_set(v: &state->enough, i: 1); |
940 | /* Update state->enough earlier */ |
941 | smp_mb__after_atomic(); |
942 | goto out; |
943 | } |
944 | skip_nr--; |
945 | } |
946 | |
947 | if (p) { |
948 | if (bch_root_node_dirty_init(c, d: state->d, k: p) < 0) |
949 | goto out; |
950 | } |
951 | |
952 | p = NULL; |
953 | prev_idx = cur_idx; |
954 | } |
955 | |
956 | out: |
957 | /* In order to wake up state->wait in time */ |
958 | smp_mb__before_atomic(); |
959 | if (atomic_dec_and_test(v: &state->started)) |
960 | wake_up(&state->wait); |
961 | |
962 | return 0; |
963 | } |
964 | |
965 | static int bch_btre_dirty_init_thread_nr(void) |
966 | { |
967 | int n = num_online_cpus()/2; |
968 | |
969 | if (n == 0) |
970 | n = 1; |
971 | else if (n > BCH_DIRTY_INIT_THRD_MAX) |
972 | n = BCH_DIRTY_INIT_THRD_MAX; |
973 | |
974 | return n; |
975 | } |
976 | |
977 | void bch_sectors_dirty_init(struct bcache_device *d) |
978 | { |
979 | int i; |
980 | struct btree *b = NULL; |
981 | struct bkey *k = NULL; |
982 | struct btree_iter iter; |
983 | struct sectors_dirty_init op; |
984 | struct cache_set *c = d->c; |
985 | struct bch_dirty_init_state state; |
986 | |
987 | retry_lock: |
988 | b = c->root; |
989 | rw_lock(w: 0, b, level: b->level); |
990 | if (b != c->root) { |
991 | rw_unlock(w: 0, b); |
992 | goto retry_lock; |
993 | } |
994 | |
995 | /* Just count root keys if no leaf node */ |
996 | if (c->root->level == 0) { |
997 | bch_btree_op_init(op: &op.op, write_lock_level: -1); |
998 | op.inode = d->id; |
999 | op.count = 0; |
1000 | |
1001 | for_each_key_filter(&c->root->keys, |
1002 | k, &iter, bch_ptr_invalid) { |
1003 | if (KEY_INODE(k) != op.inode) |
1004 | continue; |
1005 | sectors_dirty_init_fn(op: &op.op, b: c->root, k); |
1006 | } |
1007 | |
1008 | rw_unlock(w: 0, b); |
1009 | return; |
1010 | } |
1011 | |
1012 | memset(&state, 0, sizeof(struct bch_dirty_init_state)); |
1013 | state.c = c; |
1014 | state.d = d; |
1015 | state.total_threads = bch_btre_dirty_init_thread_nr(); |
1016 | state.key_idx = 0; |
1017 | spin_lock_init(&state.idx_lock); |
1018 | atomic_set(v: &state.started, i: 0); |
1019 | atomic_set(v: &state.enough, i: 0); |
1020 | init_waitqueue_head(&state.wait); |
1021 | |
1022 | for (i = 0; i < state.total_threads; i++) { |
1023 | /* Fetch latest state.enough earlier */ |
1024 | smp_mb__before_atomic(); |
1025 | if (atomic_read(v: &state.enough)) |
1026 | break; |
1027 | |
1028 | atomic_inc(v: &state.started); |
1029 | state.infos[i].state = &state; |
1030 | state.infos[i].thread = |
1031 | kthread_run(bch_dirty_init_thread, &state.infos[i], |
1032 | "bch_dirtcnt[%d]" , i); |
1033 | if (IS_ERR(ptr: state.infos[i].thread)) { |
1034 | pr_err("fails to run thread bch_dirty_init[%d]\n" , i); |
1035 | atomic_dec(v: &state.started); |
1036 | for (--i; i >= 0; i--) |
1037 | kthread_stop(k: state.infos[i].thread); |
1038 | goto out; |
1039 | } |
1040 | } |
1041 | |
1042 | out: |
1043 | /* Must wait for all threads to stop. */ |
1044 | wait_event(state.wait, atomic_read(&state.started) == 0); |
1045 | rw_unlock(w: 0, b); |
1046 | } |
1047 | |
1048 | void bch_cached_dev_writeback_init(struct cached_dev *dc) |
1049 | { |
1050 | sema_init(sem: &dc->in_flight, val: 64); |
1051 | init_rwsem(&dc->writeback_lock); |
1052 | bch_keybuf_init(buf: &dc->writeback_keys); |
1053 | |
1054 | dc->writeback_metadata = true; |
1055 | dc->writeback_running = false; |
1056 | dc->writeback_consider_fragment = true; |
1057 | dc->writeback_percent = 10; |
1058 | dc->writeback_delay = 30; |
1059 | atomic_long_set(v: &dc->writeback_rate.rate, i: 1024); |
1060 | dc->writeback_rate_minimum = 8; |
1061 | |
1062 | dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT; |
1063 | dc->writeback_rate_p_term_inverse = 40; |
1064 | dc->writeback_rate_fp_term_low = 1; |
1065 | dc->writeback_rate_fp_term_mid = 10; |
1066 | dc->writeback_rate_fp_term_high = 1000; |
1067 | dc->writeback_rate_i_term_inverse = 10000; |
1068 | |
1069 | /* For dc->writeback_lock contention in update_writeback_rate() */ |
1070 | dc->rate_update_retry = 0; |
1071 | |
1072 | WARN_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)); |
1073 | INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); |
1074 | } |
1075 | |
1076 | int bch_cached_dev_writeback_start(struct cached_dev *dc) |
1077 | { |
1078 | dc->writeback_write_wq = alloc_workqueue(fmt: "bcache_writeback_wq" , |
1079 | flags: WQ_MEM_RECLAIM, max_active: 0); |
1080 | if (!dc->writeback_write_wq) |
1081 | return -ENOMEM; |
1082 | |
1083 | cached_dev_get(dc); |
1084 | dc->writeback_thread = kthread_create(bch_writeback_thread, dc, |
1085 | "bcache_writeback" ); |
1086 | if (IS_ERR(ptr: dc->writeback_thread)) { |
1087 | cached_dev_put(dc); |
1088 | destroy_workqueue(wq: dc->writeback_write_wq); |
1089 | return PTR_ERR(ptr: dc->writeback_thread); |
1090 | } |
1091 | dc->writeback_running = true; |
1092 | |
1093 | WARN_ON(test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)); |
1094 | schedule_delayed_work(dwork: &dc->writeback_rate_update, |
1095 | delay: dc->writeback_rate_update_seconds * HZ); |
1096 | |
1097 | bch_writeback_queue(dc); |
1098 | |
1099 | return 0; |
1100 | } |
1101 | |