writeback.c source code [linux/drivers/md/bcache/writeback.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* background writeback - scan btree for dirty data and write it to the backing
4	* device
5	*
6	* Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
7	* Copyright 2012 Google, Inc.
8	*/
9
10	#include "bcache.h"
11	#include "btree.h"
12	#include "debug.h"
13	#include "writeback.h"
14
15	#include <linux/delay.h>
16	#include <linux/kthread.h>
17	#include <linux/sched/clock.h>
18	#include <trace/events/bcache.h>
19
20	static void update_gc_after_writeback(struct cache_set *c)
21	{
22	if (c->gc_after_writeback != (BCH_ENABLE_AUTO_GC) \|\|
23	c->gc_stats.in_use < BCH_AUTO_GC_DIRTY_THRESHOLD)
24	return;
25
26	c->gc_after_writeback \|= BCH_DO_AUTO_GC;
27	}
28
29	/ Rate limiting /
30	static uint64_t __calc_target_rate(struct cached_dev *dc)
31	{
32	struct cache_set *c = dc->disk.c;
33
34	/*
35	* This is the size of the cache, minus the amount used for
36	* flash-only devices
37	*/
38	uint64_t cache_sectors = c->nbuckets * c->cache->sb.bucket_size -
39	atomic_long_read(v: &c->flash_dev_dirty_sectors);
40
41	/*
42	* Unfortunately there is no control of global dirty data. If the
43	* user states that they want 10% dirty data in the cache, and has,
44	* e.g., 5 backing volumes of equal size, we try and ensure each
45	* backing volume uses about 2% of the cache for dirty data.
46	*/
47	uint32_t bdev_share =
48	div64_u64(dividend: bdev_nr_sectors(bdev: dc->bdev) << WRITEBACK_SHARE_SHIFT,
49	divisor: c->cached_dev_sectors);
50
51	uint64_t cache_dirty_target =
52	div_u64(dividend: cache_sectors * dc->writeback_percent, divisor: `100`);
53
54	/ Ensure each backing dev gets at least one dirty share /
55	if (bdev_share < `1`)
56	bdev_share = `1`;
57
58	return (cache_dirty_target * bdev_share) >> WRITEBACK_SHARE_SHIFT;
59	}
60
61	static void __update_writeback_rate(struct cached_dev *dc)
62	{
63	/*
64	* PI controller:
65	* Figures out the amount that should be written per second.
66	*
67	* First, the error (number of sectors that are dirty beyond our
68	* target) is calculated. The error is accumulated (numerically
69	* integrated).
70	*
71	* Then, the proportional value and integral value are scaled
72	* based on configured values. These are stored as inverses to
73	* avoid fixed point math and to make configuration easy-- e.g.
74	* the default value of 40 for writeback_rate_p_term_inverse
75	* attempts to write at a rate that would retire all the dirty
76	* blocks in 40 seconds.
77	*
78	* The writeback_rate_i_inverse value of 10000 means that 1/10000th
79	* of the error is accumulated in the integral term per second.
80	* This acts as a slow, long-term average that is not subject to
81	* variations in usage like the p term.
82	*/
83	int64_t target = __calc_target_rate(dc);
84	int64_t dirty = bcache_dev_sectors_dirty(d: &dc->disk);
85	int64_t error = dirty - target;
86	int64_t proportional_scaled =
87	div_s64(dividend: error, divisor: dc->writeback_rate_p_term_inverse);
88	int64_t integral_scaled;
89	uint32_t new_rate;
90
91	/*
92	* We need to consider the number of dirty buckets as well
93	* when calculating the proportional_scaled, Otherwise we might
94	* have an unreasonable small writeback rate at a highly fragmented situation
95	* when very few dirty sectors consumed a lot dirty buckets, the
96	* worst case is when dirty buckets reached cutoff_writeback_sync and
97	* dirty data is still not even reached to writeback percent, so the rate
98	* still will be at the minimum value, which will cause the write
99	* stuck at a non-writeback mode.
100	*/
101	struct cache_set *c = dc->disk.c;
102
103	int64_t dirty_buckets = c->nbuckets - c->avail_nbuckets;
104
105	if (dc->writeback_consider_fragment &&
106	c->gc_stats.in_use > BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW && dirty > `0`) {
107	int64_t fragment =
108	div_s64(dividend: (dirty_buckets * c->cache->sb.bucket_size), divisor: dirty);
109	int64_t fp_term;
110	int64_t fps;
111
112	if (c->gc_stats.in_use <= BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID) {
113	fp_term = (int64_t)dc->writeback_rate_fp_term_low *
114	(c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW);
115	} else if (c->gc_stats.in_use <= BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH) {
116	fp_term = (int64_t)dc->writeback_rate_fp_term_mid *
117	(c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID);
118	} else {
119	fp_term = (int64_t)dc->writeback_rate_fp_term_high *
120	(c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH);
121	}
122	fps = div_s64(dividend: dirty, divisor: dirty_buckets) * fp_term;
123	if (fragment > `3` && fps > proportional_scaled) {
124	/ Only overrite the p when fragment > 3 /
125	proportional_scaled = fps;
126	}
127	}
128
129	if ((error < `0` && dc->writeback_rate_integral > `0`) \|\|
130	(error > `0` && time_before64(local_clock(),
131	dc->writeback_rate.next + NSEC_PER_MSEC))) {
132	/*
133	* Only decrease the integral term if it's more than
134	* zero. Only increase the integral term if the device
135	* is keeping up. (Don't wind up the integral
136	* ineffectively in either case).
137	*
138	* It's necessary to scale this by
139	* writeback_rate_update_seconds to keep the integral
140	* term dimensioned properly.
141	*/
142	dc->writeback_rate_integral += error *
143	dc->writeback_rate_update_seconds;
144	}
145
146	integral_scaled = div_s64(dividend: dc->writeback_rate_integral,
147	divisor: dc->writeback_rate_i_term_inverse);
148
149	new_rate = clamp_t(int32_t, (proportional_scaled + integral_scaled),
150	dc->writeback_rate_minimum, NSEC_PER_SEC);
151
152	dc->writeback_rate_proportional = proportional_scaled;
153	dc->writeback_rate_integral_scaled = integral_scaled;
154	dc->writeback_rate_change = new_rate -
155	atomic_long_read(v: &dc->writeback_rate.rate);
156	atomic_long_set(v: &dc->writeback_rate.rate, i: new_rate);
157	dc->writeback_rate_target = target;
158	}
159
160	static bool idle_counter_exceeded(struct cache_set *c)
161	{
162	int counter, dev_nr;
163
164	/*
165	* If c->idle_counter is overflow (idel for really long time),
166	* reset as 0 and not set maximum rate this time for code
167	* simplicity.
168	*/
169	counter = atomic_inc_return(v: &c->idle_counter);
170	if (counter <= `0`) {
171	atomic_set(v: &c->idle_counter, i: `0`);
172	return false;
173	}
174
175	dev_nr = atomic_read(v: &c->attached_dev_nr);
176	if (dev_nr == `0`)
177	return false;
178
179	/*
180	* c->idle_counter is increased by writeback thread of all
181	* attached backing devices, in order to represent a rough
182	* time period, counter should be divided by dev_nr.
183	* Otherwise the idle time cannot be larger with more backing
184	* device attached.
185	* The following calculation equals to checking
186	* (counter / dev_nr) < (dev_nr * 6)
187	*/
188	if (counter < (dev_nr * dev_nr * `6`))
189	return false;
190
191	return true;
192	}
193
194	/*
195	* Idle_counter is increased every time when update_writeback_rate() is
196	* called. If all backing devices attached to the same cache set have
197	* identical dc->writeback_rate_update_seconds values, it is about 6
198	* rounds of update_writeback_rate() on each backing device before
199	* c->at_max_writeback_rate is set to 1, and then max wrteback rate set
200	* to each dc->writeback_rate.rate.
201	* In order to avoid extra locking cost for counting exact dirty cached
202	* devices number, c->attached_dev_nr is used to calculate the idle
203	* throushold. It might be bigger if not all cached device are in write-
204	* back mode, but it still works well with limited extra rounds of
205	* update_writeback_rate().
206	*/
207	static bool set_at_max_writeback_rate(struct cache_set *c,
208	struct cached_dev *dc)
209	{
210	/ Don't sst max writeback rate if it is disabled /
211	if (!c->idle_max_writeback_rate_enabled)
212	return false;
213
214	/ Don't set max writeback rate if gc is running /
215	if (!c->gc_mark_valid)
216	return false;
217
218	if (!idle_counter_exceeded(c))
219	return false;
220
221	if (atomic_read(v: &c->at_max_writeback_rate) != `1`)
222	atomic_set(v: &c->at_max_writeback_rate, i: `1`);
223
224	atomic_long_set(v: &dc->writeback_rate.rate, INT_MAX);
225
226	/ keep writeback_rate_target as existing value /
227	dc->writeback_rate_proportional = `0`;
228	dc->writeback_rate_integral_scaled = `0`;
229	dc->writeback_rate_change = `0`;
230
231	/*
232	* In case new I/O arrives during before
233	* set_at_max_writeback_rate() returns.
234	*/
235	if (!idle_counter_exceeded(c) \|\|
236	!atomic_read(v: &c->at_max_writeback_rate))
237	return false;
238
239	return true;
240	}
241
242	static void update_writeback_rate(struct work_struct *work)
243	{
244	struct cached_dev *dc = container_of(to_delayed_work(work),
245	struct cached_dev,
246	writeback_rate_update);
247	struct cache_set *c = dc->disk.c;
248
249	/*
250	* should check BCACHE_DEV_RATE_DW_RUNNING before calling
251	* cancel_delayed_work_sync().
252	*/
253	set_bit(BCACHE_DEV_RATE_DW_RUNNING, addr: &dc->disk.flags);
254	/ paired with where BCACHE_DEV_RATE_DW_RUNNING is tested /
255	smp_mb__after_atomic();
256
257	/*
258	* CACHE_SET_IO_DISABLE might be set via sysfs interface,
259	* check it here too.
260	*/
261	if (!test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) \|\|
262	test_bit(CACHE_SET_IO_DISABLE, &c->flags)) {
263	clear_bit(BCACHE_DEV_RATE_DW_RUNNING, addr: &dc->disk.flags);
264	/ paired with where BCACHE_DEV_RATE_DW_RUNNING is tested /
265	smp_mb__after_atomic();
266	return;
267	}
268
269	/*
270	* If the whole cache set is idle, set_at_max_writeback_rate()
271	* will set writeback rate to a max number. Then it is
272	* unncessary to update writeback rate for an idle cache set
273	* in maximum writeback rate number(s).
274	*/
275	if (atomic_read(v: &dc->has_dirty) && dc->writeback_percent &&
276	!set_at_max_writeback_rate(c, dc)) {
277	do {
278	if (!down_read_trylock(sem: (&dc->writeback_lock))) {
279	dc->rate_update_retry++;
280	if (dc->rate_update_retry <=
281	BCH_WBRATE_UPDATE_MAX_SKIPS)
282	break;
283	down_read(sem: &dc->writeback_lock);
284	dc->rate_update_retry = `0`;
285	}
286	__update_writeback_rate(dc);
287	update_gc_after_writeback(c);
288	up_read(sem: &dc->writeback_lock);
289	} while (`0`);
290	}
291
292
293	/*
294	* CACHE_SET_IO_DISABLE might be set via sysfs interface,
295	* check it here too.
296	*/
297	if (test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) &&
298	!test_bit(CACHE_SET_IO_DISABLE, &c->flags)) {
299	schedule_delayed_work(dwork: &dc->writeback_rate_update,
300	delay: dc->writeback_rate_update_seconds * HZ);
301	}
302
303	/*
304	* should check BCACHE_DEV_RATE_DW_RUNNING before calling
305	* cancel_delayed_work_sync().
306	*/
307	clear_bit(BCACHE_DEV_RATE_DW_RUNNING, addr: &dc->disk.flags);
308	/ paired with where BCACHE_DEV_RATE_DW_RUNNING is tested /
309	smp_mb__after_atomic();
310	}
311
312	static unsigned int writeback_delay(struct cached_dev *dc,
313	unsigned int sectors)
314	{
315	if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) \|\|
316	!dc->writeback_percent)
317	return `0`;
318
319	return bch_next_delay(d: &dc->writeback_rate, done: sectors);
320	}
321
322	struct dirty_io {
323	struct closure cl;
324	struct cached_dev *dc;
325	uint16_t sequence;
326	struct bio bio;
327	};
328
329	static void dirty_init(struct keybuf_key *w)
330	{
331	struct dirty_io *io = w->private;
332	struct bio *bio = &io->bio;
333
334	bio_init(bio, NULL, table: bio->bi_inline_vecs,
335	DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), opf: `0`);
336	if (!io->dc->writeback_percent)
337	bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, `0`));
338
339	bio->bi_iter.bi_size = KEY_SIZE(k: &w->key) << `9`;
340	bio->bi_private = w;
341	bch_bio_map(bio, NULL);
342	}
343
344	static CLOSURE_CALLBACK(dirty_io_destructor)
345	{
346	closure_type(io, struct dirty_io, cl);
347
348	kfree(objp: io);
349	}
350
351	static CLOSURE_CALLBACK(write_dirty_finish)
352	{
353	closure_type(io, struct dirty_io, cl);
354	struct keybuf_key *w = io->bio.bi_private;
355	struct cached_dev *dc = io->dc;
356
357	bio_free_pages(bio: &io->bio);
358
359	/ This is kind of a dumb way of signalling errors. /
360	if (KEY_DIRTY(k: &w->key)) {
361	int ret;
362	unsigned int i;
363	struct keylist keys;
364
365	bch_keylist_init(l: &keys);
366
367	bkey_copy(keys.top, &w->key);
368	SET_KEY_DIRTY(k: keys.top, v: false);
369	bch_keylist_push(l: &keys);
370
371	for (i = `0`; i < KEY_PTRS(k: &w->key); i++)
372	atomic_inc(v: &PTR_BUCKET(c: dc->disk.c, k: &w->key, ptr: i)->pin);
373
374	ret = bch_btree_insert(c: dc->disk.c, keys: &keys, NULL, replace_key: &w->key);
375
376	if (ret)
377	trace_bcache_writeback_collision(k: &w->key);
378
379	atomic_long_inc(v: ret
380	? &dc->disk.c->writeback_keys_failed
381	: &dc->disk.c->writeback_keys_done);
382	}
383
384	bch_keybuf_del(buf: &dc->writeback_keys, w);
385	up(sem: &dc->in_flight);
386
387	closure_return_with_destructor(cl, dirty_io_destructor);
388	}
389
390	static void dirty_endio(struct bio *bio)
391	{
392	struct keybuf_key *w = bio->bi_private;
393	struct dirty_io *io = w->private;
394
395	if (bio->bi_status) {
396	SET_KEY_DIRTY(k: &w->key, v: false);
397	bch_count_backing_io_errors(dc: io->dc, bio);
398	}
399
400	closure_put(cl: &io->cl);
401	}
402
403	static CLOSURE_CALLBACK(write_dirty)
404	{
405	closure_type(io, struct dirty_io, cl);
406	struct keybuf_key *w = io->bio.bi_private;
407	struct cached_dev *dc = io->dc;
408
409	uint16_t next_sequence;
410
411	if (atomic_read(v: &dc->writeback_sequence_next) != io->sequence) {
412	/ Not our turn to write; wait for a write to complete /
413	closure_wait(list: &dc->writeback_ordering_wait, cl);
414
415	if (atomic_read(v: &dc->writeback_sequence_next) == io->sequence) {
416	/*
417	* Edge case-- it happened in indeterminate order
418	* relative to when we were added to wait list..
419	*/
420	closure_wake_up(list: &dc->writeback_ordering_wait);
421	}
422
423	continue_at(cl, write_dirty, io->dc->writeback_write_wq);
424	return;
425	}
426
427	next_sequence = io->sequence + `1`;
428
429	/*
430	* IO errors are signalled using the dirty bit on the key.
431	* If we failed to read, we should not attempt to write to the
432	* backing device. Instead, immediately go to write_dirty_finish
433	* to clean up.
434	*/
435	if (KEY_DIRTY(k: &w->key)) {
436	dirty_init(w);
437	io->bio.bi_opf = REQ_OP_WRITE;
438	io->bio.bi_iter.bi_sector = KEY_START(&w->key);
439	bio_set_dev(bio: &io->bio, bdev: io->dc->bdev);
440	io->bio.bi_end_io = dirty_endio;
441
442	/ I/O request sent to backing device /
443	closure_bio_submit(c: io->dc->disk.c, bio: &io->bio, cl);
444	}
445
446	atomic_set(v: &dc->writeback_sequence_next, i: next_sequence);
447	closure_wake_up(list: &dc->writeback_ordering_wait);
448
449	continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq);
450	}
451
452	static void read_dirty_endio(struct bio *bio)
453	{
454	struct keybuf_key *w = bio->bi_private;
455	struct dirty_io *io = w->private;
456
457	/ is_read = 1 /
458	bch_count_io_errors(ca: io->dc->disk.c->cache,
459	error: bio->bi_status, is_read: `1`,
460	m: "reading dirty data from cache");
461
462	dirty_endio(bio);
463	}
464
465	static CLOSURE_CALLBACK(read_dirty_submit)
466	{
467	closure_type(io, struct dirty_io, cl);
468
469	closure_bio_submit(c: io->dc->disk.c, bio: &io->bio, cl);
470
471	continue_at(cl, write_dirty, io->dc->writeback_write_wq);
472	}
473
474	static void read_dirty(struct cached_dev *dc)
475	{
476	unsigned int delay = `0`;
477	struct keybuf_key next, keys[MAX_WRITEBACKS_IN_PASS], *w;
478	size_t size;
479	int nk, i;
480	struct dirty_io *io;
481	struct closure cl;
482	uint16_t sequence = `0`;
483
484	BUG_ON(!llist_empty(&dc->writeback_ordering_wait.list));
485	atomic_set(v: &dc->writeback_sequence_next, i: sequence);
486	closure_init_stack(cl: &cl);
487
488	/*
489	* XXX: if we error, background writeback just spins. Should use some
490	* mempools.
491	*/
492
493	next = bch_keybuf_next(buf: &dc->writeback_keys);
494
495	while (!kthread_should_stop() &&
496	!test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) &&
497	next) {
498	size = `0`;
499	nk = `0`;
500
501	do {
502	BUG_ON(ptr_stale(dc->disk.c, &next->key, `0`));
503
504	/*
505	* Don't combine too many operations, even if they
506	* are all small.
507	*/
508	if (nk >= MAX_WRITEBACKS_IN_PASS)
509	break;
510
511	/*
512	* If the current operation is very large, don't
513	* further combine operations.
514	*/
515	if (size >= MAX_WRITESIZE_IN_PASS)
516	break;
517
518	/*
519	* Operations are only eligible to be combined
520	* if they are contiguous.
521	*
522	* TODO: add a heuristic willing to fire a
523	* certain amount of non-contiguous IO per pass,
524	* so that we can benefit from backing device
525	* command queueing.
526	*/
527	if ((nk != `0`) && bkey_cmp(l: &keys[nk-`1`]->key,
528	r: &START_KEY(&next->key)))
529	break;
530
531	size += KEY_SIZE(k: &next->key);
532	keys[nk++] = next;
533	} while ((next = bch_keybuf_next(buf: &dc->writeback_keys)));
534
535	/ Now we have gathered a set of 1..5 keys to write back. /
536	for (i = `0`; i < nk; i++) {
537	w = keys[i];
538
539	io = kzalloc(struct_size(io, bio.bi_inline_vecs,
540	DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)),
541	GFP_KERNEL);
542	if (!io)
543	goto err;
544
545	w->private = io;
546	io->dc = dc;
547	io->sequence = sequence++;
548
549	dirty_init(w);
550	io->bio.bi_opf = REQ_OP_READ;
551	io->bio.bi_iter.bi_sector = PTR_OFFSET(k: &w->key, i: `0`);
552	bio_set_dev(bio: &io->bio, bdev: dc->disk.c->cache->bdev);
553	io->bio.bi_end_io = read_dirty_endio;
554
555	if (bch_bio_alloc_pages(bio: &io->bio, GFP_KERNEL))
556	goto err_free;
557
558	trace_bcache_writeback(k: &w->key);
559
560	down(sem: &dc->in_flight);
561
562	/*
563	* We've acquired a semaphore for the maximum
564	* simultaneous number of writebacks; from here
565	* everything happens asynchronously.
566	*/
567	closure_call(cl: &io->cl, fn: read_dirty_submit, NULL, parent: &cl);
568	}
569
570	delay = writeback_delay(dc, sectors: size);
571
572	while (!kthread_should_stop() &&
573	!test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) &&
574	delay) {
575	schedule_timeout_interruptible(timeout: delay);
576	delay = writeback_delay(dc, sectors: `0`);
577	}
578	}
579
580	if (`0`) {
581	err_free:
582	kfree(objp: w->private);
583	err:
584	bch_keybuf_del(buf: &dc->writeback_keys, w);
585	}
586
587	/*
588	* Wait for outstanding writeback IOs to finish (and keybuf slots to be
589	* freed) before refilling again
590	*/
591	closure_sync(cl: &cl);
592	}
593
594	/ Scan for dirty data /
595
596	void bcache_dev_sectors_dirty_add(struct cache_set c, unsigned* int inode,
597	uint64_t offset, int nr_sectors)
598	{
599	struct bcache_device *d = c->devices[inode];
600	unsigned int stripe_offset, sectors_dirty;
601	int stripe;
602
603	if (!d)
604	return;
605
606	stripe = offset_to_stripe(d, offset);
607	if (stripe < `0`)
608	return;
609
610	if (UUID_FLASH_ONLY(k: &c->uuids[inode]))
611	atomic_long_add(i: nr_sectors, v: &c->flash_dev_dirty_sectors);
612
613	stripe_offset = offset & (d->stripe_size - `1`);
614
615	while (nr_sectors) {
616	int s = min_t(unsigned int, abs(nr_sectors),
617	d->stripe_size - stripe_offset);
618
619	if (nr_sectors < `0`)
620	s = -s;
621
622	if (stripe >= d->nr_stripes)
623	return;
624
625	sectors_dirty = atomic_add_return(i: s,
626	v: d->stripe_sectors_dirty + stripe);
627	if (sectors_dirty == d->stripe_size) {
628	if (!test_bit(stripe, d->full_dirty_stripes))
629	set_bit(nr: stripe, addr: d->full_dirty_stripes);
630	} else {
631	if (test_bit(stripe, d->full_dirty_stripes))
632	clear_bit(nr: stripe, addr: d->full_dirty_stripes);
633	}
634
635	nr_sectors -= s;
636	stripe_offset = `0`;
637	stripe++;
638	}
639	}
640
641	static bool dirty_pred(struct keybuf buf, struct* bkey *k)
642	{
643	struct cached_dev *dc = container_of(buf,
644	struct cached_dev,
645	writeback_keys);
646
647	BUG_ON(KEY_INODE(k) != dc->disk.id);
648
649	return KEY_DIRTY(k);
650	}
651
652	static void refill_full_stripes(struct cached_dev *dc)
653	{
654	struct keybuf *buf = &dc->writeback_keys;
655	unsigned int start_stripe, next_stripe;
656	int stripe;
657	bool wrapped = false;
658
659	stripe = offset_to_stripe(d: &dc->disk, offset: KEY_OFFSET(k: &buf->last_scanned));
660	if (stripe < `0`)
661	stripe = `0`;
662
663	start_stripe = stripe;
664
665	while (`1`) {
666	stripe = find_next_bit(addr: dc->disk.full_dirty_stripes,
667	size: dc->disk.nr_stripes, offset: stripe);
668
669	if (stripe == dc->disk.nr_stripes)
670	goto next;
671
672	next_stripe = find_next_zero_bit(addr: dc->disk.full_dirty_stripes,
673	size: dc->disk.nr_stripes, offset: stripe);
674
675	buf->last_scanned = KEY(dc->disk.id,
676	stripe * dc->disk.stripe_size, `0`);
677
678	bch_refill_keybuf(c: dc->disk.c, buf,
679	end: &KEY(dc->disk.id,
680	next_stripe * dc->disk.stripe_size, `0`),
681	pred: dirty_pred);
682
683	if (array_freelist_empty(&buf->freelist))
684	return;
685
686	stripe = next_stripe;
687	next:
688	if (wrapped && stripe > start_stripe)
689	return;
690
691	if (stripe == dc->disk.nr_stripes) {
692	stripe = `0`;
693	wrapped = true;
694	}
695	}
696	}
697
698	/*
699	* Returns true if we scanned the entire disk
700	*/
701	static bool refill_dirty(struct cached_dev *dc)
702	{
703	struct keybuf *buf = &dc->writeback_keys;
704	struct bkey start = KEY(dc->disk.id, `0`, `0`);
705	struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, `0`);
706	struct bkey start_pos;
707
708	/*
709	* make sure keybuf pos is inside the range for this disk - at bringup
710	* we might not be attached yet so this disk's inode nr isn't
711	* initialized then
712	*/
713	if (bkey_cmp(l: &buf->last_scanned, r: &start) < `0` \|\|
714	bkey_cmp(l: &buf->last_scanned, r: &end) > `0`)
715	buf->last_scanned = start;
716
717	if (dc->partial_stripes_expensive) {
718	refill_full_stripes(dc);
719	if (array_freelist_empty(&buf->freelist))
720	return false;
721	}
722
723	start_pos = buf->last_scanned;
724	bch_refill_keybuf(c: dc->disk.c, buf, end: &end, pred: dirty_pred);
725
726	if (bkey_cmp(l: &buf->last_scanned, r: &end) < `0`)
727	return false;
728
729	/*
730	* If we get to the end start scanning again from the beginning, and
731	* only scan up to where we initially started scanning from:
732	*/
733	buf->last_scanned = start;
734	bch_refill_keybuf(c: dc->disk.c, buf, end: &start_pos, pred: dirty_pred);
735
736	return bkey_cmp(l: &buf->last_scanned, r: &start_pos) >= `0`;
737	}
738
739	static int bch_writeback_thread(void *arg)
740	{
741	struct cached_dev *dc = arg;
742	struct cache_set *c = dc->disk.c;
743	bool searched_full_index;
744
745	bch_ratelimit_reset(d: &dc->writeback_rate);
746
747	while (!kthread_should_stop() &&
748	!test_bit(CACHE_SET_IO_DISABLE, &c->flags)) {
749	down_write(sem: &dc->writeback_lock);
750	set_current_state(TASK_INTERRUPTIBLE);
751	/*
752	* If the bache device is detaching, skip here and continue
753	* to perform writeback. Otherwise, if no dirty data on cache,
754	* or there is dirty data on cache but writeback is disabled,
755	* the writeback thread should sleep here and wait for others
756	* to wake up it.
757	*/
758	if (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) &&
759	(!atomic_read(v: &dc->has_dirty) \|\| !dc->writeback_running)) {
760	up_write(sem: &dc->writeback_lock);
761
762	if (kthread_should_stop() \|\|
763	test_bit(CACHE_SET_IO_DISABLE, &c->flags)) {
764	set_current_state(TASK_RUNNING);
765	break;
766	}
767
768	schedule();
769	continue;
770	}
771	set_current_state(TASK_RUNNING);
772
773	searched_full_index = refill_dirty(dc);
774
775	if (searched_full_index &&
776	RB_EMPTY_ROOT(&dc->writeback_keys.keys)) {
777	atomic_set(v: &dc->has_dirty, i: `0`);
778	SET_BDEV_STATE(k: &dc->sb, BDEV_STATE_CLEAN);
779	bch_write_bdev_super(dc, NULL);
780	/*
781	* If bcache device is detaching via sysfs interface,
782	* writeback thread should stop after there is no dirty
783	* data on cache. BCACHE_DEV_DETACHING flag is set in
784	* bch_cached_dev_detach().
785	*/
786	if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) {
787	struct closure cl;
788
789	closure_init_stack(cl: &cl);
790	memset(&dc->sb.set_uuid, `0`, `16`);
791	SET_BDEV_STATE(k: &dc->sb, BDEV_STATE_NONE);
792
793	bch_write_bdev_super(dc, parent: &cl);
794	closure_sync(cl: &cl);
795
796	up_write(sem: &dc->writeback_lock);
797	break;
798	}
799
800	/*
801	* When dirty data rate is high (e.g. 50%+), there might
802	* be heavy buckets fragmentation after writeback
803	* finished, which hurts following write performance.
804	* If users really care about write performance they
805	* may set BCH_ENABLE_AUTO_GC via sysfs, then when
806	* BCH_DO_AUTO_GC is set, garbage collection thread
807	* will be wake up here. After moving gc, the shrunk
808	* btree and discarded free buckets SSD space may be
809	* helpful for following write requests.
810	*/
811	if (c->gc_after_writeback ==
812	(BCH_ENABLE_AUTO_GC\|BCH_DO_AUTO_GC)) {
813	c->gc_after_writeback &= ~BCH_DO_AUTO_GC;
814	force_wake_up_gc(c);
815	}
816	}
817
818	up_write(sem: &dc->writeback_lock);
819
820	read_dirty(dc);
821
822	if (searched_full_index) {
823	unsigned int delay = dc->writeback_delay * HZ;
824
825	while (delay &&
826	!kthread_should_stop() &&
827	!test_bit(CACHE_SET_IO_DISABLE, &c->flags) &&
828	!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
829	delay = schedule_timeout_interruptible(timeout: delay);
830
831	bch_ratelimit_reset(d: &dc->writeback_rate);
832	}
833	}
834
835	if (dc->writeback_write_wq)
836	destroy_workqueue(wq: dc->writeback_write_wq);
837
838	cached_dev_put(dc);
839	wait_for_kthread_stop();
840
841	return `0`;
842	}
843
844	/ Init /
845	#define INIT_KEYS_EACH_TIME 500000
846
847	struct sectors_dirty_init {
848	struct btree_op op;
849	unsigned int inode;
850	size_t count;
851	};
852
853	static int sectors_dirty_init_fn(struct btree_op _op, struct* btree *b,
854	struct bkey *k)
855	{
856	struct sectors_dirty_init *op = container_of(_op,
857	struct sectors_dirty_init, op);
858	if (KEY_INODE(k) > op->inode)
859	return MAP_DONE;
860
861	if (KEY_DIRTY(k))
862	bcache_dev_sectors_dirty_add(c: b->c, inode: KEY_INODE(k),
863	KEY_START(k), nr_sectors: KEY_SIZE(k));
864
865	op->count++;
866	if (!(op->count % INIT_KEYS_EACH_TIME))
867	cond_resched();
868
869	return MAP_CONTINUE;
870	}
871
872	static int bch_root_node_dirty_init(struct cache_set *c,
873	struct bcache_device *d,
874	struct bkey *k)
875	{
876	struct sectors_dirty_init op;
877	int ret;
878
879	bch_btree_op_init(op: &op.op, write_lock_level: -`1`);
880	op.inode = d->id;
881	op.count = `0`;
882
883	ret = bcache_btree(map_keys_recurse,
884	k,
885	c->root,
886	&op.op,
887	&KEY(op.inode, `0`, `0`),
888	sectors_dirty_init_fn,
889	`0`);
890	if (ret < `0`)
891	pr_warn("sectors dirty init failed, ret=%d!\n", ret);
892
893	/*
894	* The op may be added to cache_set's btree_cache_wait
895	* in mca_cannibalize(), must ensure it is removed from
896	* the list and release btree_cache_alloc_lock before
897	* free op memory.
898	* Otherwise, the btree_cache_wait will be damaged.
899	*/
900	bch_cannibalize_unlock(c);
901	finish_wait(wq_head: &c->btree_cache_wait, wq_entry: &(&op.op)->wait);
902
903	return ret;
904	}
905
906	static int bch_dirty_init_thread(void *arg)
907	{
908	struct dirty_init_thrd_info *info = arg;
909	struct bch_dirty_init_state *state = info->state;
910	struct cache_set *c = state->c;
911	struct btree_iter iter;
912	struct bkey k, p;
913	int cur_idx, prev_idx, skip_nr;
914
915	k = p = NULL;
916	prev_idx = `0`;
917
918	bch_btree_iter_init(b: &c->root->keys, iter: &iter, NULL);
919	k = bch_btree_iter_next_filter(iter: &iter, b: &c->root->keys, fn: bch_ptr_bad);
920	BUG_ON(!k);
921
922	p = k;
923
924	while (k) {
925	spin_lock(lock: &state->idx_lock);
926	cur_idx = state->key_idx;
927	state->key_idx++;
928	spin_unlock(lock: &state->idx_lock);
929
930	skip_nr = cur_idx - prev_idx;
931
932	while (skip_nr) {
933	k = bch_btree_iter_next_filter(iter: &iter,
934	b: &c->root->keys,
935	fn: bch_ptr_bad);
936	if (k)
937	p = k;
938	else {
939	atomic_set(v: &state->enough, i: `1`);
940	/ Update state->enough earlier /
941	smp_mb__after_atomic();
942	goto out;
943	}
944	skip_nr--;
945	}
946
947	if (p) {
948	if (bch_root_node_dirty_init(c, d: state->d, k: p) < `0`)
949	goto out;
950	}
951
952	p = NULL;
953	prev_idx = cur_idx;
954	}
955
956	out:
957	/ In order to wake up state->wait in time /
958	smp_mb__before_atomic();
959	if (atomic_dec_and_test(v: &state->started))
960	wake_up(&state->wait);
961
962	return `0`;
963	}
964
965	static int bch_btre_dirty_init_thread_nr(void)
966	{
967	int n = num_online_cpus()/`2`;
968
969	if (n == `0`)
970	n = `1`;
971	else if (n > BCH_DIRTY_INIT_THRD_MAX)
972	n = BCH_DIRTY_INIT_THRD_MAX;
973
974	return n;
975	}
976
977	void bch_sectors_dirty_init(struct bcache_device *d)
978	{
979	int i;
980	struct btree *b = NULL;
981	struct bkey *k = NULL;
982	struct btree_iter iter;
983	struct sectors_dirty_init op;
984	struct cache_set *c = d->c;
985	struct bch_dirty_init_state state;
986
987	retry_lock:
988	b = c->root;
989	rw_lock(w: `0`, b, level: b->level);
990	if (b != c->root) {
991	rw_unlock(w: `0`, b);
992	goto retry_lock;
993	}
994
995	/ Just count root keys if no leaf node /
996	if (c->root->level == `0`) {
997	bch_btree_op_init(op: &op.op, write_lock_level: -`1`);
998	op.inode = d->id;
999	op.count = `0`;
1000
1001	for_each_key_filter(&c->root->keys,
1002	k, &iter, bch_ptr_invalid) {
1003	if (KEY_INODE(k) != op.inode)
1004	continue;
1005	sectors_dirty_init_fn(op: &op.op, b: c->root, k);
1006	}
1007
1008	rw_unlock(w: `0`, b);
1009	return;
1010	}
1011
1012	memset(&state, `0`, sizeof(struct bch_dirty_init_state));
1013	state.c = c;
1014	state.d = d;
1015	state.total_threads = bch_btre_dirty_init_thread_nr();
1016	state.key_idx = `0`;
1017	spin_lock_init(&state.idx_lock);
1018	atomic_set(v: &state.started, i: `0`);
1019	atomic_set(v: &state.enough, i: `0`);
1020	init_waitqueue_head(&state.wait);
1021
1022	for (i = `0`; i < state.total_threads; i++) {
1023	/ Fetch latest state.enough earlier /
1024	smp_mb__before_atomic();
1025	if (atomic_read(v: &state.enough))
1026	break;
1027
1028	atomic_inc(v: &state.started);
1029	state.infos[i].state = &state;
1030	state.infos[i].thread =
1031	kthread_run(bch_dirty_init_thread, &state.infos[i],
1032	"bch_dirtcnt[%d]", i);
1033	if (IS_ERR(ptr: state.infos[i].thread)) {
1034	pr_err("fails to run thread bch_dirty_init[%d]\n", i);
1035	atomic_dec(v: &state.started);
1036	for (--i; i >= `0`; i--)
1037	kthread_stop(k: state.infos[i].thread);
1038	goto out;
1039	}
1040	}
1041
1042	out:
1043	/ Must wait for all threads to stop. /
1044	wait_event(state.wait, atomic_read(&state.started) == `0`);
1045	rw_unlock(w: `0`, b);
1046	}
1047
1048	void bch_cached_dev_writeback_init(struct cached_dev *dc)
1049	{
1050	sema_init(sem: &dc->in_flight, val: `64`);
1051	init_rwsem(&dc->writeback_lock);
1052	bch_keybuf_init(buf: &dc->writeback_keys);
1053
1054	dc->writeback_metadata = true;
1055	dc->writeback_running = false;
1056	dc->writeback_consider_fragment = true;
1057	dc->writeback_percent = `10`;
1058	dc->writeback_delay = `30`;
1059	atomic_long_set(v: &dc->writeback_rate.rate, i: `1024`);
1060	dc->writeback_rate_minimum = `8`;
1061
1062	dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT;
1063	dc->writeback_rate_p_term_inverse = `40`;
1064	dc->writeback_rate_fp_term_low = `1`;
1065	dc->writeback_rate_fp_term_mid = `10`;
1066	dc->writeback_rate_fp_term_high = `1000`;
1067	dc->writeback_rate_i_term_inverse = `10000`;
1068
1069	/ For dc->writeback_lock contention in update_writeback_rate() /
1070	dc->rate_update_retry = `0`;
1071
1072	WARN_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags));
1073	INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
1074	}
1075
1076	int bch_cached_dev_writeback_start(struct cached_dev *dc)
1077	{
1078	dc->writeback_write_wq = alloc_workqueue(fmt: "bcache_writeback_wq",
1079	flags: WQ_MEM_RECLAIM, max_active: `0`);
1080	if (!dc->writeback_write_wq)
1081	return -ENOMEM;
1082
1083	cached_dev_get(dc);
1084	dc->writeback_thread = kthread_create(bch_writeback_thread, dc,
1085	"bcache_writeback");
1086	if (IS_ERR(ptr: dc->writeback_thread)) {
1087	cached_dev_put(dc);
1088	destroy_workqueue(wq: dc->writeback_write_wq);
1089	return PTR_ERR(ptr: dc->writeback_thread);
1090	}
1091	dc->writeback_running = true;
1092
1093	WARN_ON(test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags));
1094	schedule_delayed_work(dwork: &dc->writeback_rate_update,
1095	delay: dc->writeback_rate_update_seconds * HZ);
1096
1097	bch_writeback_queue(dc);
1098
1099	return `0`;
1100	}
1101

source code of linux/drivers/md/bcache/writeback.c