btree.c source code [linux/drivers/md/bcache/btree.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
4	*
5	* Uses a block device as cache for other block devices; optimized for SSDs.
6	* All allocation is done in buckets, which should match the erase block size
7	* of the device.
8	*
9	* Buckets containing cached data are kept on a heap sorted by priority;
10	* bucket priority is increased on cache hit, and periodically all the buckets
11	* on the heap have their priority scaled down. This currently is just used as
12	* an LRU but in the future should allow for more intelligent heuristics.
13	*
14	* Buckets have an 8 bit counter; freeing is accomplished by incrementing the
15	* counter. Garbage collection is used to remove stale pointers.
16	*
17	* Indexing is done via a btree; nodes are not necessarily fully sorted, rather
18	* as keys are inserted we only sort the pages that have not yet been written.
19	* When garbage collection is run, we resort the entire node.
20	*
21	* All configuration is done via sysfs; see Documentation/admin-guide/bcache.rst.
22	*/
23
24	#include "bcache.h"
25	#include "btree.h"
26	#include "debug.h"
27	#include "extents.h"
28
29	#include <linux/slab.h>
30	#include <linux/bitops.h>
31	#include <linux/hash.h>
32	#include <linux/kthread.h>
33	#include <linux/prefetch.h>
34	#include <linux/random.h>
35	#include <linux/rcupdate.h>
36	#include <linux/sched/clock.h>
37	#include <linux/rculist.h>
38	#include <linux/delay.h>
39	#include <trace/events/bcache.h>
40
41	/*
42	* Todo:
43	* register_bcache: Return errors out to userspace correctly
44	*
45	* Writeback: don't undirty key until after a cache flush
46	*
47	* Create an iterator for key pointers
48	*
49	* On btree write error, mark bucket such that it won't be freed from the cache
50	*
51	* Journalling:
52	* Check for bad keys in replay
53	* Propagate barriers
54	* Refcount journal entries in journal_replay
55	*
56	* Garbage collection:
57	* Finish incremental gc
58	* Gc should free old UUIDs, data for invalid UUIDs
59	*
60	* Provide a way to list backing device UUIDs we have data cached for, and
61	* probably how long it's been since we've seen them, and a way to invalidate
62	* dirty data for devices that will never be attached again
63	*
64	* Keep 1 min/5 min/15 min statistics of how busy a block device has been, so
65	* that based on that and how much dirty data we have we can keep writeback
66	* from being starved
67	*
68	* Add a tracepoint or somesuch to watch for writeback starvation
69	*
70	* When btree depth > 1 and splitting an interior node, we have to make sure
71	* alloc_bucket() cannot fail. This should be true but is not completely
72	* obvious.
73	*
74	* Plugging?
75	*
76	* If data write is less than hard sector size of ssd, round up offset in open
77	* bucket to the next whole sector
78	*
79	* Superblock needs to be fleshed out for multiple cache devices
80	*
81	* Add a sysfs tunable for the number of writeback IOs in flight
82	*
83	* Add a sysfs tunable for the number of open data buckets
84	*
85	* IO tracking: Can we track when one process is doing io on behalf of another?
86	* IO tracking: Don't use just an average, weigh more recent stuff higher
87	*
88	* Test module load/unload
89	*/
90
91	#define MAX_NEED_GC 64
92	#define MAX_SAVE_PRIO 72
93	#define MAX_GC_TIMES 100
94	#define MIN_GC_NODES 100
95	#define GC_SLEEP_MS 100
96
97	#define PTR_DIRTY_BIT (((uint64_t) 1 << 36))
98
99	#define PTR_HASH(c, k) \
100	(((k)->ptr[0] >> c->bucket_bits) \| PTR_GEN(k, 0))
101
102	static struct workqueue_struct *btree_io_wq;
103
104	#define insert_lock(s, b) ((b)->level <= (s)->lock)
105
106
107	static inline struct bset write_block(struct* btree *b)
108	{
109	return ((void ) btree_bset_first(b)) + b->written block_bytes(b->c->cache);
110	}
111
112	static void bch_btree_init_next(struct btree *b)
113	{
114	/ If not a leaf node, always sort /
115	if (b->level && b->keys.nsets)
116	bch_btree_sort(b: &b->keys, state: &b->c->sort);
117	else
118	bch_btree_sort_lazy(b: &b->keys, state: &b->c->sort);
119
120	if (b->written < btree_blocks(b))
121	bch_bset_init_next(b: &b->keys, i: write_block(b),
122	magic: bset_magic(sb: &b->c->cache->sb));
123
124	}
125
126	/ Btree key manipulation /
127
128	void bkey_put(struct cache_set c, struct* bkey *k)
129	{
130	unsigned int i;
131
132	for (i = `0`; i < KEY_PTRS(k); i++)
133	if (ptr_available(c, k, i))
134	atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin);
135	}
136
137	/ Btree IO /
138
139	static uint64_t btree_csum_set(struct btree b, struct* bset *i)
140	{
141	uint64_t crc = b->key.ptr[`0`];
142	void data = (void* ) i + `8`, end = bset_bkey_last(i);
143
144	crc = crc64_be(crc, p: data, len: end - data);
145	return crc ^ `0xffffffffffffffffULL`;
146	}
147
148	void bch_btree_node_read_done(struct btree *b)
149	{
150	const char *err = "bad btree header";
151	struct bset *i = btree_bset_first(b);
152	struct btree_iter *iter;
153
154	/*
155	* c->fill_iter can allocate an iterator with more memory space
156	* than static MAX_BSETS.
157	* See the comment arount cache_set->fill_iter.
158	*/
159	iter = mempool_alloc(pool: &b->c->fill_iter, GFP_NOIO);
160	iter->size = b->c->cache->sb.bucket_size / b->c->cache->sb.block_size;
161	iter->used = `0`;
162
163	#ifdef CONFIG_BCACHE_DEBUG
164	iter->b = &b->keys;
165	#endif
166
167	if (!i->seq)
168	goto err;
169
170	for (;
171	b->written < btree_blocks(b) && i->seq == b->keys.set[`0`].data->seq;
172	i = write_block(b)) {
173	err = "unsupported bset version";
174	if (i->version > BCACHE_BSET_VERSION)
175	goto err;
176
177	err = "bad btree header";
178	if (b->written + set_blocks(i, block_bytes(b->c->cache)) >
179	btree_blocks(b))
180	goto err;
181
182	err = "bad magic";
183	if (i->magic != bset_magic(sb: &b->c->cache->sb))
184	goto err;
185
186	err = "bad checksum";
187	switch (i->version) {
188	case `0`:
189	if (i->csum != csum_set(i))
190	goto err;
191	break;
192	case BCACHE_BSET_VERSION:
193	if (i->csum != btree_csum_set(b, i))
194	goto err;
195	break;
196	}
197
198	err = "empty set";
199	if (i != b->keys.set[`0`].data && !i->keys)
200	goto err;
201
202	bch_btree_iter_push(iter, k: i->start, bset_bkey_last(i));
203
204	b->written += set_blocks(i, block_bytes(b->c->cache));
205	}
206
207	err = "corrupted btree";
208	for (i = write_block(b);
209	bset_sector_offset(b: &b->keys, i) < KEY_SIZE(k: &b->key);
210	i = ((void *) i) + block_bytes(b->c->cache))
211	if (i->seq == b->keys.set[`0`].data->seq)
212	goto err;
213
214	bch_btree_sort_and_fix_extents(b: &b->keys, iter, state: &b->c->sort);
215
216	i = b->keys.set[`0`].data;
217	err = "short btree key";
218	if (b->keys.set[`0`].size &&
219	bkey_cmp(l: &b->key, r: &b->keys.set[`0`].end) < `0`)
220	goto err;
221
222	if (b->written < btree_blocks(b))
223	bch_bset_init_next(b: &b->keys, i: write_block(b),
224	magic: bset_magic(sb: &b->c->cache->sb));
225	out:
226	mempool_free(element: iter, pool: &b->c->fill_iter);
227	return;
228	err:
229	set_btree_node_io_error(b);
230	bch_cache_set_error(c: b->c, fmt: "%s at bucket %zu, block %u, %u keys",
231	err, PTR_BUCKET_NR(c: b->c, k: &b->key, ptr: `0`),
232	bset_block_offset(b, i), i->keys);
233	goto out;
234	}
235
236	static void btree_node_read_endio(struct bio *bio)
237	{
238	struct closure *cl = bio->bi_private;
239
240	closure_put(cl);
241	}
242
243	static void bch_btree_node_read(struct btree *b)
244	{
245	uint64_t start_time = local_clock();
246	struct closure cl;
247	struct bio *bio;
248
249	trace_bcache_btree_read(b);
250
251	closure_init_stack(cl: &cl);
252
253	bio = bch_bbio_alloc(c: b->c);
254	bio->bi_iter.bi_size = KEY_SIZE(k: &b->key) << `9`;
255	bio->bi_end_io = btree_node_read_endio;
256	bio->bi_private = &cl;
257	bio->bi_opf = REQ_OP_READ \| REQ_META;
258
259	bch_bio_map(bio, base: b->keys.set[`0`].data);
260
261	bch_submit_bbio(bio, c: b->c, k: &b->key, ptr: `0`);
262	closure_sync(cl: &cl);
263
264	if (bio->bi_status)
265	set_btree_node_io_error(b);
266
267	bch_bbio_free(bio, c: b->c);
268
269	if (btree_node_io_error(b))
270	goto err;
271
272	bch_btree_node_read_done(b);
273	bch_time_stats_update(stats: &b->c->btree_read_time, time: start_time);
274
275	return;
276	err:
277	bch_cache_set_error(c: b->c, fmt: "io error reading bucket %zu",
278	PTR_BUCKET_NR(c: b->c, k: &b->key, ptr: `0`));
279	}
280
281	static void btree_complete_write(struct btree b, struct* btree_write *w)
282	{
283	if (w->prio_blocked &&
284	!atomic_sub_return(i: w->prio_blocked, v: &b->c->prio_blocked))
285	wake_up_allocators(c: b->c);
286
287	if (w->journal) {
288	atomic_dec_bug(w->journal);
289	__closure_wake_up(list: &b->c->journal.wait);
290	}
291
292	w->prio_blocked = `0`;
293	w->journal = NULL;
294	}
295
296	static CLOSURE_CALLBACK(btree_node_write_unlock)
297	{
298	closure_type(b, struct btree, io);
299
300	up(sem: &b->io_mutex);
301	}
302
303	static CLOSURE_CALLBACK(__btree_node_write_done)
304	{
305	closure_type(b, struct btree, io);
306	struct btree_write *w = btree_prev_write(b);
307
308	bch_bbio_free(bio: b->bio, c: b->c);
309	b->bio = NULL;
310	btree_complete_write(b, w);
311
312	if (btree_node_dirty(b))
313	queue_delayed_work(wq: btree_io_wq, dwork: &b->work, delay: `30` * HZ);
314
315	closure_return_with_destructor(cl, btree_node_write_unlock);
316	}
317
318	static CLOSURE_CALLBACK(btree_node_write_done)
319	{
320	closure_type(b, struct btree, io);
321
322	bio_free_pages(bio: b->bio);
323	__btree_node_write_done(ws: &cl->work);
324	}
325
326	static void btree_node_write_endio(struct bio *bio)
327	{
328	struct closure *cl = bio->bi_private;
329	struct btree b = container_of(cl, struct* btree, io);
330
331	if (bio->bi_status)
332	set_btree_node_io_error(b);
333
334	bch_bbio_count_io_errors(c: b->c, bio, error: bio->bi_status, m: "writing btree");
335	closure_put(cl);
336	}
337
338	static void do_btree_node_write(struct btree *b)
339	{
340	struct closure *cl = &b->io;
341	struct bset *i = btree_bset_last(b);
342	BKEY_PADDED(key) k;
343
344	i->version = BCACHE_BSET_VERSION;
345	i->csum = btree_csum_set(b, i);
346
347	BUG_ON(b->bio);
348	b->bio = bch_bbio_alloc(c: b->c);
349
350	b->bio->bi_end_io = btree_node_write_endio;
351	b->bio->bi_private = cl;
352	b->bio->bi_iter.bi_size = roundup(set_bytes(i), block_bytes(b->c->cache));
353	b->bio->bi_opf = REQ_OP_WRITE \| REQ_META \| REQ_FUA;
354	bch_bio_map(bio: b->bio, base: i);
355
356	/*
357	* If we're appending to a leaf node, we don't technically need FUA -
358	* this write just needs to be persisted before the next journal write,
359	* which will be marked FLUSH\|FUA.
360	*
361	* Similarly if we're writing a new btree root - the pointer is going to
362	* be in the next journal entry.
363	*
364	* But if we're writing a new btree node (that isn't a root) or
365	* appending to a non leaf btree node, we need either FUA or a flush
366	* when we write the parent with the new pointer. FUA is cheaper than a
367	* flush, and writes appending to leaf nodes aren't blocking anything so
368	* just make all btree node writes FUA to keep things sane.
369	*/
370
371	bkey_copy(&k.key, &b->key);
372	SET_PTR_OFFSET(k: &k.key, i: `0`, v: PTR_OFFSET(k: &k.key, i: `0`) +
373	bset_sector_offset(b: &b->keys, i));
374
375	if (!bch_bio_alloc_pages(bio: b->bio, __GFP_NOWARN\|GFP_NOWAIT)) {
376	struct bio_vec *bv;
377	void addr = (void* ) ((unsigned* long) i & ~(PAGE_SIZE - `1`));
378	struct bvec_iter_all iter_all;
379
380	bio_for_each_segment_all(bv, b->bio, iter_all) {
381	memcpy(page_address(bv->bv_page), addr, PAGE_SIZE);
382	addr += PAGE_SIZE;
383	}
384
385	bch_submit_bbio(bio: b->bio, c: b->c, k: &k.key, ptr: `0`);
386
387	continue_at(cl, btree_node_write_done, NULL);
388	} else {
389	/*
390	* No problem for multipage bvec since the bio is
391	* just allocated
392	*/
393	b->bio->bi_vcnt = `0`;
394	bch_bio_map(bio: b->bio, base: i);
395
396	bch_submit_bbio(bio: b->bio, c: b->c, k: &k.key, ptr: `0`);
397
398	closure_sync(cl);
399	continue_at_nobarrier(cl, __btree_node_write_done, NULL);
400	}
401	}
402
403	void __bch_btree_node_write(struct btree b, struct* closure *parent)
404	{
405	struct bset *i = btree_bset_last(b);
406
407	lockdep_assert_held(&b->write_lock);
408
409	trace_bcache_btree_write(b);
410
411	BUG_ON(current->bio_list);
412	BUG_ON(b->written >= btree_blocks(b));
413	BUG_ON(b->written && !i->keys);
414	BUG_ON(btree_bset_first(b)->seq != i->seq);
415	bch_check_keys(&b->keys, "writing");
416
417	cancel_delayed_work(dwork: &b->work);
418
419	/ If caller isn't waiting for write, parent refcount is cache set /
420	down(sem: &b->io_mutex);
421	closure_init(cl: &b->io, parent: parent ?: &b->c->cl);
422
423	clear_bit(nr: BTREE_NODE_dirty, addr: &b->flags);
424	change_bit(nr: BTREE_NODE_write_idx, addr: &b->flags);
425
426	do_btree_node_write(b);
427
428	atomic_long_add(set_blocks(i, block_bytes(b->c->cache)) * b->c->cache->sb.block_size,
429	v: &b->c->cache->btree_sectors_written);
430
431	b->written += set_blocks(i, block_bytes(b->c->cache));
432	}
433
434	void bch_btree_node_write(struct btree b, struct* closure *parent)
435	{
436	unsigned int nsets = b->keys.nsets;
437
438	lockdep_assert_held(&b->lock);
439
440	__bch_btree_node_write(b, parent);
441
442	/*
443	* do verify if there was more than one set initially (i.e. we did a
444	* sort) and we sorted down to a single set:
445	*/
446	if (nsets && !b->keys.nsets)
447	bch_btree_verify(b);
448
449	bch_btree_init_next(b);
450	}
451
452	static void bch_btree_node_write_sync(struct btree *b)
453	{
454	struct closure cl;
455
456	closure_init_stack(cl: &cl);
457
458	mutex_lock(&b->write_lock);
459	bch_btree_node_write(b, parent: &cl);
460	mutex_unlock(lock: &b->write_lock);
461
462	closure_sync(cl: &cl);
463	}
464
465	static void btree_node_write_work(struct work_struct *w)
466	{
467	struct btree b = container_of(to_delayed_work(w), struct* btree, work);
468
469	mutex_lock(&b->write_lock);
470	if (btree_node_dirty(b))
471	__bch_btree_node_write(b, NULL);
472	mutex_unlock(lock: &b->write_lock);
473	}
474
475	static void bch_btree_leaf_dirty(struct btree b, atomic_t journal_ref)
476	{
477	struct bset *i = btree_bset_last(b);
478	struct btree_write *w = btree_current_write(b);
479
480	lockdep_assert_held(&b->write_lock);
481
482	BUG_ON(!b->written);
483	BUG_ON(!i->keys);
484
485	if (!btree_node_dirty(b))
486	queue_delayed_work(wq: btree_io_wq, dwork: &b->work, delay: `30` * HZ);
487
488	set_btree_node_dirty(b);
489
490	/*
491	* w->journal is always the oldest journal pin of all bkeys
492	* in the leaf node, to make sure the oldest jset seq won't
493	* be increased before this btree node is flushed.
494	*/
495	if (journal_ref) {
496	if (w->journal &&
497	journal_pin_cmp(b->c, w->journal, journal_ref)) {
498	atomic_dec_bug(w->journal);
499	w->journal = NULL;
500	}
501
502	if (!w->journal) {
503	w->journal = journal_ref;
504	atomic_inc(v: w->journal);
505	}
506	}
507
508	/ Force write if set is too big /
509	if (set_bytes(i) > PAGE_SIZE - `48` &&
510	!current->bio_list)
511	bch_btree_node_write(b, NULL);
512	}
513
514	/*
515	* Btree in memory cache - allocation/freeing
516	* mca -> memory cache
517	*/
518
519	#define mca_reserve(c) (((!IS_ERR_OR_NULL(c->root) && c->root->level) \
520	? c->root->level : 1) * 8 + 16)
521	#define mca_can_free(c) \
522	max_t(int, 0, c->btree_cache_used - mca_reserve(c))
523
524	static void mca_data_free(struct btree *b)
525	{
526	BUG_ON(b->io_mutex.count != `1`);
527
528	bch_btree_keys_free(b: &b->keys);
529
530	b->c->btree_cache_used--;
531	list_move(list: &b->list, head: &b->c->btree_cache_freed);
532	}
533
534	static void mca_bucket_free(struct btree *b)
535	{
536	BUG_ON(btree_node_dirty(b));
537
538	b->key.ptr[`0`] = `0`;
539	hlist_del_init_rcu(n: &b->hash);
540	list_move(list: &b->list, head: &b->c->btree_cache_freeable);
541	}
542
543	static unsigned int btree_order(struct bkey *k)
544	{
545	return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: `1`);
546	}
547
548	static void mca_data_alloc(struct btree b, struct* bkey *k, gfp_t gfp)
549	{
550	if (!bch_btree_keys_alloc(b: &b->keys,
551	max_t(unsigned int,
552	ilog2(b->c->btree_pages),
553	btree_order(k)),
554	gfp)) {
555	b->c->btree_cache_used++;
556	list_move(list: &b->list, head: &b->c->btree_cache);
557	} else {
558	list_move(list: &b->list, head: &b->c->btree_cache_freed);
559	}
560	}
561
562	#define cmp_int(l, r) ((l > r) - (l < r))
563
564	#ifdef CONFIG_PROVE_LOCKING
565	static int btree_lock_cmp_fn(const struct lockdep_map *_a,
566	const struct lockdep_map *_b)
567	{
568	const struct btree a = container_of(_a, struct* btree, lock.dep_map);
569	const struct btree b = container_of(_b, struct* btree, lock.dep_map);
570
571	return -cmp_int(a->level, b->level) ?: bkey_cmp(l: &a->key, r: &b->key);
572	}
573
574	static void btree_lock_print_fn(const struct lockdep_map *map)
575	{
576	const struct btree b = container_of(map, struct* btree, lock.dep_map);
577
578	printk(KERN_CONT " l=%u %llu:%llu", b->level,
579	KEY_INODE(&b->key), KEY_OFFSET(&b->key));
580	}
581	#endif
582
583	static struct btree mca_bucket_alloc(struct* cache_set *c,
584	struct bkey *k, gfp_t gfp)
585	{
586	/*
587	* kzalloc() is necessary here for initialization,
588	* see code comments in bch_btree_keys_init().
589	*/
590	struct btree b = kzalloc(size: sizeof(struct* btree), flags: gfp);
591
592	if (!b)
593	return NULL;
594
595	init_rwsem(&b->lock);
596	lock_set_cmp_fn(&b->lock, btree_lock_cmp_fn, btree_lock_print_fn);
597	mutex_init(&b->write_lock);
598	lockdep_set_novalidate_class(&b->write_lock);
599	INIT_LIST_HEAD(list: &b->list);
600	INIT_DELAYED_WORK(&b->work, btree_node_write_work);
601	b->c = c;
602	sema_init(sem: &b->io_mutex, val: `1`);
603
604	mca_data_alloc(b, k, gfp);
605	return b;
606	}
607
608	static int mca_reap(struct btree b, unsigned* int min_order, bool flush)
609	{
610	struct closure cl;
611
612	closure_init_stack(cl: &cl);
613	lockdep_assert_held(&b->c->bucket_lock);
614
615	if (!down_write_trylock(sem: &b->lock))
616	return -ENOMEM;
617
618	BUG_ON(btree_node_dirty(b) && !b->keys.set[`0`].data);
619
620	if (b->keys.page_order < min_order)
621	goto out_unlock;
622
623	if (!flush) {
624	if (btree_node_dirty(b))
625	goto out_unlock;
626
627	if (down_trylock(sem: &b->io_mutex))
628	goto out_unlock;
629	up(sem: &b->io_mutex);
630	}
631
632	retry:
633	/*
634	* BTREE_NODE_dirty might be cleared in btree_flush_btree() by
635	* __bch_btree_node_write(). To avoid an extra flush, acquire
636	* b->write_lock before checking BTREE_NODE_dirty bit.
637	*/
638	mutex_lock(&b->write_lock);
639	/*
640	* If this btree node is selected in btree_flush_write() by journal
641	* code, delay and retry until the node is flushed by journal code
642	* and BTREE_NODE_journal_flush bit cleared by btree_flush_write().
643	*/
644	if (btree_node_journal_flush(b)) {
645	pr_debug("bnode %p is flushing by journal, retry\n", b);
646	mutex_unlock(lock: &b->write_lock);
647	udelay(`1`);
648	goto retry;
649	}
650
651	if (btree_node_dirty(b))
652	__bch_btree_node_write(b, parent: &cl);
653	mutex_unlock(lock: &b->write_lock);
654
655	closure_sync(cl: &cl);
656
657	/ wait for any in flight btree write /
658	down(sem: &b->io_mutex);
659	up(sem: &b->io_mutex);
660
661	return `0`;
662	out_unlock:
663	rw_unlock(w: true, b);
664	return -ENOMEM;
665	}
666
667	static unsigned long bch_mca_scan(struct shrinker *shrink,
668	struct shrink_control *sc)
669	{
670	struct cache_set *c = shrink->private_data;
671	struct btree b, t;
672	unsigned long i, nr = sc->nr_to_scan;
673	unsigned long freed = `0`;
674	unsigned int btree_cache_used;
675
676	if (c->shrinker_disabled)
677	return SHRINK_STOP;
678
679	if (c->btree_cache_alloc_lock)
680	return SHRINK_STOP;
681
682	/ Return -1 if we can't do anything right now /
683	if (sc->gfp_mask & __GFP_IO)
684	mutex_lock(&c->bucket_lock);
685	else if (!mutex_trylock(lock: &c->bucket_lock))
686	return -`1`;
687
688	/*
689	* It's _really_ critical that we don't free too many btree nodes - we
690	* have to always leave ourselves a reserve. The reserve is how we
691	* guarantee that allocating memory for a new btree node can always
692	* succeed, so that inserting keys into the btree can always succeed and
693	* IO can always make forward progress:
694	*/
695	nr /= c->btree_pages;
696	if (nr == `0`)
697	nr = `1`;
698	nr = min_t(unsigned long, nr, mca_can_free(c));
699
700	i = `0`;
701	btree_cache_used = c->btree_cache_used;
702	list_for_each_entry_safe_reverse(b, t, &c->btree_cache_freeable, list) {
703	if (nr <= `0`)
704	goto out;
705
706	if (!mca_reap(b, min_order: `0`, flush: false)) {
707	mca_data_free(b);
708	rw_unlock(w: true, b);
709	freed++;
710	}
711	nr--;
712	i++;
713	}
714
715	list_for_each_entry_safe_reverse(b, t, &c->btree_cache, list) {
716	if (nr <= `0` \|\| i >= btree_cache_used)
717	goto out;
718
719	if (!mca_reap(b, min_order: `0`, flush: false)) {
720	mca_bucket_free(b);
721	mca_data_free(b);
722	rw_unlock(w: true, b);
723	freed++;
724	}
725
726	nr--;
727	i++;
728	}
729	out:
730	mutex_unlock(lock: &c->bucket_lock);
731	return freed * c->btree_pages;
732	}
733
734	static unsigned long bch_mca_count(struct shrinker *shrink,
735	struct shrink_control *sc)
736	{
737	struct cache_set *c = shrink->private_data;
738
739	if (c->shrinker_disabled)
740	return `0`;
741
742	if (c->btree_cache_alloc_lock)
743	return `0`;
744
745	return mca_can_free(c) * c->btree_pages;
746	}
747
748	void bch_btree_cache_free(struct cache_set *c)
749	{
750	struct btree *b;
751	struct closure cl;
752
753	closure_init_stack(cl: &cl);
754
755	if (c->shrink)
756	shrinker_free(shrinker: c->shrink);
757
758	mutex_lock(&c->bucket_lock);
759
760	#ifdef CONFIG_BCACHE_DEBUG
761	if (c->verify_data)
762	list_move(list: &c->verify_data->list, head: &c->btree_cache);
763
764	free_pages(addr: (unsigned long) c->verify_ondisk, ilog2(meta_bucket_pages(&c->cache->sb)));
765	#endif
766
767	list_splice(list: &c->btree_cache_freeable,
768	head: &c->btree_cache);
769
770	while (!list_empty(head: &c->btree_cache)) {
771	b = list_first_entry(&c->btree_cache, struct btree, list);
772
773	/*
774	* This function is called by cache_set_free(), no I/O
775	* request on cache now, it is unnecessary to acquire
776	* b->write_lock before clearing BTREE_NODE_dirty anymore.
777	*/
778	if (btree_node_dirty(b)) {
779	btree_complete_write(b, w: btree_current_write(b));
780	clear_bit(nr: BTREE_NODE_dirty, addr: &b->flags);
781	}
782	mca_data_free(b);
783	}
784
785	while (!list_empty(head: &c->btree_cache_freed)) {
786	b = list_first_entry(&c->btree_cache_freed,
787	struct btree, list);
788	list_del(entry: &b->list);
789	cancel_delayed_work_sync(dwork: &b->work);
790	kfree(objp: b);
791	}
792
793	mutex_unlock(lock: &c->bucket_lock);
794	}
795
796	int bch_btree_cache_alloc(struct cache_set *c)
797	{
798	unsigned int i;
799
800	for (i = `0`; i < mca_reserve(c); i++)
801	if (!mca_bucket_alloc(c, k: &ZERO_KEY, GFP_KERNEL))
802	return -ENOMEM;
803
804	list_splice_init(list: &c->btree_cache,
805	head: &c->btree_cache_freeable);
806
807	#ifdef CONFIG_BCACHE_DEBUG
808	mutex_init(&c->verify_lock);
809
810	c->verify_ondisk = (void *)
811	__get_free_pages(GFP_KERNEL\|__GFP_COMP,
812	ilog2(meta_bucket_pages(&c->cache->sb)));
813	if (!c->verify_ondisk) {
814	/*
815	* Don't worry about the mca_rereserve buckets
816	* allocated in previous for-loop, they will be
817	* handled properly in bch_cache_set_unregister().
818	*/
819	return -ENOMEM;
820	}
821
822	c->verify_data = mca_bucket_alloc(c, k: &ZERO_KEY, GFP_KERNEL);
823
824	if (c->verify_data &&
825	c->verify_data->keys.set->data)
826	list_del_init(entry: &c->verify_data->list);
827	else
828	c->verify_data = NULL;
829	#endif
830
831	c->shrink = shrinker_alloc(flags: `0`, fmt: "md-bcache:%pU", c->set_uuid);
832	if (!c->shrink) {
833	pr_warn("bcache: %s: could not allocate shrinker\n", __func__);
834	return `0`;
835	}
836
837	c->shrink->count_objects = bch_mca_count;
838	c->shrink->scan_objects = bch_mca_scan;
839	c->shrink->seeks = `4`;
840	c->shrink->batch = c->btree_pages * `2`;
841	c->shrink->private_data = c;
842
843	shrinker_register(shrinker: c->shrink);
844
845	return `0`;
846	}
847
848	/ Btree in memory cache - hash table /
849
850	static struct hlist_head mca_hash(struct* cache_set c, struct* bkey *k)
851	{
852	return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)];
853	}
854
855	static struct btree mca_find(struct* cache_set c, struct* bkey *k)
856	{
857	struct btree *b;
858
859	rcu_read_lock();
860	hlist_for_each_entry_rcu(b, mca_hash(c, k), hash)
861	if (PTR_HASH(c, &b->key) == PTR_HASH(c, k))
862	goto out;
863	b = NULL;
864	out:
865	rcu_read_unlock();
866	return b;
867	}
868
869	static int mca_cannibalize_lock(struct cache_set c, struct* btree_op *op)
870	{
871	spin_lock(lock: &c->btree_cannibalize_lock);
872	if (likely(c->btree_cache_alloc_lock == NULL)) {
873	c->btree_cache_alloc_lock = current;
874	} else if (c->btree_cache_alloc_lock != current) {
875	if (op)
876	prepare_to_wait(wq_head: &c->btree_cache_wait, wq_entry: &op->wait,
877	TASK_UNINTERRUPTIBLE);
878	spin_unlock(lock: &c->btree_cannibalize_lock);
879	return -EINTR;
880	}
881	spin_unlock(lock: &c->btree_cannibalize_lock);
882
883	return `0`;
884	}
885
886	static struct btree mca_cannibalize(struct* cache_set c, struct* btree_op *op,
887	struct bkey *k)
888	{
889	struct btree *b;
890
891	trace_bcache_btree_cache_cannibalize(c);
892
893	if (mca_cannibalize_lock(c, op))
894	return ERR_PTR(error: -EINTR);
895
896	list_for_each_entry_reverse(b, &c->btree_cache, list)
897	if (!mca_reap(b, min_order: btree_order(k), flush: false))
898	return b;
899
900	list_for_each_entry_reverse(b, &c->btree_cache, list)
901	if (!mca_reap(b, min_order: btree_order(k), flush: true))
902	return b;
903
904	WARN(`1`, "btree cache cannibalize failed\n");
905	return ERR_PTR(error: -ENOMEM);
906	}
907
908	/*
909	* We can only have one thread cannibalizing other cached btree nodes at a time,
910	* or we'll deadlock. We use an open coded mutex to ensure that, which a
911	* cannibalize_bucket() will take. This means every time we unlock the root of
912	* the btree, we need to release this lock if we have it held.
913	*/
914	void bch_cannibalize_unlock(struct cache_set *c)
915	{
916	spin_lock(lock: &c->btree_cannibalize_lock);
917	if (c->btree_cache_alloc_lock == current) {
918	c->btree_cache_alloc_lock = NULL;
919	wake_up(&c->btree_cache_wait);
920	}
921	spin_unlock(lock: &c->btree_cannibalize_lock);
922	}
923
924	static struct btree mca_alloc(struct* cache_set c, struct* btree_op *op,
925	struct bkey k, int* level)
926	{
927	struct btree *b;
928
929	BUG_ON(current->bio_list);
930
931	lockdep_assert_held(&c->bucket_lock);
932
933	if (mca_find(c, k))
934	return NULL;
935
936	/ btree_free() doesn't free memory; it sticks the node on the end of*
937	* the list. Check if there's any freed nodes there:
938	*/
939	list_for_each_entry(b, &c->btree_cache_freeable, list)
940	if (!mca_reap(b, min_order: btree_order(k), flush: false))
941	goto out;
942
943	/ We never free struct btree itself, just the memory that holds the on*
944	* disk node. Check the freed list before allocating a new one:
945	*/
946	list_for_each_entry(b, &c->btree_cache_freed, list)
947	if (!mca_reap(b, min_order: `0`, flush: false)) {
948	mca_data_alloc(b, k, __GFP_NOWARN\|GFP_NOIO);
949	if (!b->keys.set[`0`].data)
950	goto err;
951	else
952	goto out;
953	}
954
955	b = mca_bucket_alloc(c, k, __GFP_NOWARN\|GFP_NOIO);
956	if (!b)
957	goto err;
958
959	BUG_ON(!down_write_trylock(&b->lock));
960	if (!b->keys.set->data)
961	goto err;
962	out:
963	BUG_ON(b->io_mutex.count != `1`);
964
965	bkey_copy(&b->key, k);
966	list_move(list: &b->list, head: &c->btree_cache);
967	hlist_del_init_rcu(n: &b->hash);
968	hlist_add_head_rcu(n: &b->hash, h: mca_hash(c, k));
969
970	lock_set_subclass(lock: &b->lock.dep_map, subclass: level + `1`, _THIS_IP_);
971	b->parent = (void *) ~`0UL`;
972	b->flags = `0`;
973	b->written = `0`;
974	b->level = level;
975
976	if (!b->level)
977	bch_btree_keys_init(b: &b->keys, ops: &bch_extent_keys_ops,
978	expensive_debug_checks: &b->c->expensive_debug_checks);
979	else
980	bch_btree_keys_init(b: &b->keys, ops: &bch_btree_keys_ops,
981	expensive_debug_checks: &b->c->expensive_debug_checks);
982
983	return b;
984	err:
985	if (b)
986	rw_unlock(w: true, b);
987
988	b = mca_cannibalize(c, op, k);
989	if (!IS_ERR(ptr: b))
990	goto out;
991
992	return b;
993	}
994
995	/*
996	* bch_btree_node_get - find a btree node in the cache and lock it, reading it
997	* in from disk if necessary.
998	*
999	* If IO is necessary and running under submit_bio_noacct, returns -EAGAIN.
1000	*
1001	* The btree node will have either a read or a write lock held, depending on
1002	* level and op->lock.
1003	*
1004	* Note: Only error code or btree pointer will be returned, it is unncessary
1005	* for callers to check NULL pointer.
1006	*/
1007	struct btree bch_btree_node_get(struct* cache_set c, struct* btree_op *op,
1008	struct bkey k, int* level, bool write,
1009	struct btree *parent)
1010	{
1011	int i = `0`;
1012	struct btree *b;
1013
1014	BUG_ON(level < `0`);
1015	retry:
1016	b = mca_find(c, k);
1017
1018	if (!b) {
1019	if (current->bio_list)
1020	return ERR_PTR(error: -EAGAIN);
1021
1022	mutex_lock(&c->bucket_lock);
1023	b = mca_alloc(c, op, k, level);
1024	mutex_unlock(lock: &c->bucket_lock);
1025
1026	if (!b)
1027	goto retry;
1028	if (IS_ERR(ptr: b))
1029	return b;
1030
1031	bch_btree_node_read(b);
1032
1033	if (!write)
1034	downgrade_write(sem: &b->lock);
1035	} else {
1036	rw_lock(w: write, b, level);
1037	if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) {
1038	rw_unlock(w: write, b);
1039	goto retry;
1040	}
1041	BUG_ON(b->level != level);
1042	}
1043
1044	if (btree_node_io_error(b)) {
1045	rw_unlock(w: write, b);
1046	return ERR_PTR(error: -EIO);
1047	}
1048
1049	BUG_ON(!b->written);
1050
1051	b->parent = parent;
1052
1053	for (; i <= b->keys.nsets && b->keys.set[i].size; i++) {
1054	prefetch(b->keys.set[i].tree);
1055	prefetch(b->keys.set[i].data);
1056	}
1057
1058	for (; i <= b->keys.nsets; i++)
1059	prefetch(b->keys.set[i].data);
1060
1061	return b;
1062	}
1063
1064	static void btree_node_prefetch(struct btree parent, struct* bkey *k)
1065	{
1066	struct btree *b;
1067
1068	mutex_lock(&parent->c->bucket_lock);
1069	b = mca_alloc(c: parent->c, NULL, k, level: parent->level - `1`);
1070	mutex_unlock(lock: &parent->c->bucket_lock);
1071
1072	if (!IS_ERR_OR_NULL(ptr: b)) {
1073	b->parent = parent;
1074	bch_btree_node_read(b);
1075	rw_unlock(w: true, b);
1076	}
1077	}
1078
1079	/ Btree alloc /
1080
1081	static void btree_node_free(struct btree *b)
1082	{
1083	trace_bcache_btree_node_free(b);
1084
1085	BUG_ON(b == b->c->root);
1086
1087	retry:
1088	mutex_lock(&b->write_lock);
1089	/*
1090	* If the btree node is selected and flushing in btree_flush_write(),
1091	* delay and retry until the BTREE_NODE_journal_flush bit cleared,
1092	* then it is safe to free the btree node here. Otherwise this btree
1093	* node will be in race condition.
1094	*/
1095	if (btree_node_journal_flush(b)) {
1096	mutex_unlock(lock: &b->write_lock);
1097	pr_debug("bnode %p journal_flush set, retry\n", b);
1098	udelay(`1`);
1099	goto retry;
1100	}
1101
1102	if (btree_node_dirty(b)) {
1103	btree_complete_write(b, w: btree_current_write(b));
1104	clear_bit(nr: BTREE_NODE_dirty, addr: &b->flags);
1105	}
1106
1107	mutex_unlock(lock: &b->write_lock);
1108
1109	cancel_delayed_work(dwork: &b->work);
1110
1111	mutex_lock(&b->c->bucket_lock);
1112	bch_bucket_free(c: b->c, k: &b->key);
1113	mca_bucket_free(b);
1114	mutex_unlock(lock: &b->c->bucket_lock);
1115	}
1116
1117	/*
1118	* Only error code or btree pointer will be returned, it is unncessary for
1119	* callers to check NULL pointer.
1120	*/
1121	struct btree __bch_btree_node_alloc(struct* cache_set c, struct* btree_op *op,
1122	int level, bool wait,
1123	struct btree *parent)
1124	{
1125	BKEY_PADDED(key) k;
1126	struct btree *b;
1127
1128	mutex_lock(&c->bucket_lock);
1129	retry:
1130	/ return ERR_PTR(-EAGAIN) when it fails /
1131	b = ERR_PTR(error: -EAGAIN);
1132	if (__bch_bucket_alloc_set(c, reserve: RESERVE_BTREE, k: &k.key, wait))
1133	goto err;
1134
1135	bkey_put(c, k: &k.key);
1136	SET_KEY_SIZE(k: &k.key, v: c->btree_pages * PAGE_SECTORS);
1137
1138	b = mca_alloc(c, op, k: &k.key, level);
1139	if (IS_ERR(ptr: b))
1140	goto err_free;
1141
1142	if (!b) {
1143	cache_bug(c,
1144	"Tried to allocate bucket that was in btree cache");
1145	goto retry;
1146	}
1147
1148	b->parent = parent;
1149	bch_bset_init_next(b: &b->keys, i: b->keys.set->data, magic: bset_magic(sb: &b->c->cache->sb));
1150
1151	mutex_unlock(lock: &c->bucket_lock);
1152
1153	trace_bcache_btree_node_alloc(b);
1154	return b;
1155	err_free:
1156	bch_bucket_free(c, k: &k.key);
1157	err:
1158	mutex_unlock(lock: &c->bucket_lock);
1159
1160	trace_bcache_btree_node_alloc_fail(c);
1161	return b;
1162	}
1163
1164	static struct btree bch_btree_node_alloc(struct* cache_set *c,
1165	struct btree_op op, int* level,
1166	struct btree *parent)
1167	{
1168	return __bch_btree_node_alloc(c, op, level, wait: op != NULL, parent);
1169	}
1170
1171	static struct btree btree_node_alloc_replacement(struct* btree *b,
1172	struct btree_op *op)
1173	{
1174	struct btree *n = bch_btree_node_alloc(c: b->c, op, level: b->level, parent: b->parent);
1175
1176	if (!IS_ERR(ptr: n)) {
1177	mutex_lock(&n->write_lock);
1178	bch_btree_sort_into(b: &b->keys, new: &n->keys, state: &b->c->sort);
1179	bkey_copy_key(dest: &n->key, src: &b->key);
1180	mutex_unlock(lock: &n->write_lock);
1181	}
1182
1183	return n;
1184	}
1185
1186	static void make_btree_freeing_key(struct btree b, struct* bkey *k)
1187	{
1188	unsigned int i;
1189
1190	mutex_lock(&b->c->bucket_lock);
1191
1192	atomic_inc(v: &b->c->prio_blocked);
1193
1194	bkey_copy(k, &b->key);
1195	bkey_copy_key(dest: k, src: &ZERO_KEY);
1196
1197	for (i = `0`; i < KEY_PTRS(k); i++)
1198	SET_PTR_GEN(k, i,
1199	v: bch_inc_gen(ca: b->c->cache,
1200	b: PTR_BUCKET(c: b->c, k: &b->key, ptr: i)));
1201
1202	mutex_unlock(lock: &b->c->bucket_lock);
1203	}
1204
1205	static int btree_check_reserve(struct btree b, struct* btree_op *op)
1206	{
1207	struct cache_set *c = b->c;
1208	struct cache *ca = c->cache;
1209	unsigned int reserve = (c->root->level - b->level) * `2` + `1`;
1210
1211	mutex_lock(&c->bucket_lock);
1212
1213	if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) {
1214	if (op)
1215	prepare_to_wait(wq_head: &c->btree_cache_wait, wq_entry: &op->wait,
1216	TASK_UNINTERRUPTIBLE);
1217	mutex_unlock(lock: &c->bucket_lock);
1218	return -EINTR;
1219	}
1220
1221	mutex_unlock(lock: &c->bucket_lock);
1222
1223	return mca_cannibalize_lock(c: b->c, op);
1224	}
1225
1226	/ Garbage collection /
1227
1228	static uint8_t __bch_btree_mark_key(struct cache_set c, int* level,
1229	struct bkey *k)
1230	{
1231	uint8_t stale = `0`;
1232	unsigned int i;
1233	struct bucket *g;
1234
1235	/*
1236	* ptr_invalid() can't return true for the keys that mark btree nodes as
1237	* freed, but since ptr_bad() returns true we'll never actually use them
1238	* for anything and thus we don't want mark their pointers here
1239	*/
1240	if (!bkey_cmp(l: k, r: &ZERO_KEY))
1241	return stale;
1242
1243	for (i = `0`; i < KEY_PTRS(k); i++) {
1244	if (!ptr_available(c, k, i))
1245	continue;
1246
1247	g = PTR_BUCKET(c, k, ptr: i);
1248
1249	if (gen_after(a: g->last_gc, b: PTR_GEN(k, i)))
1250	g->last_gc = PTR_GEN(k, i);
1251
1252	if (ptr_stale(c, k, i)) {
1253	stale = max(stale, ptr_stale(c, k, i));
1254	continue;
1255	}
1256
1257	cache_bug_on(GC_MARK(g) &&
1258	(GC_MARK(g) == GC_MARK_METADATA) != (level != `0`),
1259	c, "inconsistent ptrs: mark = %llu, level = %i",
1260	GC_MARK(g), level);
1261
1262	if (level)
1263	SET_GC_MARK(k: g, GC_MARK_METADATA);
1264	else if (KEY_DIRTY(k))
1265	SET_GC_MARK(k: g, GC_MARK_DIRTY);
1266	else if (!GC_MARK(k: g))
1267	SET_GC_MARK(k: g, GC_MARK_RECLAIMABLE);
1268
1269	/ guard against overflow /
1270	SET_GC_SECTORS_USED(k: g, min_t(unsigned int,
1271	GC_SECTORS_USED(g) + KEY_SIZE(k),
1272	MAX_GC_SECTORS_USED));
1273
1274	BUG_ON(!GC_SECTORS_USED(g));
1275	}
1276
1277	return stale;
1278	}
1279
1280	#define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k)
1281
1282	void bch_initial_mark_key(struct cache_set c, int* level, struct bkey *k)
1283	{
1284	unsigned int i;
1285
1286	for (i = `0`; i < KEY_PTRS(k); i++)
1287	if (ptr_available(c, k, i) &&
1288	!ptr_stale(c, k, i)) {
1289	struct bucket *b = PTR_BUCKET(c, k, ptr: i);
1290
1291	b->gen = PTR_GEN(k, i);
1292
1293	if (level && bkey_cmp(l: k, r: &ZERO_KEY))
1294	b->prio = BTREE_PRIO;
1295	else if (!level && b->prio == BTREE_PRIO)
1296	b->prio = INITIAL_PRIO;
1297	}
1298
1299	__bch_btree_mark_key(c, level, k);
1300	}
1301
1302	void bch_update_bucket_in_use(struct cache_set c, struct* gc_stat *stats)
1303	{
1304	stats->in_use = (c->nbuckets - c->avail_nbuckets) * `100` / c->nbuckets;
1305	}
1306
1307	static bool btree_gc_mark_node(struct btree b, struct* gc_stat *gc)
1308	{
1309	uint8_t stale = `0`;
1310	unsigned int keys = `0`, good_keys = `0`;
1311	struct bkey *k;
1312	struct btree_iter iter;
1313	struct bset_tree *t;
1314
1315	gc->nodes++;
1316
1317	for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) {
1318	stale = max(stale, btree_mark_key(b, k));
1319	keys++;
1320
1321	if (bch_ptr_bad(b: &b->keys, k))
1322	continue;
1323
1324	gc->key_bytes += bkey_u64s(k);
1325	gc->nkeys++;
1326	good_keys++;
1327
1328	gc->data += KEY_SIZE(k);
1329	}
1330
1331	for (t = b->keys.set; t <= &b->keys.set[b->keys.nsets]; t++)
1332	btree_bug_on(t->size &&
1333	bset_written(&b->keys, t) &&
1334	bkey_cmp(&b->key, &t->end) < `0`,
1335	b, "found short btree key in gc");
1336
1337	if (b->c->gc_always_rewrite)
1338	return true;
1339
1340	if (stale > `10`)
1341	return true;
1342
1343	if ((keys - good_keys) * `2` > keys)
1344	return true;
1345
1346	return false;
1347	}
1348
1349	#define GC_MERGE_NODES 4U
1350
1351	struct gc_merge_info {
1352	struct btree *b;
1353	unsigned int keys;
1354	};
1355
1356	static int bch_btree_insert_node(struct btree b, struct* btree_op *op,
1357	struct keylist *insert_keys,
1358	atomic_t *journal_ref,
1359	struct bkey *replace_key);
1360
1361	static int btree_gc_coalesce(struct btree b, struct* btree_op *op,
1362	struct gc_stat gc, struct* gc_merge_info *r)
1363	{
1364	unsigned int i, nodes = `0`, keys = `0`, blocks;
1365	struct btree *new_nodes[GC_MERGE_NODES];
1366	struct keylist keylist;
1367	struct closure cl;
1368	struct bkey *k;
1369
1370	bch_keylist_init(l: &keylist);
1371
1372	if (btree_check_reserve(b, NULL))
1373	return `0`;
1374
1375	memset(new_nodes, `0`, sizeof(new_nodes));
1376	closure_init_stack(cl: &cl);
1377
1378	while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(ptr: r[nodes].b))
1379	keys += r[nodes++].keys;
1380
1381	blocks = btree_default_blocks(b->c) * `2` / `3`;
1382
1383	if (nodes < `2` \|\|
1384	__set_blocks(b->keys.set[`0`].data, keys,
1385	block_bytes(b->c->cache)) > blocks * (nodes - `1`))
1386	return `0`;
1387
1388	for (i = `0`; i < nodes; i++) {
1389	new_nodes[i] = btree_node_alloc_replacement(b: r[i].b, NULL);
1390	if (IS_ERR(ptr: new_nodes[i]))
1391	goto out_nocoalesce;
1392	}
1393
1394	/*
1395	* We have to check the reserve here, after we've allocated our new
1396	* nodes, to make sure the insert below will succeed - we also check
1397	* before as an optimization to potentially avoid a bunch of expensive
1398	* allocs/sorts
1399	*/
1400	if (btree_check_reserve(b, NULL))
1401	goto out_nocoalesce;
1402
1403	for (i = `0`; i < nodes; i++)
1404	mutex_lock(&new_nodes[i]->write_lock);
1405
1406	for (i = nodes - `1`; i > `0`; --i) {
1407	struct bset *n1 = btree_bset_first(b: new_nodes[i]);
1408	struct bset *n2 = btree_bset_first(b: new_nodes[i - `1`]);
1409	struct bkey k, last = NULL;
1410
1411	keys = `0`;
1412
1413	if (i > `1`) {
1414	for (k = n2->start;
1415	k < bset_bkey_last(n2);
1416	k = bkey_next(k)) {
1417	if (__set_blocks(n1, n1->keys + keys +
1418	bkey_u64s(k),
1419	block_bytes(b->c->cache)) > blocks)
1420	break;
1421
1422	last = k;
1423	keys += bkey_u64s(k);
1424	}
1425	} else {
1426	/*
1427	* Last node we're not getting rid of - we're getting
1428	* rid of the node at r[0]. Have to try and fit all of
1429	* the remaining keys into this node; we can't ensure
1430	* they will always fit due to rounding and variable
1431	* length keys (shouldn't be possible in practice,
1432	* though)
1433	*/
1434	if (__set_blocks(n1, n1->keys + n2->keys,
1435	block_bytes(b->c->cache)) >
1436	btree_blocks(new_nodes[i]))
1437	goto out_unlock_nocoalesce;
1438
1439	keys = n2->keys;
1440	/ Take the key of the node we're getting rid of /
1441	last = &r->b->key;
1442	}
1443
1444	BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c->cache)) >
1445	btree_blocks(new_nodes[i]));
1446
1447	if (last)
1448	bkey_copy_key(dest: &new_nodes[i]->key, src: last);
1449
1450	memcpy(bset_bkey_last(n1),
1451	n2->start,
1452	(void ) bset_bkey_idx(n2, keys) - (void* *) n2->start);
1453
1454	n1->keys += keys;
1455	r[i].keys = n1->keys;
1456
1457	memmove(n2->start,
1458	bset_bkey_idx(n2, keys),
1459	(void *) bset_bkey_last(n2) -
1460	(void *) bset_bkey_idx(n2, keys));
1461
1462	n2->keys -= keys;
1463
1464	if (__bch_keylist_realloc(l: &keylist,
1465	u64s: bkey_u64s(k: &new_nodes[i]->key)))
1466	goto out_unlock_nocoalesce;
1467
1468	bch_btree_node_write(b: new_nodes[i], parent: &cl);
1469	bch_keylist_add(l: &keylist, k: &new_nodes[i]->key);
1470	}
1471
1472	for (i = `0`; i < nodes; i++)
1473	mutex_unlock(lock: &new_nodes[i]->write_lock);
1474
1475	closure_sync(cl: &cl);
1476
1477	/ We emptied out this node /
1478	BUG_ON(btree_bset_first(new_nodes[`0`])->keys);
1479	btree_node_free(b: new_nodes[`0`]);
1480	rw_unlock(w: true, b: new_nodes[`0`]);
1481	new_nodes[`0`] = NULL;
1482
1483	for (i = `0`; i < nodes; i++) {
1484	if (__bch_keylist_realloc(l: &keylist, u64s: bkey_u64s(k: &r[i].b->key)))
1485	goto out_nocoalesce;
1486
1487	make_btree_freeing_key(b: r[i].b, k: keylist.top);
1488	bch_keylist_push(l: &keylist);
1489	}
1490
1491	bch_btree_insert_node(b, op, insert_keys: &keylist, NULL, NULL);
1492	BUG_ON(!bch_keylist_empty(&keylist));
1493
1494	for (i = `0`; i < nodes; i++) {
1495	btree_node_free(b: r[i].b);
1496	rw_unlock(w: true, b: r[i].b);
1497
1498	r[i].b = new_nodes[i];
1499	}
1500
1501	memmove(r, r + `1`, sizeof(r[`0`]) * (nodes - `1`));
1502	r[nodes - `1`].b = ERR_PTR(error: -EINTR);
1503
1504	trace_bcache_btree_gc_coalesce(nodes);
1505	gc->nodes--;
1506
1507	bch_keylist_free(l: &keylist);
1508
1509	/ Invalidated our iterator /
1510	return -EINTR;
1511
1512	out_unlock_nocoalesce:
1513	for (i = `0`; i < nodes; i++)
1514	mutex_unlock(lock: &new_nodes[i]->write_lock);
1515
1516	out_nocoalesce:
1517	closure_sync(cl: &cl);
1518
1519	while ((k = bch_keylist_pop(l: &keylist)))
1520	if (!bkey_cmp(l: k, r: &ZERO_KEY))
1521	atomic_dec(v: &b->c->prio_blocked);
1522	bch_keylist_free(l: &keylist);
1523
1524	for (i = `0`; i < nodes; i++)
1525	if (!IS_ERR_OR_NULL(ptr: new_nodes[i])) {
1526	btree_node_free(b: new_nodes[i]);
1527	rw_unlock(w: true, b: new_nodes[i]);
1528	}
1529	return `0`;
1530	}
1531
1532	static int btree_gc_rewrite_node(struct btree b, struct* btree_op *op,
1533	struct btree *replace)
1534	{
1535	struct keylist keys;
1536	struct btree *n;
1537
1538	if (btree_check_reserve(b, NULL))
1539	return `0`;
1540
1541	n = btree_node_alloc_replacement(b: replace, NULL);
1542	if (IS_ERR(ptr: n))
1543	return `0`;
1544
1545	/ recheck reserve after allocating replacement node /
1546	if (btree_check_reserve(b, NULL)) {
1547	btree_node_free(b: n);
1548	rw_unlock(w: true, b: n);
1549	return `0`;
1550	}
1551
1552	bch_btree_node_write_sync(b: n);
1553
1554	bch_keylist_init(l: &keys);
1555	bch_keylist_add(l: &keys, k: &n->key);
1556
1557	make_btree_freeing_key(b: replace, k: keys.top);
1558	bch_keylist_push(l: &keys);
1559
1560	bch_btree_insert_node(b, op, insert_keys: &keys, NULL, NULL);
1561	BUG_ON(!bch_keylist_empty(&keys));
1562
1563	btree_node_free(b: replace);
1564	rw_unlock(w: true, b: n);
1565
1566	/ Invalidated our iterator /
1567	return -EINTR;
1568	}
1569
1570	static unsigned int btree_gc_count_keys(struct btree *b)
1571	{
1572	struct bkey *k;
1573	struct btree_iter iter;
1574	unsigned int ret = `0`;
1575
1576	for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad)
1577	ret += bkey_u64s(k);
1578
1579	return ret;
1580	}
1581
1582	static size_t btree_gc_min_nodes(struct cache_set *c)
1583	{
1584	size_t min_nodes;
1585
1586	/*
1587	* Since incremental GC would stop 100ms when front
1588	* side I/O comes, so when there are many btree nodes,
1589	* if GC only processes constant (100) nodes each time,
1590	* GC would last a long time, and the front side I/Os
1591	* would run out of the buckets (since no new bucket
1592	* can be allocated during GC), and be blocked again.
1593	* So GC should not process constant nodes, but varied
1594	* nodes according to the number of btree nodes, which
1595	* realized by dividing GC into constant(100) times,
1596	* so when there are many btree nodes, GC can process
1597	* more nodes each time, otherwise, GC will process less
1598	* nodes each time (but no less than MIN_GC_NODES)
1599	*/
1600	min_nodes = c->gc_stats.nodes / MAX_GC_TIMES;
1601	if (min_nodes < MIN_GC_NODES)
1602	min_nodes = MIN_GC_NODES;
1603
1604	return min_nodes;
1605	}
1606
1607
1608	static int btree_gc_recurse(struct btree b, struct* btree_op *op,
1609	struct closure writes, struct* gc_stat *gc)
1610	{
1611	int ret = `0`;
1612	bool should_rewrite;
1613	struct bkey *k;
1614	struct btree_iter iter;
1615	struct gc_merge_info r[GC_MERGE_NODES];
1616	struct gc_merge_info i, last = r + ARRAY_SIZE(r) - `1`;
1617
1618	bch_btree_iter_init(b: &b->keys, iter: &iter, search: &b->c->gc_done);
1619
1620	for (i = r; i < r + ARRAY_SIZE(r); i++)
1621	i->b = ERR_PTR(error: -EINTR);
1622
1623	while (`1`) {
1624	k = bch_btree_iter_next_filter(iter: &iter, b: &b->keys, fn: bch_ptr_bad);
1625	if (k) {
1626	r->b = bch_btree_node_get(c: b->c, op, k, level: b->level - `1`,
1627	write: true, parent: b);
1628	if (IS_ERR(ptr: r->b)) {
1629	ret = PTR_ERR(ptr: r->b);
1630	break;
1631	}
1632
1633	r->keys = btree_gc_count_keys(b: r->b);
1634
1635	ret = btree_gc_coalesce(b, op, gc, r);
1636	if (ret)
1637	break;
1638	}
1639
1640	if (!last->b)
1641	break;
1642
1643	if (!IS_ERR(ptr: last->b)) {
1644	should_rewrite = btree_gc_mark_node(b: last->b, gc);
1645	if (should_rewrite) {
1646	ret = btree_gc_rewrite_node(b, op, replace: last->b);
1647	if (ret)
1648	break;
1649	}
1650
1651	if (last->b->level) {
1652	ret = btree_gc_recurse(b: last->b, op, writes, gc);
1653	if (ret)
1654	break;
1655	}
1656
1657	bkey_copy_key(dest: &b->c->gc_done, src: &last->b->key);
1658
1659	/*
1660	* Must flush leaf nodes before gc ends, since replace
1661	* operations aren't journalled
1662	*/
1663	mutex_lock(&last->b->write_lock);
1664	if (btree_node_dirty(b: last->b))
1665	bch_btree_node_write(b: last->b, parent: writes);
1666	mutex_unlock(lock: &last->b->write_lock);
1667	rw_unlock(w: true, b: last->b);
1668	}
1669
1670	memmove(r + `1`, r, sizeof(r[`0`]) * (GC_MERGE_NODES - `1`));
1671	r->b = NULL;
1672
1673	if (atomic_read(v: &b->c->search_inflight) &&
1674	gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(c: b->c)) {
1675	gc->nodes_pre = gc->nodes;
1676	ret = -EAGAIN;
1677	break;
1678	}
1679
1680	if (need_resched()) {
1681	ret = -EAGAIN;
1682	break;
1683	}
1684	}
1685
1686	for (i = r; i < r + ARRAY_SIZE(r); i++)
1687	if (!IS_ERR_OR_NULL(ptr: i->b)) {
1688	mutex_lock(&i->b->write_lock);
1689	if (btree_node_dirty(b: i->b))
1690	bch_btree_node_write(b: i->b, parent: writes);
1691	mutex_unlock(lock: &i->b->write_lock);
1692	rw_unlock(w: true, b: i->b);
1693	}
1694
1695	return ret;
1696	}
1697
1698	static int bch_btree_gc_root(struct btree b, struct* btree_op *op,
1699	struct closure writes, struct* gc_stat *gc)
1700	{
1701	struct btree *n = NULL;
1702	int ret = `0`;
1703	bool should_rewrite;
1704
1705	should_rewrite = btree_gc_mark_node(b, gc);
1706	if (should_rewrite) {
1707	n = btree_node_alloc_replacement(b, NULL);
1708
1709	if (!IS_ERR(ptr: n)) {
1710	bch_btree_node_write_sync(b: n);
1711
1712	bch_btree_set_root(b: n);
1713	btree_node_free(b);
1714	rw_unlock(w: true, b: n);
1715
1716	return -EINTR;
1717	}
1718	}
1719
1720	__bch_btree_mark_key(c: b->c, level: b->level + `1`, k: &b->key);
1721
1722	if (b->level) {
1723	ret = btree_gc_recurse(b, op, writes, gc);
1724	if (ret)
1725	return ret;
1726	}
1727
1728	bkey_copy_key(dest: &b->c->gc_done, src: &b->key);
1729
1730	return ret;
1731	}
1732
1733	static void btree_gc_start(struct cache_set *c)
1734	{
1735	struct cache *ca;
1736	struct bucket *b;
1737
1738	if (!c->gc_mark_valid)
1739	return;
1740
1741	mutex_lock(&c->bucket_lock);
1742
1743	c->gc_mark_valid = `0`;
1744	c->gc_done = ZERO_KEY;
1745
1746	ca = c->cache;
1747	for_each_bucket(b, ca) {
1748	b->last_gc = b->gen;
1749	if (!atomic_read(v: &b->pin)) {
1750	SET_GC_MARK(k: b, v: `0`);
1751	SET_GC_SECTORS_USED(k: b, v: `0`);
1752	}
1753	}
1754
1755	mutex_unlock(lock: &c->bucket_lock);
1756	}
1757
1758	static void bch_btree_gc_finish(struct cache_set *c)
1759	{
1760	struct bucket *b;
1761	struct cache *ca;
1762	unsigned int i, j;
1763	uint64_t *k;
1764
1765	mutex_lock(&c->bucket_lock);
1766
1767	set_gc_sectors(c);
1768	c->gc_mark_valid = `1`;
1769	c->need_gc = `0`;
1770
1771	for (i = `0`; i < KEY_PTRS(k: &c->uuid_bucket); i++)
1772	SET_GC_MARK(k: PTR_BUCKET(c, k: &c->uuid_bucket, ptr: i),
1773	GC_MARK_METADATA);
1774
1775	/ don't reclaim buckets to which writeback keys point /
1776	rcu_read_lock();
1777	for (i = `0`; i < c->devices_max_used; i++) {
1778	struct bcache_device *d = c->devices[i];
1779	struct cached_dev *dc;
1780	struct keybuf_key w, n;
1781
1782	if (!d \|\| UUID_FLASH_ONLY(k: &c->uuids[i]))
1783	continue;
1784	dc = container_of(d, struct cached_dev, disk);
1785
1786	spin_lock(lock: &dc->writeback_keys.lock);
1787	rbtree_postorder_for_each_entry_safe(w, n,
1788	&dc->writeback_keys.keys, node)
1789	for (j = `0`; j < KEY_PTRS(k: &w->key); j++)
1790	SET_GC_MARK(k: PTR_BUCKET(c, k: &w->key, ptr: j),
1791	GC_MARK_DIRTY);
1792	spin_unlock(lock: &dc->writeback_keys.lock);
1793	}
1794	rcu_read_unlock();
1795
1796	c->avail_nbuckets = `0`;
1797
1798	ca = c->cache;
1799	ca->invalidate_needs_gc = `0`;
1800
1801	for (k = ca->sb.d; k < ca->sb.d + ca->sb.keys; k++)
1802	SET_GC_MARK(k: ca->buckets + *k, GC_MARK_METADATA);
1803
1804	for (k = ca->prio_buckets;
1805	k < ca->prio_buckets + prio_buckets(ca) * `2`; k++)
1806	SET_GC_MARK(k: ca->buckets + *k, GC_MARK_METADATA);
1807
1808	for_each_bucket(b, ca) {
1809	c->need_gc = max(c->need_gc, bucket_gc_gen(b));
1810
1811	if (atomic_read(v: &b->pin))
1812	continue;
1813
1814	BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b));
1815
1816	if (!GC_MARK(k: b) \|\| GC_MARK(k: b) == GC_MARK_RECLAIMABLE)
1817	c->avail_nbuckets++;
1818	}
1819
1820	mutex_unlock(lock: &c->bucket_lock);
1821	}
1822
1823	static void bch_btree_gc(struct cache_set *c)
1824	{
1825	int ret;
1826	struct gc_stat stats;
1827	struct closure writes;
1828	struct btree_op op;
1829	uint64_t start_time = local_clock();
1830
1831	trace_bcache_gc_start(c);
1832
1833	memset(&stats, `0`, sizeof(struct gc_stat));
1834	closure_init_stack(cl: &writes);
1835	bch_btree_op_init(op: &op, SHRT_MAX);
1836
1837	btree_gc_start(c);
1838
1839	/ if CACHE_SET_IO_DISABLE set, gc thread should stop too /
1840	do {
1841	ret = bcache_btree_root(gc_root, c, &op, &writes, &stats);
1842	closure_sync(cl: &writes);
1843	cond_resched();
1844
1845	if (ret == -EAGAIN)
1846	schedule_timeout_interruptible(timeout: msecs_to_jiffies
1847	(GC_SLEEP_MS));
1848	else if (ret)
1849	pr_warn("gc failed!\n");
1850	} while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags));
1851
1852	bch_btree_gc_finish(c);
1853	wake_up_allocators(c);
1854
1855	bch_time_stats_update(stats: &c->btree_gc_time, time: start_time);
1856
1857	stats.key_bytes = sizeof*(uint64_t);
1858	stats.data <<= `9`;
1859	bch_update_bucket_in_use(c, stats: &stats);
1860	memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));
1861
1862	trace_bcache_gc_end(c);
1863
1864	bch_moving_gc(c);
1865	}
1866
1867	static bool gc_should_run(struct cache_set *c)
1868	{
1869	struct cache *ca = c->cache;
1870
1871	if (ca->invalidate_needs_gc)
1872	return true;
1873
1874	if (atomic_read(v: &c->sectors_to_gc) < `0`)
1875	return true;
1876
1877	return false;
1878	}
1879
1880	static int bch_gc_thread(void *arg)
1881	{
1882	struct cache_set *c = arg;
1883
1884	while (`1`) {
1885	wait_event_interruptible(c->gc_wait,
1886	kthread_should_stop() \|\|
1887	test_bit(CACHE_SET_IO_DISABLE, &c->flags) \|\|
1888	gc_should_run(c));
1889
1890	if (kthread_should_stop() \|\|
1891	test_bit(CACHE_SET_IO_DISABLE, &c->flags))
1892	break;
1893
1894	set_gc_sectors(c);
1895	bch_btree_gc(c);
1896	}
1897
1898	wait_for_kthread_stop();
1899	return `0`;
1900	}
1901
1902	int bch_gc_thread_start(struct cache_set *c)
1903	{
1904	c->gc_thread = kthread_run(bch_gc_thread, c, "bcache_gc");
1905	return PTR_ERR_OR_ZERO(ptr: c->gc_thread);
1906	}
1907
1908	/ Initial partial gc /
1909
1910	static int bch_btree_check_recurse(struct btree b, struct* btree_op *op)
1911	{
1912	int ret = `0`;
1913	struct bkey k, p = NULL;
1914	struct btree_iter iter;
1915
1916	for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid)
1917	bch_initial_mark_key(c: b->c, level: b->level, k);
1918
1919	bch_initial_mark_key(c: b->c, level: b->level + `1`, k: &b->key);
1920
1921	if (b->level) {
1922	bch_btree_iter_init(b: &b->keys, iter: &iter, NULL);
1923
1924	do {
1925	k = bch_btree_iter_next_filter(iter: &iter, b: &b->keys,
1926	fn: bch_ptr_bad);
1927	if (k) {
1928	btree_node_prefetch(parent: b, k);
1929	/*
1930	* initiallize c->gc_stats.nodes
1931	* for incremental GC
1932	*/
1933	b->c->gc_stats.nodes++;
1934	}
1935
1936	if (p)
1937	ret = bcache_btree(check_recurse, p, b, op);
1938
1939	p = k;
1940	} while (p && !ret);
1941	}
1942
1943	return ret;
1944	}
1945
1946
1947	static int bch_btree_check_thread(void *arg)
1948	{
1949	int ret;
1950	struct btree_check_info *info = arg;
1951	struct btree_check_state *check_state = info->state;
1952	struct cache_set *c = check_state->c;
1953	struct btree_iter iter;
1954	struct bkey k, p;
1955	int cur_idx, prev_idx, skip_nr;
1956
1957	k = p = NULL;
1958	cur_idx = prev_idx = `0`;
1959	ret = `0`;
1960
1961	/ root node keys are checked before thread created /
1962	bch_btree_iter_init(b: &c->root->keys, iter: &iter, NULL);
1963	k = bch_btree_iter_next_filter(iter: &iter, b: &c->root->keys, fn: bch_ptr_bad);
1964	BUG_ON(!k);
1965
1966	p = k;
1967	while (k) {
1968	/*
1969	* Fetch a root node key index, skip the keys which
1970	* should be fetched by other threads, then check the
1971	* sub-tree indexed by the fetched key.
1972	*/
1973	spin_lock(lock: &check_state->idx_lock);
1974	cur_idx = check_state->key_idx;
1975	check_state->key_idx++;
1976	spin_unlock(lock: &check_state->idx_lock);
1977
1978	skip_nr = cur_idx - prev_idx;
1979
1980	while (skip_nr) {
1981	k = bch_btree_iter_next_filter(iter: &iter,
1982	b: &c->root->keys,
1983	fn: bch_ptr_bad);
1984	if (k)
1985	p = k;
1986	else {
1987	/*
1988	* No more keys to check in root node,
1989	* current checking threads are enough,
1990	* stop creating more.
1991	*/
1992	atomic_set(v: &check_state->enough, i: `1`);
1993	/ Update check_state->enough earlier /
1994	smp_mb__after_atomic();
1995	goto out;
1996	}
1997	skip_nr--;
1998	cond_resched();
1999	}
2000
2001	if (p) {
2002	struct btree_op op;
2003
2004	btree_node_prefetch(parent: c->root, k: p);
2005	c->gc_stats.nodes++;
2006	bch_btree_op_init(op: &op, write_lock_level: `0`);
2007	ret = bcache_btree(check_recurse, p, c->root, &op);
2008	/*
2009	* The op may be added to cache_set's btree_cache_wait
2010	* in mca_cannibalize(), must ensure it is removed from
2011	* the list and release btree_cache_alloc_lock before
2012	* free op memory.
2013	* Otherwise, the btree_cache_wait will be damaged.
2014	*/
2015	bch_cannibalize_unlock(c);
2016	finish_wait(wq_head: &c->btree_cache_wait, wq_entry: &(&op)->wait);
2017	if (ret)
2018	goto out;
2019	}
2020	p = NULL;
2021	prev_idx = cur_idx;
2022	cond_resched();
2023	}
2024
2025	out:
2026	info->result = ret;
2027	/ update check_state->started among all CPUs /
2028	smp_mb__before_atomic();
2029	if (atomic_dec_and_test(v: &check_state->started))
2030	wake_up(&check_state->wait);
2031
2032	return ret;
2033	}
2034
2035
2036
2037	static int bch_btree_chkthread_nr(void)
2038	{
2039	int n = num_online_cpus()/`2`;
2040
2041	if (n == `0`)
2042	n = `1`;
2043	else if (n > BCH_BTR_CHKTHREAD_MAX)
2044	n = BCH_BTR_CHKTHREAD_MAX;
2045
2046	return n;
2047	}
2048
2049	int bch_btree_check(struct cache_set *c)
2050	{
2051	int ret = `0`;
2052	int i;
2053	struct bkey *k = NULL;
2054	struct btree_iter iter;
2055	struct btree_check_state check_state;
2056
2057	/ check and mark root node keys /
2058	for_each_key_filter(&c->root->keys, k, &iter, bch_ptr_invalid)
2059	bch_initial_mark_key(c, level: c->root->level, k);
2060
2061	bch_initial_mark_key(c, level: c->root->level + `1`, k: &c->root->key);
2062
2063	if (c->root->level == `0`)
2064	return `0`;
2065
2066	memset(&check_state, `0`, sizeof(struct btree_check_state));
2067	check_state.c = c;
2068	check_state.total_threads = bch_btree_chkthread_nr();
2069	check_state.key_idx = `0`;
2070	spin_lock_init(&check_state.idx_lock);
2071	atomic_set(v: &check_state.started, i: `0`);
2072	atomic_set(v: &check_state.enough, i: `0`);
2073	init_waitqueue_head(&check_state.wait);
2074
2075	rw_lock(w: `0`, b: c->root, level: c->root->level);
2076	/*
2077	* Run multiple threads to check btree nodes in parallel,
2078	* if check_state.enough is non-zero, it means current
2079	* running check threads are enough, unncessary to create
2080	* more.
2081	*/
2082	for (i = `0`; i < check_state.total_threads; i++) {
2083	/ fetch latest check_state.enough earlier /
2084	smp_mb__before_atomic();
2085	if (atomic_read(v: &check_state.enough))
2086	break;
2087
2088	check_state.infos[i].result = `0`;
2089	check_state.infos[i].state = &check_state;
2090
2091	check_state.infos[i].thread =
2092	kthread_run(bch_btree_check_thread,
2093	&check_state.infos[i],
2094	"bch_btrchk[%d]", i);
2095	if (IS_ERR(ptr: check_state.infos[i].thread)) {
2096	pr_err("fails to run thread bch_btrchk[%d]\n", i);
2097	for (--i; i >= `0`; i--)
2098	kthread_stop(k: check_state.infos[i].thread);
2099	ret = -ENOMEM;
2100	goto out;
2101	}
2102	atomic_inc(v: &check_state.started);
2103	}
2104
2105	/*
2106	* Must wait for all threads to stop.
2107	*/
2108	wait_event(check_state.wait, atomic_read(&check_state.started) == `0`);
2109
2110	for (i = `0`; i < check_state.total_threads; i++) {
2111	if (check_state.infos[i].result) {
2112	ret = check_state.infos[i].result;
2113	goto out;
2114	}
2115	}
2116
2117	out:
2118	rw_unlock(w: `0`, b: c->root);
2119	return ret;
2120	}
2121
2122	void bch_initial_gc_finish(struct cache_set *c)
2123	{
2124	struct cache *ca = c->cache;
2125	struct bucket *b;
2126
2127	bch_btree_gc_finish(c);
2128
2129	mutex_lock(&c->bucket_lock);
2130
2131	/*
2132	* We need to put some unused buckets directly on the prio freelist in
2133	* order to get the allocator thread started - it needs freed buckets in
2134	* order to rewrite the prios and gens, and it needs to rewrite prios
2135	* and gens in order to free buckets.
2136	*
2137	* This is only safe for buckets that have no live data in them, which
2138	* there should always be some of.
2139	*/
2140	for_each_bucket(b, ca) {
2141	if (fifo_full(&ca->free[RESERVE_PRIO]) &&
2142	fifo_full(&ca->free[RESERVE_BTREE]))
2143	break;
2144
2145	if (bch_can_invalidate_bucket(ca, b) &&
2146	!GC_MARK(k: b)) {
2147	__bch_invalidate_one_bucket(ca, b);
2148	if (!fifo_push(&ca->free[RESERVE_PRIO],
2149	b - ca->buckets))
2150	fifo_push(&ca->free[RESERVE_BTREE],
2151	b - ca->buckets);
2152	}
2153	}
2154
2155	mutex_unlock(lock: &c->bucket_lock);
2156	}
2157
2158	/ Btree insertion /
2159
2160	static bool btree_insert_key(struct btree b, struct* bkey *k,
2161	struct bkey *replace_key)
2162	{
2163	unsigned int status;
2164
2165	BUG_ON(bkey_cmp(k, &b->key) > `0`);
2166
2167	status = bch_btree_insert_key(b: &b->keys, k, replace_key);
2168	if (status != BTREE_INSERT_STATUS_NO_INSERT) {
2169	bch_check_keys(&b->keys, "%u for %s", status,
2170	replace_key ? "replace" : "insert");
2171
2172	trace_bcache_btree_insert_key(b, k, op: replace_key != NULL,
2173	status);
2174	return true;
2175	} else
2176	return false;
2177	}
2178
2179	static size_t insert_u64s_remaining(struct btree *b)
2180	{
2181	long ret = bch_btree_keys_u64s_remaining(b: &b->keys);
2182
2183	/*
2184	* Might land in the middle of an existing extent and have to split it
2185	*/
2186	if (b->keys.ops->is_extents)
2187	ret -= KEY_MAX_U64S;
2188
2189	return max(ret, `0L`);
2190	}
2191
2192	static bool bch_btree_insert_keys(struct btree b, struct* btree_op *op,
2193	struct keylist *insert_keys,
2194	struct bkey *replace_key)
2195	{
2196	bool ret = false;
2197	int oldsize = bch_count_data(b: &b->keys);
2198
2199	while (!bch_keylist_empty(l: insert_keys)) {
2200	struct bkey *k = insert_keys->keys;
2201
2202	if (bkey_u64s(k) > insert_u64s_remaining(b))
2203	break;
2204
2205	if (bkey_cmp(l: k, r: &b->key) <= `0`) {
2206	if (!b->level)
2207	bkey_put(c: b->c, k);
2208
2209	ret \|= btree_insert_key(b, k, replace_key);
2210	bch_keylist_pop_front(l: insert_keys);
2211	} else if (bkey_cmp(l: &START_KEY(k), r: &b->key) < `0`) {
2212	BKEY_PADDED(key) temp;
2213	bkey_copy(&temp.key, insert_keys->keys);
2214
2215	bch_cut_back(where: &b->key, k: &temp.key);
2216	bch_cut_front(where: &b->key, k: insert_keys->keys);
2217
2218	ret \|= btree_insert_key(b, k: &temp.key, replace_key);
2219	break;
2220	} else {
2221	break;
2222	}
2223	}
2224
2225	if (!ret)
2226	op->insert_collision = true;
2227
2228	BUG_ON(!bch_keylist_empty(insert_keys) && b->level);
2229
2230	BUG_ON(bch_count_data(&b->keys) < oldsize);
2231	return ret;
2232	}
2233
2234	static int btree_split(struct btree b, struct* btree_op *op,
2235	struct keylist *insert_keys,
2236	struct bkey *replace_key)
2237	{
2238	bool split;
2239	struct btree n1, n2 = NULL, *n3 = NULL;
2240	uint64_t start_time = local_clock();
2241	struct closure cl;
2242	struct keylist parent_keys;
2243
2244	closure_init_stack(cl: &cl);
2245	bch_keylist_init(l: &parent_keys);
2246
2247	if (btree_check_reserve(b, op)) {
2248	if (!b->level)
2249	return -EINTR;
2250	else
2251	WARN(`1`, "insufficient reserve for split\n");
2252	}
2253
2254	n1 = btree_node_alloc_replacement(b, op);
2255	if (IS_ERR(ptr: n1))
2256	goto err;
2257
2258	split = set_blocks(btree_bset_first(n1),
2259	block_bytes(n1->c->cache)) > (btree_blocks(b) * `4`) / `5`;
2260
2261	if (split) {
2262	unsigned int keys = `0`;
2263
2264	trace_bcache_btree_node_split(b, keys: btree_bset_first(b: n1)->keys);
2265
2266	n2 = bch_btree_node_alloc(c: b->c, op, level: b->level, parent: b->parent);
2267	if (IS_ERR(ptr: n2))
2268	goto err_free1;
2269
2270	if (!b->parent) {
2271	n3 = bch_btree_node_alloc(c: b->c, op, level: b->level + `1`, NULL);
2272	if (IS_ERR(ptr: n3))
2273	goto err_free2;
2274	}
2275
2276	mutex_lock(&n1->write_lock);
2277	mutex_lock(&n2->write_lock);
2278
2279	bch_btree_insert_keys(b: n1, op, insert_keys, replace_key);
2280
2281	/*
2282	* Has to be a linear search because we don't have an auxiliary
2283	* search tree yet
2284	*/
2285
2286	while (keys < (btree_bset_first(b: n1)->keys * `3`) / `5`)
2287	keys += bkey_u64s(k: bset_bkey_idx(i: btree_bset_first(b: n1),
2288	idx: keys));
2289
2290	bkey_copy_key(dest: &n1->key,
2291	src: bset_bkey_idx(i: btree_bset_first(b: n1), idx: keys));
2292	keys += bkey_u64s(k: bset_bkey_idx(i: btree_bset_first(b: n1), idx: keys));
2293
2294	btree_bset_first(b: n2)->keys = btree_bset_first(b: n1)->keys - keys;
2295	btree_bset_first(b: n1)->keys = keys;
2296
2297	memcpy(btree_bset_first(n2)->start,
2298	bset_bkey_last(btree_bset_first(n1)),
2299	btree_bset_first(n2)->keys * sizeof(uint64_t));
2300
2301	bkey_copy_key(dest: &n2->key, src: &b->key);
2302
2303	bch_keylist_add(l: &parent_keys, k: &n2->key);
2304	bch_btree_node_write(b: n2, parent: &cl);
2305	mutex_unlock(lock: &n2->write_lock);
2306	rw_unlock(w: true, b: n2);
2307	} else {
2308	trace_bcache_btree_node_compact(b, keys: btree_bset_first(b: n1)->keys);
2309
2310	mutex_lock(&n1->write_lock);
2311	bch_btree_insert_keys(b: n1, op, insert_keys, replace_key);
2312	}
2313
2314	bch_keylist_add(l: &parent_keys, k: &n1->key);
2315	bch_btree_node_write(b: n1, parent: &cl);
2316	mutex_unlock(lock: &n1->write_lock);
2317
2318	if (n3) {
2319	/ Depth increases, make a new root /
2320	mutex_lock(&n3->write_lock);
2321	bkey_copy_key(dest: &n3->key, src: &MAX_KEY);
2322	bch_btree_insert_keys(b: n3, op, insert_keys: &parent_keys, NULL);
2323	bch_btree_node_write(b: n3, parent: &cl);
2324	mutex_unlock(lock: &n3->write_lock);
2325
2326	closure_sync(cl: &cl);
2327	bch_btree_set_root(b: n3);
2328	rw_unlock(w: true, b: n3);
2329	} else if (!b->parent) {
2330	/ Root filled up but didn't need to be split /
2331	closure_sync(cl: &cl);
2332	bch_btree_set_root(b: n1);
2333	} else {
2334	/ Split a non root node /
2335	closure_sync(cl: &cl);
2336	make_btree_freeing_key(b, k: parent_keys.top);
2337	bch_keylist_push(l: &parent_keys);
2338
2339	bch_btree_insert_node(b: b->parent, op, insert_keys: &parent_keys, NULL, NULL);
2340	BUG_ON(!bch_keylist_empty(&parent_keys));
2341	}
2342
2343	btree_node_free(b);
2344	rw_unlock(w: true, b: n1);
2345
2346	bch_time_stats_update(stats: &b->c->btree_split_time, time: start_time);
2347
2348	return `0`;
2349	err_free2:
2350	bkey_put(c: b->c, k: &n2->key);
2351	btree_node_free(b: n2);
2352	rw_unlock(w: true, b: n2);
2353	err_free1:
2354	bkey_put(c: b->c, k: &n1->key);
2355	btree_node_free(b: n1);
2356	rw_unlock(w: true, b: n1);
2357	err:
2358	WARN(`1`, "bcache: btree split failed (level %u)", b->level);
2359
2360	if (n3 == ERR_PTR(error: -EAGAIN) \|\|
2361	n2 == ERR_PTR(error: -EAGAIN) \|\|
2362	n1 == ERR_PTR(error: -EAGAIN))
2363	return -EAGAIN;
2364
2365	return -ENOMEM;
2366	}
2367
2368	static int bch_btree_insert_node(struct btree b, struct* btree_op *op,
2369	struct keylist *insert_keys,
2370	atomic_t *journal_ref,
2371	struct bkey *replace_key)
2372	{
2373	struct closure cl;
2374
2375	BUG_ON(b->level && replace_key);
2376
2377	closure_init_stack(cl: &cl);
2378
2379	mutex_lock(&b->write_lock);
2380
2381	if (write_block(b) != btree_bset_last(b) &&
2382	b->keys.last_set_unwritten)
2383	bch_btree_init_next(b); / just wrote a set /
2384
2385	if (bch_keylist_nkeys(l: insert_keys) > insert_u64s_remaining(b)) {
2386	mutex_unlock(lock: &b->write_lock);
2387	goto split;
2388	}
2389
2390	BUG_ON(write_block(b) != btree_bset_last(b));
2391
2392	if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) {
2393	if (!b->level)
2394	bch_btree_leaf_dirty(b, journal_ref);
2395	else
2396	bch_btree_node_write(b, parent: &cl);
2397	}
2398
2399	mutex_unlock(lock: &b->write_lock);
2400
2401	/ wait for btree node write if necessary, after unlock /
2402	closure_sync(cl: &cl);
2403
2404	return `0`;
2405	split:
2406	if (current->bio_list) {
2407	op->lock = b->c->root->level + `1`;
2408	return -EAGAIN;
2409	} else if (op->lock <= b->c->root->level) {
2410	op->lock = b->c->root->level + `1`;
2411	return -EINTR;
2412	} else {
2413	/ Invalidated all iterators /
2414	int ret = btree_split(b, op, insert_keys, replace_key);
2415
2416	if (bch_keylist_empty(l: insert_keys))
2417	return `0`;
2418	else if (!ret)
2419	return -EINTR;
2420	return ret;
2421	}
2422	}
2423
2424	int bch_btree_insert_check_key(struct btree b, struct* btree_op *op,
2425	struct bkey *check_key)
2426	{
2427	int ret = -EINTR;
2428	uint64_t btree_ptr = b->key.ptr[`0`];
2429	unsigned long seq = b->seq;
2430	struct keylist insert;
2431	bool upgrade = op->lock == -`1`;
2432
2433	bch_keylist_init(l: &insert);
2434
2435	if (upgrade) {
2436	rw_unlock(w: false, b);
2437	rw_lock(w: true, b, level: b->level);
2438
2439	if (b->key.ptr[`0`] != btree_ptr \|\|
2440	b->seq != seq + `1`) {
2441	op->lock = b->level;
2442	goto out;
2443	}
2444	}
2445
2446	SET_KEY_PTRS(k: check_key, v: `1`);
2447	get_random_bytes(buf: &check_key->ptr[`0`], len: sizeof(uint64_t));
2448
2449	SET_PTR_DEV(k: check_key, i: `0`, PTR_CHECK_DEV);
2450
2451	bch_keylist_add(l: &insert, k: check_key);
2452
2453	ret = bch_btree_insert_node(b, op, insert_keys: &insert, NULL, NULL);
2454
2455	BUG_ON(!ret && !bch_keylist_empty(&insert));
2456	out:
2457	if (upgrade)
2458	downgrade_write(sem: &b->lock);
2459	return ret;
2460	}
2461
2462	struct btree_insert_op {
2463	struct btree_op op;
2464	struct keylist *keys;
2465	atomic_t *journal_ref;
2466	struct bkey *replace_key;
2467	};
2468
2469	static int btree_insert_fn(struct btree_op b_op, struct* btree *b)
2470	{
2471	struct btree_insert_op *op = container_of(b_op,
2472	struct btree_insert_op, op);
2473
2474	int ret = bch_btree_insert_node(b, op: &op->op, insert_keys: op->keys,
2475	journal_ref: op->journal_ref, replace_key: op->replace_key);
2476	if (ret && !bch_keylist_empty(l: op->keys))
2477	return ret;
2478	else
2479	return MAP_DONE;
2480	}
2481
2482	int bch_btree_insert(struct cache_set c, struct* keylist *keys,
2483	atomic_t journal_ref, struct* bkey *replace_key)
2484	{
2485	struct btree_insert_op op;
2486	int ret = `0`;
2487
2488	BUG_ON(current->bio_list);
2489	BUG_ON(bch_keylist_empty(keys));
2490
2491	bch_btree_op_init(op: &op.op, write_lock_level: `0`);
2492	op.keys = keys;
2493	op.journal_ref = journal_ref;
2494	op.replace_key = replace_key;
2495
2496	while (!ret && !bch_keylist_empty(l: keys)) {
2497	op.op.lock = `0`;
2498	ret = bch_btree_map_leaf_nodes(op: &op.op, c,
2499	from: &START_KEY(keys->keys),
2500	fn: btree_insert_fn);
2501	}
2502
2503	if (ret) {
2504	struct bkey *k;
2505
2506	pr_err("error %i\n", ret);
2507
2508	while ((k = bch_keylist_pop(l: keys)))
2509	bkey_put(c, k);
2510	} else if (op.op.insert_collision)
2511	ret = -ESRCH;
2512
2513	return ret;
2514	}
2515
2516	void bch_btree_set_root(struct btree *b)
2517	{
2518	unsigned int i;
2519	struct closure cl;
2520
2521	closure_init_stack(cl: &cl);
2522
2523	trace_bcache_btree_set_root(b);
2524
2525	BUG_ON(!b->written);
2526
2527	for (i = `0`; i < KEY_PTRS(k: &b->key); i++)
2528	BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != BTREE_PRIO);
2529
2530	mutex_lock(&b->c->bucket_lock);
2531	list_del_init(entry: &b->list);
2532	mutex_unlock(lock: &b->c->bucket_lock);
2533
2534	b->c->root = b;
2535
2536	bch_journal_meta(c: b->c, cl: &cl);
2537	closure_sync(cl: &cl);
2538	}
2539
2540	/ Map across nodes or keys /
2541
2542	static int bch_btree_map_nodes_recurse(struct btree b, struct* btree_op *op,
2543	struct bkey *from,
2544	btree_map_nodes_fn fn, int* flags)
2545	{
2546	int ret = MAP_CONTINUE;
2547
2548	if (b->level) {
2549	struct bkey *k;
2550	struct btree_iter iter;
2551
2552	bch_btree_iter_init(b: &b->keys, iter: &iter, search: from);
2553
2554	while ((k = bch_btree_iter_next_filter(iter: &iter, b: &b->keys,
2555	fn: bch_ptr_bad))) {
2556	ret = bcache_btree(map_nodes_recurse, k, b,
2557	op, from, fn, flags);
2558	from = NULL;
2559
2560	if (ret != MAP_CONTINUE)
2561	return ret;
2562	}
2563	}
2564
2565	if (!b->level \|\| flags == MAP_ALL_NODES)
2566	ret = fn(op, b);
2567
2568	return ret;
2569	}
2570
2571	int __bch_btree_map_nodes(struct btree_op op, struct* cache_set *c,
2572	struct bkey from, btree_map_nodes_fn fn, int flags)
2573	{
2574	return bcache_btree_root(map_nodes_recurse, c, op, from, fn, flags);
2575	}
2576
2577	int bch_btree_map_keys_recurse(struct btree b, struct* btree_op *op,
2578	struct bkey from, btree_map_keys_fn fn,
2579	int flags)
2580	{
2581	int ret = MAP_CONTINUE;
2582	struct bkey *k;
2583	struct btree_iter iter;
2584
2585	bch_btree_iter_init(b: &b->keys, iter: &iter, search: from);
2586
2587	while ((k = bch_btree_iter_next_filter(iter: &iter, b: &b->keys, fn: bch_ptr_bad))) {
2588	ret = !b->level
2589	? fn(op, b, k)
2590	: bcache_btree(map_keys_recurse, k,
2591	b, op, from, fn, flags);
2592	from = NULL;
2593
2594	if (ret != MAP_CONTINUE)
2595	return ret;
2596	}
2597
2598	if (!b->level && (flags & MAP_END_KEY))
2599	ret = fn(op, b, &KEY(KEY_INODE(&b->key),
2600	KEY_OFFSET(&b->key), `0`));
2601
2602	return ret;
2603	}
2604
2605	int bch_btree_map_keys(struct btree_op op, struct* cache_set *c,
2606	struct bkey from, btree_map_keys_fn fn, int flags)
2607	{
2608	return bcache_btree_root(map_keys_recurse, c, op, from, fn, flags);
2609	}
2610
2611	/ Keybuf code /
2612
2613	static inline int keybuf_cmp(struct keybuf_key l, struct* keybuf_key *r)
2614	{
2615	/ Overlapping keys compare equal /
2616	if (bkey_cmp(l: &l->key, r: &START_KEY(&r->key)) <= `0`)
2617	return -`1`;
2618	if (bkey_cmp(l: &START_KEY(&l->key), r: &r->key) >= `0`)
2619	return `1`;
2620	return `0`;
2621	}
2622
2623	static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l,
2624	struct keybuf_key *r)
2625	{
2626	return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -`1`, `1`);
2627	}
2628
2629	struct refill {
2630	struct btree_op op;
2631	unsigned int nr_found;
2632	struct keybuf *buf;
2633	struct bkey *end;
2634	keybuf_pred_fn *pred;
2635	};
2636
2637	static int refill_keybuf_fn(struct btree_op op, struct* btree *b,
2638	struct bkey *k)
2639	{
2640	struct refill refill = container_of(op, struct* refill, op);
2641	struct keybuf *buf = refill->buf;
2642	int ret = MAP_CONTINUE;
2643
2644	if (bkey_cmp(l: k, r: refill->end) > `0`) {
2645	ret = MAP_DONE;
2646	goto out;
2647	}
2648
2649	if (!KEY_SIZE(k)) / end key /
2650	goto out;
2651
2652	if (refill->pred(buf, k)) {
2653	struct keybuf_key *w;
2654
2655	spin_lock(lock: &buf->lock);
2656
2657	w = array_alloc(&buf->freelist);
2658	if (!w) {
2659	spin_unlock(lock: &buf->lock);
2660	return MAP_DONE;
2661	}
2662
2663	w->private = NULL;
2664	bkey_copy(&w->key, k);
2665
2666	if (RB_INSERT(&buf->keys, w, node, keybuf_cmp))
2667	array_free(&buf->freelist, w);
2668	else
2669	refill->nr_found++;
2670
2671	if (array_freelist_empty(&buf->freelist))
2672	ret = MAP_DONE;
2673
2674	spin_unlock(lock: &buf->lock);
2675	}
2676	out:
2677	buf->last_scanned = *k;
2678	return ret;
2679	}
2680
2681	void bch_refill_keybuf(struct cache_set c, struct* keybuf *buf,
2682	struct bkey end, keybuf_pred_fn pred)
2683	{
2684	struct bkey start = buf->last_scanned;
2685	struct refill refill;
2686
2687	cond_resched();
2688
2689	bch_btree_op_init(op: &refill.op, write_lock_level: -`1`);
2690	refill.nr_found = `0`;
2691	refill.buf = buf;
2692	refill.end = end;
2693	refill.pred = pred;
2694
2695	bch_btree_map_keys(op: &refill.op, c, from: &buf->last_scanned,
2696	fn: refill_keybuf_fn, MAP_END_KEY);
2697
2698	trace_bcache_keyscan(nr_found: refill.nr_found,
2699	start_inode: KEY_INODE(k: &start), start_offset: KEY_OFFSET(k: &start),
2700	end_inode: KEY_INODE(k: &buf->last_scanned),
2701	end_offset: KEY_OFFSET(k: &buf->last_scanned));
2702
2703	spin_lock(lock: &buf->lock);
2704
2705	if (!RB_EMPTY_ROOT(&buf->keys)) {
2706	struct keybuf_key *w;
2707
2708	w = RB_FIRST(&buf->keys, struct keybuf_key, node);
2709	buf->start = START_KEY(&w->key);
2710
2711	w = RB_LAST(&buf->keys, struct keybuf_key, node);
2712	buf->end = w->key;
2713	} else {
2714	buf->start = MAX_KEY;
2715	buf->end = MAX_KEY;
2716	}
2717
2718	spin_unlock(lock: &buf->lock);
2719	}
2720
2721	static void __bch_keybuf_del(struct keybuf buf, struct* keybuf_key *w)
2722	{
2723	rb_erase(&w->node, &buf->keys);
2724	array_free(&buf->freelist, w);
2725	}
2726
2727	void bch_keybuf_del(struct keybuf buf, struct* keybuf_key *w)
2728	{
2729	spin_lock(lock: &buf->lock);
2730	__bch_keybuf_del(buf, w);
2731	spin_unlock(lock: &buf->lock);
2732	}
2733
2734	bool bch_keybuf_check_overlapping(struct keybuf buf, struct* bkey *start,
2735	struct bkey *end)
2736	{
2737	bool ret = false;
2738	struct keybuf_key p, w, s;
2739
2740	s.key = *start;
2741
2742	if (bkey_cmp(l: end, r: &buf->start) <= `0` \|\|
2743	bkey_cmp(l: start, r: &buf->end) >= `0`)
2744	return false;
2745
2746	spin_lock(lock: &buf->lock);
2747	w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp);
2748
2749	while (w && bkey_cmp(l: &START_KEY(&w->key), r: end) < `0`) {
2750	p = w;
2751	w = RB_NEXT(w, node);
2752
2753	if (p->private)
2754	ret = true;
2755	else
2756	__bch_keybuf_del(buf, w: p);
2757	}
2758
2759	spin_unlock(lock: &buf->lock);
2760	return ret;
2761	}
2762
2763	struct keybuf_key bch_keybuf_next(struct* keybuf *buf)
2764	{
2765	struct keybuf_key *w;
2766
2767	spin_lock(lock: &buf->lock);
2768
2769	w = RB_FIRST(&buf->keys, struct keybuf_key, node);
2770
2771	while (w && w->private)
2772	w = RB_NEXT(w, node);
2773
2774	if (w)
2775	w->private = ERR_PTR(error: -EINTR);
2776
2777	spin_unlock(lock: &buf->lock);
2778	return w;
2779	}
2780
2781	struct keybuf_key bch_keybuf_next_rescan(struct* cache_set *c,
2782	struct keybuf *buf,
2783	struct bkey *end,
2784	keybuf_pred_fn *pred)
2785	{
2786	struct keybuf_key *ret;
2787
2788	while (`1`) {
2789	ret = bch_keybuf_next(buf);
2790	if (ret)
2791	break;
2792
2793	if (bkey_cmp(l: &buf->last_scanned, r: end) >= `0`) {
2794	pr_debug("scan finished\n");
2795	break;
2796	}
2797
2798	bch_refill_keybuf(c, buf, end, pred);
2799	}
2800
2801	return ret;
2802	}
2803
2804	void bch_keybuf_init(struct keybuf *buf)
2805	{
2806	buf->last_scanned = MAX_KEY;
2807	buf->keys = RB_ROOT;
2808
2809	spin_lock_init(&buf->lock);
2810	array_allocator_init(&buf->freelist);
2811	}
2812
2813	void bch_btree_exit(void)
2814	{
2815	if (btree_io_wq)
2816	destroy_workqueue(wq: btree_io_wq);
2817	}
2818
2819	int __init bch_btree_init(void)
2820	{
2821	btree_io_wq = alloc_workqueue(fmt: "bch_btree_io", flags: WQ_MEM_RECLAIM, max_active: `0`);
2822	if (!btree_io_wq)
2823	return -ENOMEM;
2824
2825	return `0`;
2826	}
2827

source code of linux/drivers/md/bcache/btree.c