1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * bcache journalling code, for btree insertions |
4 | * |
5 | * Copyright 2012 Google, Inc. |
6 | */ |
7 | |
8 | #include "bcache.h" |
9 | #include "btree.h" |
10 | #include "debug.h" |
11 | #include "extents.h" |
12 | |
13 | #include <trace/events/bcache.h> |
14 | |
15 | /* |
16 | * Journal replay/recovery: |
17 | * |
18 | * This code is all driven from run_cache_set(); we first read the journal |
19 | * entries, do some other stuff, then we mark all the keys in the journal |
20 | * entries (same as garbage collection would), then we replay them - reinserting |
21 | * them into the cache in precisely the same order as they appear in the |
22 | * journal. |
23 | * |
24 | * We only journal keys that go in leaf nodes, which simplifies things quite a |
25 | * bit. |
26 | */ |
27 | |
28 | static void journal_read_endio(struct bio *bio) |
29 | { |
30 | struct closure *cl = bio->bi_private; |
31 | |
32 | closure_put(cl); |
33 | } |
34 | |
35 | static int journal_read_bucket(struct cache *ca, struct list_head *list, |
36 | unsigned int bucket_index) |
37 | { |
38 | struct journal_device *ja = &ca->journal; |
39 | struct bio *bio = &ja->bio; |
40 | |
41 | struct journal_replay *i; |
42 | struct jset *j, *data = ca->set->journal.w[0].data; |
43 | struct closure cl; |
44 | unsigned int len, left, offset = 0; |
45 | int ret = 0; |
46 | sector_t bucket = bucket_to_sector(c: ca->set, b: ca->sb.d[bucket_index]); |
47 | |
48 | closure_init_stack(cl: &cl); |
49 | |
50 | pr_debug("reading %u\n" , bucket_index); |
51 | |
52 | while (offset < ca->sb.bucket_size) { |
53 | reread: left = ca->sb.bucket_size - offset; |
54 | len = min_t(unsigned int, left, PAGE_SECTORS << JSET_BITS); |
55 | |
56 | bio_reset(bio, bdev: ca->bdev, opf: REQ_OP_READ); |
57 | bio->bi_iter.bi_sector = bucket + offset; |
58 | bio->bi_iter.bi_size = len << 9; |
59 | |
60 | bio->bi_end_io = journal_read_endio; |
61 | bio->bi_private = &cl; |
62 | bch_bio_map(bio, base: data); |
63 | |
64 | closure_bio_submit(c: ca->set, bio, cl: &cl); |
65 | closure_sync(cl: &cl); |
66 | |
67 | /* This function could be simpler now since we no longer write |
68 | * journal entries that overlap bucket boundaries; this means |
69 | * the start of a bucket will always have a valid journal entry |
70 | * if it has any journal entries at all. |
71 | */ |
72 | |
73 | j = data; |
74 | while (len) { |
75 | struct list_head *where; |
76 | size_t blocks, bytes = set_bytes(j); |
77 | |
78 | if (j->magic != jset_magic(sb: &ca->sb)) { |
79 | pr_debug("%u: bad magic\n" , bucket_index); |
80 | return ret; |
81 | } |
82 | |
83 | if (bytes > left << 9 || |
84 | bytes > PAGE_SIZE << JSET_BITS) { |
85 | pr_info("%u: too big, %zu bytes, offset %u\n" , |
86 | bucket_index, bytes, offset); |
87 | return ret; |
88 | } |
89 | |
90 | if (bytes > len << 9) |
91 | goto reread; |
92 | |
93 | if (j->csum != csum_set(j)) { |
94 | pr_info("%u: bad csum, %zu bytes, offset %u\n" , |
95 | bucket_index, bytes, offset); |
96 | return ret; |
97 | } |
98 | |
99 | blocks = set_blocks(j, block_bytes(ca)); |
100 | |
101 | /* |
102 | * Nodes in 'list' are in linear increasing order of |
103 | * i->j.seq, the node on head has the smallest (oldest) |
104 | * journal seq, the node on tail has the biggest |
105 | * (latest) journal seq. |
106 | */ |
107 | |
108 | /* |
109 | * Check from the oldest jset for last_seq. If |
110 | * i->j.seq < j->last_seq, it means the oldest jset |
111 | * in list is expired and useless, remove it from |
112 | * this list. Otherwise, j is a candidate jset for |
113 | * further following checks. |
114 | */ |
115 | while (!list_empty(head: list)) { |
116 | i = list_first_entry(list, |
117 | struct journal_replay, list); |
118 | if (i->j.seq >= j->last_seq) |
119 | break; |
120 | list_del(entry: &i->list); |
121 | kfree(objp: i); |
122 | } |
123 | |
124 | /* iterate list in reverse order (from latest jset) */ |
125 | list_for_each_entry_reverse(i, list, list) { |
126 | if (j->seq == i->j.seq) |
127 | goto next_set; |
128 | |
129 | /* |
130 | * if j->seq is less than any i->j.last_seq |
131 | * in list, j is an expired and useless jset. |
132 | */ |
133 | if (j->seq < i->j.last_seq) |
134 | goto next_set; |
135 | |
136 | /* |
137 | * 'where' points to first jset in list which |
138 | * is elder then j. |
139 | */ |
140 | if (j->seq > i->j.seq) { |
141 | where = &i->list; |
142 | goto add; |
143 | } |
144 | } |
145 | |
146 | where = list; |
147 | add: |
148 | i = kmalloc(offsetof(struct journal_replay, j) + |
149 | bytes, GFP_KERNEL); |
150 | if (!i) |
151 | return -ENOMEM; |
152 | unsafe_memcpy(&i->j, j, bytes, |
153 | /* "bytes" was calculated by set_bytes() above */); |
154 | /* Add to the location after 'where' points to */ |
155 | list_add(new: &i->list, head: where); |
156 | ret = 1; |
157 | |
158 | if (j->seq > ja->seq[bucket_index]) |
159 | ja->seq[bucket_index] = j->seq; |
160 | next_set: |
161 | offset += blocks * ca->sb.block_size; |
162 | len -= blocks * ca->sb.block_size; |
163 | j = ((void *) j) + blocks * block_bytes(ca); |
164 | } |
165 | } |
166 | |
167 | return ret; |
168 | } |
169 | |
170 | int bch_journal_read(struct cache_set *c, struct list_head *list) |
171 | { |
172 | #define read_bucket(b) \ |
173 | ({ \ |
174 | ret = journal_read_bucket(ca, list, b); \ |
175 | __set_bit(b, bitmap); \ |
176 | if (ret < 0) \ |
177 | return ret; \ |
178 | ret; \ |
179 | }) |
180 | |
181 | struct cache *ca = c->cache; |
182 | int ret = 0; |
183 | struct journal_device *ja = &ca->journal; |
184 | DECLARE_BITMAP(bitmap, SB_JOURNAL_BUCKETS); |
185 | unsigned int i, l, r, m; |
186 | uint64_t seq; |
187 | |
188 | bitmap_zero(dst: bitmap, SB_JOURNAL_BUCKETS); |
189 | pr_debug("%u journal buckets\n" , ca->sb.njournal_buckets); |
190 | |
191 | /* |
192 | * Read journal buckets ordered by golden ratio hash to quickly |
193 | * find a sequence of buckets with valid journal entries |
194 | */ |
195 | for (i = 0; i < ca->sb.njournal_buckets; i++) { |
196 | /* |
197 | * We must try the index l with ZERO first for |
198 | * correctness due to the scenario that the journal |
199 | * bucket is circular buffer which might have wrapped |
200 | */ |
201 | l = (i * 2654435769U) % ca->sb.njournal_buckets; |
202 | |
203 | if (test_bit(l, bitmap)) |
204 | break; |
205 | |
206 | if (read_bucket(l)) |
207 | goto bsearch; |
208 | } |
209 | |
210 | /* |
211 | * If that fails, check all the buckets we haven't checked |
212 | * already |
213 | */ |
214 | pr_debug("falling back to linear search\n" ); |
215 | |
216 | for_each_clear_bit(l, bitmap, ca->sb.njournal_buckets) |
217 | if (read_bucket(l)) |
218 | goto bsearch; |
219 | |
220 | /* no journal entries on this device? */ |
221 | if (l == ca->sb.njournal_buckets) |
222 | goto out; |
223 | bsearch: |
224 | BUG_ON(list_empty(list)); |
225 | |
226 | /* Binary search */ |
227 | m = l; |
228 | r = find_next_bit(addr: bitmap, size: ca->sb.njournal_buckets, offset: l + 1); |
229 | pr_debug("starting binary search, l %u r %u\n" , l, r); |
230 | |
231 | while (l + 1 < r) { |
232 | seq = list_entry(list->prev, struct journal_replay, |
233 | list)->j.seq; |
234 | |
235 | m = (l + r) >> 1; |
236 | read_bucket(m); |
237 | |
238 | if (seq != list_entry(list->prev, struct journal_replay, |
239 | list)->j.seq) |
240 | l = m; |
241 | else |
242 | r = m; |
243 | } |
244 | |
245 | /* |
246 | * Read buckets in reverse order until we stop finding more |
247 | * journal entries |
248 | */ |
249 | pr_debug("finishing up: m %u njournal_buckets %u\n" , |
250 | m, ca->sb.njournal_buckets); |
251 | l = m; |
252 | |
253 | while (1) { |
254 | if (!l--) |
255 | l = ca->sb.njournal_buckets - 1; |
256 | |
257 | if (l == m) |
258 | break; |
259 | |
260 | if (test_bit(l, bitmap)) |
261 | continue; |
262 | |
263 | if (!read_bucket(l)) |
264 | break; |
265 | } |
266 | |
267 | seq = 0; |
268 | |
269 | for (i = 0; i < ca->sb.njournal_buckets; i++) |
270 | if (ja->seq[i] > seq) { |
271 | seq = ja->seq[i]; |
272 | /* |
273 | * When journal_reclaim() goes to allocate for |
274 | * the first time, it'll use the bucket after |
275 | * ja->cur_idx |
276 | */ |
277 | ja->cur_idx = i; |
278 | ja->last_idx = ja->discard_idx = (i + 1) % |
279 | ca->sb.njournal_buckets; |
280 | |
281 | } |
282 | |
283 | out: |
284 | if (!list_empty(head: list)) |
285 | c->journal.seq = list_entry(list->prev, |
286 | struct journal_replay, |
287 | list)->j.seq; |
288 | |
289 | return 0; |
290 | #undef read_bucket |
291 | } |
292 | |
293 | void bch_journal_mark(struct cache_set *c, struct list_head *list) |
294 | { |
295 | atomic_t p = { 0 }; |
296 | struct bkey *k; |
297 | struct journal_replay *i; |
298 | struct journal *j = &c->journal; |
299 | uint64_t last = j->seq; |
300 | |
301 | /* |
302 | * journal.pin should never fill up - we never write a journal |
303 | * entry when it would fill up. But if for some reason it does, we |
304 | * iterate over the list in reverse order so that we can just skip that |
305 | * refcount instead of bugging. |
306 | */ |
307 | |
308 | list_for_each_entry_reverse(i, list, list) { |
309 | BUG_ON(last < i->j.seq); |
310 | i->pin = NULL; |
311 | |
312 | while (last-- != i->j.seq) |
313 | if (fifo_free(&j->pin) > 1) { |
314 | fifo_push_front(&j->pin, p); |
315 | atomic_set(v: &fifo_front(&j->pin), i: 0); |
316 | } |
317 | |
318 | if (fifo_free(&j->pin) > 1) { |
319 | fifo_push_front(&j->pin, p); |
320 | i->pin = &fifo_front(&j->pin); |
321 | atomic_set(v: i->pin, i: 1); |
322 | } |
323 | |
324 | for (k = i->j.start; |
325 | k < bset_bkey_last(&i->j); |
326 | k = bkey_next(k)) |
327 | if (!__bch_extent_invalid(c, k)) { |
328 | unsigned int j; |
329 | |
330 | for (j = 0; j < KEY_PTRS(k); j++) |
331 | if (ptr_available(c, k, i: j)) |
332 | atomic_inc(v: &PTR_BUCKET(c, k, ptr: j)->pin); |
333 | |
334 | bch_initial_mark_key(c, level: 0, k); |
335 | } |
336 | } |
337 | } |
338 | |
339 | static bool is_discard_enabled(struct cache_set *s) |
340 | { |
341 | struct cache *ca = s->cache; |
342 | |
343 | if (ca->discard) |
344 | return true; |
345 | |
346 | return false; |
347 | } |
348 | |
349 | int bch_journal_replay(struct cache_set *s, struct list_head *list) |
350 | { |
351 | int ret = 0, keys = 0, entries = 0; |
352 | struct bkey *k; |
353 | struct journal_replay *i = |
354 | list_entry(list->prev, struct journal_replay, list); |
355 | |
356 | uint64_t start = i->j.last_seq, end = i->j.seq, n = start; |
357 | struct keylist keylist; |
358 | |
359 | list_for_each_entry(i, list, list) { |
360 | BUG_ON(i->pin && atomic_read(i->pin) != 1); |
361 | |
362 | if (n != i->j.seq) { |
363 | if (n == start && is_discard_enabled(s)) |
364 | pr_info("journal entries %llu-%llu may be discarded! (replaying %llu-%llu)\n" , |
365 | n, i->j.seq - 1, start, end); |
366 | else { |
367 | pr_err("journal entries %llu-%llu missing! (replaying %llu-%llu)\n" , |
368 | n, i->j.seq - 1, start, end); |
369 | ret = -EIO; |
370 | goto err; |
371 | } |
372 | } |
373 | |
374 | for (k = i->j.start; |
375 | k < bset_bkey_last(&i->j); |
376 | k = bkey_next(k)) { |
377 | trace_bcache_journal_replay_key(k); |
378 | |
379 | bch_keylist_init_single(l: &keylist, k); |
380 | |
381 | ret = bch_btree_insert(c: s, keys: &keylist, journal_ref: i->pin, NULL); |
382 | if (ret) |
383 | goto err; |
384 | |
385 | BUG_ON(!bch_keylist_empty(&keylist)); |
386 | keys++; |
387 | |
388 | cond_resched(); |
389 | } |
390 | |
391 | if (i->pin) |
392 | atomic_dec(v: i->pin); |
393 | n = i->j.seq + 1; |
394 | entries++; |
395 | } |
396 | |
397 | pr_info("journal replay done, %i keys in %i entries, seq %llu\n" , |
398 | keys, entries, end); |
399 | err: |
400 | while (!list_empty(head: list)) { |
401 | i = list_first_entry(list, struct journal_replay, list); |
402 | list_del(entry: &i->list); |
403 | kfree(objp: i); |
404 | } |
405 | |
406 | return ret; |
407 | } |
408 | |
409 | void bch_journal_space_reserve(struct journal *j) |
410 | { |
411 | j->do_reserve = true; |
412 | } |
413 | |
414 | /* Journalling */ |
415 | |
416 | static void btree_flush_write(struct cache_set *c) |
417 | { |
418 | struct btree *b, *t, *btree_nodes[BTREE_FLUSH_NR]; |
419 | unsigned int i, nr; |
420 | int ref_nr; |
421 | atomic_t *fifo_front_p, *now_fifo_front_p; |
422 | size_t mask; |
423 | |
424 | if (c->journal.btree_flushing) |
425 | return; |
426 | |
427 | spin_lock(lock: &c->journal.flush_write_lock); |
428 | if (c->journal.btree_flushing) { |
429 | spin_unlock(lock: &c->journal.flush_write_lock); |
430 | return; |
431 | } |
432 | c->journal.btree_flushing = true; |
433 | spin_unlock(lock: &c->journal.flush_write_lock); |
434 | |
435 | /* get the oldest journal entry and check its refcount */ |
436 | spin_lock(lock: &c->journal.lock); |
437 | fifo_front_p = &fifo_front(&c->journal.pin); |
438 | ref_nr = atomic_read(v: fifo_front_p); |
439 | if (ref_nr <= 0) { |
440 | /* |
441 | * do nothing if no btree node references |
442 | * the oldest journal entry |
443 | */ |
444 | spin_unlock(lock: &c->journal.lock); |
445 | goto out; |
446 | } |
447 | spin_unlock(lock: &c->journal.lock); |
448 | |
449 | mask = c->journal.pin.mask; |
450 | nr = 0; |
451 | atomic_long_inc(v: &c->flush_write); |
452 | memset(btree_nodes, 0, sizeof(btree_nodes)); |
453 | |
454 | mutex_lock(&c->bucket_lock); |
455 | list_for_each_entry_safe_reverse(b, t, &c->btree_cache, list) { |
456 | /* |
457 | * It is safe to get now_fifo_front_p without holding |
458 | * c->journal.lock here, because we don't need to know |
459 | * the exactly accurate value, just check whether the |
460 | * front pointer of c->journal.pin is changed. |
461 | */ |
462 | now_fifo_front_p = &fifo_front(&c->journal.pin); |
463 | /* |
464 | * If the oldest journal entry is reclaimed and front |
465 | * pointer of c->journal.pin changes, it is unnecessary |
466 | * to scan c->btree_cache anymore, just quit the loop and |
467 | * flush out what we have already. |
468 | */ |
469 | if (now_fifo_front_p != fifo_front_p) |
470 | break; |
471 | /* |
472 | * quit this loop if all matching btree nodes are |
473 | * scanned and record in btree_nodes[] already. |
474 | */ |
475 | ref_nr = atomic_read(v: fifo_front_p); |
476 | if (nr >= ref_nr) |
477 | break; |
478 | |
479 | if (btree_node_journal_flush(b)) |
480 | pr_err("BUG: flush_write bit should not be set here!\n" ); |
481 | |
482 | mutex_lock(&b->write_lock); |
483 | |
484 | if (!btree_node_dirty(b)) { |
485 | mutex_unlock(lock: &b->write_lock); |
486 | continue; |
487 | } |
488 | |
489 | if (!btree_current_write(b)->journal) { |
490 | mutex_unlock(lock: &b->write_lock); |
491 | continue; |
492 | } |
493 | |
494 | /* |
495 | * Only select the btree node which exactly references |
496 | * the oldest journal entry. |
497 | * |
498 | * If the journal entry pointed by fifo_front_p is |
499 | * reclaimed in parallel, don't worry: |
500 | * - the list_for_each_xxx loop will quit when checking |
501 | * next now_fifo_front_p. |
502 | * - If there are matched nodes recorded in btree_nodes[], |
503 | * they are clean now (this is why and how the oldest |
504 | * journal entry can be reclaimed). These selected nodes |
505 | * will be ignored and skipped in the following for-loop. |
506 | */ |
507 | if (((btree_current_write(b)->journal - fifo_front_p) & |
508 | mask) != 0) { |
509 | mutex_unlock(lock: &b->write_lock); |
510 | continue; |
511 | } |
512 | |
513 | set_btree_node_journal_flush(b); |
514 | |
515 | mutex_unlock(lock: &b->write_lock); |
516 | |
517 | btree_nodes[nr++] = b; |
518 | /* |
519 | * To avoid holding c->bucket_lock too long time, |
520 | * only scan for BTREE_FLUSH_NR matched btree nodes |
521 | * at most. If there are more btree nodes reference |
522 | * the oldest journal entry, try to flush them next |
523 | * time when btree_flush_write() is called. |
524 | */ |
525 | if (nr == BTREE_FLUSH_NR) |
526 | break; |
527 | } |
528 | mutex_unlock(lock: &c->bucket_lock); |
529 | |
530 | for (i = 0; i < nr; i++) { |
531 | b = btree_nodes[i]; |
532 | if (!b) { |
533 | pr_err("BUG: btree_nodes[%d] is NULL\n" , i); |
534 | continue; |
535 | } |
536 | |
537 | /* safe to check without holding b->write_lock */ |
538 | if (!btree_node_journal_flush(b)) { |
539 | pr_err("BUG: bnode %p: journal_flush bit cleaned\n" , b); |
540 | continue; |
541 | } |
542 | |
543 | mutex_lock(&b->write_lock); |
544 | if (!btree_current_write(b)->journal) { |
545 | clear_bit(nr: BTREE_NODE_journal_flush, addr: &b->flags); |
546 | mutex_unlock(lock: &b->write_lock); |
547 | pr_debug("bnode %p: written by others\n" , b); |
548 | continue; |
549 | } |
550 | |
551 | if (!btree_node_dirty(b)) { |
552 | clear_bit(nr: BTREE_NODE_journal_flush, addr: &b->flags); |
553 | mutex_unlock(lock: &b->write_lock); |
554 | pr_debug("bnode %p: dirty bit cleaned by others\n" , b); |
555 | continue; |
556 | } |
557 | |
558 | __bch_btree_node_write(b, NULL); |
559 | clear_bit(nr: BTREE_NODE_journal_flush, addr: &b->flags); |
560 | mutex_unlock(lock: &b->write_lock); |
561 | } |
562 | |
563 | out: |
564 | spin_lock(lock: &c->journal.flush_write_lock); |
565 | c->journal.btree_flushing = false; |
566 | spin_unlock(lock: &c->journal.flush_write_lock); |
567 | } |
568 | |
569 | #define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1) |
570 | |
571 | static void journal_discard_endio(struct bio *bio) |
572 | { |
573 | struct journal_device *ja = |
574 | container_of(bio, struct journal_device, discard_bio); |
575 | struct cache *ca = container_of(ja, struct cache, journal); |
576 | |
577 | atomic_set(v: &ja->discard_in_flight, DISCARD_DONE); |
578 | |
579 | closure_wake_up(list: &ca->set->journal.wait); |
580 | closure_put(cl: &ca->set->cl); |
581 | } |
582 | |
583 | static void journal_discard_work(struct work_struct *work) |
584 | { |
585 | struct journal_device *ja = |
586 | container_of(work, struct journal_device, discard_work); |
587 | |
588 | submit_bio(bio: &ja->discard_bio); |
589 | } |
590 | |
591 | static void do_journal_discard(struct cache *ca) |
592 | { |
593 | struct journal_device *ja = &ca->journal; |
594 | struct bio *bio = &ja->discard_bio; |
595 | |
596 | if (!ca->discard) { |
597 | ja->discard_idx = ja->last_idx; |
598 | return; |
599 | } |
600 | |
601 | switch (atomic_read(v: &ja->discard_in_flight)) { |
602 | case DISCARD_IN_FLIGHT: |
603 | return; |
604 | |
605 | case DISCARD_DONE: |
606 | ja->discard_idx = (ja->discard_idx + 1) % |
607 | ca->sb.njournal_buckets; |
608 | |
609 | atomic_set(v: &ja->discard_in_flight, DISCARD_READY); |
610 | fallthrough; |
611 | |
612 | case DISCARD_READY: |
613 | if (ja->discard_idx == ja->last_idx) |
614 | return; |
615 | |
616 | atomic_set(v: &ja->discard_in_flight, DISCARD_IN_FLIGHT); |
617 | |
618 | bio_init(bio, bdev: ca->bdev, table: bio->bi_inline_vecs, max_vecs: 1, opf: REQ_OP_DISCARD); |
619 | bio->bi_iter.bi_sector = bucket_to_sector(c: ca->set, |
620 | b: ca->sb.d[ja->discard_idx]); |
621 | bio->bi_iter.bi_size = bucket_bytes(ca); |
622 | bio->bi_end_io = journal_discard_endio; |
623 | |
624 | closure_get(cl: &ca->set->cl); |
625 | INIT_WORK(&ja->discard_work, journal_discard_work); |
626 | queue_work(wq: bch_journal_wq, work: &ja->discard_work); |
627 | } |
628 | } |
629 | |
630 | static unsigned int free_journal_buckets(struct cache_set *c) |
631 | { |
632 | struct journal *j = &c->journal; |
633 | struct cache *ca = c->cache; |
634 | struct journal_device *ja = &c->cache->journal; |
635 | unsigned int n; |
636 | |
637 | /* In case njournal_buckets is not power of 2 */ |
638 | if (ja->cur_idx >= ja->discard_idx) |
639 | n = ca->sb.njournal_buckets + ja->discard_idx - ja->cur_idx; |
640 | else |
641 | n = ja->discard_idx - ja->cur_idx; |
642 | |
643 | if (n > (1 + j->do_reserve)) |
644 | return n - (1 + j->do_reserve); |
645 | |
646 | return 0; |
647 | } |
648 | |
649 | static void journal_reclaim(struct cache_set *c) |
650 | { |
651 | struct bkey *k = &c->journal.key; |
652 | struct cache *ca = c->cache; |
653 | uint64_t last_seq; |
654 | struct journal_device *ja = &ca->journal; |
655 | atomic_t p __maybe_unused; |
656 | |
657 | atomic_long_inc(v: &c->reclaim); |
658 | |
659 | while (!atomic_read(v: &fifo_front(&c->journal.pin))) |
660 | fifo_pop(&c->journal.pin, p); |
661 | |
662 | last_seq = last_seq(&c->journal); |
663 | |
664 | /* Update last_idx */ |
665 | |
666 | while (ja->last_idx != ja->cur_idx && |
667 | ja->seq[ja->last_idx] < last_seq) |
668 | ja->last_idx = (ja->last_idx + 1) % |
669 | ca->sb.njournal_buckets; |
670 | |
671 | do_journal_discard(ca); |
672 | |
673 | if (c->journal.blocks_free) |
674 | goto out; |
675 | |
676 | if (!free_journal_buckets(c)) |
677 | goto out; |
678 | |
679 | ja->cur_idx = (ja->cur_idx + 1) % ca->sb.njournal_buckets; |
680 | k->ptr[0] = MAKE_PTR(0, |
681 | bucket_to_sector(c, ca->sb.d[ja->cur_idx]), |
682 | ca->sb.nr_this_dev); |
683 | atomic_long_inc(v: &c->reclaimed_journal_buckets); |
684 | |
685 | bkey_init(k); |
686 | SET_KEY_PTRS(k, v: 1); |
687 | c->journal.blocks_free = ca->sb.bucket_size >> c->block_bits; |
688 | |
689 | out: |
690 | if (!journal_full(&c->journal)) |
691 | __closure_wake_up(list: &c->journal.wait); |
692 | } |
693 | |
694 | void bch_journal_next(struct journal *j) |
695 | { |
696 | atomic_t p = { 1 }; |
697 | |
698 | j->cur = (j->cur == j->w) |
699 | ? &j->w[1] |
700 | : &j->w[0]; |
701 | |
702 | /* |
703 | * The fifo_push() needs to happen at the same time as j->seq is |
704 | * incremented for last_seq() to be calculated correctly |
705 | */ |
706 | BUG_ON(!fifo_push(&j->pin, p)); |
707 | atomic_set(v: &fifo_back(&j->pin), i: 1); |
708 | |
709 | j->cur->data->seq = ++j->seq; |
710 | j->cur->dirty = false; |
711 | j->cur->need_write = false; |
712 | j->cur->data->keys = 0; |
713 | |
714 | if (fifo_full(&j->pin)) |
715 | pr_debug("journal_pin full (%zu)\n" , fifo_used(&j->pin)); |
716 | } |
717 | |
718 | static void journal_write_endio(struct bio *bio) |
719 | { |
720 | struct journal_write *w = bio->bi_private; |
721 | |
722 | cache_set_err_on(bio->bi_status, w->c, "journal io error" ); |
723 | closure_put(cl: &w->c->journal.io); |
724 | } |
725 | |
726 | static CLOSURE_CALLBACK(journal_write); |
727 | |
728 | static CLOSURE_CALLBACK(journal_write_done) |
729 | { |
730 | closure_type(j, struct journal, io); |
731 | struct journal_write *w = (j->cur == j->w) |
732 | ? &j->w[1] |
733 | : &j->w[0]; |
734 | |
735 | __closure_wake_up(list: &w->wait); |
736 | continue_at_nobarrier(cl, journal_write, bch_journal_wq); |
737 | } |
738 | |
739 | static CLOSURE_CALLBACK(journal_write_unlock) |
740 | __releases(&c->journal.lock) |
741 | { |
742 | closure_type(c, struct cache_set, journal.io); |
743 | |
744 | c->journal.io_in_flight = 0; |
745 | spin_unlock(lock: &c->journal.lock); |
746 | } |
747 | |
748 | static CLOSURE_CALLBACK(journal_write_unlocked) |
749 | __releases(c->journal.lock) |
750 | { |
751 | closure_type(c, struct cache_set, journal.io); |
752 | struct cache *ca = c->cache; |
753 | struct journal_write *w = c->journal.cur; |
754 | struct bkey *k = &c->journal.key; |
755 | unsigned int i, sectors = set_blocks(w->data, block_bytes(ca)) * |
756 | ca->sb.block_size; |
757 | |
758 | struct bio *bio; |
759 | struct bio_list list; |
760 | |
761 | bio_list_init(bl: &list); |
762 | |
763 | if (!w->need_write) { |
764 | closure_return_with_destructor(cl, journal_write_unlock); |
765 | return; |
766 | } else if (journal_full(&c->journal)) { |
767 | journal_reclaim(c); |
768 | spin_unlock(lock: &c->journal.lock); |
769 | |
770 | btree_flush_write(c); |
771 | continue_at(cl, journal_write, bch_journal_wq); |
772 | return; |
773 | } |
774 | |
775 | c->journal.blocks_free -= set_blocks(w->data, block_bytes(ca)); |
776 | |
777 | w->data->btree_level = c->root->level; |
778 | |
779 | bkey_copy(&w->data->btree_root, &c->root->key); |
780 | bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket); |
781 | |
782 | w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0]; |
783 | w->data->magic = jset_magic(sb: &ca->sb); |
784 | w->data->version = BCACHE_JSET_VERSION; |
785 | w->data->last_seq = last_seq(&c->journal); |
786 | w->data->csum = csum_set(w->data); |
787 | |
788 | for (i = 0; i < KEY_PTRS(k); i++) { |
789 | ca = c->cache; |
790 | bio = &ca->journal.bio; |
791 | |
792 | atomic_long_add(i: sectors, v: &ca->meta_sectors_written); |
793 | |
794 | bio_reset(bio, bdev: ca->bdev, opf: REQ_OP_WRITE | |
795 | REQ_SYNC | REQ_META | REQ_PREFLUSH | REQ_FUA); |
796 | bio->bi_iter.bi_sector = PTR_OFFSET(k, i); |
797 | bio->bi_iter.bi_size = sectors << 9; |
798 | |
799 | bio->bi_end_io = journal_write_endio; |
800 | bio->bi_private = w; |
801 | bch_bio_map(bio, base: w->data); |
802 | |
803 | trace_bcache_journal_write(bio, keys: w->data->keys); |
804 | bio_list_add(bl: &list, bio); |
805 | |
806 | SET_PTR_OFFSET(k, i, v: PTR_OFFSET(k, i) + sectors); |
807 | |
808 | ca->journal.seq[ca->journal.cur_idx] = w->data->seq; |
809 | } |
810 | |
811 | /* If KEY_PTRS(k) == 0, this jset gets lost in air */ |
812 | BUG_ON(i == 0); |
813 | |
814 | atomic_dec_bug(&fifo_back(&c->journal.pin)); |
815 | bch_journal_next(j: &c->journal); |
816 | journal_reclaim(c); |
817 | |
818 | spin_unlock(lock: &c->journal.lock); |
819 | |
820 | while ((bio = bio_list_pop(bl: &list))) |
821 | closure_bio_submit(c, bio, cl); |
822 | |
823 | continue_at(cl, journal_write_done, NULL); |
824 | } |
825 | |
826 | static CLOSURE_CALLBACK(journal_write) |
827 | { |
828 | closure_type(c, struct cache_set, journal.io); |
829 | |
830 | spin_lock(lock: &c->journal.lock); |
831 | journal_write_unlocked(ws: &cl->work); |
832 | } |
833 | |
834 | static void journal_try_write(struct cache_set *c) |
835 | __releases(c->journal.lock) |
836 | { |
837 | struct closure *cl = &c->journal.io; |
838 | struct journal_write *w = c->journal.cur; |
839 | |
840 | w->need_write = true; |
841 | |
842 | if (!c->journal.io_in_flight) { |
843 | c->journal.io_in_flight = 1; |
844 | closure_call(cl, fn: journal_write_unlocked, NULL, parent: &c->cl); |
845 | } else { |
846 | spin_unlock(lock: &c->journal.lock); |
847 | } |
848 | } |
849 | |
850 | static struct journal_write *journal_wait_for_write(struct cache_set *c, |
851 | unsigned int nkeys) |
852 | __acquires(&c->journal.lock) |
853 | { |
854 | size_t sectors; |
855 | struct closure cl; |
856 | bool wait = false; |
857 | struct cache *ca = c->cache; |
858 | |
859 | closure_init_stack(cl: &cl); |
860 | |
861 | spin_lock(lock: &c->journal.lock); |
862 | |
863 | while (1) { |
864 | struct journal_write *w = c->journal.cur; |
865 | |
866 | sectors = __set_blocks(w->data, w->data->keys + nkeys, |
867 | block_bytes(ca)) * ca->sb.block_size; |
868 | |
869 | if (sectors <= min_t(size_t, |
870 | c->journal.blocks_free * ca->sb.block_size, |
871 | PAGE_SECTORS << JSET_BITS)) |
872 | return w; |
873 | |
874 | if (wait) |
875 | closure_wait(list: &c->journal.wait, cl: &cl); |
876 | |
877 | if (!journal_full(&c->journal)) { |
878 | if (wait) |
879 | trace_bcache_journal_entry_full(c); |
880 | |
881 | /* |
882 | * XXX: If we were inserting so many keys that they |
883 | * won't fit in an _empty_ journal write, we'll |
884 | * deadlock. For now, handle this in |
885 | * bch_keylist_realloc() - but something to think about. |
886 | */ |
887 | BUG_ON(!w->data->keys); |
888 | |
889 | journal_try_write(c); /* unlocks */ |
890 | } else { |
891 | if (wait) |
892 | trace_bcache_journal_full(c); |
893 | |
894 | journal_reclaim(c); |
895 | spin_unlock(lock: &c->journal.lock); |
896 | |
897 | btree_flush_write(c); |
898 | } |
899 | |
900 | closure_sync(cl: &cl); |
901 | spin_lock(lock: &c->journal.lock); |
902 | wait = true; |
903 | } |
904 | } |
905 | |
906 | static void journal_write_work(struct work_struct *work) |
907 | { |
908 | struct cache_set *c = container_of(to_delayed_work(work), |
909 | struct cache_set, |
910 | journal.work); |
911 | spin_lock(lock: &c->journal.lock); |
912 | if (c->journal.cur->dirty) |
913 | journal_try_write(c); |
914 | else |
915 | spin_unlock(lock: &c->journal.lock); |
916 | } |
917 | |
918 | /* |
919 | * Entry point to the journalling code - bio_insert() and btree_invalidate() |
920 | * pass bch_journal() a list of keys to be journalled, and then |
921 | * bch_journal() hands those same keys off to btree_insert_async() |
922 | */ |
923 | |
924 | atomic_t *bch_journal(struct cache_set *c, |
925 | struct keylist *keys, |
926 | struct closure *parent) |
927 | { |
928 | struct journal_write *w; |
929 | atomic_t *ret; |
930 | |
931 | /* No journaling if CACHE_SET_IO_DISABLE set already */ |
932 | if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags))) |
933 | return NULL; |
934 | |
935 | if (!CACHE_SYNC(k: &c->cache->sb)) |
936 | return NULL; |
937 | |
938 | w = journal_wait_for_write(c, nkeys: bch_keylist_nkeys(l: keys)); |
939 | |
940 | memcpy(bset_bkey_last(w->data), keys->keys, bch_keylist_bytes(keys)); |
941 | w->data->keys += bch_keylist_nkeys(l: keys); |
942 | |
943 | ret = &fifo_back(&c->journal.pin); |
944 | atomic_inc(v: ret); |
945 | |
946 | if (parent) { |
947 | closure_wait(list: &w->wait, cl: parent); |
948 | journal_try_write(c); |
949 | } else if (!w->dirty) { |
950 | w->dirty = true; |
951 | queue_delayed_work(wq: bch_flush_wq, dwork: &c->journal.work, |
952 | delay: msecs_to_jiffies(m: c->journal_delay_ms)); |
953 | spin_unlock(lock: &c->journal.lock); |
954 | } else { |
955 | spin_unlock(lock: &c->journal.lock); |
956 | } |
957 | |
958 | |
959 | return ret; |
960 | } |
961 | |
962 | void bch_journal_meta(struct cache_set *c, struct closure *cl) |
963 | { |
964 | struct keylist keys; |
965 | atomic_t *ref; |
966 | |
967 | bch_keylist_init(l: &keys); |
968 | |
969 | ref = bch_journal(c, keys: &keys, parent: cl); |
970 | if (ref) |
971 | atomic_dec_bug(ref); |
972 | } |
973 | |
974 | void bch_journal_free(struct cache_set *c) |
975 | { |
976 | free_pages(addr: (unsigned long) c->journal.w[1].data, JSET_BITS); |
977 | free_pages(addr: (unsigned long) c->journal.w[0].data, JSET_BITS); |
978 | free_fifo(&c->journal.pin); |
979 | } |
980 | |
981 | int bch_journal_alloc(struct cache_set *c) |
982 | { |
983 | struct journal *j = &c->journal; |
984 | |
985 | spin_lock_init(&j->lock); |
986 | spin_lock_init(&j->flush_write_lock); |
987 | INIT_DELAYED_WORK(&j->work, journal_write_work); |
988 | |
989 | c->journal_delay_ms = 100; |
990 | |
991 | j->w[0].c = c; |
992 | j->w[1].c = c; |
993 | |
994 | if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || |
995 | !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL|__GFP_COMP, JSET_BITS)) || |
996 | !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL|__GFP_COMP, JSET_BITS))) |
997 | return -ENOMEM; |
998 | |
999 | return 0; |
1000 | } |
1001 | |