1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Main bcache entry point - handle a read or a write request and decide what to |
4 | * do with it; the make_request functions are called by the block layer. |
5 | * |
6 | * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> |
7 | * Copyright 2012 Google, Inc. |
8 | */ |
9 | |
10 | #include "bcache.h" |
11 | #include "btree.h" |
12 | #include "debug.h" |
13 | #include "request.h" |
14 | #include "writeback.h" |
15 | |
16 | #include <linux/module.h> |
17 | #include <linux/hash.h> |
18 | #include <linux/random.h> |
19 | #include <linux/backing-dev.h> |
20 | |
21 | #include <trace/events/bcache.h> |
22 | |
23 | #define CUTOFF_CACHE_ADD 95 |
24 | #define CUTOFF_CACHE_READA 90 |
25 | |
26 | struct kmem_cache *bch_search_cache; |
27 | |
28 | static CLOSURE_CALLBACK(bch_data_insert_start); |
29 | |
30 | static unsigned int cache_mode(struct cached_dev *dc) |
31 | { |
32 | return BDEV_CACHE_MODE(k: &dc->sb); |
33 | } |
34 | |
35 | static bool verify(struct cached_dev *dc) |
36 | { |
37 | return dc->verify; |
38 | } |
39 | |
40 | static void bio_csum(struct bio *bio, struct bkey *k) |
41 | { |
42 | struct bio_vec bv; |
43 | struct bvec_iter iter; |
44 | uint64_t csum = 0; |
45 | |
46 | bio_for_each_segment(bv, bio, iter) { |
47 | void *d = bvec_kmap_local(bvec: &bv); |
48 | |
49 | csum = crc64_be(crc: csum, p: d, len: bv.bv_len); |
50 | kunmap_local(d); |
51 | } |
52 | |
53 | k->ptr[KEY_PTRS(k)] = csum & (~0ULL >> 1); |
54 | } |
55 | |
56 | /* Insert data into cache */ |
57 | |
58 | static CLOSURE_CALLBACK(bch_data_insert_keys) |
59 | { |
60 | closure_type(op, struct data_insert_op, cl); |
61 | atomic_t *journal_ref = NULL; |
62 | struct bkey *replace_key = op->replace ? &op->replace_key : NULL; |
63 | int ret; |
64 | |
65 | if (!op->replace) |
66 | journal_ref = bch_journal(c: op->c, keys: &op->insert_keys, |
67 | parent: op->flush_journal ? cl : NULL); |
68 | |
69 | ret = bch_btree_insert(c: op->c, keys: &op->insert_keys, |
70 | journal_ref, replace_key); |
71 | if (ret == -ESRCH) { |
72 | op->replace_collision = true; |
73 | } else if (ret) { |
74 | op->status = BLK_STS_RESOURCE; |
75 | op->insert_data_done = true; |
76 | } |
77 | |
78 | if (journal_ref) |
79 | atomic_dec_bug(journal_ref); |
80 | |
81 | if (!op->insert_data_done) { |
82 | continue_at(cl, bch_data_insert_start, op->wq); |
83 | return; |
84 | } |
85 | |
86 | bch_keylist_free(l: &op->insert_keys); |
87 | closure_return(cl); |
88 | } |
89 | |
90 | static int bch_keylist_realloc(struct keylist *l, unsigned int u64s, |
91 | struct cache_set *c) |
92 | { |
93 | size_t oldsize = bch_keylist_nkeys(l); |
94 | size_t newsize = oldsize + u64s; |
95 | |
96 | /* |
97 | * The journalling code doesn't handle the case where the keys to insert |
98 | * is bigger than an empty write: If we just return -ENOMEM here, |
99 | * bch_data_insert_keys() will insert the keys created so far |
100 | * and finish the rest when the keylist is empty. |
101 | */ |
102 | if (newsize * sizeof(uint64_t) > block_bytes(c->cache) - sizeof(struct jset)) |
103 | return -ENOMEM; |
104 | |
105 | return __bch_keylist_realloc(l, u64s); |
106 | } |
107 | |
108 | static void bch_data_invalidate(struct closure *cl) |
109 | { |
110 | struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); |
111 | struct bio *bio = op->bio; |
112 | |
113 | pr_debug("invalidating %i sectors from %llu\n" , |
114 | bio_sectors(bio), (uint64_t) bio->bi_iter.bi_sector); |
115 | |
116 | while (bio_sectors(bio)) { |
117 | unsigned int sectors = min(bio_sectors(bio), |
118 | 1U << (KEY_SIZE_BITS - 1)); |
119 | |
120 | if (bch_keylist_realloc(l: &op->insert_keys, u64s: 2, c: op->c)) |
121 | goto out; |
122 | |
123 | bio->bi_iter.bi_sector += sectors; |
124 | bio->bi_iter.bi_size -= sectors << 9; |
125 | |
126 | bch_keylist_add(l: &op->insert_keys, |
127 | k: &KEY(op->inode, |
128 | bio->bi_iter.bi_sector, |
129 | sectors)); |
130 | } |
131 | |
132 | op->insert_data_done = true; |
133 | /* get in bch_data_insert() */ |
134 | bio_put(bio); |
135 | out: |
136 | continue_at(cl, bch_data_insert_keys, op->wq); |
137 | } |
138 | |
139 | static CLOSURE_CALLBACK(bch_data_insert_error) |
140 | { |
141 | closure_type(op, struct data_insert_op, cl); |
142 | |
143 | /* |
144 | * Our data write just errored, which means we've got a bunch of keys to |
145 | * insert that point to data that wasn't successfully written. |
146 | * |
147 | * We don't have to insert those keys but we still have to invalidate |
148 | * that region of the cache - so, if we just strip off all the pointers |
149 | * from the keys we'll accomplish just that. |
150 | */ |
151 | |
152 | struct bkey *src = op->insert_keys.keys, *dst = op->insert_keys.keys; |
153 | |
154 | while (src != op->insert_keys.top) { |
155 | struct bkey *n = bkey_next(k: src); |
156 | |
157 | SET_KEY_PTRS(k: src, v: 0); |
158 | memmove(dst, src, bkey_bytes(src)); |
159 | |
160 | dst = bkey_next(k: dst); |
161 | src = n; |
162 | } |
163 | |
164 | op->insert_keys.top = dst; |
165 | |
166 | bch_data_insert_keys(ws: &cl->work); |
167 | } |
168 | |
169 | static void bch_data_insert_endio(struct bio *bio) |
170 | { |
171 | struct closure *cl = bio->bi_private; |
172 | struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); |
173 | |
174 | if (bio->bi_status) { |
175 | /* TODO: We could try to recover from this. */ |
176 | if (op->writeback) |
177 | op->status = bio->bi_status; |
178 | else if (!op->replace) |
179 | set_closure_fn(cl, fn: bch_data_insert_error, wq: op->wq); |
180 | else |
181 | set_closure_fn(cl, NULL, NULL); |
182 | } |
183 | |
184 | bch_bbio_endio(c: op->c, bio, error: bio->bi_status, m: "writing data to cache" ); |
185 | } |
186 | |
187 | static CLOSURE_CALLBACK(bch_data_insert_start) |
188 | { |
189 | closure_type(op, struct data_insert_op, cl); |
190 | struct bio *bio = op->bio, *n; |
191 | |
192 | if (op->bypass) |
193 | return bch_data_invalidate(cl); |
194 | |
195 | if (atomic_sub_return(bio_sectors(bio), v: &op->c->sectors_to_gc) < 0) |
196 | wake_up_gc(c: op->c); |
197 | |
198 | /* |
199 | * Journal writes are marked REQ_PREFLUSH; if the original write was a |
200 | * flush, it'll wait on the journal write. |
201 | */ |
202 | bio->bi_opf &= ~(REQ_PREFLUSH|REQ_FUA); |
203 | |
204 | do { |
205 | unsigned int i; |
206 | struct bkey *k; |
207 | struct bio_set *split = &op->c->bio_split; |
208 | |
209 | /* 1 for the device pointer and 1 for the chksum */ |
210 | if (bch_keylist_realloc(l: &op->insert_keys, |
211 | u64s: 3 + (op->csum ? 1 : 0), |
212 | c: op->c)) { |
213 | continue_at(cl, bch_data_insert_keys, op->wq); |
214 | return; |
215 | } |
216 | |
217 | k = op->insert_keys.top; |
218 | bkey_init(k); |
219 | SET_KEY_INODE(k, v: op->inode); |
220 | SET_KEY_OFFSET(k, v: bio->bi_iter.bi_sector); |
221 | |
222 | if (!bch_alloc_sectors(c: op->c, k, bio_sectors(bio), |
223 | write_point: op->write_point, write_prio: op->write_prio, |
224 | wait: op->writeback)) |
225 | goto err; |
226 | |
227 | n = bio_next_split(bio, sectors: KEY_SIZE(k), GFP_NOIO, bs: split); |
228 | |
229 | n->bi_end_io = bch_data_insert_endio; |
230 | n->bi_private = cl; |
231 | |
232 | if (op->writeback) { |
233 | SET_KEY_DIRTY(k, v: true); |
234 | |
235 | for (i = 0; i < KEY_PTRS(k); i++) |
236 | SET_GC_MARK(k: PTR_BUCKET(c: op->c, k, ptr: i), |
237 | GC_MARK_DIRTY); |
238 | } |
239 | |
240 | SET_KEY_CSUM(k, v: op->csum); |
241 | if (KEY_CSUM(k)) |
242 | bio_csum(bio: n, k); |
243 | |
244 | trace_bcache_cache_insert(k); |
245 | bch_keylist_push(l: &op->insert_keys); |
246 | |
247 | n->bi_opf = REQ_OP_WRITE; |
248 | bch_submit_bbio(bio: n, c: op->c, k, ptr: 0); |
249 | } while (n != bio); |
250 | |
251 | op->insert_data_done = true; |
252 | continue_at(cl, bch_data_insert_keys, op->wq); |
253 | return; |
254 | err: |
255 | /* bch_alloc_sectors() blocks if s->writeback = true */ |
256 | BUG_ON(op->writeback); |
257 | |
258 | /* |
259 | * But if it's not a writeback write we'd rather just bail out if |
260 | * there aren't any buckets ready to write to - it might take awhile and |
261 | * we might be starving btree writes for gc or something. |
262 | */ |
263 | |
264 | if (!op->replace) { |
265 | /* |
266 | * Writethrough write: We can't complete the write until we've |
267 | * updated the index. But we don't want to delay the write while |
268 | * we wait for buckets to be freed up, so just invalidate the |
269 | * rest of the write. |
270 | */ |
271 | op->bypass = true; |
272 | return bch_data_invalidate(cl); |
273 | } else { |
274 | /* |
275 | * From a cache miss, we can just insert the keys for the data |
276 | * we have written or bail out if we didn't do anything. |
277 | */ |
278 | op->insert_data_done = true; |
279 | bio_put(bio); |
280 | |
281 | if (!bch_keylist_empty(l: &op->insert_keys)) |
282 | continue_at(cl, bch_data_insert_keys, op->wq); |
283 | else |
284 | closure_return(cl); |
285 | } |
286 | } |
287 | |
288 | /** |
289 | * bch_data_insert - stick some data in the cache |
290 | * @cl: closure pointer. |
291 | * |
292 | * This is the starting point for any data to end up in a cache device; it could |
293 | * be from a normal write, or a writeback write, or a write to a flash only |
294 | * volume - it's also used by the moving garbage collector to compact data in |
295 | * mostly empty buckets. |
296 | * |
297 | * It first writes the data to the cache, creating a list of keys to be inserted |
298 | * (if the data had to be fragmented there will be multiple keys); after the |
299 | * data is written it calls bch_journal, and after the keys have been added to |
300 | * the next journal write they're inserted into the btree. |
301 | * |
302 | * It inserts the data in op->bio; bi_sector is used for the key offset, |
303 | * and op->inode is used for the key inode. |
304 | * |
305 | * If op->bypass is true, instead of inserting the data it invalidates the |
306 | * region of the cache represented by op->bio and op->inode. |
307 | */ |
308 | CLOSURE_CALLBACK(bch_data_insert) |
309 | { |
310 | closure_type(op, struct data_insert_op, cl); |
311 | |
312 | trace_bcache_write(c: op->c, inode: op->inode, bio: op->bio, |
313 | writeback: op->writeback, bypass: op->bypass); |
314 | |
315 | bch_keylist_init(l: &op->insert_keys); |
316 | bio_get(bio: op->bio); |
317 | bch_data_insert_start(ws: &cl->work); |
318 | } |
319 | |
320 | /* |
321 | * Congested? Return 0 (not congested) or the limit (in sectors) |
322 | * beyond which we should bypass the cache due to congestion. |
323 | */ |
324 | unsigned int bch_get_congested(const struct cache_set *c) |
325 | { |
326 | int i; |
327 | |
328 | if (!c->congested_read_threshold_us && |
329 | !c->congested_write_threshold_us) |
330 | return 0; |
331 | |
332 | i = (local_clock_us() - c->congested_last_us) / 1024; |
333 | if (i < 0) |
334 | return 0; |
335 | |
336 | i += atomic_read(v: &c->congested); |
337 | if (i >= 0) |
338 | return 0; |
339 | |
340 | i += CONGESTED_MAX; |
341 | |
342 | if (i > 0) |
343 | i = fract_exp_two(x: i, fract_bits: 6); |
344 | |
345 | i -= hweight32(get_random_u32()); |
346 | |
347 | return i > 0 ? i : 1; |
348 | } |
349 | |
350 | static void add_sequential(struct task_struct *t) |
351 | { |
352 | ewma_add(t->sequential_io_avg, |
353 | t->sequential_io, 8, 0); |
354 | |
355 | t->sequential_io = 0; |
356 | } |
357 | |
358 | static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k) |
359 | { |
360 | return &dc->io_hash[hash_64(val: k, RECENT_IO_BITS)]; |
361 | } |
362 | |
363 | static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) |
364 | { |
365 | struct cache_set *c = dc->disk.c; |
366 | unsigned int mode = cache_mode(dc); |
367 | unsigned int sectors, congested; |
368 | struct task_struct *task = current; |
369 | struct io *i; |
370 | |
371 | if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || |
372 | c->gc_stats.in_use > CUTOFF_CACHE_ADD || |
373 | (bio_op(bio) == REQ_OP_DISCARD)) |
374 | goto skip; |
375 | |
376 | if (mode == CACHE_MODE_NONE || |
377 | (mode == CACHE_MODE_WRITEAROUND && |
378 | op_is_write(op: bio_op(bio)))) |
379 | goto skip; |
380 | |
381 | /* |
382 | * If the bio is for read-ahead or background IO, bypass it or |
383 | * not depends on the following situations, |
384 | * - If the IO is for meta data, always cache it and no bypass |
385 | * - If the IO is not meta data, check dc->cache_reada_policy, |
386 | * BCH_CACHE_READA_ALL: cache it and not bypass |
387 | * BCH_CACHE_READA_META_ONLY: not cache it and bypass |
388 | * That is, read-ahead request for metadata always get cached |
389 | * (eg, for gfs2 or xfs). |
390 | */ |
391 | if ((bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND))) { |
392 | if (!(bio->bi_opf & (REQ_META|REQ_PRIO)) && |
393 | (dc->cache_readahead_policy != BCH_CACHE_READA_ALL)) |
394 | goto skip; |
395 | } |
396 | |
397 | if (bio->bi_iter.bi_sector & (c->cache->sb.block_size - 1) || |
398 | bio_sectors(bio) & (c->cache->sb.block_size - 1)) { |
399 | pr_debug("skipping unaligned io\n" ); |
400 | goto skip; |
401 | } |
402 | |
403 | if (bypass_torture_test(dc)) { |
404 | if (get_random_u32_below(ceil: 4) == 3) |
405 | goto skip; |
406 | else |
407 | goto rescale; |
408 | } |
409 | |
410 | congested = bch_get_congested(c); |
411 | if (!congested && !dc->sequential_cutoff) |
412 | goto rescale; |
413 | |
414 | spin_lock(lock: &dc->io_lock); |
415 | |
416 | hlist_for_each_entry(i, iohash(dc, bio->bi_iter.bi_sector), hash) |
417 | if (i->last == bio->bi_iter.bi_sector && |
418 | time_before(jiffies, i->jiffies)) |
419 | goto found; |
420 | |
421 | i = list_first_entry(&dc->io_lru, struct io, lru); |
422 | |
423 | add_sequential(t: task); |
424 | i->sequential = 0; |
425 | found: |
426 | if (i->sequential + bio->bi_iter.bi_size > i->sequential) |
427 | i->sequential += bio->bi_iter.bi_size; |
428 | |
429 | i->last = bio_end_sector(bio); |
430 | i->jiffies = jiffies + msecs_to_jiffies(m: 5000); |
431 | task->sequential_io = i->sequential; |
432 | |
433 | hlist_del(n: &i->hash); |
434 | hlist_add_head(n: &i->hash, h: iohash(dc, k: i->last)); |
435 | list_move_tail(list: &i->lru, head: &dc->io_lru); |
436 | |
437 | spin_unlock(lock: &dc->io_lock); |
438 | |
439 | sectors = max(task->sequential_io, |
440 | task->sequential_io_avg) >> 9; |
441 | |
442 | if (dc->sequential_cutoff && |
443 | sectors >= dc->sequential_cutoff >> 9) { |
444 | trace_bcache_bypass_sequential(bio); |
445 | goto skip; |
446 | } |
447 | |
448 | if (congested && sectors >= congested) { |
449 | trace_bcache_bypass_congested(bio); |
450 | goto skip; |
451 | } |
452 | |
453 | rescale: |
454 | bch_rescale_priorities(c, bio_sectors(bio)); |
455 | return false; |
456 | skip: |
457 | bch_mark_sectors_bypassed(c, dc, bio_sectors(bio)); |
458 | return true; |
459 | } |
460 | |
461 | /* Cache lookup */ |
462 | |
463 | struct search { |
464 | /* Stack frame for bio_complete */ |
465 | struct closure cl; |
466 | |
467 | struct bbio bio; |
468 | struct bio *orig_bio; |
469 | struct bio *cache_miss; |
470 | struct bcache_device *d; |
471 | |
472 | unsigned int insert_bio_sectors; |
473 | unsigned int recoverable:1; |
474 | unsigned int write:1; |
475 | unsigned int read_dirty_data:1; |
476 | unsigned int cache_missed:1; |
477 | |
478 | struct block_device *orig_bdev; |
479 | unsigned long start_time; |
480 | |
481 | struct btree_op op; |
482 | struct data_insert_op iop; |
483 | }; |
484 | |
485 | static void bch_cache_read_endio(struct bio *bio) |
486 | { |
487 | struct bbio *b = container_of(bio, struct bbio, bio); |
488 | struct closure *cl = bio->bi_private; |
489 | struct search *s = container_of(cl, struct search, cl); |
490 | |
491 | /* |
492 | * If the bucket was reused while our bio was in flight, we might have |
493 | * read the wrong data. Set s->error but not error so it doesn't get |
494 | * counted against the cache device, but we'll still reread the data |
495 | * from the backing device. |
496 | */ |
497 | |
498 | if (bio->bi_status) |
499 | s->iop.status = bio->bi_status; |
500 | else if (!KEY_DIRTY(k: &b->key) && |
501 | ptr_stale(c: s->iop.c, k: &b->key, i: 0)) { |
502 | atomic_long_inc(v: &s->iop.c->cache_read_races); |
503 | s->iop.status = BLK_STS_IOERR; |
504 | } |
505 | |
506 | bch_bbio_endio(c: s->iop.c, bio, error: bio->bi_status, m: "reading from cache" ); |
507 | } |
508 | |
509 | /* |
510 | * Read from a single key, handling the initial cache miss if the key starts in |
511 | * the middle of the bio |
512 | */ |
513 | static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k) |
514 | { |
515 | struct search *s = container_of(op, struct search, op); |
516 | struct bio *n, *bio = &s->bio.bio; |
517 | struct bkey *bio_key; |
518 | unsigned int ptr; |
519 | |
520 | if (bkey_cmp(l: k, r: &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0)) <= 0) |
521 | return MAP_CONTINUE; |
522 | |
523 | if (KEY_INODE(k) != s->iop.inode || |
524 | KEY_START(k) > bio->bi_iter.bi_sector) { |
525 | unsigned int bio_sectors = bio_sectors(bio); |
526 | unsigned int sectors = KEY_INODE(k) == s->iop.inode |
527 | ? min_t(uint64_t, INT_MAX, |
528 | KEY_START(k) - bio->bi_iter.bi_sector) |
529 | : INT_MAX; |
530 | int ret = s->d->cache_miss(b, s, bio, sectors); |
531 | |
532 | if (ret != MAP_CONTINUE) |
533 | return ret; |
534 | |
535 | /* if this was a complete miss we shouldn't get here */ |
536 | BUG_ON(bio_sectors <= sectors); |
537 | } |
538 | |
539 | if (!KEY_SIZE(k)) |
540 | return MAP_CONTINUE; |
541 | |
542 | /* XXX: figure out best pointer - for multiple cache devices */ |
543 | ptr = 0; |
544 | |
545 | PTR_BUCKET(c: b->c, k, ptr)->prio = INITIAL_PRIO; |
546 | |
547 | if (KEY_DIRTY(k)) |
548 | s->read_dirty_data = true; |
549 | |
550 | n = bio_next_split(bio, min_t(uint64_t, INT_MAX, |
551 | KEY_OFFSET(k) - bio->bi_iter.bi_sector), |
552 | GFP_NOIO, bs: &s->d->bio_split); |
553 | |
554 | bio_key = &container_of(n, struct bbio, bio)->key; |
555 | bch_bkey_copy_single_ptr(dest: bio_key, src: k, i: ptr); |
556 | |
557 | bch_cut_front(where: &KEY(s->iop.inode, n->bi_iter.bi_sector, 0), k: bio_key); |
558 | bch_cut_back(where: &KEY(s->iop.inode, bio_end_sector(n), 0), k: bio_key); |
559 | |
560 | n->bi_end_io = bch_cache_read_endio; |
561 | n->bi_private = &s->cl; |
562 | |
563 | /* |
564 | * The bucket we're reading from might be reused while our bio |
565 | * is in flight, and we could then end up reading the wrong |
566 | * data. |
567 | * |
568 | * We guard against this by checking (in cache_read_endio()) if |
569 | * the pointer is stale again; if so, we treat it as an error |
570 | * and reread from the backing device (but we don't pass that |
571 | * error up anywhere). |
572 | */ |
573 | |
574 | __bch_submit_bbio(bio: n, c: b->c); |
575 | return n == bio ? MAP_DONE : MAP_CONTINUE; |
576 | } |
577 | |
578 | static CLOSURE_CALLBACK(cache_lookup) |
579 | { |
580 | closure_type(s, struct search, iop.cl); |
581 | struct bio *bio = &s->bio.bio; |
582 | struct cached_dev *dc; |
583 | int ret; |
584 | |
585 | bch_btree_op_init(op: &s->op, write_lock_level: -1); |
586 | |
587 | ret = bch_btree_map_keys(op: &s->op, c: s->iop.c, |
588 | from: &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0), |
589 | fn: cache_lookup_fn, MAP_END_KEY); |
590 | if (ret == -EAGAIN) { |
591 | continue_at(cl, cache_lookup, bcache_wq); |
592 | return; |
593 | } |
594 | |
595 | /* |
596 | * We might meet err when searching the btree, If that happens, we will |
597 | * get negative ret, in this scenario we should not recover data from |
598 | * backing device (when cache device is dirty) because we don't know |
599 | * whether bkeys the read request covered are all clean. |
600 | * |
601 | * And after that happened, s->iop.status is still its initial value |
602 | * before we submit s->bio.bio |
603 | */ |
604 | if (ret < 0) { |
605 | BUG_ON(ret == -EINTR); |
606 | if (s->d && s->d->c && |
607 | !UUID_FLASH_ONLY(k: &s->d->c->uuids[s->d->id])) { |
608 | dc = container_of(s->d, struct cached_dev, disk); |
609 | if (dc && atomic_read(v: &dc->has_dirty)) |
610 | s->recoverable = false; |
611 | } |
612 | if (!s->iop.status) |
613 | s->iop.status = BLK_STS_IOERR; |
614 | } |
615 | |
616 | closure_return(cl); |
617 | } |
618 | |
619 | /* Common code for the make_request functions */ |
620 | |
621 | static void request_endio(struct bio *bio) |
622 | { |
623 | struct closure *cl = bio->bi_private; |
624 | |
625 | if (bio->bi_status) { |
626 | struct search *s = container_of(cl, struct search, cl); |
627 | |
628 | s->iop.status = bio->bi_status; |
629 | /* Only cache read errors are recoverable */ |
630 | s->recoverable = false; |
631 | } |
632 | |
633 | bio_put(bio); |
634 | closure_put(cl); |
635 | } |
636 | |
637 | static void backing_request_endio(struct bio *bio) |
638 | { |
639 | struct closure *cl = bio->bi_private; |
640 | |
641 | if (bio->bi_status) { |
642 | struct search *s = container_of(cl, struct search, cl); |
643 | struct cached_dev *dc = container_of(s->d, |
644 | struct cached_dev, disk); |
645 | /* |
646 | * If a bio has REQ_PREFLUSH for writeback mode, it is |
647 | * speically assembled in cached_dev_write() for a non-zero |
648 | * write request which has REQ_PREFLUSH. we don't set |
649 | * s->iop.status by this failure, the status will be decided |
650 | * by result of bch_data_insert() operation. |
651 | */ |
652 | if (unlikely(s->iop.writeback && |
653 | bio->bi_opf & REQ_PREFLUSH)) { |
654 | pr_err("Can't flush %pg: returned bi_status %i\n" , |
655 | dc->bdev, bio->bi_status); |
656 | } else { |
657 | /* set to orig_bio->bi_status in bio_complete() */ |
658 | s->iop.status = bio->bi_status; |
659 | } |
660 | s->recoverable = false; |
661 | /* should count I/O error for backing device here */ |
662 | bch_count_backing_io_errors(dc, bio); |
663 | } |
664 | |
665 | bio_put(bio); |
666 | closure_put(cl); |
667 | } |
668 | |
669 | static void bio_complete(struct search *s) |
670 | { |
671 | if (s->orig_bio) { |
672 | /* Count on bcache device */ |
673 | bio_end_io_acct_remapped(bio: s->orig_bio, start_time: s->start_time, |
674 | orig_bdev: s->orig_bdev); |
675 | trace_bcache_request_end(d: s->d, bio: s->orig_bio); |
676 | s->orig_bio->bi_status = s->iop.status; |
677 | bio_endio(s->orig_bio); |
678 | s->orig_bio = NULL; |
679 | } |
680 | } |
681 | |
682 | static void do_bio_hook(struct search *s, |
683 | struct bio *orig_bio, |
684 | bio_end_io_t *end_io_fn) |
685 | { |
686 | struct bio *bio = &s->bio.bio; |
687 | |
688 | bio_init_clone(bdev: orig_bio->bi_bdev, bio, bio_src: orig_bio, GFP_NOIO); |
689 | /* |
690 | * bi_end_io can be set separately somewhere else, e.g. the |
691 | * variants in, |
692 | * - cache_bio->bi_end_io from cached_dev_cache_miss() |
693 | * - n->bi_end_io from cache_lookup_fn() |
694 | */ |
695 | bio->bi_end_io = end_io_fn; |
696 | bio->bi_private = &s->cl; |
697 | |
698 | bio_cnt_set(bio, count: 3); |
699 | } |
700 | |
701 | static CLOSURE_CALLBACK(search_free) |
702 | { |
703 | closure_type(s, struct search, cl); |
704 | |
705 | atomic_dec(v: &s->iop.c->search_inflight); |
706 | |
707 | if (s->iop.bio) |
708 | bio_put(s->iop.bio); |
709 | |
710 | bio_complete(s); |
711 | closure_debug_destroy(cl); |
712 | mempool_free(element: s, pool: &s->iop.c->search); |
713 | } |
714 | |
715 | static inline struct search *search_alloc(struct bio *bio, |
716 | struct bcache_device *d, struct block_device *orig_bdev, |
717 | unsigned long start_time) |
718 | { |
719 | struct search *s; |
720 | |
721 | s = mempool_alloc(pool: &d->c->search, GFP_NOIO); |
722 | |
723 | closure_init(cl: &s->cl, NULL); |
724 | do_bio_hook(s, orig_bio: bio, end_io_fn: request_endio); |
725 | atomic_inc(v: &d->c->search_inflight); |
726 | |
727 | s->orig_bio = bio; |
728 | s->cache_miss = NULL; |
729 | s->cache_missed = 0; |
730 | s->d = d; |
731 | s->recoverable = 1; |
732 | s->write = op_is_write(op: bio_op(bio)); |
733 | s->read_dirty_data = 0; |
734 | /* Count on the bcache device */ |
735 | s->orig_bdev = orig_bdev; |
736 | s->start_time = start_time; |
737 | s->iop.c = d->c; |
738 | s->iop.bio = NULL; |
739 | s->iop.inode = d->id; |
740 | s->iop.write_point = hash_long((unsigned long) current, 16); |
741 | s->iop.write_prio = 0; |
742 | s->iop.status = 0; |
743 | s->iop.flags = 0; |
744 | s->iop.flush_journal = op_is_flush(op: bio->bi_opf); |
745 | s->iop.wq = bcache_wq; |
746 | |
747 | return s; |
748 | } |
749 | |
750 | /* Cached devices */ |
751 | |
752 | static CLOSURE_CALLBACK(cached_dev_bio_complete) |
753 | { |
754 | closure_type(s, struct search, cl); |
755 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); |
756 | |
757 | cached_dev_put(dc); |
758 | search_free(ws: &cl->work); |
759 | } |
760 | |
761 | /* Process reads */ |
762 | |
763 | static CLOSURE_CALLBACK(cached_dev_read_error_done) |
764 | { |
765 | closure_type(s, struct search, cl); |
766 | |
767 | if (s->iop.replace_collision) |
768 | bch_mark_cache_miss_collision(c: s->iop.c, d: s->d); |
769 | |
770 | if (s->iop.bio) |
771 | bio_free_pages(bio: s->iop.bio); |
772 | |
773 | cached_dev_bio_complete(ws: &cl->work); |
774 | } |
775 | |
776 | static CLOSURE_CALLBACK(cached_dev_read_error) |
777 | { |
778 | closure_type(s, struct search, cl); |
779 | struct bio *bio = &s->bio.bio; |
780 | |
781 | /* |
782 | * If read request hit dirty data (s->read_dirty_data is true), |
783 | * then recovery a failed read request from cached device may |
784 | * get a stale data back. So read failure recovery is only |
785 | * permitted when read request hit clean data in cache device, |
786 | * or when cache read race happened. |
787 | */ |
788 | if (s->recoverable && !s->read_dirty_data) { |
789 | /* Retry from the backing device: */ |
790 | trace_bcache_read_retry(bio: s->orig_bio); |
791 | |
792 | s->iop.status = 0; |
793 | do_bio_hook(s, orig_bio: s->orig_bio, end_io_fn: backing_request_endio); |
794 | |
795 | /* XXX: invalidate cache */ |
796 | |
797 | /* I/O request sent to backing device */ |
798 | closure_bio_submit(c: s->iop.c, bio, cl); |
799 | } |
800 | |
801 | continue_at(cl, cached_dev_read_error_done, NULL); |
802 | } |
803 | |
804 | static CLOSURE_CALLBACK(cached_dev_cache_miss_done) |
805 | { |
806 | closure_type(s, struct search, cl); |
807 | struct bcache_device *d = s->d; |
808 | |
809 | if (s->iop.replace_collision) |
810 | bch_mark_cache_miss_collision(c: s->iop.c, d: s->d); |
811 | |
812 | if (s->iop.bio) |
813 | bio_free_pages(bio: s->iop.bio); |
814 | |
815 | cached_dev_bio_complete(ws: &cl->work); |
816 | closure_put(cl: &d->cl); |
817 | } |
818 | |
819 | static CLOSURE_CALLBACK(cached_dev_read_done) |
820 | { |
821 | closure_type(s, struct search, cl); |
822 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); |
823 | |
824 | /* |
825 | * We had a cache miss; cache_bio now contains data ready to be inserted |
826 | * into the cache. |
827 | * |
828 | * First, we copy the data we just read from cache_bio's bounce buffers |
829 | * to the buffers the original bio pointed to: |
830 | */ |
831 | |
832 | if (s->iop.bio) { |
833 | bio_reset(bio: s->iop.bio, bdev: s->cache_miss->bi_bdev, opf: REQ_OP_READ); |
834 | s->iop.bio->bi_iter.bi_sector = |
835 | s->cache_miss->bi_iter.bi_sector; |
836 | s->iop.bio->bi_iter.bi_size = s->insert_bio_sectors << 9; |
837 | bio_clone_blkg_association(dst: s->iop.bio, src: s->cache_miss); |
838 | bch_bio_map(bio: s->iop.bio, NULL); |
839 | |
840 | bio_copy_data(dst: s->cache_miss, src: s->iop.bio); |
841 | |
842 | bio_put(s->cache_miss); |
843 | s->cache_miss = NULL; |
844 | } |
845 | |
846 | if (verify(dc) && s->recoverable && !s->read_dirty_data) |
847 | bch_data_verify(dc, bio: s->orig_bio); |
848 | |
849 | closure_get(cl: &dc->disk.cl); |
850 | bio_complete(s); |
851 | |
852 | if (s->iop.bio && |
853 | !test_bit(CACHE_SET_STOPPING, &s->iop.c->flags)) { |
854 | BUG_ON(!s->iop.replace); |
855 | closure_call(cl: &s->iop.cl, fn: bch_data_insert, NULL, parent: cl); |
856 | } |
857 | |
858 | continue_at(cl, cached_dev_cache_miss_done, NULL); |
859 | } |
860 | |
861 | static CLOSURE_CALLBACK(cached_dev_read_done_bh) |
862 | { |
863 | closure_type(s, struct search, cl); |
864 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); |
865 | |
866 | bch_mark_cache_accounting(c: s->iop.c, d: s->d, |
867 | hit: !s->cache_missed, bypass: s->iop.bypass); |
868 | trace_bcache_read(bio: s->orig_bio, hit: !s->cache_missed, bypass: s->iop.bypass); |
869 | |
870 | if (s->iop.status) |
871 | continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq); |
872 | else if (s->iop.bio || verify(dc)) |
873 | continue_at_nobarrier(cl, cached_dev_read_done, bcache_wq); |
874 | else |
875 | continue_at_nobarrier(cl, cached_dev_bio_complete, NULL); |
876 | } |
877 | |
878 | static int cached_dev_cache_miss(struct btree *b, struct search *s, |
879 | struct bio *bio, unsigned int sectors) |
880 | { |
881 | int ret = MAP_CONTINUE; |
882 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); |
883 | struct bio *miss, *cache_bio; |
884 | unsigned int size_limit; |
885 | |
886 | s->cache_missed = 1; |
887 | |
888 | if (s->cache_miss || s->iop.bypass) { |
889 | miss = bio_next_split(bio, sectors, GFP_NOIO, bs: &s->d->bio_split); |
890 | ret = miss == bio ? MAP_DONE : MAP_CONTINUE; |
891 | goto out_submit; |
892 | } |
893 | |
894 | /* Limitation for valid replace key size and cache_bio bvecs number */ |
895 | size_limit = min_t(unsigned int, BIO_MAX_VECS * PAGE_SECTORS, |
896 | (1 << KEY_SIZE_BITS) - 1); |
897 | s->insert_bio_sectors = min3(size_limit, sectors, bio_sectors(bio)); |
898 | |
899 | s->iop.replace_key = KEY(s->iop.inode, |
900 | bio->bi_iter.bi_sector + s->insert_bio_sectors, |
901 | s->insert_bio_sectors); |
902 | |
903 | ret = bch_btree_insert_check_key(b, op: &s->op, check_key: &s->iop.replace_key); |
904 | if (ret) |
905 | return ret; |
906 | |
907 | s->iop.replace = true; |
908 | |
909 | miss = bio_next_split(bio, sectors: s->insert_bio_sectors, GFP_NOIO, |
910 | bs: &s->d->bio_split); |
911 | |
912 | /* btree_search_recurse()'s btree iterator is no good anymore */ |
913 | ret = miss == bio ? MAP_DONE : -EINTR; |
914 | |
915 | cache_bio = bio_alloc_bioset(bdev: miss->bi_bdev, |
916 | DIV_ROUND_UP(s->insert_bio_sectors, PAGE_SECTORS), |
917 | opf: 0, GFP_NOWAIT, bs: &dc->disk.bio_split); |
918 | if (!cache_bio) |
919 | goto out_submit; |
920 | |
921 | cache_bio->bi_iter.bi_sector = miss->bi_iter.bi_sector; |
922 | cache_bio->bi_iter.bi_size = s->insert_bio_sectors << 9; |
923 | |
924 | cache_bio->bi_end_io = backing_request_endio; |
925 | cache_bio->bi_private = &s->cl; |
926 | |
927 | bch_bio_map(bio: cache_bio, NULL); |
928 | if (bch_bio_alloc_pages(bio: cache_bio, __GFP_NOWARN|GFP_NOIO)) |
929 | goto out_put; |
930 | |
931 | s->cache_miss = miss; |
932 | s->iop.bio = cache_bio; |
933 | bio_get(bio: cache_bio); |
934 | /* I/O request sent to backing device */ |
935 | closure_bio_submit(c: s->iop.c, bio: cache_bio, cl: &s->cl); |
936 | |
937 | return ret; |
938 | out_put: |
939 | bio_put(cache_bio); |
940 | out_submit: |
941 | miss->bi_end_io = backing_request_endio; |
942 | miss->bi_private = &s->cl; |
943 | /* I/O request sent to backing device */ |
944 | closure_bio_submit(c: s->iop.c, bio: miss, cl: &s->cl); |
945 | return ret; |
946 | } |
947 | |
948 | static void cached_dev_read(struct cached_dev *dc, struct search *s) |
949 | { |
950 | struct closure *cl = &s->cl; |
951 | |
952 | closure_call(cl: &s->iop.cl, fn: cache_lookup, NULL, parent: cl); |
953 | continue_at(cl, cached_dev_read_done_bh, NULL); |
954 | } |
955 | |
956 | /* Process writes */ |
957 | |
958 | static CLOSURE_CALLBACK(cached_dev_write_complete) |
959 | { |
960 | closure_type(s, struct search, cl); |
961 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); |
962 | |
963 | up_read_non_owner(sem: &dc->writeback_lock); |
964 | cached_dev_bio_complete(ws: &cl->work); |
965 | } |
966 | |
967 | static void cached_dev_write(struct cached_dev *dc, struct search *s) |
968 | { |
969 | struct closure *cl = &s->cl; |
970 | struct bio *bio = &s->bio.bio; |
971 | struct bkey start = KEY(dc->disk.id, bio->bi_iter.bi_sector, 0); |
972 | struct bkey end = KEY(dc->disk.id, bio_end_sector(bio), 0); |
973 | |
974 | bch_keybuf_check_overlapping(buf: &s->iop.c->moving_gc_keys, start: &start, end: &end); |
975 | |
976 | down_read_non_owner(sem: &dc->writeback_lock); |
977 | if (bch_keybuf_check_overlapping(buf: &dc->writeback_keys, start: &start, end: &end)) { |
978 | /* |
979 | * We overlap with some dirty data undergoing background |
980 | * writeback, force this write to writeback |
981 | */ |
982 | s->iop.bypass = false; |
983 | s->iop.writeback = true; |
984 | } |
985 | |
986 | /* |
987 | * Discards aren't _required_ to do anything, so skipping if |
988 | * check_overlapping returned true is ok |
989 | * |
990 | * But check_overlapping drops dirty keys for which io hasn't started, |
991 | * so we still want to call it. |
992 | */ |
993 | if (bio_op(bio) == REQ_OP_DISCARD) |
994 | s->iop.bypass = true; |
995 | |
996 | if (should_writeback(dc, bio: s->orig_bio, |
997 | cache_mode: cache_mode(dc), |
998 | would_skip: s->iop.bypass)) { |
999 | s->iop.bypass = false; |
1000 | s->iop.writeback = true; |
1001 | } |
1002 | |
1003 | if (s->iop.bypass) { |
1004 | s->iop.bio = s->orig_bio; |
1005 | bio_get(bio: s->iop.bio); |
1006 | |
1007 | if (bio_op(bio) == REQ_OP_DISCARD && |
1008 | !bdev_max_discard_sectors(bdev: dc->bdev)) |
1009 | goto insert_data; |
1010 | |
1011 | /* I/O request sent to backing device */ |
1012 | bio->bi_end_io = backing_request_endio; |
1013 | closure_bio_submit(c: s->iop.c, bio, cl); |
1014 | |
1015 | } else if (s->iop.writeback) { |
1016 | bch_writeback_add(dc); |
1017 | s->iop.bio = bio; |
1018 | |
1019 | if (bio->bi_opf & REQ_PREFLUSH) { |
1020 | /* |
1021 | * Also need to send a flush to the backing |
1022 | * device. |
1023 | */ |
1024 | struct bio *flush; |
1025 | |
1026 | flush = bio_alloc_bioset(bdev: bio->bi_bdev, nr_vecs: 0, |
1027 | opf: REQ_OP_WRITE | REQ_PREFLUSH, |
1028 | GFP_NOIO, bs: &dc->disk.bio_split); |
1029 | if (!flush) { |
1030 | s->iop.status = BLK_STS_RESOURCE; |
1031 | goto insert_data; |
1032 | } |
1033 | flush->bi_end_io = backing_request_endio; |
1034 | flush->bi_private = cl; |
1035 | /* I/O request sent to backing device */ |
1036 | closure_bio_submit(c: s->iop.c, bio: flush, cl); |
1037 | } |
1038 | } else { |
1039 | s->iop.bio = bio_alloc_clone(bdev: bio->bi_bdev, bio_src: bio, GFP_NOIO, |
1040 | bs: &dc->disk.bio_split); |
1041 | /* I/O request sent to backing device */ |
1042 | bio->bi_end_io = backing_request_endio; |
1043 | closure_bio_submit(c: s->iop.c, bio, cl); |
1044 | } |
1045 | |
1046 | insert_data: |
1047 | closure_call(cl: &s->iop.cl, fn: bch_data_insert, NULL, parent: cl); |
1048 | continue_at(cl, cached_dev_write_complete, NULL); |
1049 | } |
1050 | |
1051 | static CLOSURE_CALLBACK(cached_dev_nodata) |
1052 | { |
1053 | closure_type(s, struct search, cl); |
1054 | struct bio *bio = &s->bio.bio; |
1055 | |
1056 | if (s->iop.flush_journal) |
1057 | bch_journal_meta(c: s->iop.c, cl); |
1058 | |
1059 | /* If it's a flush, we send the flush to the backing device too */ |
1060 | bio->bi_end_io = backing_request_endio; |
1061 | closure_bio_submit(c: s->iop.c, bio, cl); |
1062 | |
1063 | continue_at(cl, cached_dev_bio_complete, NULL); |
1064 | } |
1065 | |
1066 | struct detached_dev_io_private { |
1067 | struct bcache_device *d; |
1068 | unsigned long start_time; |
1069 | bio_end_io_t *bi_end_io; |
1070 | void *bi_private; |
1071 | struct block_device *orig_bdev; |
1072 | }; |
1073 | |
1074 | static void detached_dev_end_io(struct bio *bio) |
1075 | { |
1076 | struct detached_dev_io_private *ddip; |
1077 | |
1078 | ddip = bio->bi_private; |
1079 | bio->bi_end_io = ddip->bi_end_io; |
1080 | bio->bi_private = ddip->bi_private; |
1081 | |
1082 | /* Count on the bcache device */ |
1083 | bio_end_io_acct_remapped(bio, start_time: ddip->start_time, orig_bdev: ddip->orig_bdev); |
1084 | |
1085 | if (bio->bi_status) { |
1086 | struct cached_dev *dc = container_of(ddip->d, |
1087 | struct cached_dev, disk); |
1088 | /* should count I/O error for backing device here */ |
1089 | bch_count_backing_io_errors(dc, bio); |
1090 | } |
1091 | |
1092 | kfree(objp: ddip); |
1093 | bio->bi_end_io(bio); |
1094 | } |
1095 | |
1096 | static void detached_dev_do_request(struct bcache_device *d, struct bio *bio, |
1097 | struct block_device *orig_bdev, unsigned long start_time) |
1098 | { |
1099 | struct detached_dev_io_private *ddip; |
1100 | struct cached_dev *dc = container_of(d, struct cached_dev, disk); |
1101 | |
1102 | /* |
1103 | * no need to call closure_get(&dc->disk.cl), |
1104 | * because upper layer had already opened bcache device, |
1105 | * which would call closure_get(&dc->disk.cl) |
1106 | */ |
1107 | ddip = kzalloc(size: sizeof(struct detached_dev_io_private), GFP_NOIO); |
1108 | if (!ddip) { |
1109 | bio->bi_status = BLK_STS_RESOURCE; |
1110 | bio->bi_end_io(bio); |
1111 | return; |
1112 | } |
1113 | |
1114 | ddip->d = d; |
1115 | /* Count on the bcache device */ |
1116 | ddip->orig_bdev = orig_bdev; |
1117 | ddip->start_time = start_time; |
1118 | ddip->bi_end_io = bio->bi_end_io; |
1119 | ddip->bi_private = bio->bi_private; |
1120 | bio->bi_end_io = detached_dev_end_io; |
1121 | bio->bi_private = ddip; |
1122 | |
1123 | if ((bio_op(bio) == REQ_OP_DISCARD) && |
1124 | !bdev_max_discard_sectors(bdev: dc->bdev)) |
1125 | bio->bi_end_io(bio); |
1126 | else |
1127 | submit_bio_noacct(bio); |
1128 | } |
1129 | |
1130 | static void quit_max_writeback_rate(struct cache_set *c, |
1131 | struct cached_dev *this_dc) |
1132 | { |
1133 | int i; |
1134 | struct bcache_device *d; |
1135 | struct cached_dev *dc; |
1136 | |
1137 | /* |
1138 | * mutex bch_register_lock may compete with other parallel requesters, |
1139 | * or attach/detach operations on other backing device. Waiting to |
1140 | * the mutex lock may increase I/O request latency for seconds or more. |
1141 | * To avoid such situation, if mutext_trylock() failed, only writeback |
1142 | * rate of current cached device is set to 1, and __update_write_back() |
1143 | * will decide writeback rate of other cached devices (remember now |
1144 | * c->idle_counter is 0 already). |
1145 | */ |
1146 | if (mutex_trylock(lock: &bch_register_lock)) { |
1147 | for (i = 0; i < c->devices_max_used; i++) { |
1148 | if (!c->devices[i]) |
1149 | continue; |
1150 | |
1151 | if (UUID_FLASH_ONLY(k: &c->uuids[i])) |
1152 | continue; |
1153 | |
1154 | d = c->devices[i]; |
1155 | dc = container_of(d, struct cached_dev, disk); |
1156 | /* |
1157 | * set writeback rate to default minimum value, |
1158 | * then let update_writeback_rate() to decide the |
1159 | * upcoming rate. |
1160 | */ |
1161 | atomic_long_set(v: &dc->writeback_rate.rate, i: 1); |
1162 | } |
1163 | mutex_unlock(lock: &bch_register_lock); |
1164 | } else |
1165 | atomic_long_set(v: &this_dc->writeback_rate.rate, i: 1); |
1166 | } |
1167 | |
1168 | /* Cached devices - read & write stuff */ |
1169 | |
1170 | void cached_dev_submit_bio(struct bio *bio) |
1171 | { |
1172 | struct search *s; |
1173 | struct block_device *orig_bdev = bio->bi_bdev; |
1174 | struct bcache_device *d = orig_bdev->bd_disk->private_data; |
1175 | struct cached_dev *dc = container_of(d, struct cached_dev, disk); |
1176 | unsigned long start_time; |
1177 | int rw = bio_data_dir(bio); |
1178 | |
1179 | if (unlikely((d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags)) || |
1180 | dc->io_disable)) { |
1181 | bio->bi_status = BLK_STS_IOERR; |
1182 | bio_endio(bio); |
1183 | return; |
1184 | } |
1185 | |
1186 | if (likely(d->c)) { |
1187 | if (atomic_read(v: &d->c->idle_counter)) |
1188 | atomic_set(v: &d->c->idle_counter, i: 0); |
1189 | /* |
1190 | * If at_max_writeback_rate of cache set is true and new I/O |
1191 | * comes, quit max writeback rate of all cached devices |
1192 | * attached to this cache set, and set at_max_writeback_rate |
1193 | * to false. |
1194 | */ |
1195 | if (unlikely(atomic_read(&d->c->at_max_writeback_rate) == 1)) { |
1196 | atomic_set(v: &d->c->at_max_writeback_rate, i: 0); |
1197 | quit_max_writeback_rate(c: d->c, this_dc: dc); |
1198 | } |
1199 | } |
1200 | |
1201 | start_time = bio_start_io_acct(bio); |
1202 | |
1203 | bio_set_dev(bio, bdev: dc->bdev); |
1204 | bio->bi_iter.bi_sector += dc->sb.data_offset; |
1205 | |
1206 | if (cached_dev_get(dc)) { |
1207 | s = search_alloc(bio, d, orig_bdev, start_time); |
1208 | trace_bcache_request_start(d: s->d, bio); |
1209 | |
1210 | if (!bio->bi_iter.bi_size) { |
1211 | /* |
1212 | * can't call bch_journal_meta from under |
1213 | * submit_bio_noacct |
1214 | */ |
1215 | continue_at_nobarrier(&s->cl, |
1216 | cached_dev_nodata, |
1217 | bcache_wq); |
1218 | } else { |
1219 | s->iop.bypass = check_should_bypass(dc, bio); |
1220 | |
1221 | if (rw) |
1222 | cached_dev_write(dc, s); |
1223 | else |
1224 | cached_dev_read(dc, s); |
1225 | } |
1226 | } else |
1227 | /* I/O request sent to backing device */ |
1228 | detached_dev_do_request(d, bio, orig_bdev, start_time); |
1229 | } |
1230 | |
1231 | static int cached_dev_ioctl(struct bcache_device *d, blk_mode_t mode, |
1232 | unsigned int cmd, unsigned long arg) |
1233 | { |
1234 | struct cached_dev *dc = container_of(d, struct cached_dev, disk); |
1235 | |
1236 | if (dc->io_disable) |
1237 | return -EIO; |
1238 | if (!dc->bdev->bd_disk->fops->ioctl) |
1239 | return -ENOTTY; |
1240 | return dc->bdev->bd_disk->fops->ioctl(dc->bdev, mode, cmd, arg); |
1241 | } |
1242 | |
1243 | void bch_cached_dev_request_init(struct cached_dev *dc) |
1244 | { |
1245 | dc->disk.cache_miss = cached_dev_cache_miss; |
1246 | dc->disk.ioctl = cached_dev_ioctl; |
1247 | } |
1248 | |
1249 | /* Flash backed devices */ |
1250 | |
1251 | static int flash_dev_cache_miss(struct btree *b, struct search *s, |
1252 | struct bio *bio, unsigned int sectors) |
1253 | { |
1254 | unsigned int bytes = min(sectors, bio_sectors(bio)) << 9; |
1255 | |
1256 | swap(bio->bi_iter.bi_size, bytes); |
1257 | zero_fill_bio(bio); |
1258 | swap(bio->bi_iter.bi_size, bytes); |
1259 | |
1260 | bio_advance(bio, nbytes: bytes); |
1261 | |
1262 | if (!bio->bi_iter.bi_size) |
1263 | return MAP_DONE; |
1264 | |
1265 | return MAP_CONTINUE; |
1266 | } |
1267 | |
1268 | static CLOSURE_CALLBACK(flash_dev_nodata) |
1269 | { |
1270 | closure_type(s, struct search, cl); |
1271 | |
1272 | if (s->iop.flush_journal) |
1273 | bch_journal_meta(c: s->iop.c, cl); |
1274 | |
1275 | continue_at(cl, search_free, NULL); |
1276 | } |
1277 | |
1278 | void flash_dev_submit_bio(struct bio *bio) |
1279 | { |
1280 | struct search *s; |
1281 | struct closure *cl; |
1282 | struct bcache_device *d = bio->bi_bdev->bd_disk->private_data; |
1283 | |
1284 | if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) { |
1285 | bio->bi_status = BLK_STS_IOERR; |
1286 | bio_endio(bio); |
1287 | return; |
1288 | } |
1289 | |
1290 | s = search_alloc(bio, d, orig_bdev: bio->bi_bdev, start_time: bio_start_io_acct(bio)); |
1291 | cl = &s->cl; |
1292 | bio = &s->bio.bio; |
1293 | |
1294 | trace_bcache_request_start(d: s->d, bio); |
1295 | |
1296 | if (!bio->bi_iter.bi_size) { |
1297 | /* |
1298 | * can't call bch_journal_meta from under submit_bio_noacct |
1299 | */ |
1300 | continue_at_nobarrier(&s->cl, |
1301 | flash_dev_nodata, |
1302 | bcache_wq); |
1303 | return; |
1304 | } else if (bio_data_dir(bio)) { |
1305 | bch_keybuf_check_overlapping(buf: &s->iop.c->moving_gc_keys, |
1306 | start: &KEY(d->id, bio->bi_iter.bi_sector, 0), |
1307 | end: &KEY(d->id, bio_end_sector(bio), 0)); |
1308 | |
1309 | s->iop.bypass = (bio_op(bio) == REQ_OP_DISCARD) != 0; |
1310 | s->iop.writeback = true; |
1311 | s->iop.bio = bio; |
1312 | |
1313 | closure_call(cl: &s->iop.cl, fn: bch_data_insert, NULL, parent: cl); |
1314 | } else { |
1315 | closure_call(cl: &s->iop.cl, fn: cache_lookup, NULL, parent: cl); |
1316 | } |
1317 | |
1318 | continue_at(cl, search_free, NULL); |
1319 | } |
1320 | |
1321 | static int flash_dev_ioctl(struct bcache_device *d, blk_mode_t mode, |
1322 | unsigned int cmd, unsigned long arg) |
1323 | { |
1324 | return -ENOTTY; |
1325 | } |
1326 | |
1327 | void bch_flash_dev_request_init(struct bcache_device *d) |
1328 | { |
1329 | d->cache_miss = flash_dev_cache_miss; |
1330 | d->ioctl = flash_dev_ioctl; |
1331 | } |
1332 | |
1333 | void bch_request_exit(void) |
1334 | { |
1335 | kmem_cache_destroy(s: bch_search_cache); |
1336 | } |
1337 | |
1338 | int __init bch_request_init(void) |
1339 | { |
1340 | bch_search_cache = KMEM_CACHE(search, 0); |
1341 | if (!bch_search_cache) |
1342 | return -ENOMEM; |
1343 | |
1344 | return 0; |
1345 | } |
1346 | |