1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Copyright (C) 2007 Oracle. All rights reserved. |
4 | * Copyright (C) 2022 Christoph Hellwig. |
5 | */ |
6 | |
7 | #include <linux/bio.h> |
8 | #include "bio.h" |
9 | #include "ctree.h" |
10 | #include "volumes.h" |
11 | #include "raid56.h" |
12 | #include "async-thread.h" |
13 | #include "dev-replace.h" |
14 | #include "zoned.h" |
15 | #include "file-item.h" |
16 | #include "raid-stripe-tree.h" |
17 | |
18 | static struct bio_set btrfs_bioset; |
19 | static struct bio_set btrfs_clone_bioset; |
20 | static struct bio_set btrfs_repair_bioset; |
21 | static mempool_t btrfs_failed_bio_pool; |
22 | |
23 | struct btrfs_failed_bio { |
24 | struct btrfs_bio *bbio; |
25 | int num_copies; |
26 | atomic_t repair_count; |
27 | }; |
28 | |
29 | /* Is this a data path I/O that needs storage layer checksum and repair? */ |
30 | static inline bool is_data_bbio(struct btrfs_bio *bbio) |
31 | { |
32 | return bbio->inode && is_data_inode(inode: &bbio->inode->vfs_inode); |
33 | } |
34 | |
35 | static bool bbio_has_ordered_extent(struct btrfs_bio *bbio) |
36 | { |
37 | return is_data_bbio(bbio) && btrfs_op(bio: &bbio->bio) == BTRFS_MAP_WRITE; |
38 | } |
39 | |
40 | /* |
41 | * Initialize a btrfs_bio structure. This skips the embedded bio itself as it |
42 | * is already initialized by the block layer. |
43 | */ |
44 | void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info, |
45 | btrfs_bio_end_io_t end_io, void *private) |
46 | { |
47 | memset(bbio, 0, offsetof(struct btrfs_bio, bio)); |
48 | bbio->fs_info = fs_info; |
49 | bbio->end_io = end_io; |
50 | bbio->private = private; |
51 | atomic_set(v: &bbio->pending_ios, i: 1); |
52 | } |
53 | |
54 | /* |
55 | * Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for |
56 | * btrfs, and is used for all I/O submitted through btrfs_submit_bio. |
57 | * |
58 | * Just like the underlying bio_alloc_bioset it will not fail as it is backed by |
59 | * a mempool. |
60 | */ |
61 | struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, |
62 | struct btrfs_fs_info *fs_info, |
63 | btrfs_bio_end_io_t end_io, void *private) |
64 | { |
65 | struct btrfs_bio *bbio; |
66 | struct bio *bio; |
67 | |
68 | bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, bs: &btrfs_bioset); |
69 | bbio = btrfs_bio(bio); |
70 | btrfs_bio_init(bbio, fs_info, end_io, private); |
71 | return bbio; |
72 | } |
73 | |
74 | static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, |
75 | struct btrfs_bio *orig_bbio, |
76 | u64 map_length, bool use_append) |
77 | { |
78 | struct btrfs_bio *bbio; |
79 | struct bio *bio; |
80 | |
81 | if (use_append) { |
82 | unsigned int nr_segs; |
83 | |
84 | bio = bio_split_rw(bio: &orig_bbio->bio, lim: &fs_info->limits, segs: &nr_segs, |
85 | bs: &btrfs_clone_bioset, max_bytes: map_length); |
86 | } else { |
87 | bio = bio_split(bio: &orig_bbio->bio, sectors: map_length >> SECTOR_SHIFT, |
88 | GFP_NOFS, bs: &btrfs_clone_bioset); |
89 | } |
90 | bbio = btrfs_bio(bio); |
91 | btrfs_bio_init(bbio, fs_info, NULL, private: orig_bbio); |
92 | bbio->inode = orig_bbio->inode; |
93 | bbio->file_offset = orig_bbio->file_offset; |
94 | orig_bbio->file_offset += map_length; |
95 | if (bbio_has_ordered_extent(bbio)) { |
96 | refcount_inc(r: &orig_bbio->ordered->refs); |
97 | bbio->ordered = orig_bbio->ordered; |
98 | } |
99 | atomic_inc(v: &orig_bbio->pending_ios); |
100 | return bbio; |
101 | } |
102 | |
103 | /* Free a bio that was never submitted to the underlying device. */ |
104 | static void btrfs_cleanup_bio(struct btrfs_bio *bbio) |
105 | { |
106 | if (bbio_has_ordered_extent(bbio)) |
107 | btrfs_put_ordered_extent(entry: bbio->ordered); |
108 | bio_put(&bbio->bio); |
109 | } |
110 | |
111 | static void __btrfs_bio_end_io(struct btrfs_bio *bbio) |
112 | { |
113 | if (bbio_has_ordered_extent(bbio)) { |
114 | struct btrfs_ordered_extent *ordered = bbio->ordered; |
115 | |
116 | bbio->end_io(bbio); |
117 | btrfs_put_ordered_extent(entry: ordered); |
118 | } else { |
119 | bbio->end_io(bbio); |
120 | } |
121 | } |
122 | |
123 | void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) |
124 | { |
125 | bbio->bio.bi_status = status; |
126 | __btrfs_bio_end_io(bbio); |
127 | } |
128 | |
129 | static void btrfs_orig_write_end_io(struct bio *bio); |
130 | |
131 | static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio, |
132 | struct btrfs_bio *orig_bbio) |
133 | { |
134 | /* |
135 | * For writes we tolerate nr_mirrors - 1 write failures, so we can't |
136 | * just blindly propagate a write failure here. Instead increment the |
137 | * error count in the original I/O context so that it is guaranteed to |
138 | * be larger than the error tolerance. |
139 | */ |
140 | if (bbio->bio.bi_end_io == &btrfs_orig_write_end_io) { |
141 | struct btrfs_io_stripe *orig_stripe = orig_bbio->bio.bi_private; |
142 | struct btrfs_io_context *orig_bioc = orig_stripe->bioc; |
143 | |
144 | atomic_add(i: orig_bioc->max_errors, v: &orig_bioc->error); |
145 | } else { |
146 | orig_bbio->bio.bi_status = bbio->bio.bi_status; |
147 | } |
148 | } |
149 | |
150 | static void btrfs_orig_bbio_end_io(struct btrfs_bio *bbio) |
151 | { |
152 | if (bbio->bio.bi_pool == &btrfs_clone_bioset) { |
153 | struct btrfs_bio *orig_bbio = bbio->private; |
154 | |
155 | if (bbio->bio.bi_status) |
156 | btrfs_bbio_propagate_error(bbio, orig_bbio); |
157 | btrfs_cleanup_bio(bbio); |
158 | bbio = orig_bbio; |
159 | } |
160 | |
161 | if (atomic_dec_and_test(v: &bbio->pending_ios)) |
162 | __btrfs_bio_end_io(bbio); |
163 | } |
164 | |
165 | static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) |
166 | { |
167 | if (cur_mirror == fbio->num_copies) |
168 | return cur_mirror + 1 - fbio->num_copies; |
169 | return cur_mirror + 1; |
170 | } |
171 | |
172 | static int prev_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) |
173 | { |
174 | if (cur_mirror == 1) |
175 | return fbio->num_copies; |
176 | return cur_mirror - 1; |
177 | } |
178 | |
179 | static void btrfs_repair_done(struct btrfs_failed_bio *fbio) |
180 | { |
181 | if (atomic_dec_and_test(v: &fbio->repair_count)) { |
182 | btrfs_orig_bbio_end_io(bbio: fbio->bbio); |
183 | mempool_free(element: fbio, pool: &btrfs_failed_bio_pool); |
184 | } |
185 | } |
186 | |
187 | static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, |
188 | struct btrfs_device *dev) |
189 | { |
190 | struct btrfs_failed_bio *fbio = repair_bbio->private; |
191 | struct btrfs_inode *inode = repair_bbio->inode; |
192 | struct btrfs_fs_info *fs_info = inode->root->fs_info; |
193 | struct bio_vec *bv = bio_first_bvec_all(bio: &repair_bbio->bio); |
194 | int mirror = repair_bbio->mirror_num; |
195 | |
196 | /* |
197 | * We can only trigger this for data bio, which doesn't support larger |
198 | * folios yet. |
199 | */ |
200 | ASSERT(folio_order(page_folio(bv->bv_page)) == 0); |
201 | |
202 | if (repair_bbio->bio.bi_status || |
203 | !btrfs_data_csum_ok(bbio: repair_bbio, dev, bio_offset: 0, bv)) { |
204 | bio_reset(bio: &repair_bbio->bio, NULL, opf: REQ_OP_READ); |
205 | repair_bbio->bio.bi_iter = repair_bbio->saved_iter; |
206 | |
207 | mirror = next_repair_mirror(fbio, cur_mirror: mirror); |
208 | if (mirror == fbio->bbio->mirror_num) { |
209 | btrfs_debug(fs_info, "no mirror left" ); |
210 | fbio->bbio->bio.bi_status = BLK_STS_IOERR; |
211 | goto done; |
212 | } |
213 | |
214 | btrfs_submit_bio(bbio: repair_bbio, mirror_num: mirror); |
215 | return; |
216 | } |
217 | |
218 | do { |
219 | mirror = prev_repair_mirror(fbio, cur_mirror: mirror); |
220 | btrfs_repair_io_failure(fs_info, ino: btrfs_ino(inode), |
221 | start: repair_bbio->file_offset, length: fs_info->sectorsize, |
222 | logical: repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT, |
223 | page_folio(bv->bv_page), folio_offset: bv->bv_offset, mirror_num: mirror); |
224 | } while (mirror != fbio->bbio->mirror_num); |
225 | |
226 | done: |
227 | btrfs_repair_done(fbio); |
228 | bio_put(&repair_bbio->bio); |
229 | } |
230 | |
231 | /* |
232 | * Try to kick off a repair read to the next available mirror for a bad sector. |
233 | * |
234 | * This primarily tries to recover good data to serve the actual read request, |
235 | * but also tries to write the good data back to the bad mirror(s) when a |
236 | * read succeeded to restore the redundancy. |
237 | */ |
238 | static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio, |
239 | u32 bio_offset, |
240 | struct bio_vec *bv, |
241 | struct btrfs_failed_bio *fbio) |
242 | { |
243 | struct btrfs_inode *inode = failed_bbio->inode; |
244 | struct btrfs_fs_info *fs_info = inode->root->fs_info; |
245 | const u32 sectorsize = fs_info->sectorsize; |
246 | const u64 logical = (failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT); |
247 | struct btrfs_bio *repair_bbio; |
248 | struct bio *repair_bio; |
249 | int num_copies; |
250 | int mirror; |
251 | |
252 | btrfs_debug(fs_info, "repair read error: read error at %llu" , |
253 | failed_bbio->file_offset + bio_offset); |
254 | |
255 | num_copies = btrfs_num_copies(fs_info, logical, len: sectorsize); |
256 | if (num_copies == 1) { |
257 | btrfs_debug(fs_info, "no copy to repair from" ); |
258 | failed_bbio->bio.bi_status = BLK_STS_IOERR; |
259 | return fbio; |
260 | } |
261 | |
262 | if (!fbio) { |
263 | fbio = mempool_alloc(pool: &btrfs_failed_bio_pool, GFP_NOFS); |
264 | fbio->bbio = failed_bbio; |
265 | fbio->num_copies = num_copies; |
266 | atomic_set(v: &fbio->repair_count, i: 1); |
267 | } |
268 | |
269 | atomic_inc(v: &fbio->repair_count); |
270 | |
271 | repair_bio = bio_alloc_bioset(NULL, nr_vecs: 1, opf: REQ_OP_READ, GFP_NOFS, |
272 | bs: &btrfs_repair_bioset); |
273 | repair_bio->bi_iter.bi_sector = failed_bbio->saved_iter.bi_sector; |
274 | __bio_add_page(bio: repair_bio, page: bv->bv_page, len: bv->bv_len, off: bv->bv_offset); |
275 | |
276 | repair_bbio = btrfs_bio(bio: repair_bio); |
277 | btrfs_bio_init(bbio: repair_bbio, fs_info, NULL, private: fbio); |
278 | repair_bbio->inode = failed_bbio->inode; |
279 | repair_bbio->file_offset = failed_bbio->file_offset + bio_offset; |
280 | |
281 | mirror = next_repair_mirror(fbio, cur_mirror: failed_bbio->mirror_num); |
282 | btrfs_debug(fs_info, "submitting repair read to mirror %d" , mirror); |
283 | btrfs_submit_bio(bbio: repair_bbio, mirror_num: mirror); |
284 | return fbio; |
285 | } |
286 | |
287 | static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *dev) |
288 | { |
289 | struct btrfs_inode *inode = bbio->inode; |
290 | struct btrfs_fs_info *fs_info = inode->root->fs_info; |
291 | u32 sectorsize = fs_info->sectorsize; |
292 | struct bvec_iter *iter = &bbio->saved_iter; |
293 | blk_status_t status = bbio->bio.bi_status; |
294 | struct btrfs_failed_bio *fbio = NULL; |
295 | u32 offset = 0; |
296 | |
297 | /* Read-repair requires the inode field to be set by the submitter. */ |
298 | ASSERT(inode); |
299 | |
300 | /* |
301 | * Hand off repair bios to the repair code as there is no upper level |
302 | * submitter for them. |
303 | */ |
304 | if (bbio->bio.bi_pool == &btrfs_repair_bioset) { |
305 | btrfs_end_repair_bio(repair_bbio: bbio, dev); |
306 | return; |
307 | } |
308 | |
309 | /* Clear the I/O error. A failed repair will reset it. */ |
310 | bbio->bio.bi_status = BLK_STS_OK; |
311 | |
312 | while (iter->bi_size) { |
313 | struct bio_vec bv = bio_iter_iovec(&bbio->bio, *iter); |
314 | |
315 | bv.bv_len = min(bv.bv_len, sectorsize); |
316 | if (status || !btrfs_data_csum_ok(bbio, dev, bio_offset: offset, bv: &bv)) |
317 | fbio = repair_one_sector(failed_bbio: bbio, bio_offset: offset, bv: &bv, fbio); |
318 | |
319 | bio_advance_iter_single(bio: &bbio->bio, iter, bytes: sectorsize); |
320 | offset += sectorsize; |
321 | } |
322 | |
323 | if (bbio->csum != bbio->csum_inline) |
324 | kfree(objp: bbio->csum); |
325 | |
326 | if (fbio) |
327 | btrfs_repair_done(fbio); |
328 | else |
329 | btrfs_orig_bbio_end_io(bbio); |
330 | } |
331 | |
332 | static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) |
333 | { |
334 | if (!dev || !dev->bdev) |
335 | return; |
336 | if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET) |
337 | return; |
338 | |
339 | if (btrfs_op(bio) == BTRFS_MAP_WRITE) |
340 | btrfs_dev_stat_inc_and_print(dev, index: BTRFS_DEV_STAT_WRITE_ERRS); |
341 | else if (!(bio->bi_opf & REQ_RAHEAD)) |
342 | btrfs_dev_stat_inc_and_print(dev, index: BTRFS_DEV_STAT_READ_ERRS); |
343 | if (bio->bi_opf & REQ_PREFLUSH) |
344 | btrfs_dev_stat_inc_and_print(dev, index: BTRFS_DEV_STAT_FLUSH_ERRS); |
345 | } |
346 | |
347 | static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_fs_info *fs_info, |
348 | struct bio *bio) |
349 | { |
350 | if (bio->bi_opf & REQ_META) |
351 | return fs_info->endio_meta_workers; |
352 | return fs_info->endio_workers; |
353 | } |
354 | |
355 | static void btrfs_end_bio_work(struct work_struct *work) |
356 | { |
357 | struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); |
358 | |
359 | /* Metadata reads are checked and repaired by the submitter. */ |
360 | if (is_data_bbio(bbio)) |
361 | btrfs_check_read_bio(bbio, dev: bbio->bio.bi_private); |
362 | else |
363 | btrfs_orig_bbio_end_io(bbio); |
364 | } |
365 | |
366 | static void btrfs_simple_end_io(struct bio *bio) |
367 | { |
368 | struct btrfs_bio *bbio = btrfs_bio(bio); |
369 | struct btrfs_device *dev = bio->bi_private; |
370 | struct btrfs_fs_info *fs_info = bbio->fs_info; |
371 | |
372 | btrfs_bio_counter_dec(fs_info); |
373 | |
374 | if (bio->bi_status) |
375 | btrfs_log_dev_io_error(bio, dev); |
376 | |
377 | if (bio_op(bio) == REQ_OP_READ) { |
378 | INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); |
379 | queue_work(wq: btrfs_end_io_wq(fs_info, bio), work: &bbio->end_io_work); |
380 | } else { |
381 | if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status) |
382 | btrfs_record_physical_zoned(bbio); |
383 | btrfs_orig_bbio_end_io(bbio); |
384 | } |
385 | } |
386 | |
387 | static void btrfs_raid56_end_io(struct bio *bio) |
388 | { |
389 | struct btrfs_io_context *bioc = bio->bi_private; |
390 | struct btrfs_bio *bbio = btrfs_bio(bio); |
391 | |
392 | btrfs_bio_counter_dec(fs_info: bioc->fs_info); |
393 | bbio->mirror_num = bioc->mirror_num; |
394 | if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) |
395 | btrfs_check_read_bio(bbio, NULL); |
396 | else |
397 | btrfs_orig_bbio_end_io(bbio); |
398 | |
399 | btrfs_put_bioc(bioc); |
400 | } |
401 | |
402 | static void btrfs_orig_write_end_io(struct bio *bio) |
403 | { |
404 | struct btrfs_io_stripe *stripe = bio->bi_private; |
405 | struct btrfs_io_context *bioc = stripe->bioc; |
406 | struct btrfs_bio *bbio = btrfs_bio(bio); |
407 | |
408 | btrfs_bio_counter_dec(fs_info: bioc->fs_info); |
409 | |
410 | if (bio->bi_status) { |
411 | atomic_inc(v: &bioc->error); |
412 | btrfs_log_dev_io_error(bio, dev: stripe->dev); |
413 | } |
414 | |
415 | /* |
416 | * Only send an error to the higher layers if it is beyond the tolerance |
417 | * threshold. |
418 | */ |
419 | if (atomic_read(v: &bioc->error) > bioc->max_errors) |
420 | bio->bi_status = BLK_STS_IOERR; |
421 | else |
422 | bio->bi_status = BLK_STS_OK; |
423 | |
424 | if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status) |
425 | stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; |
426 | |
427 | btrfs_orig_bbio_end_io(bbio); |
428 | btrfs_put_bioc(bioc); |
429 | } |
430 | |
431 | static void btrfs_clone_write_end_io(struct bio *bio) |
432 | { |
433 | struct btrfs_io_stripe *stripe = bio->bi_private; |
434 | |
435 | if (bio->bi_status) { |
436 | atomic_inc(v: &stripe->bioc->error); |
437 | btrfs_log_dev_io_error(bio, dev: stripe->dev); |
438 | } else if (bio_op(bio) == REQ_OP_ZONE_APPEND) { |
439 | stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; |
440 | } |
441 | |
442 | /* Pass on control to the original bio this one was cloned from */ |
443 | bio_endio(stripe->bioc->orig_bio); |
444 | bio_put(bio); |
445 | } |
446 | |
447 | static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) |
448 | { |
449 | if (!dev || !dev->bdev || |
450 | test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || |
451 | (btrfs_op(bio) == BTRFS_MAP_WRITE && |
452 | !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { |
453 | bio_io_error(bio); |
454 | return; |
455 | } |
456 | |
457 | bio_set_dev(bio, bdev: dev->bdev); |
458 | |
459 | /* |
460 | * For zone append writing, bi_sector must point the beginning of the |
461 | * zone |
462 | */ |
463 | if (bio_op(bio) == REQ_OP_ZONE_APPEND) { |
464 | u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; |
465 | u64 zone_start = round_down(physical, dev->fs_info->zone_size); |
466 | |
467 | ASSERT(btrfs_dev_is_sequential(dev, physical)); |
468 | bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; |
469 | } |
470 | btrfs_debug_in_rcu(dev->fs_info, |
471 | "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u" , |
472 | __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, |
473 | (unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev), |
474 | dev->devid, bio->bi_iter.bi_size); |
475 | |
476 | if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT) |
477 | blkcg_punt_bio_submit(bio); |
478 | else |
479 | submit_bio(bio); |
480 | } |
481 | |
482 | static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) |
483 | { |
484 | struct bio *orig_bio = bioc->orig_bio, *bio; |
485 | |
486 | ASSERT(bio_op(orig_bio) != REQ_OP_READ); |
487 | |
488 | /* Reuse the bio embedded into the btrfs_bio for the last mirror */ |
489 | if (dev_nr == bioc->num_stripes - 1) { |
490 | bio = orig_bio; |
491 | bio->bi_end_io = btrfs_orig_write_end_io; |
492 | } else { |
493 | bio = bio_alloc_clone(NULL, bio_src: orig_bio, GFP_NOFS, bs: &fs_bio_set); |
494 | bio_inc_remaining(bio: orig_bio); |
495 | bio->bi_end_io = btrfs_clone_write_end_io; |
496 | } |
497 | |
498 | bio->bi_private = &bioc->stripes[dev_nr]; |
499 | bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT; |
500 | bioc->stripes[dev_nr].bioc = bioc; |
501 | bioc->size = bio->bi_iter.bi_size; |
502 | btrfs_submit_dev_bio(dev: bioc->stripes[dev_nr].dev, bio); |
503 | } |
504 | |
505 | static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, |
506 | struct btrfs_io_stripe *smap, int mirror_num) |
507 | { |
508 | if (!bioc) { |
509 | /* Single mirror read/write fast path. */ |
510 | btrfs_bio(bio)->mirror_num = mirror_num; |
511 | bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT; |
512 | if (bio_op(bio) != REQ_OP_READ) |
513 | btrfs_bio(bio)->orig_physical = smap->physical; |
514 | bio->bi_private = smap->dev; |
515 | bio->bi_end_io = btrfs_simple_end_io; |
516 | btrfs_submit_dev_bio(dev: smap->dev, bio); |
517 | } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { |
518 | /* Parity RAID write or read recovery. */ |
519 | bio->bi_private = bioc; |
520 | bio->bi_end_io = btrfs_raid56_end_io; |
521 | if (bio_op(bio) == REQ_OP_READ) |
522 | raid56_parity_recover(bio, bioc, mirror_num); |
523 | else |
524 | raid56_parity_write(bio, bioc); |
525 | } else { |
526 | /* Write to multiple mirrors. */ |
527 | int total_devs = bioc->num_stripes; |
528 | |
529 | bioc->orig_bio = bio; |
530 | for (int dev_nr = 0; dev_nr < total_devs; dev_nr++) |
531 | btrfs_submit_mirrored_bio(bioc, dev_nr); |
532 | } |
533 | } |
534 | |
535 | static blk_status_t btrfs_bio_csum(struct btrfs_bio *bbio) |
536 | { |
537 | if (bbio->bio.bi_opf & REQ_META) |
538 | return btree_csum_one_bio(bbio); |
539 | return btrfs_csum_one_bio(bbio); |
540 | } |
541 | |
542 | /* |
543 | * Async submit bios are used to offload expensive checksumming onto the worker |
544 | * threads. |
545 | */ |
546 | struct async_submit_bio { |
547 | struct btrfs_bio *bbio; |
548 | struct btrfs_io_context *bioc; |
549 | struct btrfs_io_stripe smap; |
550 | int mirror_num; |
551 | struct btrfs_work work; |
552 | }; |
553 | |
554 | /* |
555 | * In order to insert checksums into the metadata in large chunks, we wait |
556 | * until bio submission time. All the pages in the bio are checksummed and |
557 | * sums are attached onto the ordered extent record. |
558 | * |
559 | * At IO completion time the csums attached on the ordered extent record are |
560 | * inserted into the btree. |
561 | */ |
562 | static void run_one_async_start(struct btrfs_work *work) |
563 | { |
564 | struct async_submit_bio *async = |
565 | container_of(work, struct async_submit_bio, work); |
566 | blk_status_t ret; |
567 | |
568 | ret = btrfs_bio_csum(bbio: async->bbio); |
569 | if (ret) |
570 | async->bbio->bio.bi_status = ret; |
571 | } |
572 | |
573 | /* |
574 | * In order to insert checksums into the metadata in large chunks, we wait |
575 | * until bio submission time. All the pages in the bio are checksummed and |
576 | * sums are attached onto the ordered extent record. |
577 | * |
578 | * At IO completion time the csums attached on the ordered extent record are |
579 | * inserted into the tree. |
580 | * |
581 | * If called with @do_free == true, then it will free the work struct. |
582 | */ |
583 | static void run_one_async_done(struct btrfs_work *work, bool do_free) |
584 | { |
585 | struct async_submit_bio *async = |
586 | container_of(work, struct async_submit_bio, work); |
587 | struct bio *bio = &async->bbio->bio; |
588 | |
589 | if (do_free) { |
590 | kfree(container_of(work, struct async_submit_bio, work)); |
591 | return; |
592 | } |
593 | |
594 | /* If an error occurred we just want to clean up the bio and move on. */ |
595 | if (bio->bi_status) { |
596 | btrfs_orig_bbio_end_io(bbio: async->bbio); |
597 | return; |
598 | } |
599 | |
600 | /* |
601 | * All of the bios that pass through here are from async helpers. |
602 | * Use REQ_BTRFS_CGROUP_PUNT to issue them from the owning cgroup's |
603 | * context. This changes nothing when cgroups aren't in use. |
604 | */ |
605 | bio->bi_opf |= REQ_BTRFS_CGROUP_PUNT; |
606 | __btrfs_submit_bio(bio, bioc: async->bioc, smap: &async->smap, mirror_num: async->mirror_num); |
607 | } |
608 | |
609 | static bool should_async_write(struct btrfs_bio *bbio) |
610 | { |
611 | bool auto_csum_mode = true; |
612 | |
613 | #ifdef CONFIG_BTRFS_DEBUG |
614 | struct btrfs_fs_devices *fs_devices = bbio->fs_info->fs_devices; |
615 | enum btrfs_offload_csum_mode csum_mode = READ_ONCE(fs_devices->offload_csum_mode); |
616 | |
617 | if (csum_mode == BTRFS_OFFLOAD_CSUM_FORCE_OFF) |
618 | return false; |
619 | |
620 | auto_csum_mode = (csum_mode == BTRFS_OFFLOAD_CSUM_AUTO); |
621 | #endif |
622 | |
623 | /* Submit synchronously if the checksum implementation is fast. */ |
624 | if (auto_csum_mode && test_bit(BTRFS_FS_CSUM_IMPL_FAST, &bbio->fs_info->flags)) |
625 | return false; |
626 | |
627 | /* |
628 | * Try to defer the submission to a workqueue to parallelize the |
629 | * checksum calculation unless the I/O is issued synchronously. |
630 | */ |
631 | if (op_is_sync(op: bbio->bio.bi_opf)) |
632 | return false; |
633 | |
634 | /* Zoned devices require I/O to be submitted in order. */ |
635 | if ((bbio->bio.bi_opf & REQ_META) && btrfs_is_zoned(fs_info: bbio->fs_info)) |
636 | return false; |
637 | |
638 | return true; |
639 | } |
640 | |
641 | /* |
642 | * Submit bio to an async queue. |
643 | * |
644 | * Return true if the work has been successfully submitted, else false. |
645 | */ |
646 | static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, |
647 | struct btrfs_io_context *bioc, |
648 | struct btrfs_io_stripe *smap, int mirror_num) |
649 | { |
650 | struct btrfs_fs_info *fs_info = bbio->fs_info; |
651 | struct async_submit_bio *async; |
652 | |
653 | async = kmalloc(size: sizeof(*async), GFP_NOFS); |
654 | if (!async) |
655 | return false; |
656 | |
657 | async->bbio = bbio; |
658 | async->bioc = bioc; |
659 | async->smap = *smap; |
660 | async->mirror_num = mirror_num; |
661 | |
662 | btrfs_init_work(work: &async->work, func: run_one_async_start, ordered_func: run_one_async_done); |
663 | btrfs_queue_work(wq: fs_info->workers, work: &async->work); |
664 | return true; |
665 | } |
666 | |
667 | static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) |
668 | { |
669 | struct btrfs_inode *inode = bbio->inode; |
670 | struct btrfs_fs_info *fs_info = bbio->fs_info; |
671 | struct btrfs_bio *orig_bbio = bbio; |
672 | struct bio *bio = &bbio->bio; |
673 | u64 logical = bio->bi_iter.bi_sector << SECTOR_SHIFT; |
674 | u64 length = bio->bi_iter.bi_size; |
675 | u64 map_length = length; |
676 | bool use_append = btrfs_use_zone_append(bbio); |
677 | struct btrfs_io_context *bioc = NULL; |
678 | struct btrfs_io_stripe smap; |
679 | blk_status_t ret; |
680 | int error; |
681 | |
682 | smap.is_scrub = !bbio->inode; |
683 | |
684 | btrfs_bio_counter_inc_blocked(fs_info); |
685 | error = btrfs_map_block(fs_info, op: btrfs_op(bio), logical, length: &map_length, |
686 | bioc_ret: &bioc, smap: &smap, mirror_num_ret: &mirror_num); |
687 | if (error) { |
688 | ret = errno_to_blk_status(errno: error); |
689 | goto fail; |
690 | } |
691 | |
692 | map_length = min(map_length, length); |
693 | if (use_append) |
694 | map_length = min(map_length, fs_info->max_zone_append_size); |
695 | |
696 | if (map_length < length) { |
697 | bbio = btrfs_split_bio(fs_info, orig_bbio: bbio, map_length, use_append); |
698 | bio = &bbio->bio; |
699 | } |
700 | |
701 | /* |
702 | * Save the iter for the end_io handler and preload the checksums for |
703 | * data reads. |
704 | */ |
705 | if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) { |
706 | bbio->saved_iter = bio->bi_iter; |
707 | ret = btrfs_lookup_bio_sums(bbio); |
708 | if (ret) |
709 | goto fail_put_bio; |
710 | } |
711 | |
712 | if (btrfs_op(bio) == BTRFS_MAP_WRITE) { |
713 | if (use_append) { |
714 | bio->bi_opf &= ~REQ_OP_WRITE; |
715 | bio->bi_opf |= REQ_OP_ZONE_APPEND; |
716 | } |
717 | |
718 | if (is_data_bbio(bbio) && bioc && |
719 | btrfs_need_stripe_tree_update(fs_info: bioc->fs_info, map_type: bioc->map_type)) { |
720 | /* |
721 | * No locking for the list update, as we only add to |
722 | * the list in the I/O submission path, and list |
723 | * iteration only happens in the completion path, which |
724 | * can't happen until after the last submission. |
725 | */ |
726 | btrfs_get_bioc(bioc); |
727 | list_add_tail(new: &bioc->rst_ordered_entry, head: &bbio->ordered->bioc_list); |
728 | } |
729 | |
730 | /* |
731 | * Csum items for reloc roots have already been cloned at this |
732 | * point, so they are handled as part of the no-checksum case. |
733 | */ |
734 | if (inode && !(inode->flags & BTRFS_INODE_NODATASUM) && |
735 | !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) && |
736 | !btrfs_is_data_reloc_root(root: inode->root)) { |
737 | if (should_async_write(bbio) && |
738 | btrfs_wq_submit_bio(bbio, bioc, smap: &smap, mirror_num)) |
739 | goto done; |
740 | |
741 | ret = btrfs_bio_csum(bbio); |
742 | if (ret) |
743 | goto fail_put_bio; |
744 | } else if (use_append) { |
745 | ret = btrfs_alloc_dummy_sum(bbio); |
746 | if (ret) |
747 | goto fail_put_bio; |
748 | } |
749 | } |
750 | |
751 | __btrfs_submit_bio(bio, bioc, smap: &smap, mirror_num); |
752 | done: |
753 | return map_length == length; |
754 | |
755 | fail_put_bio: |
756 | if (map_length < length) |
757 | btrfs_cleanup_bio(bbio); |
758 | fail: |
759 | btrfs_bio_counter_dec(fs_info); |
760 | btrfs_bio_end_io(bbio: orig_bbio, status: ret); |
761 | /* Do not submit another chunk */ |
762 | return true; |
763 | } |
764 | |
765 | void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num) |
766 | { |
767 | /* If bbio->inode is not populated, its file_offset must be 0. */ |
768 | ASSERT(bbio->inode || bbio->file_offset == 0); |
769 | |
770 | while (!btrfs_submit_chunk(bbio, mirror_num)) |
771 | ; |
772 | } |
773 | |
774 | /* |
775 | * Submit a repair write. |
776 | * |
777 | * This bypasses btrfs_submit_bio deliberately, as that writes all copies in a |
778 | * RAID setup. Here we only want to write the one bad copy, so we do the |
779 | * mapping ourselves and submit the bio directly. |
780 | * |
781 | * The I/O is issued synchronously to block the repair read completion from |
782 | * freeing the bio. |
783 | */ |
784 | int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, |
785 | u64 length, u64 logical, struct folio *folio, |
786 | unsigned int folio_offset, int mirror_num) |
787 | { |
788 | struct btrfs_io_stripe smap = { 0 }; |
789 | struct bio_vec bvec; |
790 | struct bio bio; |
791 | int ret = 0; |
792 | |
793 | ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); |
794 | BUG_ON(!mirror_num); |
795 | |
796 | if (btrfs_repair_one_zone(fs_info, logical)) |
797 | return 0; |
798 | |
799 | /* |
800 | * Avoid races with device replace and make sure our bioc has devices |
801 | * associated to its stripes that don't go away while we are doing the |
802 | * read repair operation. |
803 | */ |
804 | btrfs_bio_counter_inc_blocked(fs_info); |
805 | ret = btrfs_map_repair_block(fs_info, smap: &smap, logical, length, mirror_num); |
806 | if (ret < 0) |
807 | goto out_counter_dec; |
808 | |
809 | if (!smap.dev->bdev || |
810 | !test_bit(BTRFS_DEV_STATE_WRITEABLE, &smap.dev->dev_state)) { |
811 | ret = -EIO; |
812 | goto out_counter_dec; |
813 | } |
814 | |
815 | bio_init(bio: &bio, bdev: smap.dev->bdev, table: &bvec, max_vecs: 1, opf: REQ_OP_WRITE | REQ_SYNC); |
816 | bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; |
817 | ret = bio_add_folio(bio: &bio, folio, len: length, off: folio_offset); |
818 | ASSERT(ret); |
819 | ret = submit_bio_wait(bio: &bio); |
820 | if (ret) { |
821 | /* try to remap that extent elsewhere? */ |
822 | btrfs_dev_stat_inc_and_print(dev: smap.dev, index: BTRFS_DEV_STAT_WRITE_ERRS); |
823 | goto out_bio_uninit; |
824 | } |
825 | |
826 | btrfs_info_rl_in_rcu(fs_info, |
827 | "read error corrected: ino %llu off %llu (dev %s sector %llu)" , |
828 | ino, start, btrfs_dev_name(smap.dev), |
829 | smap.physical >> SECTOR_SHIFT); |
830 | ret = 0; |
831 | |
832 | out_bio_uninit: |
833 | bio_uninit(&bio); |
834 | out_counter_dec: |
835 | btrfs_bio_counter_dec(fs_info); |
836 | return ret; |
837 | } |
838 | |
839 | /* |
840 | * Submit a btrfs_bio based repair write. |
841 | * |
842 | * If @dev_replace is true, the write would be submitted to dev-replace target. |
843 | */ |
844 | void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace) |
845 | { |
846 | struct btrfs_fs_info *fs_info = bbio->fs_info; |
847 | u64 logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; |
848 | u64 length = bbio->bio.bi_iter.bi_size; |
849 | struct btrfs_io_stripe smap = { 0 }; |
850 | int ret; |
851 | |
852 | ASSERT(fs_info); |
853 | ASSERT(mirror_num > 0); |
854 | ASSERT(btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE); |
855 | ASSERT(!bbio->inode); |
856 | |
857 | btrfs_bio_counter_inc_blocked(fs_info); |
858 | ret = btrfs_map_repair_block(fs_info, smap: &smap, logical, length, mirror_num); |
859 | if (ret < 0) |
860 | goto fail; |
861 | |
862 | if (dev_replace) { |
863 | ASSERT(smap.dev == fs_info->dev_replace.srcdev); |
864 | smap.dev = fs_info->dev_replace.tgtdev; |
865 | } |
866 | __btrfs_submit_bio(bio: &bbio->bio, NULL, smap: &smap, mirror_num); |
867 | return; |
868 | |
869 | fail: |
870 | btrfs_bio_counter_dec(fs_info); |
871 | btrfs_bio_end_io(bbio, status: errno_to_blk_status(errno: ret)); |
872 | } |
873 | |
874 | int __init btrfs_bioset_init(void) |
875 | { |
876 | if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, |
877 | offsetof(struct btrfs_bio, bio), |
878 | flags: BIOSET_NEED_BVECS)) |
879 | return -ENOMEM; |
880 | if (bioset_init(&btrfs_clone_bioset, BIO_POOL_SIZE, |
881 | offsetof(struct btrfs_bio, bio), flags: 0)) |
882 | goto out_free_bioset; |
883 | if (bioset_init(&btrfs_repair_bioset, BIO_POOL_SIZE, |
884 | offsetof(struct btrfs_bio, bio), |
885 | flags: BIOSET_NEED_BVECS)) |
886 | goto out_free_clone_bioset; |
887 | if (mempool_init_kmalloc_pool(pool: &btrfs_failed_bio_pool, BIO_POOL_SIZE, |
888 | size: sizeof(struct btrfs_failed_bio))) |
889 | goto out_free_repair_bioset; |
890 | return 0; |
891 | |
892 | out_free_repair_bioset: |
893 | bioset_exit(&btrfs_repair_bioset); |
894 | out_free_clone_bioset: |
895 | bioset_exit(&btrfs_clone_bioset); |
896 | out_free_bioset: |
897 | bioset_exit(&btrfs_bioset); |
898 | return -ENOMEM; |
899 | } |
900 | |
901 | void __cold btrfs_bioset_exit(void) |
902 | { |
903 | mempool_exit(pool: &btrfs_failed_bio_pool); |
904 | bioset_exit(&btrfs_repair_bioset); |
905 | bioset_exit(&btrfs_clone_bioset); |
906 | bioset_exit(&btrfs_bioset); |
907 | } |
908 | |