1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> |
4 | * Copyright 2012 Google, Inc. |
5 | */ |
6 | |
7 | #include "bcachefs.h" |
8 | #include "alloc_foreground.h" |
9 | #include "bkey_buf.h" |
10 | #include "bset.h" |
11 | #include "btree_update.h" |
12 | #include "buckets.h" |
13 | #include "checksum.h" |
14 | #include "clock.h" |
15 | #include "compress.h" |
16 | #include "debug.h" |
17 | #include "ec.h" |
18 | #include "error.h" |
19 | #include "extent_update.h" |
20 | #include "inode.h" |
21 | #include "io_write.h" |
22 | #include "journal.h" |
23 | #include "keylist.h" |
24 | #include "move.h" |
25 | #include "nocow_locking.h" |
26 | #include "rebalance.h" |
27 | #include "subvolume.h" |
28 | #include "super.h" |
29 | #include "super-io.h" |
30 | #include "trace.h" |
31 | |
32 | #include <linux/blkdev.h> |
33 | #include <linux/prefetch.h> |
34 | #include <linux/random.h> |
35 | #include <linux/sched/mm.h> |
36 | |
37 | #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT |
38 | |
39 | static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency, |
40 | u64 now, int rw) |
41 | { |
42 | u64 latency_capable = |
43 | ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m; |
44 | /* ideally we'd be taking into account the device's variance here: */ |
45 | u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3); |
46 | s64 latency_over = io_latency - latency_threshold; |
47 | |
48 | if (latency_threshold && latency_over > 0) { |
49 | /* |
50 | * bump up congested by approximately latency_over * 4 / |
51 | * latency_threshold - we don't need much accuracy here so don't |
52 | * bother with the divide: |
53 | */ |
54 | if (atomic_read(&ca->congested) < CONGESTED_MAX) |
55 | atomic_add(latency_over >> |
56 | max_t(int, ilog2(latency_threshold) - 2, 0), |
57 | &ca->congested); |
58 | |
59 | ca->congested_last = now; |
60 | } else if (atomic_read(&ca->congested) > 0) { |
61 | atomic_dec(&ca->congested); |
62 | } |
63 | } |
64 | |
65 | void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) |
66 | { |
67 | atomic64_t *latency = &ca->cur_latency[rw]; |
68 | u64 now = local_clock(); |
69 | u64 io_latency = time_after64(now, submit_time) |
70 | ? now - submit_time |
71 | : 0; |
72 | u64 old, new, v = atomic64_read(latency); |
73 | |
74 | do { |
75 | old = v; |
76 | |
77 | /* |
78 | * If the io latency was reasonably close to the current |
79 | * latency, skip doing the update and atomic operation - most of |
80 | * the time: |
81 | */ |
82 | if (abs((int) (old - io_latency)) < (old >> 1) && |
83 | now & ~(~0U << 5)) |
84 | break; |
85 | |
86 | new = ewma_add(old, io_latency, 5); |
87 | } while ((v = atomic64_cmpxchg(latency, old, new)) != old); |
88 | |
89 | bch2_congested_acct(ca, io_latency, now, rw); |
90 | |
91 | __bch2_time_stats_update(&ca->io_latency[rw].stats, submit_time, now); |
92 | } |
93 | |
94 | #endif |
95 | |
96 | /* Allocate, free from mempool: */ |
97 | |
98 | void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) |
99 | { |
100 | struct bvec_iter_all iter; |
101 | struct bio_vec *bv; |
102 | |
103 | bio_for_each_segment_all(bv, bio, iter) |
104 | if (bv->bv_page != ZERO_PAGE(0)) |
105 | mempool_free(element: bv->bv_page, pool: &c->bio_bounce_pages); |
106 | bio->bi_vcnt = 0; |
107 | } |
108 | |
109 | static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool) |
110 | { |
111 | struct page *page; |
112 | |
113 | if (likely(!*using_mempool)) { |
114 | page = alloc_page(GFP_NOFS); |
115 | if (unlikely(!page)) { |
116 | mutex_lock(&c->bio_bounce_pages_lock); |
117 | *using_mempool = true; |
118 | goto pool_alloc; |
119 | |
120 | } |
121 | } else { |
122 | pool_alloc: |
123 | page = mempool_alloc(pool: &c->bio_bounce_pages, GFP_NOFS); |
124 | } |
125 | |
126 | return page; |
127 | } |
128 | |
129 | void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, |
130 | size_t size) |
131 | { |
132 | bool using_mempool = false; |
133 | |
134 | while (size) { |
135 | struct page *page = __bio_alloc_page_pool(c, using_mempool: &using_mempool); |
136 | unsigned len = min_t(size_t, PAGE_SIZE, size); |
137 | |
138 | BUG_ON(!bio_add_page(bio, page, len, 0)); |
139 | size -= len; |
140 | } |
141 | |
142 | if (using_mempool) |
143 | mutex_unlock(lock: &c->bio_bounce_pages_lock); |
144 | } |
145 | |
146 | /* Extent update path: */ |
147 | |
148 | int bch2_sum_sector_overwrites(struct btree_trans *trans, |
149 | struct btree_iter *extent_iter, |
150 | struct bkey_i *new, |
151 | bool *usage_increasing, |
152 | s64 *i_sectors_delta, |
153 | s64 *disk_sectors_delta) |
154 | { |
155 | struct bch_fs *c = trans->c; |
156 | struct btree_iter iter; |
157 | struct bkey_s_c old; |
158 | unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(k: new)); |
159 | bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(k: new)); |
160 | int ret = 0; |
161 | |
162 | *usage_increasing = false; |
163 | *i_sectors_delta = 0; |
164 | *disk_sectors_delta = 0; |
165 | |
166 | bch2_trans_copy_iter(&iter, extent_iter); |
167 | |
168 | for_each_btree_key_upto_continue_norestart(iter, |
169 | new->k.p, BTREE_ITER_SLOTS, old, ret) { |
170 | s64 sectors = min(new->k.p.offset, old.k->p.offset) - |
171 | max(bkey_start_offset(&new->k), |
172 | bkey_start_offset(old.k)); |
173 | |
174 | *i_sectors_delta += sectors * |
175 | (bkey_extent_is_allocation(k: &new->k) - |
176 | bkey_extent_is_allocation(k: old.k)); |
177 | |
178 | *disk_sectors_delta += sectors * bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(k: new)); |
179 | *disk_sectors_delta -= new->k.p.snapshot == old.k->p.snapshot |
180 | ? sectors * bch2_bkey_nr_ptrs_fully_allocated(old) |
181 | : 0; |
182 | |
183 | if (!*usage_increasing && |
184 | (new->k.p.snapshot != old.k->p.snapshot || |
185 | new_replicas > bch2_bkey_replicas(c, old) || |
186 | (!new_compressed && bch2_bkey_sectors_compressed(old)))) |
187 | *usage_increasing = true; |
188 | |
189 | if (bkey_ge(l: old.k->p, r: new->k.p)) |
190 | break; |
191 | } |
192 | |
193 | bch2_trans_iter_exit(trans, &iter); |
194 | return ret; |
195 | } |
196 | |
197 | static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, |
198 | struct btree_iter *extent_iter, |
199 | u64 new_i_size, |
200 | s64 i_sectors_delta) |
201 | { |
202 | struct btree_iter iter; |
203 | struct bkey_i *k; |
204 | struct bkey_i_inode_v3 *inode; |
205 | /* |
206 | * Crazy performance optimization: |
207 | * Every extent update needs to also update the inode: the inode trigger |
208 | * will set bi->journal_seq to the journal sequence number of this |
209 | * transaction - for fsync. |
210 | * |
211 | * But if that's the only reason we're updating the inode (we're not |
212 | * updating bi_size or bi_sectors), then we don't need the inode update |
213 | * to be journalled - if we crash, the bi_journal_seq update will be |
214 | * lost, but that's fine. |
215 | */ |
216 | unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL; |
217 | int ret; |
218 | |
219 | k = bch2_bkey_get_mut_noupdate(trans, iter: &iter, btree_id: BTREE_ID_inodes, |
220 | pos: SPOS(inode: 0, |
221 | offset: extent_iter->pos.inode, |
222 | snapshot: extent_iter->snapshot), |
223 | flags: BTREE_ITER_CACHED); |
224 | ret = PTR_ERR_OR_ZERO(ptr: k); |
225 | if (unlikely(ret)) |
226 | return ret; |
227 | |
228 | if (unlikely(k->k.type != KEY_TYPE_inode_v3)) { |
229 | k = bch2_inode_to_v3(trans, k); |
230 | ret = PTR_ERR_OR_ZERO(ptr: k); |
231 | if (unlikely(ret)) |
232 | goto err; |
233 | } |
234 | |
235 | inode = bkey_i_to_inode_v3(k); |
236 | |
237 | if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_i_size_dirty) && |
238 | new_i_size > le64_to_cpu(inode->v.bi_size)) { |
239 | inode->v.bi_size = cpu_to_le64(new_i_size); |
240 | inode_update_flags = 0; |
241 | } |
242 | |
243 | if (i_sectors_delta) { |
244 | le64_add_cpu(var: &inode->v.bi_sectors, val: i_sectors_delta); |
245 | inode_update_flags = 0; |
246 | } |
247 | |
248 | if (inode->k.p.snapshot != iter.snapshot) { |
249 | inode->k.p.snapshot = iter.snapshot; |
250 | inode_update_flags = 0; |
251 | } |
252 | |
253 | ret = bch2_trans_update(trans, &iter, &inode->k_i, |
254 | BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| |
255 | inode_update_flags); |
256 | err: |
257 | bch2_trans_iter_exit(trans, &iter); |
258 | return ret; |
259 | } |
260 | |
261 | int bch2_extent_update(struct btree_trans *trans, |
262 | subvol_inum inum, |
263 | struct btree_iter *iter, |
264 | struct bkey_i *k, |
265 | struct disk_reservation *disk_res, |
266 | u64 new_i_size, |
267 | s64 *i_sectors_delta_total, |
268 | bool check_enospc) |
269 | { |
270 | struct bpos next_pos; |
271 | bool usage_increasing; |
272 | s64 i_sectors_delta = 0, disk_sectors_delta = 0; |
273 | int ret; |
274 | |
275 | /* |
276 | * This traverses us the iterator without changing iter->path->pos to |
277 | * search_key() (which is pos + 1 for extents): we want there to be a |
278 | * path already traversed at iter->pos because |
279 | * bch2_trans_extent_update() will use it to attempt extent merging |
280 | */ |
281 | ret = __bch2_btree_iter_traverse(iter); |
282 | if (ret) |
283 | return ret; |
284 | |
285 | ret = bch2_extent_trim_atomic(trans, iter, k); |
286 | if (ret) |
287 | return ret; |
288 | |
289 | next_pos = k->k.p; |
290 | |
291 | ret = bch2_sum_sector_overwrites(trans, extent_iter: iter, new: k, |
292 | usage_increasing: &usage_increasing, |
293 | i_sectors_delta: &i_sectors_delta, |
294 | disk_sectors_delta: &disk_sectors_delta); |
295 | if (ret) |
296 | return ret; |
297 | |
298 | if (disk_res && |
299 | disk_sectors_delta > (s64) disk_res->sectors) { |
300 | ret = bch2_disk_reservation_add(c: trans->c, res: disk_res, |
301 | sectors: disk_sectors_delta - disk_res->sectors, |
302 | flags: !check_enospc || !usage_increasing |
303 | ? BCH_DISK_RESERVATION_NOFAIL : 0); |
304 | if (ret) |
305 | return ret; |
306 | } |
307 | |
308 | /* |
309 | * Note: |
310 | * We always have to do an inode update - even when i_size/i_sectors |
311 | * aren't changing - for fsync to work properly; fsync relies on |
312 | * inode->bi_journal_seq which is updated by the trigger code: |
313 | */ |
314 | ret = bch2_extent_update_i_size_sectors(trans, extent_iter: iter, |
315 | min(k->k.p.offset << 9, new_i_size), |
316 | i_sectors_delta) ?: |
317 | bch2_trans_update(trans, iter, k, 0) ?: |
318 | bch2_trans_commit(trans, disk_res, NULL, |
319 | flags: BCH_TRANS_COMMIT_no_check_rw| |
320 | BCH_TRANS_COMMIT_no_enospc); |
321 | if (unlikely(ret)) |
322 | return ret; |
323 | |
324 | if (i_sectors_delta_total) |
325 | *i_sectors_delta_total += i_sectors_delta; |
326 | bch2_btree_iter_set_pos(iter, new_pos: next_pos); |
327 | return 0; |
328 | } |
329 | |
330 | static int bch2_write_index_default(struct bch_write_op *op) |
331 | { |
332 | struct bch_fs *c = op->c; |
333 | struct bkey_buf sk; |
334 | struct keylist *keys = &op->insert_keys; |
335 | struct bkey_i *k = bch2_keylist_front(l: keys); |
336 | struct btree_trans *trans = bch2_trans_get(c); |
337 | struct btree_iter iter; |
338 | subvol_inum inum = { |
339 | .subvol = op->subvol, |
340 | .inum = k->k.p.inode, |
341 | }; |
342 | int ret; |
343 | |
344 | BUG_ON(!inum.subvol); |
345 | |
346 | bch2_bkey_buf_init(s: &sk); |
347 | |
348 | do { |
349 | bch2_trans_begin(trans); |
350 | |
351 | k = bch2_keylist_front(l: keys); |
352 | bch2_bkey_buf_copy(s: &sk, c, src: k); |
353 | |
354 | ret = bch2_subvolume_get_snapshot(trans, inum.subvol, |
355 | &sk.k->k.p.snapshot); |
356 | if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) |
357 | continue; |
358 | if (ret) |
359 | break; |
360 | |
361 | bch2_trans_iter_init(trans, iter: &iter, btree_id: BTREE_ID_extents, |
362 | pos: bkey_start_pos(k: &sk.k->k), |
363 | flags: BTREE_ITER_SLOTS|BTREE_ITER_INTENT); |
364 | |
365 | ret = bch2_bkey_set_needs_rebalance(c, sk.k, &op->opts) ?: |
366 | bch2_extent_update(trans, inum, iter: &iter, k: sk.k, |
367 | disk_res: &op->res, |
368 | new_i_size: op->new_i_size, i_sectors_delta_total: &op->i_sectors_delta, |
369 | check_enospc: op->flags & BCH_WRITE_CHECK_ENOSPC); |
370 | bch2_trans_iter_exit(trans, &iter); |
371 | |
372 | if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) |
373 | continue; |
374 | if (ret) |
375 | break; |
376 | |
377 | if (bkey_ge(l: iter.pos, r: k->k.p)) |
378 | bch2_keylist_pop_front(&op->insert_keys); |
379 | else |
380 | bch2_cut_front(where: iter.pos, k); |
381 | } while (!bch2_keylist_empty(l: keys)); |
382 | |
383 | bch2_trans_put(trans); |
384 | bch2_bkey_buf_exit(s: &sk, c); |
385 | |
386 | return ret; |
387 | } |
388 | |
389 | /* Writes */ |
390 | |
391 | void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, |
392 | enum bch_data_type type, |
393 | const struct bkey_i *k, |
394 | bool nocow) |
395 | { |
396 | struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k: bkey_i_to_s_c(k)); |
397 | struct bch_write_bio *n; |
398 | |
399 | BUG_ON(c->opts.nochanges); |
400 | |
401 | bkey_for_each_ptr(ptrs, ptr) { |
402 | BUG_ON(!bch2_dev_exists2(c, ptr->dev)); |
403 | |
404 | struct bch_dev *ca = bch_dev_bkey_exists(c, idx: ptr->dev); |
405 | |
406 | if (to_entry(ptr + 1) < ptrs.end) { |
407 | n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, |
408 | GFP_NOFS, &ca->replica_set)); |
409 | |
410 | n->bio.bi_end_io = wbio->bio.bi_end_io; |
411 | n->bio.bi_private = wbio->bio.bi_private; |
412 | n->parent = wbio; |
413 | n->split = true; |
414 | n->bounce = false; |
415 | n->put_bio = true; |
416 | n->bio.bi_opf = wbio->bio.bi_opf; |
417 | bio_inc_remaining(bio: &wbio->bio); |
418 | } else { |
419 | n = wbio; |
420 | n->split = false; |
421 | } |
422 | |
423 | n->c = c; |
424 | n->dev = ptr->dev; |
425 | n->have_ioref = nocow || bch2_dev_get_ioref(ca, |
426 | rw: type == BCH_DATA_btree ? READ : WRITE); |
427 | n->nocow = nocow; |
428 | n->submit_time = local_clock(); |
429 | n->inode_offset = bkey_start_offset(k: &k->k); |
430 | n->bio.bi_iter.bi_sector = ptr->offset; |
431 | |
432 | if (likely(n->have_ioref)) { |
433 | this_cpu_add(ca->io_done->sectors[WRITE][type], |
434 | bio_sectors(&n->bio)); |
435 | |
436 | bio_set_dev(bio: &n->bio, bdev: ca->disk_sb.bdev); |
437 | |
438 | if (type != BCH_DATA_btree && unlikely(c->opts.no_data_io)) { |
439 | bio_endio(&n->bio); |
440 | continue; |
441 | } |
442 | |
443 | submit_bio(bio: &n->bio); |
444 | } else { |
445 | n->bio.bi_status = BLK_STS_REMOVED; |
446 | bio_endio(&n->bio); |
447 | } |
448 | } |
449 | } |
450 | |
451 | static void __bch2_write(struct bch_write_op *); |
452 | |
453 | static void bch2_write_done(struct closure *cl) |
454 | { |
455 | struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); |
456 | struct bch_fs *c = op->c; |
457 | |
458 | EBUG_ON(op->open_buckets.nr); |
459 | |
460 | bch2_time_stats_update(stats: &c->times[BCH_TIME_data_write], start: op->start_time); |
461 | bch2_disk_reservation_put(c, res: &op->res); |
462 | |
463 | if (!(op->flags & BCH_WRITE_MOVE)) |
464 | bch2_write_ref_put(c, ref: BCH_WRITE_REF_write); |
465 | bch2_keylist_free(l: &op->insert_keys, inline_keys: op->inline_keys); |
466 | |
467 | EBUG_ON(cl->parent); |
468 | closure_debug_destroy(cl); |
469 | if (op->end_io) |
470 | op->end_io(op); |
471 | } |
472 | |
473 | static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op) |
474 | { |
475 | struct keylist *keys = &op->insert_keys; |
476 | struct bch_extent_ptr *ptr; |
477 | struct bkey_i *src, *dst = keys->keys, *n; |
478 | |
479 | for (src = keys->keys; src != keys->top; src = n) { |
480 | n = bkey_next(k: src); |
481 | |
482 | if (bkey_extent_is_direct_data(k: &src->k)) { |
483 | bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr, |
484 | test_bit(ptr->dev, op->failed.d)); |
485 | |
486 | if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(k: src))) |
487 | return -EIO; |
488 | } |
489 | |
490 | if (dst != src) |
491 | memmove_u64s_down(dst, src, u64s: src->k.u64s); |
492 | dst = bkey_next(k: dst); |
493 | } |
494 | |
495 | keys->top = dst; |
496 | return 0; |
497 | } |
498 | |
499 | /** |
500 | * __bch2_write_index - after a write, update index to point to new data |
501 | * @op: bch_write_op to process |
502 | */ |
503 | static void __bch2_write_index(struct bch_write_op *op) |
504 | { |
505 | struct bch_fs *c = op->c; |
506 | struct keylist *keys = &op->insert_keys; |
507 | unsigned dev; |
508 | int ret = 0; |
509 | |
510 | if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { |
511 | ret = bch2_write_drop_io_error_ptrs(op); |
512 | if (ret) |
513 | goto err; |
514 | } |
515 | |
516 | if (!bch2_keylist_empty(l: keys)) { |
517 | u64 sectors_start = keylist_sectors(keys); |
518 | |
519 | ret = !(op->flags & BCH_WRITE_MOVE) |
520 | ? bch2_write_index_default(op) |
521 | : bch2_data_update_index_update(op); |
522 | |
523 | BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); |
524 | BUG_ON(keylist_sectors(keys) && !ret); |
525 | |
526 | op->written += sectors_start - keylist_sectors(keys); |
527 | |
528 | if (ret && !bch2_err_matches(ret, EROFS)) { |
529 | struct bkey_i *insert = bch2_keylist_front(l: &op->insert_keys); |
530 | |
531 | bch_err_inum_offset_ratelimited(c, |
532 | insert->k.p.inode, insert->k.p.offset << 9, |
533 | "%s write error while doing btree update: %s" , |
534 | op->flags & BCH_WRITE_MOVE ? "move" : "user" , |
535 | bch2_err_str(ret)); |
536 | } |
537 | |
538 | if (ret) |
539 | goto err; |
540 | } |
541 | out: |
542 | /* If some a bucket wasn't written, we can't erasure code it: */ |
543 | for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX) |
544 | bch2_open_bucket_write_error(c, &op->open_buckets, dev); |
545 | |
546 | bch2_open_buckets_put(c, ptrs: &op->open_buckets); |
547 | return; |
548 | err: |
549 | keys->top = keys->keys; |
550 | op->error = ret; |
551 | op->flags |= BCH_WRITE_DONE; |
552 | goto out; |
553 | } |
554 | |
555 | static inline void __wp_update_state(struct write_point *wp, enum write_point_state state) |
556 | { |
557 | if (state != wp->state) { |
558 | u64 now = ktime_get_ns(); |
559 | |
560 | if (wp->last_state_change && |
561 | time_after64(now, wp->last_state_change)) |
562 | wp->time[wp->state] += now - wp->last_state_change; |
563 | wp->state = state; |
564 | wp->last_state_change = now; |
565 | } |
566 | } |
567 | |
568 | static inline void wp_update_state(struct write_point *wp, bool running) |
569 | { |
570 | enum write_point_state state; |
571 | |
572 | state = running ? WRITE_POINT_running : |
573 | !list_empty(head: &wp->writes) ? WRITE_POINT_waiting_io |
574 | : WRITE_POINT_stopped; |
575 | |
576 | __wp_update_state(wp, state); |
577 | } |
578 | |
579 | static CLOSURE_CALLBACK(bch2_write_index) |
580 | { |
581 | closure_type(op, struct bch_write_op, cl); |
582 | struct write_point *wp = op->wp; |
583 | struct workqueue_struct *wq = index_update_wq(op); |
584 | unsigned long flags; |
585 | |
586 | if ((op->flags & BCH_WRITE_DONE) && |
587 | (op->flags & BCH_WRITE_MOVE)) |
588 | bch2_bio_free_pages_pool(c: op->c, bio: &op->wbio.bio); |
589 | |
590 | spin_lock_irqsave(&wp->writes_lock, flags); |
591 | if (wp->state == WRITE_POINT_waiting_io) |
592 | __wp_update_state(wp, state: WRITE_POINT_waiting_work); |
593 | list_add_tail(new: &op->wp_list, head: &wp->writes); |
594 | spin_unlock_irqrestore (lock: &wp->writes_lock, flags); |
595 | |
596 | queue_work(wq, work: &wp->index_update_work); |
597 | } |
598 | |
599 | static inline void bch2_write_queue(struct bch_write_op *op, struct write_point *wp) |
600 | { |
601 | op->wp = wp; |
602 | |
603 | if (wp->state == WRITE_POINT_stopped) { |
604 | spin_lock_irq(lock: &wp->writes_lock); |
605 | __wp_update_state(wp, state: WRITE_POINT_waiting_io); |
606 | spin_unlock_irq(lock: &wp->writes_lock); |
607 | } |
608 | } |
609 | |
610 | void bch2_write_point_do_index_updates(struct work_struct *work) |
611 | { |
612 | struct write_point *wp = |
613 | container_of(work, struct write_point, index_update_work); |
614 | struct bch_write_op *op; |
615 | |
616 | while (1) { |
617 | spin_lock_irq(lock: &wp->writes_lock); |
618 | op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list); |
619 | if (op) |
620 | list_del(entry: &op->wp_list); |
621 | wp_update_state(wp, running: op != NULL); |
622 | spin_unlock_irq(lock: &wp->writes_lock); |
623 | |
624 | if (!op) |
625 | break; |
626 | |
627 | op->flags |= BCH_WRITE_IN_WORKER; |
628 | |
629 | __bch2_write_index(op); |
630 | |
631 | if (!(op->flags & BCH_WRITE_DONE)) |
632 | __bch2_write(op); |
633 | else |
634 | bch2_write_done(cl: &op->cl); |
635 | } |
636 | } |
637 | |
638 | static void bch2_write_endio(struct bio *bio) |
639 | { |
640 | struct closure *cl = bio->bi_private; |
641 | struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); |
642 | struct bch_write_bio *wbio = to_wbio(bio); |
643 | struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; |
644 | struct bch_fs *c = wbio->c; |
645 | struct bch_dev *ca = bch_dev_bkey_exists(c, idx: wbio->dev); |
646 | |
647 | if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, |
648 | op->pos.inode, |
649 | wbio->inode_offset << 9, |
650 | "data write error: %s" , |
651 | bch2_blk_status_to_str(bio->bi_status))) { |
652 | set_bit(nr: wbio->dev, addr: op->failed.d); |
653 | op->flags |= BCH_WRITE_IO_ERROR; |
654 | } |
655 | |
656 | if (wbio->nocow) |
657 | set_bit(nr: wbio->dev, addr: op->devs_need_flush->d); |
658 | |
659 | if (wbio->have_ioref) { |
660 | bch2_latency_acct(ca, submit_time: wbio->submit_time, WRITE); |
661 | percpu_ref_put(ref: &ca->io_ref); |
662 | } |
663 | |
664 | if (wbio->bounce) |
665 | bch2_bio_free_pages_pool(c, bio); |
666 | |
667 | if (wbio->put_bio) |
668 | bio_put(bio); |
669 | |
670 | if (parent) |
671 | bio_endio(&parent->bio); |
672 | else |
673 | closure_put(cl); |
674 | } |
675 | |
676 | static void init_append_extent(struct bch_write_op *op, |
677 | struct write_point *wp, |
678 | struct bversion version, |
679 | struct bch_extent_crc_unpacked crc) |
680 | { |
681 | struct bkey_i_extent *e; |
682 | |
683 | op->pos.offset += crc.uncompressed_size; |
684 | |
685 | e = bkey_extent_init(k: op->insert_keys.top); |
686 | e->k.p = op->pos; |
687 | e->k.size = crc.uncompressed_size; |
688 | e->k.version = version; |
689 | |
690 | if (crc.csum_type || |
691 | crc.compression_type || |
692 | crc.nonce) |
693 | bch2_extent_crc_append(&e->k_i, crc); |
694 | |
695 | bch2_alloc_sectors_append_ptrs_inlined(c: op->c, wp, k: &e->k_i, sectors: crc.compressed_size, |
696 | cached: op->flags & BCH_WRITE_CACHED); |
697 | |
698 | bch2_keylist_push(l: &op->insert_keys); |
699 | } |
700 | |
701 | static struct bio *bch2_write_bio_alloc(struct bch_fs *c, |
702 | struct write_point *wp, |
703 | struct bio *src, |
704 | bool *page_alloc_failed, |
705 | void *buf) |
706 | { |
707 | struct bch_write_bio *wbio; |
708 | struct bio *bio; |
709 | unsigned output_available = |
710 | min(wp->sectors_free << 9, src->bi_iter.bi_size); |
711 | unsigned pages = DIV_ROUND_UP(output_available + |
712 | (buf |
713 | ? ((unsigned long) buf & (PAGE_SIZE - 1)) |
714 | : 0), PAGE_SIZE); |
715 | |
716 | pages = min(pages, BIO_MAX_VECS); |
717 | |
718 | bio = bio_alloc_bioset(NULL, nr_vecs: pages, opf: 0, |
719 | GFP_NOFS, bs: &c->bio_write); |
720 | wbio = wbio_init(bio); |
721 | wbio->put_bio = true; |
722 | /* copy WRITE_SYNC flag */ |
723 | wbio->bio.bi_opf = src->bi_opf; |
724 | |
725 | if (buf) { |
726 | bch2_bio_map(bio, base: buf, output_available); |
727 | return bio; |
728 | } |
729 | |
730 | wbio->bounce = true; |
731 | |
732 | /* |
733 | * We can't use mempool for more than c->sb.encoded_extent_max |
734 | * worth of pages, but we'd like to allocate more if we can: |
735 | */ |
736 | bch2_bio_alloc_pages_pool(c, bio, |
737 | min_t(unsigned, output_available, |
738 | c->opts.encoded_extent_max)); |
739 | |
740 | if (bio->bi_iter.bi_size < output_available) |
741 | *page_alloc_failed = |
742 | bch2_bio_alloc_pages(bio, |
743 | output_available - |
744 | bio->bi_iter.bi_size, |
745 | GFP_NOFS) != 0; |
746 | |
747 | return bio; |
748 | } |
749 | |
750 | static int bch2_write_rechecksum(struct bch_fs *c, |
751 | struct bch_write_op *op, |
752 | unsigned new_csum_type) |
753 | { |
754 | struct bio *bio = &op->wbio.bio; |
755 | struct bch_extent_crc_unpacked new_crc; |
756 | int ret; |
757 | |
758 | /* bch2_rechecksum_bio() can't encrypt or decrypt data: */ |
759 | |
760 | if (bch2_csum_type_is_encryption(type: op->crc.csum_type) != |
761 | bch2_csum_type_is_encryption(type: new_csum_type)) |
762 | new_csum_type = op->crc.csum_type; |
763 | |
764 | ret = bch2_rechecksum_bio(c, bio, op->version, op->crc, |
765 | NULL, &new_crc, |
766 | op->crc.offset, op->crc.live_size, |
767 | new_csum_type); |
768 | if (ret) |
769 | return ret; |
770 | |
771 | bio_advance(bio, nbytes: op->crc.offset << 9); |
772 | bio->bi_iter.bi_size = op->crc.live_size << 9; |
773 | op->crc = new_crc; |
774 | return 0; |
775 | } |
776 | |
777 | static int bch2_write_decrypt(struct bch_write_op *op) |
778 | { |
779 | struct bch_fs *c = op->c; |
780 | struct nonce nonce = extent_nonce(version: op->version, crc: op->crc); |
781 | struct bch_csum csum; |
782 | int ret; |
783 | |
784 | if (!bch2_csum_type_is_encryption(type: op->crc.csum_type)) |
785 | return 0; |
786 | |
787 | /* |
788 | * If we need to decrypt data in the write path, we'll no longer be able |
789 | * to verify the existing checksum (poly1305 mac, in this case) after |
790 | * it's decrypted - this is the last point we'll be able to reverify the |
791 | * checksum: |
792 | */ |
793 | csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); |
794 | if (bch2_crc_cmp(l: op->crc.csum, r: csum) && !c->opts.no_data_io) |
795 | return -EIO; |
796 | |
797 | ret = bch2_encrypt_bio(c, type: op->crc.csum_type, nonce, bio: &op->wbio.bio); |
798 | op->crc.csum_type = 0; |
799 | op->crc.csum = (struct bch_csum) { 0, 0 }; |
800 | return ret; |
801 | } |
802 | |
803 | static enum prep_encoded_ret { |
804 | PREP_ENCODED_OK, |
805 | PREP_ENCODED_ERR, |
806 | PREP_ENCODED_CHECKSUM_ERR, |
807 | PREP_ENCODED_DO_WRITE, |
808 | } bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp) |
809 | { |
810 | struct bch_fs *c = op->c; |
811 | struct bio *bio = &op->wbio.bio; |
812 | |
813 | if (!(op->flags & BCH_WRITE_DATA_ENCODED)) |
814 | return PREP_ENCODED_OK; |
815 | |
816 | BUG_ON(bio_sectors(bio) != op->crc.compressed_size); |
817 | |
818 | /* Can we just write the entire extent as is? */ |
819 | if (op->crc.uncompressed_size == op->crc.live_size && |
820 | op->crc.uncompressed_size <= c->opts.encoded_extent_max >> 9 && |
821 | op->crc.compressed_size <= wp->sectors_free && |
822 | (op->crc.compression_type == bch2_compression_opt_to_type(v: op->compression_opt) || |
823 | op->incompressible)) { |
824 | if (!crc_is_compressed(crc: op->crc) && |
825 | op->csum_type != op->crc.csum_type && |
826 | bch2_write_rechecksum(c, op, new_csum_type: op->csum_type) && |
827 | !c->opts.no_data_io) |
828 | return PREP_ENCODED_CHECKSUM_ERR; |
829 | |
830 | return PREP_ENCODED_DO_WRITE; |
831 | } |
832 | |
833 | /* |
834 | * If the data is compressed and we couldn't write the entire extent as |
835 | * is, we have to decompress it: |
836 | */ |
837 | if (crc_is_compressed(crc: op->crc)) { |
838 | struct bch_csum csum; |
839 | |
840 | if (bch2_write_decrypt(op)) |
841 | return PREP_ENCODED_CHECKSUM_ERR; |
842 | |
843 | /* Last point we can still verify checksum: */ |
844 | csum = bch2_checksum_bio(c, op->crc.csum_type, |
845 | extent_nonce(version: op->version, crc: op->crc), |
846 | bio); |
847 | if (bch2_crc_cmp(l: op->crc.csum, r: csum) && !c->opts.no_data_io) |
848 | return PREP_ENCODED_CHECKSUM_ERR; |
849 | |
850 | if (bch2_bio_uncompress_inplace(c, bio, &op->crc)) |
851 | return PREP_ENCODED_ERR; |
852 | } |
853 | |
854 | /* |
855 | * No longer have compressed data after this point - data might be |
856 | * encrypted: |
857 | */ |
858 | |
859 | /* |
860 | * If the data is checksummed and we're only writing a subset, |
861 | * rechecksum and adjust bio to point to currently live data: |
862 | */ |
863 | if ((op->crc.live_size != op->crc.uncompressed_size || |
864 | op->crc.csum_type != op->csum_type) && |
865 | bch2_write_rechecksum(c, op, new_csum_type: op->csum_type) && |
866 | !c->opts.no_data_io) |
867 | return PREP_ENCODED_CHECKSUM_ERR; |
868 | |
869 | /* |
870 | * If we want to compress the data, it has to be decrypted: |
871 | */ |
872 | if ((op->compression_opt || |
873 | bch2_csum_type_is_encryption(type: op->crc.csum_type) != |
874 | bch2_csum_type_is_encryption(type: op->csum_type)) && |
875 | bch2_write_decrypt(op)) |
876 | return PREP_ENCODED_CHECKSUM_ERR; |
877 | |
878 | return PREP_ENCODED_OK; |
879 | } |
880 | |
881 | static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, |
882 | struct bio **_dst) |
883 | { |
884 | struct bch_fs *c = op->c; |
885 | struct bio *src = &op->wbio.bio, *dst = src; |
886 | struct bvec_iter saved_iter; |
887 | void *ec_buf; |
888 | unsigned total_output = 0, total_input = 0; |
889 | bool bounce = false; |
890 | bool page_alloc_failed = false; |
891 | int ret, more = 0; |
892 | |
893 | BUG_ON(!bio_sectors(src)); |
894 | |
895 | ec_buf = bch2_writepoint_ec_buf(c, wp); |
896 | |
897 | switch (bch2_write_prep_encoded_data(op, wp)) { |
898 | case PREP_ENCODED_OK: |
899 | break; |
900 | case PREP_ENCODED_ERR: |
901 | ret = -EIO; |
902 | goto err; |
903 | case PREP_ENCODED_CHECKSUM_ERR: |
904 | goto csum_err; |
905 | case PREP_ENCODED_DO_WRITE: |
906 | /* XXX look for bug here */ |
907 | if (ec_buf) { |
908 | dst = bch2_write_bio_alloc(c, wp, src, |
909 | page_alloc_failed: &page_alloc_failed, |
910 | buf: ec_buf); |
911 | bio_copy_data(dst, src); |
912 | bounce = true; |
913 | } |
914 | init_append_extent(op, wp, version: op->version, crc: op->crc); |
915 | goto do_write; |
916 | } |
917 | |
918 | if (ec_buf || |
919 | op->compression_opt || |
920 | (op->csum_type && |
921 | !(op->flags & BCH_WRITE_PAGES_STABLE)) || |
922 | (bch2_csum_type_is_encryption(type: op->csum_type) && |
923 | !(op->flags & BCH_WRITE_PAGES_OWNED))) { |
924 | dst = bch2_write_bio_alloc(c, wp, src, |
925 | page_alloc_failed: &page_alloc_failed, |
926 | buf: ec_buf); |
927 | bounce = true; |
928 | } |
929 | |
930 | saved_iter = dst->bi_iter; |
931 | |
932 | do { |
933 | struct bch_extent_crc_unpacked crc = { 0 }; |
934 | struct bversion version = op->version; |
935 | size_t dst_len = 0, src_len = 0; |
936 | |
937 | if (page_alloc_failed && |
938 | dst->bi_iter.bi_size < (wp->sectors_free << 9) && |
939 | dst->bi_iter.bi_size < c->opts.encoded_extent_max) |
940 | break; |
941 | |
942 | BUG_ON(op->compression_opt && |
943 | (op->flags & BCH_WRITE_DATA_ENCODED) && |
944 | bch2_csum_type_is_encryption(op->crc.csum_type)); |
945 | BUG_ON(op->compression_opt && !bounce); |
946 | |
947 | crc.compression_type = op->incompressible |
948 | ? BCH_COMPRESSION_TYPE_incompressible |
949 | : op->compression_opt |
950 | ? bch2_bio_compress(c, dst, &dst_len, src, &src_len, |
951 | op->compression_opt) |
952 | : 0; |
953 | if (!crc_is_compressed(crc)) { |
954 | dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); |
955 | dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9); |
956 | |
957 | if (op->csum_type) |
958 | dst_len = min_t(unsigned, dst_len, |
959 | c->opts.encoded_extent_max); |
960 | |
961 | if (bounce) { |
962 | swap(dst->bi_iter.bi_size, dst_len); |
963 | bio_copy_data(dst, src); |
964 | swap(dst->bi_iter.bi_size, dst_len); |
965 | } |
966 | |
967 | src_len = dst_len; |
968 | } |
969 | |
970 | BUG_ON(!src_len || !dst_len); |
971 | |
972 | if (bch2_csum_type_is_encryption(type: op->csum_type)) { |
973 | if (bversion_zero(v: version)) { |
974 | version.lo = atomic64_inc_return(v: &c->key_version); |
975 | } else { |
976 | crc.nonce = op->nonce; |
977 | op->nonce += src_len >> 9; |
978 | } |
979 | } |
980 | |
981 | if ((op->flags & BCH_WRITE_DATA_ENCODED) && |
982 | !crc_is_compressed(crc) && |
983 | bch2_csum_type_is_encryption(type: op->crc.csum_type) == |
984 | bch2_csum_type_is_encryption(type: op->csum_type)) { |
985 | u8 compression_type = crc.compression_type; |
986 | u16 nonce = crc.nonce; |
987 | /* |
988 | * Note: when we're using rechecksum(), we need to be |
989 | * checksumming @src because it has all the data our |
990 | * existing checksum covers - if we bounced (because we |
991 | * were trying to compress), @dst will only have the |
992 | * part of the data the new checksum will cover. |
993 | * |
994 | * But normally we want to be checksumming post bounce, |
995 | * because part of the reason for bouncing is so the |
996 | * data can't be modified (by userspace) while it's in |
997 | * flight. |
998 | */ |
999 | if (bch2_rechecksum_bio(c, src, version, op->crc, |
1000 | &crc, &op->crc, |
1001 | src_len >> 9, |
1002 | bio_sectors(src) - (src_len >> 9), |
1003 | op->csum_type)) |
1004 | goto csum_err; |
1005 | /* |
1006 | * rchecksum_bio sets compression_type on crc from op->crc, |
1007 | * this isn't always correct as sometimes we're changing |
1008 | * an extent from uncompressed to incompressible. |
1009 | */ |
1010 | crc.compression_type = compression_type; |
1011 | crc.nonce = nonce; |
1012 | } else { |
1013 | if ((op->flags & BCH_WRITE_DATA_ENCODED) && |
1014 | bch2_rechecksum_bio(c, src, version, op->crc, |
1015 | NULL, &op->crc, |
1016 | src_len >> 9, |
1017 | bio_sectors(src) - (src_len >> 9), |
1018 | op->crc.csum_type)) |
1019 | goto csum_err; |
1020 | |
1021 | crc.compressed_size = dst_len >> 9; |
1022 | crc.uncompressed_size = src_len >> 9; |
1023 | crc.live_size = src_len >> 9; |
1024 | |
1025 | swap(dst->bi_iter.bi_size, dst_len); |
1026 | ret = bch2_encrypt_bio(c, type: op->csum_type, |
1027 | nonce: extent_nonce(version, crc), bio: dst); |
1028 | if (ret) |
1029 | goto err; |
1030 | |
1031 | crc.csum = bch2_checksum_bio(c, op->csum_type, |
1032 | extent_nonce(version, crc), dst); |
1033 | crc.csum_type = op->csum_type; |
1034 | swap(dst->bi_iter.bi_size, dst_len); |
1035 | } |
1036 | |
1037 | init_append_extent(op, wp, version, crc); |
1038 | |
1039 | if (dst != src) |
1040 | bio_advance(bio: dst, nbytes: dst_len); |
1041 | bio_advance(bio: src, nbytes: src_len); |
1042 | total_output += dst_len; |
1043 | total_input += src_len; |
1044 | } while (dst->bi_iter.bi_size && |
1045 | src->bi_iter.bi_size && |
1046 | wp->sectors_free && |
1047 | !bch2_keylist_realloc(&op->insert_keys, |
1048 | op->inline_keys, |
1049 | ARRAY_SIZE(op->inline_keys), |
1050 | BKEY_EXTENT_U64s_MAX)); |
1051 | |
1052 | more = src->bi_iter.bi_size != 0; |
1053 | |
1054 | dst->bi_iter = saved_iter; |
1055 | |
1056 | if (dst == src && more) { |
1057 | BUG_ON(total_output != total_input); |
1058 | |
1059 | dst = bio_split(bio: src, sectors: total_input >> 9, |
1060 | GFP_NOFS, bs: &c->bio_write); |
1061 | wbio_init(bio: dst)->put_bio = true; |
1062 | /* copy WRITE_SYNC flag */ |
1063 | dst->bi_opf = src->bi_opf; |
1064 | } |
1065 | |
1066 | dst->bi_iter.bi_size = total_output; |
1067 | do_write: |
1068 | *_dst = dst; |
1069 | return more; |
1070 | csum_err: |
1071 | bch_err(c, "%s writ error: error verifying existing checksum while rewriting existing data (memory corruption?)" , |
1072 | op->flags & BCH_WRITE_MOVE ? "move" : "user" ); |
1073 | ret = -EIO; |
1074 | err: |
1075 | if (to_wbio(dst)->bounce) |
1076 | bch2_bio_free_pages_pool(c, bio: dst); |
1077 | if (to_wbio(dst)->put_bio) |
1078 | bio_put(dst); |
1079 | |
1080 | return ret; |
1081 | } |
1082 | |
1083 | static bool bch2_extent_is_writeable(struct bch_write_op *op, |
1084 | struct bkey_s_c k) |
1085 | { |
1086 | struct bch_fs *c = op->c; |
1087 | struct bkey_s_c_extent e; |
1088 | struct extent_ptr_decoded p; |
1089 | const union bch_extent_entry *entry; |
1090 | unsigned replicas = 0; |
1091 | |
1092 | if (k.k->type != KEY_TYPE_extent) |
1093 | return false; |
1094 | |
1095 | e = bkey_s_c_to_extent(k); |
1096 | extent_for_each_ptr_decode(e, p, entry) { |
1097 | if (crc_is_encoded(crc: p.crc) || p.has_ec) |
1098 | return false; |
1099 | |
1100 | replicas += bch2_extent_ptr_durability(c, &p); |
1101 | } |
1102 | |
1103 | return replicas >= op->opts.data_replicas; |
1104 | } |
1105 | |
1106 | static inline void bch2_nocow_write_unlock(struct bch_write_op *op) |
1107 | { |
1108 | struct bch_fs *c = op->c; |
1109 | |
1110 | for_each_keylist_key(&op->insert_keys, k) { |
1111 | struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k: bkey_i_to_s_c(k)); |
1112 | |
1113 | bkey_for_each_ptr(ptrs, ptr) |
1114 | bch2_bucket_nocow_unlock(&c->nocow_locks, |
1115 | PTR_BUCKET_POS(c, ptr), |
1116 | BUCKET_NOCOW_LOCK_UPDATE); |
1117 | } |
1118 | } |
1119 | |
1120 | static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, |
1121 | struct btree_iter *iter, |
1122 | struct bkey_i *orig, |
1123 | struct bkey_s_c k, |
1124 | u64 new_i_size) |
1125 | { |
1126 | if (!bch2_extents_match(bkey_i_to_s_c(k: orig), k)) { |
1127 | /* trace this */ |
1128 | return 0; |
1129 | } |
1130 | |
1131 | struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); |
1132 | int ret = PTR_ERR_OR_ZERO(ptr: new); |
1133 | if (ret) |
1134 | return ret; |
1135 | |
1136 | bch2_cut_front(where: bkey_start_pos(k: &orig->k), k: new); |
1137 | bch2_cut_back(where: orig->k.p, k: new); |
1138 | |
1139 | struct bkey_ptrs ptrs = bch2_bkey_ptrs(k: bkey_i_to_s(k: new)); |
1140 | bkey_for_each_ptr(ptrs, ptr) |
1141 | ptr->unwritten = 0; |
1142 | |
1143 | /* |
1144 | * Note that we're not calling bch2_subvol_get_snapshot() in this path - |
1145 | * that was done when we kicked off the write, and here it's important |
1146 | * that we update the extent that we wrote to - even if a snapshot has |
1147 | * since been created. The write is still outstanding, so we're ok |
1148 | * w.r.t. snapshot atomicity: |
1149 | */ |
1150 | return bch2_extent_update_i_size_sectors(trans, extent_iter: iter, |
1151 | min(new->k.p.offset << 9, new_i_size), i_sectors_delta: 0) ?: |
1152 | bch2_trans_update(trans, iter, new, |
1153 | BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); |
1154 | } |
1155 | |
1156 | static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) |
1157 | { |
1158 | struct bch_fs *c = op->c; |
1159 | struct btree_trans *trans = bch2_trans_get(c); |
1160 | |
1161 | for_each_keylist_key(&op->insert_keys, orig) { |
1162 | int ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents, |
1163 | bkey_start_pos(&orig->k), orig->k.p, |
1164 | BTREE_ITER_INTENT, k, |
1165 | NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ |
1166 | bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size); |
1167 | })); |
1168 | |
1169 | if (ret && !bch2_err_matches(ret, EROFS)) { |
1170 | struct bkey_i *insert = bch2_keylist_front(l: &op->insert_keys); |
1171 | |
1172 | bch_err_inum_offset_ratelimited(c, |
1173 | insert->k.p.inode, insert->k.p.offset << 9, |
1174 | "%s write error while doing btree update: %s" , |
1175 | op->flags & BCH_WRITE_MOVE ? "move" : "user" , |
1176 | bch2_err_str(ret)); |
1177 | } |
1178 | |
1179 | if (ret) { |
1180 | op->error = ret; |
1181 | break; |
1182 | } |
1183 | } |
1184 | |
1185 | bch2_trans_put(trans); |
1186 | } |
1187 | |
1188 | static void __bch2_nocow_write_done(struct bch_write_op *op) |
1189 | { |
1190 | bch2_nocow_write_unlock(op); |
1191 | |
1192 | if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { |
1193 | op->error = -EIO; |
1194 | } else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN)) |
1195 | bch2_nocow_write_convert_unwritten(op); |
1196 | } |
1197 | |
1198 | static CLOSURE_CALLBACK(bch2_nocow_write_done) |
1199 | { |
1200 | closure_type(op, struct bch_write_op, cl); |
1201 | |
1202 | __bch2_nocow_write_done(op); |
1203 | bch2_write_done(cl); |
1204 | } |
1205 | |
1206 | struct bucket_to_lock { |
1207 | struct bpos b; |
1208 | unsigned gen; |
1209 | struct nocow_lock_bucket *l; |
1210 | }; |
1211 | |
1212 | static void bch2_nocow_write(struct bch_write_op *op) |
1213 | { |
1214 | struct bch_fs *c = op->c; |
1215 | struct btree_trans *trans; |
1216 | struct btree_iter iter; |
1217 | struct bkey_s_c k; |
1218 | DARRAY_PREALLOCATED(struct bucket_to_lock, 3) buckets; |
1219 | u32 snapshot; |
1220 | struct bucket_to_lock *stale_at; |
1221 | int ret; |
1222 | |
1223 | if (op->flags & BCH_WRITE_MOVE) |
1224 | return; |
1225 | |
1226 | darray_init(&buckets); |
1227 | trans = bch2_trans_get(c); |
1228 | retry: |
1229 | bch2_trans_begin(trans); |
1230 | |
1231 | ret = bch2_subvolume_get_snapshot(trans, op->subvol, &snapshot); |
1232 | if (unlikely(ret)) |
1233 | goto err; |
1234 | |
1235 | bch2_trans_iter_init(trans, iter: &iter, btree_id: BTREE_ID_extents, |
1236 | pos: SPOS(inode: op->pos.inode, offset: op->pos.offset, snapshot), |
1237 | flags: BTREE_ITER_SLOTS); |
1238 | while (1) { |
1239 | struct bio *bio = &op->wbio.bio; |
1240 | |
1241 | buckets.nr = 0; |
1242 | |
1243 | k = bch2_btree_iter_peek_slot(&iter); |
1244 | ret = bkey_err(k); |
1245 | if (ret) |
1246 | break; |
1247 | |
1248 | /* fall back to normal cow write path? */ |
1249 | if (unlikely(k.k->p.snapshot != snapshot || |
1250 | !bch2_extent_is_writeable(op, k))) |
1251 | break; |
1252 | |
1253 | if (bch2_keylist_realloc(&op->insert_keys, |
1254 | op->inline_keys, |
1255 | ARRAY_SIZE(op->inline_keys), |
1256 | k.k->u64s)) |
1257 | break; |
1258 | |
1259 | /* Get iorefs before dropping btree locks: */ |
1260 | struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); |
1261 | bkey_for_each_ptr(ptrs, ptr) { |
1262 | struct bpos b = PTR_BUCKET_POS(c, ptr); |
1263 | struct nocow_lock_bucket *l = |
1264 | bucket_nocow_lock(t: &c->nocow_locks, dev_bucket: bucket_to_u64(bucket: b)); |
1265 | prefetch(l); |
1266 | |
1267 | if (unlikely(!bch2_dev_get_ioref(bch_dev_bkey_exists(c, ptr->dev), WRITE))) |
1268 | goto err_get_ioref; |
1269 | |
1270 | /* XXX allocating memory with btree locks held - rare */ |
1271 | darray_push_gfp(&buckets, ((struct bucket_to_lock) { |
1272 | .b = b, .gen = ptr->gen, .l = l, |
1273 | }), GFP_KERNEL|__GFP_NOFAIL); |
1274 | |
1275 | if (ptr->unwritten) |
1276 | op->flags |= BCH_WRITE_CONVERT_UNWRITTEN; |
1277 | } |
1278 | |
1279 | /* Unlock before taking nocow locks, doing IO: */ |
1280 | bkey_reassemble(dst: op->insert_keys.top, src: k); |
1281 | bch2_trans_unlock(trans); |
1282 | |
1283 | bch2_cut_front(where: op->pos, k: op->insert_keys.top); |
1284 | if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN) |
1285 | bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), k: op->insert_keys.top); |
1286 | |
1287 | darray_for_each(buckets, i) { |
1288 | struct bch_dev *ca = bch_dev_bkey_exists(c, idx: i->b.inode); |
1289 | |
1290 | __bch2_bucket_nocow_lock(&c->nocow_locks, i->l, |
1291 | bucket_to_u64(bucket: i->b), |
1292 | BUCKET_NOCOW_LOCK_UPDATE); |
1293 | |
1294 | rcu_read_lock(); |
1295 | bool stale = gen_after(a: *bucket_gen(ca, b: i->b.offset), b: i->gen); |
1296 | rcu_read_unlock(); |
1297 | |
1298 | if (unlikely(stale)) { |
1299 | stale_at = i; |
1300 | goto err_bucket_stale; |
1301 | } |
1302 | } |
1303 | |
1304 | bio = &op->wbio.bio; |
1305 | if (k.k->p.offset < op->pos.offset + bio_sectors(bio)) { |
1306 | bio = bio_split(bio, sectors: k.k->p.offset - op->pos.offset, |
1307 | GFP_KERNEL, bs: &c->bio_write); |
1308 | wbio_init(bio)->put_bio = true; |
1309 | bio->bi_opf = op->wbio.bio.bi_opf; |
1310 | } else { |
1311 | op->flags |= BCH_WRITE_DONE; |
1312 | } |
1313 | |
1314 | op->pos.offset += bio_sectors(bio); |
1315 | op->written += bio_sectors(bio); |
1316 | |
1317 | bio->bi_end_io = bch2_write_endio; |
1318 | bio->bi_private = &op->cl; |
1319 | bio->bi_opf |= REQ_OP_WRITE; |
1320 | closure_get(cl: &op->cl); |
1321 | bch2_submit_wbio_replicas(to_wbio(bio), c, type: BCH_DATA_user, |
1322 | k: op->insert_keys.top, nocow: true); |
1323 | |
1324 | bch2_keylist_push(l: &op->insert_keys); |
1325 | if (op->flags & BCH_WRITE_DONE) |
1326 | break; |
1327 | bch2_btree_iter_advance(&iter); |
1328 | } |
1329 | out: |
1330 | bch2_trans_iter_exit(trans, &iter); |
1331 | err: |
1332 | if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) |
1333 | goto retry; |
1334 | |
1335 | if (ret) { |
1336 | bch_err_inum_offset_ratelimited(c, |
1337 | op->pos.inode, op->pos.offset << 9, |
1338 | "%s: btree lookup error %s" , __func__, bch2_err_str(ret)); |
1339 | op->error = ret; |
1340 | op->flags |= BCH_WRITE_DONE; |
1341 | } |
1342 | |
1343 | bch2_trans_put(trans); |
1344 | darray_exit(&buckets); |
1345 | |
1346 | /* fallback to cow write path? */ |
1347 | if (!(op->flags & BCH_WRITE_DONE)) { |
1348 | closure_sync(cl: &op->cl); |
1349 | __bch2_nocow_write_done(op); |
1350 | op->insert_keys.top = op->insert_keys.keys; |
1351 | } else if (op->flags & BCH_WRITE_SYNC) { |
1352 | closure_sync(cl: &op->cl); |
1353 | bch2_nocow_write_done(ws: &op->cl.work); |
1354 | } else { |
1355 | /* |
1356 | * XXX |
1357 | * needs to run out of process context because ei_quota_lock is |
1358 | * a mutex |
1359 | */ |
1360 | continue_at(&op->cl, bch2_nocow_write_done, index_update_wq(op)); |
1361 | } |
1362 | return; |
1363 | err_get_ioref: |
1364 | darray_for_each(buckets, i) |
1365 | percpu_ref_put(ref: &bch_dev_bkey_exists(c, idx: i->b.inode)->io_ref); |
1366 | |
1367 | /* Fall back to COW path: */ |
1368 | goto out; |
1369 | err_bucket_stale: |
1370 | darray_for_each(buckets, i) { |
1371 | bch2_bucket_nocow_unlock(&c->nocow_locks, i->b, BUCKET_NOCOW_LOCK_UPDATE); |
1372 | if (i == stale_at) |
1373 | break; |
1374 | } |
1375 | |
1376 | /* We can retry this: */ |
1377 | ret = -BCH_ERR_transaction_restart; |
1378 | goto err_get_ioref; |
1379 | } |
1380 | |
1381 | static void __bch2_write(struct bch_write_op *op) |
1382 | { |
1383 | struct bch_fs *c = op->c; |
1384 | struct write_point *wp = NULL; |
1385 | struct bio *bio = NULL; |
1386 | unsigned nofs_flags; |
1387 | int ret; |
1388 | |
1389 | nofs_flags = memalloc_nofs_save(); |
1390 | |
1391 | if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) { |
1392 | bch2_nocow_write(op); |
1393 | if (op->flags & BCH_WRITE_DONE) |
1394 | goto out_nofs_restore; |
1395 | } |
1396 | again: |
1397 | memset(&op->failed, 0, sizeof(op->failed)); |
1398 | |
1399 | do { |
1400 | struct bkey_i *key_to_write; |
1401 | unsigned key_to_write_offset = op->insert_keys.top_p - |
1402 | op->insert_keys.keys_p; |
1403 | |
1404 | /* +1 for possible cache device: */ |
1405 | if (op->open_buckets.nr + op->nr_replicas + 1 > |
1406 | ARRAY_SIZE(op->open_buckets.v)) |
1407 | break; |
1408 | |
1409 | if (bch2_keylist_realloc(&op->insert_keys, |
1410 | op->inline_keys, |
1411 | ARRAY_SIZE(op->inline_keys), |
1412 | BKEY_EXTENT_U64s_MAX)) |
1413 | break; |
1414 | |
1415 | /* |
1416 | * The copygc thread is now global, which means it's no longer |
1417 | * freeing up space on specific disks, which means that |
1418 | * allocations for specific disks may hang arbitrarily long: |
1419 | */ |
1420 | ret = bch2_trans_do(c, NULL, NULL, 0, |
1421 | bch2_alloc_sectors_start_trans(trans, |
1422 | op->target, |
1423 | op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED), |
1424 | op->write_point, |
1425 | &op->devs_have, |
1426 | op->nr_replicas, |
1427 | op->nr_replicas_required, |
1428 | op->watermark, |
1429 | op->flags, |
1430 | (op->flags & (BCH_WRITE_ALLOC_NOWAIT| |
1431 | BCH_WRITE_ONLY_SPECIFIED_DEVS)) |
1432 | ? NULL : &op->cl, &wp)); |
1433 | if (unlikely(ret)) { |
1434 | if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) |
1435 | break; |
1436 | |
1437 | goto err; |
1438 | } |
1439 | |
1440 | EBUG_ON(!wp); |
1441 | |
1442 | bch2_open_bucket_get(c, wp, ptrs: &op->open_buckets); |
1443 | ret = bch2_write_extent(op, wp, dst: &bio); |
1444 | |
1445 | bch2_alloc_sectors_done_inlined(c, wp); |
1446 | err: |
1447 | if (ret <= 0) { |
1448 | op->flags |= BCH_WRITE_DONE; |
1449 | |
1450 | if (ret < 0) { |
1451 | if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT)) |
1452 | bch_err_inum_offset_ratelimited(c, |
1453 | op->pos.inode, |
1454 | op->pos.offset << 9, |
1455 | "%s(): %s error: %s" , __func__, |
1456 | op->flags & BCH_WRITE_MOVE ? "move" : "user" , |
1457 | bch2_err_str(ret)); |
1458 | op->error = ret; |
1459 | break; |
1460 | } |
1461 | } |
1462 | |
1463 | bio->bi_end_io = bch2_write_endio; |
1464 | bio->bi_private = &op->cl; |
1465 | bio->bi_opf |= REQ_OP_WRITE; |
1466 | |
1467 | closure_get(cl: bio->bi_private); |
1468 | |
1469 | key_to_write = (void *) (op->insert_keys.keys_p + |
1470 | key_to_write_offset); |
1471 | |
1472 | bch2_submit_wbio_replicas(to_wbio(bio), c, type: BCH_DATA_user, |
1473 | k: key_to_write, nocow: false); |
1474 | } while (ret); |
1475 | |
1476 | /* |
1477 | * Sync or no? |
1478 | * |
1479 | * If we're running asynchronously, wne may still want to block |
1480 | * synchronously here if we weren't able to submit all of the IO at |
1481 | * once, as that signals backpressure to the caller. |
1482 | */ |
1483 | if ((op->flags & BCH_WRITE_SYNC) || |
1484 | (!(op->flags & BCH_WRITE_DONE) && |
1485 | !(op->flags & BCH_WRITE_IN_WORKER))) { |
1486 | closure_sync(cl: &op->cl); |
1487 | __bch2_write_index(op); |
1488 | |
1489 | if (!(op->flags & BCH_WRITE_DONE)) |
1490 | goto again; |
1491 | bch2_write_done(cl: &op->cl); |
1492 | } else { |
1493 | bch2_write_queue(op, wp); |
1494 | continue_at(&op->cl, bch2_write_index, NULL); |
1495 | } |
1496 | out_nofs_restore: |
1497 | memalloc_nofs_restore(flags: nofs_flags); |
1498 | } |
1499 | |
1500 | static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) |
1501 | { |
1502 | struct bio *bio = &op->wbio.bio; |
1503 | struct bvec_iter iter; |
1504 | struct bkey_i_inline_data *id; |
1505 | unsigned sectors; |
1506 | int ret; |
1507 | |
1508 | op->flags |= BCH_WRITE_WROTE_DATA_INLINE; |
1509 | op->flags |= BCH_WRITE_DONE; |
1510 | |
1511 | bch2_check_set_feature(c: op->c, feat: BCH_FEATURE_inline_data); |
1512 | |
1513 | ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys, |
1514 | ARRAY_SIZE(op->inline_keys), |
1515 | BKEY_U64s + DIV_ROUND_UP(data_len, 8)); |
1516 | if (ret) { |
1517 | op->error = ret; |
1518 | goto err; |
1519 | } |
1520 | |
1521 | sectors = bio_sectors(bio); |
1522 | op->pos.offset += sectors; |
1523 | |
1524 | id = bkey_inline_data_init(k: op->insert_keys.top); |
1525 | id->k.p = op->pos; |
1526 | id->k.version = op->version; |
1527 | id->k.size = sectors; |
1528 | |
1529 | iter = bio->bi_iter; |
1530 | iter.bi_size = data_len; |
1531 | memcpy_from_bio(id->v.data, bio, iter); |
1532 | |
1533 | while (data_len & 7) |
1534 | id->v.data[data_len++] = '\0'; |
1535 | set_bkey_val_bytes(k: &id->k, bytes: data_len); |
1536 | bch2_keylist_push(l: &op->insert_keys); |
1537 | |
1538 | __bch2_write_index(op); |
1539 | err: |
1540 | bch2_write_done(cl: &op->cl); |
1541 | } |
1542 | |
1543 | /** |
1544 | * bch2_write() - handle a write to a cache device or flash only volume |
1545 | * @cl: &bch_write_op->cl |
1546 | * |
1547 | * This is the starting point for any data to end up in a cache device; it could |
1548 | * be from a normal write, or a writeback write, or a write to a flash only |
1549 | * volume - it's also used by the moving garbage collector to compact data in |
1550 | * mostly empty buckets. |
1551 | * |
1552 | * It first writes the data to the cache, creating a list of keys to be inserted |
1553 | * (if the data won't fit in a single open bucket, there will be multiple keys); |
1554 | * after the data is written it calls bch_journal, and after the keys have been |
1555 | * added to the next journal write they're inserted into the btree. |
1556 | * |
1557 | * If op->discard is true, instead of inserting the data it invalidates the |
1558 | * region of the cache represented by op->bio and op->inode. |
1559 | */ |
1560 | CLOSURE_CALLBACK(bch2_write) |
1561 | { |
1562 | closure_type(op, struct bch_write_op, cl); |
1563 | struct bio *bio = &op->wbio.bio; |
1564 | struct bch_fs *c = op->c; |
1565 | unsigned data_len; |
1566 | |
1567 | EBUG_ON(op->cl.parent); |
1568 | BUG_ON(!op->nr_replicas); |
1569 | BUG_ON(!op->write_point.v); |
1570 | BUG_ON(bkey_eq(op->pos, POS_MAX)); |
1571 | |
1572 | op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas); |
1573 | op->start_time = local_clock(); |
1574 | bch2_keylist_init(l: &op->insert_keys, inline_keys: op->inline_keys); |
1575 | wbio_init(bio)->put_bio = false; |
1576 | |
1577 | if (bio->bi_iter.bi_size & (c->opts.block_size - 1)) { |
1578 | bch_err_inum_offset_ratelimited(c, |
1579 | op->pos.inode, |
1580 | op->pos.offset << 9, |
1581 | "%s write error: misaligned write" , |
1582 | op->flags & BCH_WRITE_MOVE ? "move" : "user" ); |
1583 | op->error = -EIO; |
1584 | goto err; |
1585 | } |
1586 | |
1587 | if (c->opts.nochanges) { |
1588 | op->error = -BCH_ERR_erofs_no_writes; |
1589 | goto err; |
1590 | } |
1591 | |
1592 | if (!(op->flags & BCH_WRITE_MOVE) && |
1593 | !bch2_write_ref_tryget(c, ref: BCH_WRITE_REF_write)) { |
1594 | op->error = -BCH_ERR_erofs_no_writes; |
1595 | goto err; |
1596 | } |
1597 | |
1598 | this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio)); |
1599 | bch2_increment_clock(c, bio_sectors(bio), WRITE); |
1600 | |
1601 | data_len = min_t(u64, bio->bi_iter.bi_size, |
1602 | op->new_i_size - (op->pos.offset << 9)); |
1603 | |
1604 | if (c->opts.inline_data && |
1605 | data_len <= min(block_bytes(c) / 2, 1024U)) { |
1606 | bch2_write_data_inline(op, data_len); |
1607 | return; |
1608 | } |
1609 | |
1610 | __bch2_write(op); |
1611 | return; |
1612 | err: |
1613 | bch2_disk_reservation_put(c, res: &op->res); |
1614 | |
1615 | closure_debug_destroy(cl: &op->cl); |
1616 | if (op->end_io) |
1617 | op->end_io(op); |
1618 | } |
1619 | |
1620 | static const char * const bch2_write_flags[] = { |
1621 | #define x(f) #f, |
1622 | BCH_WRITE_FLAGS() |
1623 | #undef x |
1624 | NULL |
1625 | }; |
1626 | |
1627 | void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op) |
1628 | { |
1629 | prt_str(out, str: "pos: " ); |
1630 | bch2_bpos_to_text(out, op->pos); |
1631 | prt_newline(out); |
1632 | printbuf_indent_add(out, 2); |
1633 | |
1634 | prt_str(out, str: "started: " ); |
1635 | bch2_pr_time_units(out, local_clock() - op->start_time); |
1636 | prt_newline(out); |
1637 | |
1638 | prt_str(out, str: "flags: " ); |
1639 | prt_bitflags(out, bch2_write_flags, op->flags); |
1640 | prt_newline(out); |
1641 | |
1642 | prt_printf(out, "ref: %u" , closure_nr_remaining(&op->cl)); |
1643 | prt_newline(out); |
1644 | |
1645 | printbuf_indent_sub(out, 2); |
1646 | } |
1647 | |
1648 | void bch2_fs_io_write_exit(struct bch_fs *c) |
1649 | { |
1650 | mempool_exit(pool: &c->bio_bounce_pages); |
1651 | bioset_exit(&c->bio_write); |
1652 | } |
1653 | |
1654 | int bch2_fs_io_write_init(struct bch_fs *c) |
1655 | { |
1656 | if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), |
1657 | flags: BIOSET_NEED_BVECS)) |
1658 | return -BCH_ERR_ENOMEM_bio_write_init; |
1659 | |
1660 | if (mempool_init_page_pool(pool: &c->bio_bounce_pages, |
1661 | max_t(unsigned, |
1662 | c->opts.btree_node_size, |
1663 | c->opts.encoded_extent_max) / |
1664 | PAGE_SIZE, order: 0)) |
1665 | return -BCH_ERR_ENOMEM_bio_bounce_pages_init; |
1666 | |
1667 | return 0; |
1668 | } |
1669 | |