1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Copyright (c) 2000-2005 Silicon Graphics, Inc. |
4 | * All Rights Reserved. |
5 | */ |
6 | #include "xfs.h" |
7 | #include "xfs_fs.h" |
8 | #include "xfs_shared.h" |
9 | #include "xfs_format.h" |
10 | #include "xfs_log_format.h" |
11 | #include "xfs_trans_resv.h" |
12 | #include "xfs_sb.h" |
13 | #include "xfs_mount.h" |
14 | #include "xfs_trans.h" |
15 | #include "xfs_error.h" |
16 | #include "xfs_alloc.h" |
17 | #include "xfs_fsops.h" |
18 | #include "xfs_trans_space.h" |
19 | #include "xfs_log.h" |
20 | #include "xfs_log_priv.h" |
21 | #include "xfs_ag.h" |
22 | #include "xfs_ag_resv.h" |
23 | #include "xfs_trace.h" |
24 | |
25 | /* |
26 | * Write new AG headers to disk. Non-transactional, but need to be |
27 | * written and completed prior to the growfs transaction being logged. |
28 | * To do this, we use a delayed write buffer list and wait for |
29 | * submission and IO completion of the list as a whole. This allows the |
30 | * IO subsystem to merge all the AG headers in a single AG into a single |
31 | * IO and hide most of the latency of the IO from us. |
32 | * |
33 | * This also means that if we get an error whilst building the buffer |
34 | * list to write, we can cancel the entire list without having written |
35 | * anything. |
36 | */ |
37 | static int |
38 | xfs_resizefs_init_new_ags( |
39 | struct xfs_trans *tp, |
40 | struct aghdr_init_data *id, |
41 | xfs_agnumber_t oagcount, |
42 | xfs_agnumber_t nagcount, |
43 | xfs_rfsblock_t delta, |
44 | struct xfs_perag *last_pag, |
45 | bool *lastag_extended) |
46 | { |
47 | struct xfs_mount *mp = tp->t_mountp; |
48 | xfs_rfsblock_t nb = mp->m_sb.sb_dblocks + delta; |
49 | int error; |
50 | |
51 | *lastag_extended = false; |
52 | |
53 | INIT_LIST_HEAD(list: &id->buffer_list); |
54 | for (id->agno = nagcount - 1; |
55 | id->agno >= oagcount; |
56 | id->agno--, delta -= id->agsize) { |
57 | |
58 | if (id->agno == nagcount - 1) |
59 | id->agsize = nb - (id->agno * |
60 | (xfs_rfsblock_t)mp->m_sb.sb_agblocks); |
61 | else |
62 | id->agsize = mp->m_sb.sb_agblocks; |
63 | |
64 | error = xfs_ag_init_headers(mp, id); |
65 | if (error) { |
66 | xfs_buf_delwri_cancel(&id->buffer_list); |
67 | return error; |
68 | } |
69 | } |
70 | |
71 | error = xfs_buf_delwri_submit(&id->buffer_list); |
72 | if (error) |
73 | return error; |
74 | |
75 | if (delta) { |
76 | *lastag_extended = true; |
77 | error = xfs_ag_extend_space(last_pag, tp, delta); |
78 | } |
79 | return error; |
80 | } |
81 | |
82 | /* |
83 | * growfs operations |
84 | */ |
85 | static int |
86 | xfs_growfs_data_private( |
87 | struct xfs_mount *mp, /* mount point for filesystem */ |
88 | struct xfs_growfs_data *in) /* growfs data input struct */ |
89 | { |
90 | struct xfs_buf *bp; |
91 | int error; |
92 | xfs_agnumber_t nagcount; |
93 | xfs_agnumber_t nagimax = 0; |
94 | xfs_rfsblock_t nb, nb_div, nb_mod; |
95 | int64_t delta; |
96 | bool lastag_extended = false; |
97 | xfs_agnumber_t oagcount; |
98 | struct xfs_trans *tp; |
99 | struct aghdr_init_data id = {}; |
100 | struct xfs_perag *last_pag; |
101 | |
102 | nb = in->newblocks; |
103 | error = xfs_sb_validate_fsb_count(&mp->m_sb, nb); |
104 | if (error) |
105 | return error; |
106 | |
107 | if (nb > mp->m_sb.sb_dblocks) { |
108 | error = xfs_buf_read_uncached(mp->m_ddev_targp, |
109 | XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), |
110 | XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL); |
111 | if (error) |
112 | return error; |
113 | xfs_buf_relse(bp); |
114 | } |
115 | |
116 | nb_div = nb; |
117 | nb_mod = do_div(nb_div, mp->m_sb.sb_agblocks); |
118 | if (nb_mod && nb_mod >= XFS_MIN_AG_BLOCKS) |
119 | nb_div++; |
120 | else if (nb_mod) |
121 | nb = nb_div * mp->m_sb.sb_agblocks; |
122 | |
123 | if (nb_div > XFS_MAX_AGNUMBER + 1) { |
124 | nb_div = XFS_MAX_AGNUMBER + 1; |
125 | nb = nb_div * mp->m_sb.sb_agblocks; |
126 | } |
127 | nagcount = nb_div; |
128 | delta = nb - mp->m_sb.sb_dblocks; |
129 | /* |
130 | * Reject filesystems with a single AG because they are not |
131 | * supported, and reject a shrink operation that would cause a |
132 | * filesystem to become unsupported. |
133 | */ |
134 | if (delta < 0 && nagcount < 2) |
135 | return -EINVAL; |
136 | |
137 | /* No work to do */ |
138 | if (delta == 0) |
139 | return 0; |
140 | |
141 | oagcount = mp->m_sb.sb_agcount; |
142 | /* allocate the new per-ag structures */ |
143 | if (nagcount > oagcount) { |
144 | error = xfs_initialize_perag(mp, nagcount, nb, &nagimax); |
145 | if (error) |
146 | return error; |
147 | } else if (nagcount < oagcount) { |
148 | /* TODO: shrinking the entire AGs hasn't yet completed */ |
149 | return -EINVAL; |
150 | } |
151 | |
152 | if (delta > 0) |
153 | error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata, |
154 | XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, |
155 | &tp); |
156 | else |
157 | error = xfs_trans_alloc(mp, resp: &M_RES(mp)->tr_growdata, blocks: -delta, rtextents: 0, |
158 | flags: 0, tpp: &tp); |
159 | if (error) |
160 | goto out_free_unused_perag; |
161 | |
162 | last_pag = xfs_perag_get(mp, oagcount - 1); |
163 | if (delta > 0) { |
164 | error = xfs_resizefs_init_new_ags(tp, &id, oagcount, nagcount, |
165 | delta, last_pag, &lastag_extended); |
166 | } else { |
167 | xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SHRINK, |
168 | "EXPERIMENTAL online shrink feature in use. Use at your own risk!" ); |
169 | |
170 | error = xfs_ag_shrink_space(last_pag, &tp, -delta); |
171 | } |
172 | xfs_perag_put(last_pag); |
173 | if (error) |
174 | goto out_trans_cancel; |
175 | |
176 | /* |
177 | * Update changed superblock fields transactionally. These are not |
178 | * seen by the rest of the world until the transaction commit applies |
179 | * them atomically to the superblock. |
180 | */ |
181 | if (nagcount > oagcount) |
182 | xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount); |
183 | if (delta) |
184 | xfs_trans_mod_sb(tp, XFS_TRANS_SB_DBLOCKS, delta); |
185 | if (id.nfree) |
186 | xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, id.nfree); |
187 | |
188 | /* |
189 | * Sync sb counters now to reflect the updated values. This is |
190 | * particularly important for shrink because the write verifier |
191 | * will fail if sb_fdblocks is ever larger than sb_dblocks. |
192 | */ |
193 | if (xfs_has_lazysbcount(mp)) |
194 | xfs_log_sb(tp); |
195 | |
196 | xfs_trans_set_sync(tp); |
197 | error = xfs_trans_commit(tp); |
198 | if (error) |
199 | return error; |
200 | |
201 | /* New allocation groups fully initialized, so update mount struct */ |
202 | if (nagimax) |
203 | mp->m_maxagi = nagimax; |
204 | xfs_set_low_space_thresholds(mp); |
205 | mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); |
206 | |
207 | if (delta > 0) { |
208 | /* |
209 | * If we expanded the last AG, free the per-AG reservation |
210 | * so we can reinitialize it with the new size. |
211 | */ |
212 | if (lastag_extended) { |
213 | struct xfs_perag *pag; |
214 | |
215 | pag = xfs_perag_get(mp, id.agno); |
216 | error = xfs_ag_resv_free(pag); |
217 | xfs_perag_put(pag); |
218 | if (error) |
219 | return error; |
220 | } |
221 | /* |
222 | * Reserve AG metadata blocks. ENOSPC here does not mean there |
223 | * was a growfs failure, just that there still isn't space for |
224 | * new user data after the grow has been run. |
225 | */ |
226 | error = xfs_fs_reserve_ag_blocks(mp); |
227 | if (error == -ENOSPC) |
228 | error = 0; |
229 | } |
230 | return error; |
231 | |
232 | out_trans_cancel: |
233 | xfs_trans_cancel(tp); |
234 | out_free_unused_perag: |
235 | if (nagcount > oagcount) |
236 | xfs_free_unused_perag_range(mp, oagcount, nagcount); |
237 | return error; |
238 | } |
239 | |
240 | static int |
241 | xfs_growfs_log_private( |
242 | struct xfs_mount *mp, /* mount point for filesystem */ |
243 | struct xfs_growfs_log *in) /* growfs log input struct */ |
244 | { |
245 | xfs_extlen_t nb; |
246 | |
247 | nb = in->newblocks; |
248 | if (nb < XFS_MIN_LOG_BLOCKS || nb < XFS_B_TO_FSB(mp, XFS_MIN_LOG_BYTES)) |
249 | return -EINVAL; |
250 | if (nb == mp->m_sb.sb_logblocks && |
251 | in->isint == (mp->m_sb.sb_logstart != 0)) |
252 | return -EINVAL; |
253 | /* |
254 | * Moving the log is hard, need new interfaces to sync |
255 | * the log first, hold off all activity while moving it. |
256 | * Can have shorter or longer log in the same space, |
257 | * or transform internal to external log or vice versa. |
258 | */ |
259 | return -ENOSYS; |
260 | } |
261 | |
262 | static int |
263 | xfs_growfs_imaxpct( |
264 | struct xfs_mount *mp, |
265 | __u32 imaxpct) |
266 | { |
267 | struct xfs_trans *tp; |
268 | int dpct; |
269 | int error; |
270 | |
271 | if (imaxpct > 100) |
272 | return -EINVAL; |
273 | |
274 | error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata, |
275 | XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp); |
276 | if (error) |
277 | return error; |
278 | |
279 | dpct = imaxpct - mp->m_sb.sb_imax_pct; |
280 | xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct); |
281 | xfs_trans_set_sync(tp); |
282 | return xfs_trans_commit(tp); |
283 | } |
284 | |
285 | /* |
286 | * protected versions of growfs function acquire and release locks on the mount |
287 | * point - exported through ioctls: XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG, |
288 | * XFS_IOC_FSGROWFSRT |
289 | */ |
290 | int |
291 | xfs_growfs_data( |
292 | struct xfs_mount *mp, |
293 | struct xfs_growfs_data *in) |
294 | { |
295 | int error = 0; |
296 | |
297 | if (!capable(CAP_SYS_ADMIN)) |
298 | return -EPERM; |
299 | if (!mutex_trylock(lock: &mp->m_growlock)) |
300 | return -EWOULDBLOCK; |
301 | |
302 | /* update imaxpct separately to the physical grow of the filesystem */ |
303 | if (in->imaxpct != mp->m_sb.sb_imax_pct) { |
304 | error = xfs_growfs_imaxpct(mp, imaxpct: in->imaxpct); |
305 | if (error) |
306 | goto out_error; |
307 | } |
308 | |
309 | if (in->newblocks != mp->m_sb.sb_dblocks) { |
310 | error = xfs_growfs_data_private(mp, in); |
311 | if (error) |
312 | goto out_error; |
313 | } |
314 | |
315 | /* Post growfs calculations needed to reflect new state in operations */ |
316 | if (mp->m_sb.sb_imax_pct) { |
317 | uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct; |
318 | do_div(icount, 100); |
319 | M_IGEO(mp)->maxicount = XFS_FSB_TO_INO(mp, icount); |
320 | } else |
321 | M_IGEO(mp)->maxicount = 0; |
322 | |
323 | /* Update secondary superblocks now the physical grow has completed */ |
324 | error = xfs_update_secondary_sbs(mp); |
325 | |
326 | out_error: |
327 | /* |
328 | * Increment the generation unconditionally, the error could be from |
329 | * updating the secondary superblocks, in which case the new size |
330 | * is live already. |
331 | */ |
332 | mp->m_generation++; |
333 | mutex_unlock(lock: &mp->m_growlock); |
334 | return error; |
335 | } |
336 | |
337 | int |
338 | xfs_growfs_log( |
339 | xfs_mount_t *mp, |
340 | struct xfs_growfs_log *in) |
341 | { |
342 | int error; |
343 | |
344 | if (!capable(CAP_SYS_ADMIN)) |
345 | return -EPERM; |
346 | if (!mutex_trylock(lock: &mp->m_growlock)) |
347 | return -EWOULDBLOCK; |
348 | error = xfs_growfs_log_private(mp, in); |
349 | mutex_unlock(lock: &mp->m_growlock); |
350 | return error; |
351 | } |
352 | |
353 | /* |
354 | * Reserve the requested number of blocks if available. Otherwise return |
355 | * as many as possible to satisfy the request. The actual number |
356 | * reserved are returned in outval. |
357 | */ |
358 | int |
359 | xfs_reserve_blocks( |
360 | struct xfs_mount *mp, |
361 | uint64_t request) |
362 | { |
363 | int64_t lcounter, delta; |
364 | int64_t fdblks_delta = 0; |
365 | int64_t free; |
366 | int error = 0; |
367 | |
368 | /* |
369 | * With per-cpu counters, this becomes an interesting problem. we need |
370 | * to work out if we are freeing or allocation blocks first, then we can |
371 | * do the modification as necessary. |
372 | * |
373 | * We do this under the m_sb_lock so that if we are near ENOSPC, we will |
374 | * hold out any changes while we work out what to do. This means that |
375 | * the amount of free space can change while we do this, so we need to |
376 | * retry if we end up trying to reserve more space than is available. |
377 | */ |
378 | spin_lock(lock: &mp->m_sb_lock); |
379 | |
380 | /* |
381 | * If our previous reservation was larger than the current value, |
382 | * then move any unused blocks back to the free pool. Modify the resblks |
383 | * counters directly since we shouldn't have any problems unreserving |
384 | * space. |
385 | */ |
386 | if (mp->m_resblks > request) { |
387 | lcounter = mp->m_resblks_avail - request; |
388 | if (lcounter > 0) { /* release unused blocks */ |
389 | fdblks_delta = lcounter; |
390 | mp->m_resblks_avail -= lcounter; |
391 | } |
392 | mp->m_resblks = request; |
393 | if (fdblks_delta) { |
394 | spin_unlock(lock: &mp->m_sb_lock); |
395 | error = xfs_mod_fdblocks(mp, delta: fdblks_delta, reserved: 0); |
396 | spin_lock(lock: &mp->m_sb_lock); |
397 | } |
398 | |
399 | goto out; |
400 | } |
401 | |
402 | /* |
403 | * If the request is larger than the current reservation, reserve the |
404 | * blocks before we update the reserve counters. Sample m_fdblocks and |
405 | * perform a partial reservation if the request exceeds free space. |
406 | * |
407 | * The code below estimates how many blocks it can request from |
408 | * fdblocks to stash in the reserve pool. This is a classic TOCTOU |
409 | * race since fdblocks updates are not always coordinated via |
410 | * m_sb_lock. Set the reserve size even if there's not enough free |
411 | * space to fill it because mod_fdblocks will refill an undersized |
412 | * reserve when it can. |
413 | */ |
414 | free = percpu_counter_sum(fbc: &mp->m_fdblocks) - |
415 | xfs_fdblocks_unavailable(mp); |
416 | delta = request - mp->m_resblks; |
417 | mp->m_resblks = request; |
418 | if (delta > 0 && free > 0) { |
419 | /* |
420 | * We'll either succeed in getting space from the free block |
421 | * count or we'll get an ENOSPC. Don't set the reserved flag |
422 | * here - we don't want to reserve the extra reserve blocks |
423 | * from the reserve. |
424 | * |
425 | * The desired reserve size can change after we drop the lock. |
426 | * Use mod_fdblocks to put the space into the reserve or into |
427 | * fdblocks as appropriate. |
428 | */ |
429 | fdblks_delta = min(free, delta); |
430 | spin_unlock(lock: &mp->m_sb_lock); |
431 | error = xfs_mod_fdblocks(mp, delta: -fdblks_delta, reserved: 0); |
432 | if (!error) |
433 | xfs_mod_fdblocks(mp, delta: fdblks_delta, reserved: 0); |
434 | spin_lock(lock: &mp->m_sb_lock); |
435 | } |
436 | out: |
437 | spin_unlock(lock: &mp->m_sb_lock); |
438 | return error; |
439 | } |
440 | |
441 | int |
442 | xfs_fs_goingdown( |
443 | xfs_mount_t *mp, |
444 | uint32_t inflags) |
445 | { |
446 | switch (inflags) { |
447 | case XFS_FSOP_GOING_FLAGS_DEFAULT: { |
448 | if (!bdev_freeze(bdev: mp->m_super->s_bdev)) { |
449 | xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT); |
450 | bdev_thaw(bdev: mp->m_super->s_bdev); |
451 | } |
452 | break; |
453 | } |
454 | case XFS_FSOP_GOING_FLAGS_LOGFLUSH: |
455 | xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT); |
456 | break; |
457 | case XFS_FSOP_GOING_FLAGS_NOLOGFLUSH: |
458 | xfs_force_shutdown(mp, |
459 | SHUTDOWN_FORCE_UMOUNT | SHUTDOWN_LOG_IO_ERROR); |
460 | break; |
461 | default: |
462 | return -EINVAL; |
463 | } |
464 | |
465 | return 0; |
466 | } |
467 | |
468 | /* |
469 | * Force a shutdown of the filesystem instantly while keeping the filesystem |
470 | * consistent. We don't do an unmount here; just shutdown the shop, make sure |
471 | * that absolutely nothing persistent happens to this filesystem after this |
472 | * point. |
473 | * |
474 | * The shutdown state change is atomic, resulting in the first and only the |
475 | * first shutdown call processing the shutdown. This means we only shutdown the |
476 | * log once as it requires, and we don't spam the logs when multiple concurrent |
477 | * shutdowns race to set the shutdown flags. |
478 | */ |
479 | void |
480 | xfs_do_force_shutdown( |
481 | struct xfs_mount *mp, |
482 | uint32_t flags, |
483 | char *fname, |
484 | int lnnum) |
485 | { |
486 | int tag; |
487 | const char *why; |
488 | |
489 | |
490 | if (test_and_set_bit(XFS_OPSTATE_SHUTDOWN, addr: &mp->m_opstate)) { |
491 | xlog_shutdown_wait(log: mp->m_log); |
492 | return; |
493 | } |
494 | if (mp->m_sb_bp) |
495 | mp->m_sb_bp->b_flags |= XBF_DONE; |
496 | |
497 | if (flags & SHUTDOWN_FORCE_UMOUNT) |
498 | xfs_alert(mp, "User initiated shutdown received." ); |
499 | |
500 | if (xlog_force_shutdown(log: mp->m_log, shutdown_flags: flags)) { |
501 | tag = XFS_PTAG_SHUTDOWN_LOGERROR; |
502 | why = "Log I/O Error" ; |
503 | } else if (flags & SHUTDOWN_CORRUPT_INCORE) { |
504 | tag = XFS_PTAG_SHUTDOWN_CORRUPT; |
505 | why = "Corruption of in-memory data" ; |
506 | } else if (flags & SHUTDOWN_CORRUPT_ONDISK) { |
507 | tag = XFS_PTAG_SHUTDOWN_CORRUPT; |
508 | why = "Corruption of on-disk metadata" ; |
509 | } else if (flags & SHUTDOWN_DEVICE_REMOVED) { |
510 | tag = XFS_PTAG_SHUTDOWN_IOERROR; |
511 | why = "Block device removal" ; |
512 | } else { |
513 | tag = XFS_PTAG_SHUTDOWN_IOERROR; |
514 | why = "Metadata I/O Error" ; |
515 | } |
516 | |
517 | trace_xfs_force_shutdown(mp, ptag: tag, flags, fname, line_num: lnnum); |
518 | |
519 | xfs_alert_tag(mp, tag, |
520 | "%s (0x%x) detected at %pS (%s:%d). Shutting down filesystem." , |
521 | why, flags, __return_address, fname, lnnum); |
522 | xfs_alert(mp, |
523 | "Please unmount the filesystem and rectify the problem(s)" ); |
524 | if (xfs_error_level >= XFS_ERRLEVEL_HIGH) |
525 | xfs_stack_trace(); |
526 | } |
527 | |
528 | /* |
529 | * Reserve free space for per-AG metadata. |
530 | */ |
531 | int |
532 | xfs_fs_reserve_ag_blocks( |
533 | struct xfs_mount *mp) |
534 | { |
535 | xfs_agnumber_t agno; |
536 | struct xfs_perag *pag; |
537 | int error = 0; |
538 | int err2; |
539 | |
540 | mp->m_finobt_nores = false; |
541 | for_each_perag(mp, agno, pag) { |
542 | err2 = xfs_ag_resv_init(pag, NULL); |
543 | if (err2 && !error) |
544 | error = err2; |
545 | } |
546 | |
547 | if (error && error != -ENOSPC) { |
548 | xfs_warn(mp, |
549 | "Error %d reserving per-AG metadata reserve pool." , error); |
550 | xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); |
551 | } |
552 | |
553 | return error; |
554 | } |
555 | |
556 | /* |
557 | * Free space reserved for per-AG metadata. |
558 | */ |
559 | int |
560 | xfs_fs_unreserve_ag_blocks( |
561 | struct xfs_mount *mp) |
562 | { |
563 | xfs_agnumber_t agno; |
564 | struct xfs_perag *pag; |
565 | int error = 0; |
566 | int err2; |
567 | |
568 | for_each_perag(mp, agno, pag) { |
569 | err2 = xfs_ag_resv_free(pag); |
570 | if (err2 && !error) |
571 | error = err2; |
572 | } |
573 | |
574 | if (error) |
575 | xfs_warn(mp, |
576 | "Error %d freeing per-AG metadata reserve pool." , error); |
577 | |
578 | return error; |
579 | } |
580 | |