| 1 | // SPDX-License-Identifier: GPL-2.0-or-later |
| 2 | /* |
| 3 | * Copyright (c) 2023-2024 Oracle. All Rights Reserved. |
| 4 | * Author: Darrick J. Wong <djwong@kernel.org> |
| 5 | */ |
| 6 | #include "xfs.h" |
| 7 | #include "xfs_fs.h" |
| 8 | #include "xfs_buf.h" |
| 9 | #include "xfs_buf_mem.h" |
| 10 | #include "xfs_trace.h" |
| 11 | #include <linux/shmem_fs.h> |
| 12 | #include "xfs_log_format.h" |
| 13 | #include "xfs_trans.h" |
| 14 | #include "xfs_buf_item.h" |
| 15 | #include "xfs_error.h" |
| 16 | |
| 17 | /* |
| 18 | * Buffer Cache for In-Memory Files |
| 19 | * ================================ |
| 20 | * |
| 21 | * Online fsck wants to create ephemeral ordered recordsets. The existing |
| 22 | * btree infrastructure can do this, but we need the buffer cache to target |
| 23 | * memory instead of block devices. |
| 24 | * |
| 25 | * When CONFIG_TMPFS=y, shmemfs is enough of a filesystem to meet those |
| 26 | * requirements. Therefore, the xmbuf mechanism uses an unlinked shmem file to |
| 27 | * store our staging data. This file is not installed in the file descriptor |
| 28 | * table so that user programs cannot access the data, which means that the |
| 29 | * xmbuf must be freed with xmbuf_destroy. |
| 30 | * |
| 31 | * xmbufs assume that the caller will handle all required concurrency |
| 32 | * management; standard vfs locks (freezer and inode) are not taken. Reads |
| 33 | * and writes are satisfied directly from the page cache. |
| 34 | * |
| 35 | * The only supported block size is PAGE_SIZE, and we cannot use highmem. |
| 36 | */ |
| 37 | |
| 38 | /* |
| 39 | * shmem files used to back an in-memory buffer cache must not be exposed to |
| 40 | * userspace. Upper layers must coordinate access to the one handle returned |
| 41 | * by the constructor, so establish a separate lock class for xmbufs to avoid |
| 42 | * confusing lockdep. |
| 43 | */ |
| 44 | static struct lock_class_key xmbuf_i_mutex_key; |
| 45 | |
| 46 | /* |
| 47 | * Allocate a buffer cache target for a memory-backed file and set up the |
| 48 | * buffer target. |
| 49 | */ |
| 50 | int |
| 51 | xmbuf_alloc( |
| 52 | struct xfs_mount *mp, |
| 53 | const char *descr, |
| 54 | struct xfs_buftarg **btpp) |
| 55 | { |
| 56 | struct file *file; |
| 57 | struct inode *inode; |
| 58 | struct xfs_buftarg *btp; |
| 59 | int error; |
| 60 | |
| 61 | btp = kzalloc(struct_size(btp, bt_cache, 1), GFP_KERNEL); |
| 62 | if (!btp) |
| 63 | return -ENOMEM; |
| 64 | |
| 65 | file = shmem_kernel_file_setup(name: descr, size: 0, flags: 0); |
| 66 | if (IS_ERR(ptr: file)) { |
| 67 | error = PTR_ERR(ptr: file); |
| 68 | goto out_free_btp; |
| 69 | } |
| 70 | inode = file_inode(f: file); |
| 71 | |
| 72 | /* private file, private locking */ |
| 73 | lockdep_set_class(&inode->i_rwsem, &xmbuf_i_mutex_key); |
| 74 | |
| 75 | /* |
| 76 | * We don't want to bother with kmapping data during repair, so don't |
| 77 | * allow highmem folios to back this mapping. |
| 78 | */ |
| 79 | mapping_set_gfp_mask(m: inode->i_mapping, GFP_KERNEL); |
| 80 | |
| 81 | /* ensure all writes are below EOF to avoid pagecache zeroing */ |
| 82 | i_size_write(inode, i_size: inode->i_sb->s_maxbytes); |
| 83 | |
| 84 | error = xfs_buf_cache_init(bch: btp->bt_cache); |
| 85 | if (error) |
| 86 | goto out_file; |
| 87 | |
| 88 | /* Initialize buffer target */ |
| 89 | btp->bt_mount = mp; |
| 90 | btp->bt_dev = (dev_t)-1U; |
| 91 | btp->bt_bdev = NULL; /* in-memory buftargs have no bdev */ |
| 92 | btp->bt_file = file; |
| 93 | btp->bt_meta_sectorsize = XMBUF_BLOCKSIZE; |
| 94 | btp->bt_meta_sectormask = XMBUF_BLOCKSIZE - 1; |
| 95 | |
| 96 | error = xfs_init_buftarg(btp, XMBUF_BLOCKSIZE, descr); |
| 97 | if (error) |
| 98 | goto out_bcache; |
| 99 | |
| 100 | trace_xmbuf_create(btp); |
| 101 | |
| 102 | *btpp = btp; |
| 103 | return 0; |
| 104 | |
| 105 | out_bcache: |
| 106 | xfs_buf_cache_destroy(bch: btp->bt_cache); |
| 107 | out_file: |
| 108 | fput(file); |
| 109 | out_free_btp: |
| 110 | kfree(objp: btp); |
| 111 | return error; |
| 112 | } |
| 113 | |
| 114 | /* Free a buffer cache target for a memory-backed buffer cache. */ |
| 115 | void |
| 116 | xmbuf_free( |
| 117 | struct xfs_buftarg *btp) |
| 118 | { |
| 119 | ASSERT(xfs_buftarg_is_mem(btp)); |
| 120 | ASSERT(percpu_counter_sum(&btp->bt_readahead_count) == 0); |
| 121 | |
| 122 | trace_xmbuf_free(btp); |
| 123 | |
| 124 | xfs_destroy_buftarg(btp); |
| 125 | xfs_buf_cache_destroy(bch: btp->bt_cache); |
| 126 | fput(btp->bt_file); |
| 127 | kfree(objp: btp); |
| 128 | } |
| 129 | |
| 130 | /* Directly map a shmem folio into the buffer cache. */ |
| 131 | int |
| 132 | xmbuf_map_backing_mem( |
| 133 | struct xfs_buf *bp) |
| 134 | { |
| 135 | struct inode *inode = file_inode(f: bp->b_target->bt_file); |
| 136 | struct folio *folio = NULL; |
| 137 | loff_t pos = BBTOB(xfs_buf_daddr(bp)); |
| 138 | int error; |
| 139 | |
| 140 | ASSERT(xfs_buftarg_is_mem(bp->b_target)); |
| 141 | |
| 142 | if (bp->b_map_count != 1) |
| 143 | return -ENOMEM; |
| 144 | if (BBTOB(bp->b_length) != XMBUF_BLOCKSIZE) |
| 145 | return -ENOMEM; |
| 146 | if (offset_in_page(pos) != 0) { |
| 147 | ASSERT(offset_in_page(pos)); |
| 148 | return -ENOMEM; |
| 149 | } |
| 150 | |
| 151 | error = shmem_get_folio(inode, index: pos >> PAGE_SHIFT, write_end: 0, foliop: &folio, sgp: SGP_CACHE); |
| 152 | if (error) |
| 153 | return error; |
| 154 | |
| 155 | if (filemap_check_wb_err(mapping: inode->i_mapping, since: 0)) { |
| 156 | folio_unlock(folio); |
| 157 | folio_put(folio); |
| 158 | return -EIO; |
| 159 | } |
| 160 | |
| 161 | /* |
| 162 | * Mark the folio dirty so that it won't be reclaimed once we drop the |
| 163 | * (potentially last) reference in xfs_buf_free. |
| 164 | */ |
| 165 | folio_set_dirty(folio); |
| 166 | folio_unlock(folio); |
| 167 | |
| 168 | bp->b_addr = folio_address(folio) + offset_in_folio(folio, pos); |
| 169 | return 0; |
| 170 | } |
| 171 | |
| 172 | /* Is this a valid daddr within the buftarg? */ |
| 173 | bool |
| 174 | xmbuf_verify_daddr( |
| 175 | struct xfs_buftarg *btp, |
| 176 | xfs_daddr_t daddr) |
| 177 | { |
| 178 | struct inode *inode = file_inode(f: btp->bt_file); |
| 179 | |
| 180 | ASSERT(xfs_buftarg_is_mem(btp)); |
| 181 | |
| 182 | return daddr < (inode->i_sb->s_maxbytes >> BBSHIFT); |
| 183 | } |
| 184 | |
| 185 | /* Discard the folio backing this buffer. */ |
| 186 | static void |
| 187 | xmbuf_stale( |
| 188 | struct xfs_buf *bp) |
| 189 | { |
| 190 | struct inode *inode = file_inode(f: bp->b_target->bt_file); |
| 191 | loff_t pos; |
| 192 | |
| 193 | ASSERT(xfs_buftarg_is_mem(bp->b_target)); |
| 194 | |
| 195 | pos = BBTOB(xfs_buf_daddr(bp)); |
| 196 | shmem_truncate_range(inode, start: pos, end: pos + BBTOB(bp->b_length) - 1); |
| 197 | } |
| 198 | |
| 199 | /* |
| 200 | * Finalize a buffer -- discard the backing folio if it's stale, or run the |
| 201 | * write verifier to detect problems. |
| 202 | */ |
| 203 | int |
| 204 | xmbuf_finalize( |
| 205 | struct xfs_buf *bp) |
| 206 | { |
| 207 | xfs_failaddr_t fa; |
| 208 | int error = 0; |
| 209 | |
| 210 | if (bp->b_flags & XBF_STALE) { |
| 211 | xmbuf_stale(bp); |
| 212 | return 0; |
| 213 | } |
| 214 | |
| 215 | /* |
| 216 | * Although this btree is ephemeral, validate the buffer structure so |
| 217 | * that we can detect memory corruption errors and software bugs. |
| 218 | */ |
| 219 | fa = bp->b_ops->verify_struct(bp); |
| 220 | if (fa) { |
| 221 | error = -EFSCORRUPTED; |
| 222 | xfs_verifier_error(bp, error, fa); |
| 223 | } |
| 224 | |
| 225 | return error; |
| 226 | } |
| 227 | |
| 228 | /* |
| 229 | * Detach this xmbuf buffer from the transaction by any means necessary. |
| 230 | * All buffers are direct-mapped, so they do not need bwrite. |
| 231 | */ |
| 232 | void |
| 233 | xmbuf_trans_bdetach( |
| 234 | struct xfs_trans *tp, |
| 235 | struct xfs_buf *bp) |
| 236 | { |
| 237 | struct xfs_buf_log_item *bli = bp->b_log_item; |
| 238 | |
| 239 | ASSERT(bli != NULL); |
| 240 | |
| 241 | bli->bli_flags &= ~(XFS_BLI_DIRTY | XFS_BLI_ORDERED | |
| 242 | XFS_BLI_LOGGED | XFS_BLI_STALE); |
| 243 | clear_bit(XFS_LI_DIRTY, addr: &bli->bli_item.li_flags); |
| 244 | |
| 245 | while (bp->b_log_item != NULL) |
| 246 | xfs_trans_bdetach(tp, bp); |
| 247 | } |
| 248 | |