| 1 | // SPDX-License-Identifier: GPL-2.0-or-later |
| 2 | /* |
| 3 | * Copyright (C) 2018-2023 Oracle. All Rights Reserved. |
| 4 | * Author: Darrick J. Wong <djwong@kernel.org> |
| 5 | */ |
| 6 | #include "xfs.h" |
| 7 | #include "xfs_fs.h" |
| 8 | #include "xfs_shared.h" |
| 9 | #include "xfs_format.h" |
| 10 | #include "xfs_log_format.h" |
| 11 | #include "xfs_trans_resv.h" |
| 12 | #include "xfs_mount.h" |
| 13 | #include "scrub/scrub.h" |
| 14 | #include "scrub/xfile.h" |
| 15 | #include "scrub/xfarray.h" |
| 16 | #include "scrub/trace.h" |
| 17 | #include <linux/shmem_fs.h> |
| 18 | |
| 19 | /* |
| 20 | * Swappable Temporary Memory |
| 21 | * ========================== |
| 22 | * |
| 23 | * Online checking sometimes needs to be able to stage a large amount of data |
| 24 | * in memory. This information might not fit in the available memory and it |
| 25 | * doesn't all need to be accessible at all times. In other words, we want an |
| 26 | * indexed data buffer to store data that can be paged out. |
| 27 | * |
| 28 | * When CONFIG_TMPFS=y, shmemfs is enough of a filesystem to meet those |
| 29 | * requirements. Therefore, the xfile mechanism uses an unlinked shmem file to |
| 30 | * store our staging data. This file is not installed in the file descriptor |
| 31 | * table so that user programs cannot access the data, which means that the |
| 32 | * xfile must be freed with xfile_destroy. |
| 33 | * |
| 34 | * xfiles assume that the caller will handle all required concurrency |
| 35 | * management; standard vfs locks (freezer and inode) are not taken. Reads |
| 36 | * and writes are satisfied directly from the page cache. |
| 37 | */ |
| 38 | |
| 39 | /* |
| 40 | * xfiles must not be exposed to userspace and require upper layers to |
| 41 | * coordinate access to the one handle returned by the constructor, so |
| 42 | * establish a separate lock class for xfiles to avoid confusing lockdep. |
| 43 | */ |
| 44 | static struct lock_class_key xfile_i_mutex_key; |
| 45 | |
| 46 | /* |
| 47 | * Create an xfile of the given size. The description will be used in the |
| 48 | * trace output. |
| 49 | */ |
| 50 | int |
| 51 | xfile_create( |
| 52 | const char *description, |
| 53 | loff_t isize, |
| 54 | struct xfile **xfilep) |
| 55 | { |
| 56 | struct inode *inode; |
| 57 | struct xfile *xf; |
| 58 | int error; |
| 59 | |
| 60 | xf = kmalloc(sizeof(struct xfile), XCHK_GFP_FLAGS); |
| 61 | if (!xf) |
| 62 | return -ENOMEM; |
| 63 | |
| 64 | xf->file = shmem_kernel_file_setup(name: description, size: isize, VM_NORESERVE); |
| 65 | if (IS_ERR(ptr: xf->file)) { |
| 66 | error = PTR_ERR(ptr: xf->file); |
| 67 | goto out_xfile; |
| 68 | } |
| 69 | |
| 70 | inode = file_inode(f: xf->file); |
| 71 | lockdep_set_class(&inode->i_rwsem, &xfile_i_mutex_key); |
| 72 | |
| 73 | /* |
| 74 | * We don't want to bother with kmapping data during repair, so don't |
| 75 | * allow highmem pages to back this mapping. |
| 76 | */ |
| 77 | mapping_set_gfp_mask(m: inode->i_mapping, GFP_KERNEL); |
| 78 | |
| 79 | trace_xfile_create(xf); |
| 80 | |
| 81 | *xfilep = xf; |
| 82 | return 0; |
| 83 | out_xfile: |
| 84 | kfree(objp: xf); |
| 85 | return error; |
| 86 | } |
| 87 | |
| 88 | /* Close the file and release all resources. */ |
| 89 | void |
| 90 | xfile_destroy( |
| 91 | struct xfile *xf) |
| 92 | { |
| 93 | struct inode *inode = file_inode(f: xf->file); |
| 94 | |
| 95 | trace_xfile_destroy(xf); |
| 96 | |
| 97 | lockdep_set_class(&inode->i_rwsem, &inode->i_sb->s_type->i_mutex_key); |
| 98 | fput(xf->file); |
| 99 | kfree(objp: xf); |
| 100 | } |
| 101 | |
| 102 | /* |
| 103 | * Load an object. Since we're treating this file as "memory", any error or |
| 104 | * short IO is treated as a failure to allocate memory. |
| 105 | */ |
| 106 | int |
| 107 | xfile_load( |
| 108 | struct xfile *xf, |
| 109 | void *buf, |
| 110 | size_t count, |
| 111 | loff_t pos) |
| 112 | { |
| 113 | struct inode *inode = file_inode(f: xf->file); |
| 114 | unsigned int pflags; |
| 115 | |
| 116 | if (count > MAX_RW_COUNT) |
| 117 | return -ENOMEM; |
| 118 | if (inode->i_sb->s_maxbytes - pos < count) |
| 119 | return -ENOMEM; |
| 120 | |
| 121 | trace_xfile_load(xf, pos, count); |
| 122 | |
| 123 | pflags = memalloc_nofs_save(); |
| 124 | while (count > 0) { |
| 125 | struct folio *folio; |
| 126 | unsigned int len; |
| 127 | unsigned int offset; |
| 128 | |
| 129 | if (shmem_get_folio(inode, index: pos >> PAGE_SHIFT, write_end: 0, foliop: &folio, |
| 130 | sgp: SGP_READ) < 0) |
| 131 | break; |
| 132 | if (!folio) { |
| 133 | /* |
| 134 | * No data stored at this offset, just zero the output |
| 135 | * buffer until the next page boundary. |
| 136 | */ |
| 137 | len = min_t(ssize_t, count, |
| 138 | PAGE_SIZE - offset_in_page(pos)); |
| 139 | memset(buf, 0, len); |
| 140 | } else { |
| 141 | if (filemap_check_wb_err(mapping: inode->i_mapping, since: 0)) { |
| 142 | folio_unlock(folio); |
| 143 | folio_put(folio); |
| 144 | break; |
| 145 | } |
| 146 | |
| 147 | offset = offset_in_folio(folio, pos); |
| 148 | len = min_t(ssize_t, count, folio_size(folio) - offset); |
| 149 | memcpy(buf, folio_address(folio) + offset, len); |
| 150 | |
| 151 | folio_unlock(folio); |
| 152 | folio_put(folio); |
| 153 | } |
| 154 | count -= len; |
| 155 | pos += len; |
| 156 | buf += len; |
| 157 | } |
| 158 | memalloc_nofs_restore(flags: pflags); |
| 159 | |
| 160 | if (count) |
| 161 | return -ENOMEM; |
| 162 | return 0; |
| 163 | } |
| 164 | |
| 165 | /* |
| 166 | * Store an object. Since we're treating this file as "memory", any error or |
| 167 | * short IO is treated as a failure to allocate memory. |
| 168 | */ |
| 169 | int |
| 170 | xfile_store( |
| 171 | struct xfile *xf, |
| 172 | const void *buf, |
| 173 | size_t count, |
| 174 | loff_t pos) |
| 175 | { |
| 176 | struct inode *inode = file_inode(f: xf->file); |
| 177 | unsigned int pflags; |
| 178 | |
| 179 | if (count > MAX_RW_COUNT) |
| 180 | return -ENOMEM; |
| 181 | if (inode->i_sb->s_maxbytes - pos < count) |
| 182 | return -ENOMEM; |
| 183 | |
| 184 | trace_xfile_store(xf, pos, count); |
| 185 | |
| 186 | /* |
| 187 | * Increase the file size first so that shmem_get_folio(..., SGP_CACHE), |
| 188 | * actually allocates a folio instead of erroring out. |
| 189 | */ |
| 190 | if (pos + count > i_size_read(inode)) |
| 191 | i_size_write(inode, i_size: pos + count); |
| 192 | |
| 193 | pflags = memalloc_nofs_save(); |
| 194 | while (count > 0) { |
| 195 | struct folio *folio; |
| 196 | unsigned int len; |
| 197 | unsigned int offset; |
| 198 | |
| 199 | if (shmem_get_folio(inode, index: pos >> PAGE_SHIFT, write_end: 0, foliop: &folio, |
| 200 | sgp: SGP_CACHE) < 0) |
| 201 | break; |
| 202 | if (filemap_check_wb_err(mapping: inode->i_mapping, since: 0)) { |
| 203 | folio_unlock(folio); |
| 204 | folio_put(folio); |
| 205 | break; |
| 206 | } |
| 207 | |
| 208 | offset = offset_in_folio(folio, pos); |
| 209 | len = min_t(ssize_t, count, folio_size(folio) - offset); |
| 210 | memcpy(folio_address(folio) + offset, buf, len); |
| 211 | |
| 212 | folio_mark_dirty(folio); |
| 213 | folio_unlock(folio); |
| 214 | folio_put(folio); |
| 215 | |
| 216 | count -= len; |
| 217 | pos += len; |
| 218 | buf += len; |
| 219 | } |
| 220 | memalloc_nofs_restore(flags: pflags); |
| 221 | |
| 222 | if (count) |
| 223 | return -ENOMEM; |
| 224 | return 0; |
| 225 | } |
| 226 | |
| 227 | /* Find the next written area in the xfile data for a given offset. */ |
| 228 | loff_t |
| 229 | xfile_seek_data( |
| 230 | struct xfile *xf, |
| 231 | loff_t pos) |
| 232 | { |
| 233 | loff_t ret; |
| 234 | |
| 235 | ret = vfs_llseek(file: xf->file, offset: pos, SEEK_DATA); |
| 236 | trace_xfile_seek_data(xf, pos, ret); |
| 237 | return ret; |
| 238 | } |
| 239 | |
| 240 | /* |
| 241 | * Grab the (locked) folio for a memory object. The object cannot span a folio |
| 242 | * boundary. Returns the locked folio if successful, NULL if there was no |
| 243 | * folio or it didn't cover the range requested, or an ERR_PTR on failure. |
| 244 | */ |
| 245 | struct folio * |
| 246 | xfile_get_folio( |
| 247 | struct xfile *xf, |
| 248 | loff_t pos, |
| 249 | size_t len, |
| 250 | unsigned int flags) |
| 251 | { |
| 252 | struct inode *inode = file_inode(f: xf->file); |
| 253 | struct folio *folio = NULL; |
| 254 | unsigned int pflags; |
| 255 | int error; |
| 256 | |
| 257 | if (inode->i_sb->s_maxbytes - pos < len) |
| 258 | return ERR_PTR(error: -ENOMEM); |
| 259 | |
| 260 | trace_xfile_get_folio(xf, pos, len); |
| 261 | |
| 262 | /* |
| 263 | * Increase the file size first so that shmem_get_folio(..., SGP_CACHE), |
| 264 | * actually allocates a folio instead of erroring out. |
| 265 | */ |
| 266 | if ((flags & XFILE_ALLOC) && pos + len > i_size_read(inode)) |
| 267 | i_size_write(inode, i_size: pos + len); |
| 268 | |
| 269 | pflags = memalloc_nofs_save(); |
| 270 | error = shmem_get_folio(inode, index: pos >> PAGE_SHIFT, write_end: 0, foliop: &folio, |
| 271 | sgp: (flags & XFILE_ALLOC) ? SGP_CACHE : SGP_READ); |
| 272 | memalloc_nofs_restore(flags: pflags); |
| 273 | if (error) |
| 274 | return ERR_PTR(error); |
| 275 | |
| 276 | if (!folio) |
| 277 | return NULL; |
| 278 | |
| 279 | if (len > folio_size(folio) - offset_in_folio(folio, pos)) { |
| 280 | folio_unlock(folio); |
| 281 | folio_put(folio); |
| 282 | return NULL; |
| 283 | } |
| 284 | |
| 285 | if (filemap_check_wb_err(mapping: inode->i_mapping, since: 0)) { |
| 286 | folio_unlock(folio); |
| 287 | folio_put(folio); |
| 288 | return ERR_PTR(error: -EIO); |
| 289 | } |
| 290 | |
| 291 | /* |
| 292 | * Mark the folio dirty so that it won't be reclaimed once we drop the |
| 293 | * (potentially last) reference in xfile_put_folio. |
| 294 | */ |
| 295 | if (flags & XFILE_ALLOC) |
| 296 | folio_mark_dirty(folio); |
| 297 | return folio; |
| 298 | } |
| 299 | |
| 300 | /* |
| 301 | * Release the (locked) folio for a memory object. |
| 302 | */ |
| 303 | void |
| 304 | xfile_put_folio( |
| 305 | struct xfile *xf, |
| 306 | struct folio *folio) |
| 307 | { |
| 308 | trace_xfile_put_folio(xf, folio_pos(folio), folio_size(folio)); |
| 309 | |
| 310 | folio_unlock(folio); |
| 311 | folio_put(folio); |
| 312 | } |
| 313 | |
| 314 | /* Discard the page cache that's backing a range of the xfile. */ |
| 315 | void |
| 316 | xfile_discard( |
| 317 | struct xfile *xf, |
| 318 | loff_t pos, |
| 319 | u64 count) |
| 320 | { |
| 321 | trace_xfile_discard(xf, pos, count); |
| 322 | |
| 323 | shmem_truncate_range(inode: file_inode(f: xf->file), start: pos, end: pos + count - 1); |
| 324 | } |
| 325 | |