| 1 | // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB |
| 2 | /* |
| 3 | * Copyright (c) 2005 Mellanox Technologies. All rights reserved. |
| 4 | * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved. |
| 5 | * Copyright 2019 Marvell. All rights reserved. |
| 6 | */ |
| 7 | #include <linux/xarray.h> |
| 8 | #include "uverbs.h" |
| 9 | #include "core_priv.h" |
| 10 | |
| 11 | /** |
| 12 | * rdma_umap_priv_init() - Initialize the private data of a vma |
| 13 | * |
| 14 | * @priv: The already allocated private data |
| 15 | * @vma: The vm area struct that needs private data |
| 16 | * @entry: entry into the mmap_xa that needs to be linked with |
| 17 | * this vma |
| 18 | * |
| 19 | * Each time we map IO memory into user space this keeps track of the |
| 20 | * mapping. When the device is hot-unplugged we 'zap' the mmaps in user space |
| 21 | * to point to the zero page and allow the hot unplug to proceed. |
| 22 | * |
| 23 | * This is necessary for cases like PCI physical hot unplug as the actual BAR |
| 24 | * memory may vanish after this and access to it from userspace could MCE. |
| 25 | * |
| 26 | * RDMA drivers supporting disassociation must have their user space designed |
| 27 | * to cope in some way with their IO pages going to the zero page. |
| 28 | * |
| 29 | */ |
| 30 | void rdma_umap_priv_init(struct rdma_umap_priv *priv, |
| 31 | struct vm_area_struct *vma, |
| 32 | struct rdma_user_mmap_entry *entry) |
| 33 | { |
| 34 | struct ib_uverbs_file *ufile = vma->vm_file->private_data; |
| 35 | |
| 36 | priv->vma = vma; |
| 37 | if (entry) { |
| 38 | kref_get(kref: &entry->ref); |
| 39 | priv->entry = entry; |
| 40 | } |
| 41 | vma->vm_private_data = priv; |
| 42 | /* vm_ops is setup in ib_uverbs_mmap() to avoid module dependencies */ |
| 43 | |
| 44 | mutex_lock(&ufile->umap_lock); |
| 45 | list_add(new: &priv->list, head: &ufile->umaps); |
| 46 | mutex_unlock(lock: &ufile->umap_lock); |
| 47 | } |
| 48 | EXPORT_SYMBOL(rdma_umap_priv_init); |
| 49 | |
| 50 | /** |
| 51 | * rdma_user_mmap_io() - Map IO memory into a process |
| 52 | * |
| 53 | * @ucontext: associated user context |
| 54 | * @vma: the vma related to the current mmap call |
| 55 | * @pfn: pfn to map |
| 56 | * @size: size to map |
| 57 | * @prot: pgprot to use in remap call |
| 58 | * @entry: mmap_entry retrieved from rdma_user_mmap_entry_get(), or NULL |
| 59 | * if mmap_entry is not used by the driver |
| 60 | * |
| 61 | * This is to be called by drivers as part of their mmap() functions if they |
| 62 | * wish to send something like PCI-E BAR memory to userspace. |
| 63 | * |
| 64 | * Return -EINVAL on wrong flags or size, -EAGAIN on failure to map. 0 on |
| 65 | * success. |
| 66 | */ |
| 67 | int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma, |
| 68 | unsigned long pfn, unsigned long size, pgprot_t prot, |
| 69 | struct rdma_user_mmap_entry *entry) |
| 70 | { |
| 71 | struct ib_uverbs_file *ufile = ucontext->ufile; |
| 72 | struct rdma_umap_priv *priv; |
| 73 | |
| 74 | if (!(vma->vm_flags & VM_SHARED)) |
| 75 | return -EINVAL; |
| 76 | |
| 77 | if (vma->vm_end - vma->vm_start != size) |
| 78 | return -EINVAL; |
| 79 | |
| 80 | /* Driver is using this wrong, must be called by ib_uverbs_mmap */ |
| 81 | if (WARN_ON(!vma->vm_file || |
| 82 | vma->vm_file->private_data != ufile)) |
| 83 | return -EINVAL; |
| 84 | lockdep_assert_held(&ufile->device->disassociate_srcu); |
| 85 | |
| 86 | priv = kzalloc(sizeof(*priv), GFP_KERNEL); |
| 87 | if (!priv) |
| 88 | return -ENOMEM; |
| 89 | |
| 90 | vma->vm_page_prot = prot; |
| 91 | if (io_remap_pfn_range(vma, addr: vma->vm_start, pfn, size, prot)) { |
| 92 | kfree(objp: priv); |
| 93 | return -EAGAIN; |
| 94 | } |
| 95 | |
| 96 | rdma_umap_priv_init(priv, vma, entry); |
| 97 | return 0; |
| 98 | } |
| 99 | EXPORT_SYMBOL(rdma_user_mmap_io); |
| 100 | |
| 101 | /** |
| 102 | * rdma_user_mmap_entry_get_pgoff() - Get an entry from the mmap_xa |
| 103 | * |
| 104 | * @ucontext: associated user context |
| 105 | * @pgoff: The mmap offset >> PAGE_SHIFT |
| 106 | * |
| 107 | * This function is called when a user tries to mmap with an offset (returned |
| 108 | * by rdma_user_mmap_get_offset()) it initially received from the driver. The |
| 109 | * rdma_user_mmap_entry was created by the function |
| 110 | * rdma_user_mmap_entry_insert(). This function increases the refcnt of the |
| 111 | * entry so that it won't be deleted from the xarray in the meantime. |
| 112 | * |
| 113 | * Return an reference to an entry if exists or NULL if there is no |
| 114 | * match. rdma_user_mmap_entry_put() must be called to put the reference. |
| 115 | */ |
| 116 | struct rdma_user_mmap_entry * |
| 117 | rdma_user_mmap_entry_get_pgoff(struct ib_ucontext *ucontext, |
| 118 | unsigned long pgoff) |
| 119 | { |
| 120 | struct rdma_user_mmap_entry *entry; |
| 121 | |
| 122 | if (pgoff > U32_MAX) |
| 123 | return NULL; |
| 124 | |
| 125 | xa_lock(&ucontext->mmap_xa); |
| 126 | |
| 127 | entry = xa_load(&ucontext->mmap_xa, index: pgoff); |
| 128 | |
| 129 | /* |
| 130 | * If refcount is zero, entry is already being deleted, driver_removed |
| 131 | * indicates that the no further mmaps are possible and we waiting for |
| 132 | * the active VMAs to be closed. |
| 133 | */ |
| 134 | if (!entry || entry->start_pgoff != pgoff || entry->driver_removed || |
| 135 | !kref_get_unless_zero(kref: &entry->ref)) |
| 136 | goto err; |
| 137 | |
| 138 | xa_unlock(&ucontext->mmap_xa); |
| 139 | |
| 140 | ibdev_dbg(ucontext->device, "mmap: pgoff[%#lx] npages[%#zx] returned\n" , |
| 141 | pgoff, entry->npages); |
| 142 | |
| 143 | return entry; |
| 144 | |
| 145 | err: |
| 146 | xa_unlock(&ucontext->mmap_xa); |
| 147 | return NULL; |
| 148 | } |
| 149 | EXPORT_SYMBOL(rdma_user_mmap_entry_get_pgoff); |
| 150 | |
| 151 | /** |
| 152 | * rdma_user_mmap_entry_get() - Get an entry from the mmap_xa |
| 153 | * |
| 154 | * @ucontext: associated user context |
| 155 | * @vma: the vma being mmap'd into |
| 156 | * |
| 157 | * This function is like rdma_user_mmap_entry_get_pgoff() except that it also |
| 158 | * checks that the VMA is correct. |
| 159 | */ |
| 160 | struct rdma_user_mmap_entry * |
| 161 | rdma_user_mmap_entry_get(struct ib_ucontext *ucontext, |
| 162 | struct vm_area_struct *vma) |
| 163 | { |
| 164 | struct rdma_user_mmap_entry *entry; |
| 165 | |
| 166 | if (!(vma->vm_flags & VM_SHARED)) |
| 167 | return NULL; |
| 168 | entry = rdma_user_mmap_entry_get_pgoff(ucontext, vma->vm_pgoff); |
| 169 | if (!entry) |
| 170 | return NULL; |
| 171 | if (entry->npages * PAGE_SIZE != vma->vm_end - vma->vm_start) { |
| 172 | rdma_user_mmap_entry_put(entry); |
| 173 | return NULL; |
| 174 | } |
| 175 | return entry; |
| 176 | } |
| 177 | EXPORT_SYMBOL(rdma_user_mmap_entry_get); |
| 178 | |
| 179 | static void rdma_user_mmap_entry_free(struct kref *kref) |
| 180 | { |
| 181 | struct rdma_user_mmap_entry *entry = |
| 182 | container_of(kref, struct rdma_user_mmap_entry, ref); |
| 183 | struct ib_ucontext *ucontext = entry->ucontext; |
| 184 | unsigned long i; |
| 185 | |
| 186 | /* |
| 187 | * Erase all entries occupied by this single entry, this is deferred |
| 188 | * until all VMA are closed so that the mmap offsets remain unique. |
| 189 | */ |
| 190 | xa_lock(&ucontext->mmap_xa); |
| 191 | for (i = 0; i < entry->npages; i++) |
| 192 | __xa_erase(&ucontext->mmap_xa, index: entry->start_pgoff + i); |
| 193 | xa_unlock(&ucontext->mmap_xa); |
| 194 | |
| 195 | ibdev_dbg(ucontext->device, "mmap: pgoff[%#lx] npages[%#zx] removed\n" , |
| 196 | entry->start_pgoff, entry->npages); |
| 197 | |
| 198 | if (ucontext->device->ops.mmap_free) |
| 199 | ucontext->device->ops.mmap_free(entry); |
| 200 | } |
| 201 | |
| 202 | /** |
| 203 | * rdma_user_mmap_entry_put() - Drop reference to the mmap entry |
| 204 | * |
| 205 | * @entry: an entry in the mmap_xa |
| 206 | * |
| 207 | * This function is called when the mapping is closed if it was |
| 208 | * an io mapping or when the driver is done with the entry for |
| 209 | * some other reason. |
| 210 | * Should be called after rdma_user_mmap_entry_get was called |
| 211 | * and entry is no longer needed. This function will erase the |
| 212 | * entry and free it if its refcnt reaches zero. |
| 213 | */ |
| 214 | void rdma_user_mmap_entry_put(struct rdma_user_mmap_entry *entry) |
| 215 | { |
| 216 | kref_put(kref: &entry->ref, release: rdma_user_mmap_entry_free); |
| 217 | } |
| 218 | EXPORT_SYMBOL(rdma_user_mmap_entry_put); |
| 219 | |
| 220 | /** |
| 221 | * rdma_user_mmap_entry_remove() - Drop reference to entry and |
| 222 | * mark it as unmmapable |
| 223 | * |
| 224 | * @entry: the entry to insert into the mmap_xa |
| 225 | * |
| 226 | * Drivers can call this to prevent userspace from creating more mappings for |
| 227 | * entry, however existing mmaps continue to exist and ops->mmap_free() will |
| 228 | * not be called until all user mmaps are destroyed. |
| 229 | */ |
| 230 | void rdma_user_mmap_entry_remove(struct rdma_user_mmap_entry *entry) |
| 231 | { |
| 232 | if (!entry) |
| 233 | return; |
| 234 | |
| 235 | xa_lock(&entry->ucontext->mmap_xa); |
| 236 | entry->driver_removed = true; |
| 237 | xa_unlock(&entry->ucontext->mmap_xa); |
| 238 | kref_put(kref: &entry->ref, release: rdma_user_mmap_entry_free); |
| 239 | } |
| 240 | EXPORT_SYMBOL(rdma_user_mmap_entry_remove); |
| 241 | |
| 242 | /** |
| 243 | * rdma_user_mmap_entry_insert_range() - Insert an entry to the mmap_xa |
| 244 | * in a given range. |
| 245 | * |
| 246 | * @ucontext: associated user context. |
| 247 | * @entry: the entry to insert into the mmap_xa |
| 248 | * @length: length of the address that will be mmapped |
| 249 | * @min_pgoff: minimum pgoff to be returned |
| 250 | * @max_pgoff: maximum pgoff to be returned |
| 251 | * |
| 252 | * This function should be called by drivers that use the rdma_user_mmap |
| 253 | * interface for implementing their mmap syscall A database of mmap offsets is |
| 254 | * handled in the core and helper functions are provided to insert entries |
| 255 | * into the database and extract entries when the user calls mmap with the |
| 256 | * given offset. The function allocates a unique page offset in a given range |
| 257 | * that should be provided to user, the user will use the offset to retrieve |
| 258 | * information such as address to be mapped and how. |
| 259 | * |
| 260 | * Return: 0 on success and -ENOMEM on failure |
| 261 | */ |
| 262 | int rdma_user_mmap_entry_insert_range(struct ib_ucontext *ucontext, |
| 263 | struct rdma_user_mmap_entry *entry, |
| 264 | size_t length, u32 min_pgoff, |
| 265 | u32 max_pgoff) |
| 266 | { |
| 267 | struct ib_uverbs_file *ufile = ucontext->ufile; |
| 268 | XA_STATE(xas, &ucontext->mmap_xa, min_pgoff); |
| 269 | u32 xa_first, xa_last, npages; |
| 270 | int err; |
| 271 | u32 i; |
| 272 | |
| 273 | if (!entry) |
| 274 | return -EINVAL; |
| 275 | |
| 276 | kref_init(kref: &entry->ref); |
| 277 | entry->ucontext = ucontext; |
| 278 | |
| 279 | /* |
| 280 | * We want the whole allocation to be done without interruption from a |
| 281 | * different thread. The allocation requires finding a free range and |
| 282 | * storing. During the xa_insert the lock could be released, possibly |
| 283 | * allowing another thread to choose the same range. |
| 284 | */ |
| 285 | mutex_lock(&ufile->umap_lock); |
| 286 | |
| 287 | xa_lock(&ucontext->mmap_xa); |
| 288 | |
| 289 | /* We want to find an empty range */ |
| 290 | npages = (u32)DIV_ROUND_UP(length, PAGE_SIZE); |
| 291 | entry->npages = npages; |
| 292 | while (true) { |
| 293 | /* First find an empty index */ |
| 294 | xas_find_marked(&xas, max: max_pgoff, XA_FREE_MARK); |
| 295 | if (xas.xa_node == XAS_RESTART) |
| 296 | goto err_unlock; |
| 297 | |
| 298 | xa_first = xas.xa_index; |
| 299 | |
| 300 | /* Is there enough room to have the range? */ |
| 301 | if (check_add_overflow(xa_first, npages, &xa_last)) |
| 302 | goto err_unlock; |
| 303 | |
| 304 | /* |
| 305 | * Now look for the next present entry. If an entry doesn't |
| 306 | * exist, we found an empty range and can proceed. |
| 307 | */ |
| 308 | xas_next_entry(xas: &xas, max: xa_last - 1); |
| 309 | if (xas.xa_node == XAS_BOUNDS || xas.xa_index >= xa_last) |
| 310 | break; |
| 311 | } |
| 312 | |
| 313 | for (i = xa_first; i < xa_last; i++) { |
| 314 | err = __xa_insert(&ucontext->mmap_xa, index: i, entry, GFP_KERNEL); |
| 315 | if (err) |
| 316 | goto err_undo; |
| 317 | } |
| 318 | |
| 319 | /* |
| 320 | * Internally the kernel uses a page offset, in libc this is a byte |
| 321 | * offset. Drivers should not return pgoff to userspace. |
| 322 | */ |
| 323 | entry->start_pgoff = xa_first; |
| 324 | xa_unlock(&ucontext->mmap_xa); |
| 325 | mutex_unlock(lock: &ufile->umap_lock); |
| 326 | |
| 327 | ibdev_dbg(ucontext->device, "mmap: pgoff[%#lx] npages[%#x] inserted\n" , |
| 328 | entry->start_pgoff, npages); |
| 329 | |
| 330 | return 0; |
| 331 | |
| 332 | err_undo: |
| 333 | for (; i > xa_first; i--) |
| 334 | __xa_erase(&ucontext->mmap_xa, index: i - 1); |
| 335 | |
| 336 | err_unlock: |
| 337 | xa_unlock(&ucontext->mmap_xa); |
| 338 | mutex_unlock(lock: &ufile->umap_lock); |
| 339 | return -ENOMEM; |
| 340 | } |
| 341 | EXPORT_SYMBOL(rdma_user_mmap_entry_insert_range); |
| 342 | |
| 343 | /** |
| 344 | * rdma_user_mmap_entry_insert() - Insert an entry to the mmap_xa. |
| 345 | * |
| 346 | * @ucontext: associated user context. |
| 347 | * @entry: the entry to insert into the mmap_xa |
| 348 | * @length: length of the address that will be mmapped |
| 349 | * |
| 350 | * This function should be called by drivers that use the rdma_user_mmap |
| 351 | * interface for handling user mmapped addresses. The database is handled in |
| 352 | * the core and helper functions are provided to insert entries into the |
| 353 | * database and extract entries when the user calls mmap with the given offset. |
| 354 | * The function allocates a unique page offset that should be provided to user, |
| 355 | * the user will use the offset to retrieve information such as address to |
| 356 | * be mapped and how. |
| 357 | * |
| 358 | * Return: 0 on success and -ENOMEM on failure |
| 359 | */ |
| 360 | int rdma_user_mmap_entry_insert(struct ib_ucontext *ucontext, |
| 361 | struct rdma_user_mmap_entry *entry, |
| 362 | size_t length) |
| 363 | { |
| 364 | return rdma_user_mmap_entry_insert_range(ucontext, entry, length, 0, |
| 365 | U32_MAX); |
| 366 | } |
| 367 | EXPORT_SYMBOL(rdma_user_mmap_entry_insert); |
| 368 | |