| 1 | // SPDX-License-Identifier: GPL-2.0-or-later |
| 2 | /* |
| 3 | * Devmem TCP |
| 4 | * |
| 5 | * Authors: Mina Almasry <almasrymina@google.com> |
| 6 | * Willem de Bruijn <willemdebruijn.kernel@gmail.com> |
| 7 | * Kaiyuan Zhang <kaiyuanz@google.com |
| 8 | */ |
| 9 | |
| 10 | #include <linux/dma-buf.h> |
| 11 | #include <linux/genalloc.h> |
| 12 | #include <linux/mm.h> |
| 13 | #include <linux/netdevice.h> |
| 14 | #include <linux/types.h> |
| 15 | #include <net/netdev_queues.h> |
| 16 | #include <net/netdev_rx_queue.h> |
| 17 | #include <net/page_pool/helpers.h> |
| 18 | #include <net/page_pool/memory_provider.h> |
| 19 | #include <net/sock.h> |
| 20 | #include <net/tcp.h> |
| 21 | #include <trace/events/page_pool.h> |
| 22 | |
| 23 | #include "devmem.h" |
| 24 | #include "mp_dmabuf_devmem.h" |
| 25 | #include "page_pool_priv.h" |
| 26 | |
| 27 | /* Device memory support */ |
| 28 | |
| 29 | static DEFINE_XARRAY_FLAGS(net_devmem_dmabuf_bindings, XA_FLAGS_ALLOC1); |
| 30 | |
| 31 | static const struct memory_provider_ops dmabuf_devmem_ops; |
| 32 | |
| 33 | bool net_is_devmem_iov(struct net_iov *niov) |
| 34 | { |
| 35 | return niov->type == NET_IOV_DMABUF; |
| 36 | } |
| 37 | |
| 38 | static void net_devmem_dmabuf_free_chunk_owner(struct gen_pool *genpool, |
| 39 | struct gen_pool_chunk *chunk, |
| 40 | void *not_used) |
| 41 | { |
| 42 | struct dmabuf_genpool_chunk_owner *owner = chunk->owner; |
| 43 | |
| 44 | kvfree(addr: owner->area.niovs); |
| 45 | kfree(objp: owner); |
| 46 | } |
| 47 | |
| 48 | static dma_addr_t net_devmem_get_dma_addr(const struct net_iov *niov) |
| 49 | { |
| 50 | struct dmabuf_genpool_chunk_owner *owner; |
| 51 | |
| 52 | owner = net_devmem_iov_to_chunk_owner(niov); |
| 53 | return owner->base_dma_addr + |
| 54 | ((dma_addr_t)net_iov_idx(niov) << PAGE_SHIFT); |
| 55 | } |
| 56 | |
| 57 | void __net_devmem_dmabuf_binding_free(struct work_struct *wq) |
| 58 | { |
| 59 | struct net_devmem_dmabuf_binding *binding = container_of(wq, typeof(*binding), unbind_w); |
| 60 | |
| 61 | size_t size, avail; |
| 62 | |
| 63 | gen_pool_for_each_chunk(binding->chunk_pool, |
| 64 | net_devmem_dmabuf_free_chunk_owner, NULL); |
| 65 | |
| 66 | size = gen_pool_size(binding->chunk_pool); |
| 67 | avail = gen_pool_avail(binding->chunk_pool); |
| 68 | |
| 69 | if (!WARN(size != avail, "can't destroy genpool. size=%zu, avail=%zu" , |
| 70 | size, avail)) |
| 71 | gen_pool_destroy(binding->chunk_pool); |
| 72 | |
| 73 | dma_buf_unmap_attachment_unlocked(attach: binding->attachment, sg_table: binding->sgt, |
| 74 | direction: binding->direction); |
| 75 | dma_buf_detach(dmabuf: binding->dmabuf, attach: binding->attachment); |
| 76 | dma_buf_put(dmabuf: binding->dmabuf); |
| 77 | xa_destroy(&binding->bound_rxqs); |
| 78 | kvfree(addr: binding->tx_vec); |
| 79 | kfree(objp: binding); |
| 80 | } |
| 81 | |
| 82 | struct net_iov * |
| 83 | net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding) |
| 84 | { |
| 85 | struct dmabuf_genpool_chunk_owner *owner; |
| 86 | unsigned long dma_addr; |
| 87 | struct net_iov *niov; |
| 88 | ssize_t offset; |
| 89 | ssize_t index; |
| 90 | |
| 91 | dma_addr = gen_pool_alloc_owner(pool: binding->chunk_pool, PAGE_SIZE, |
| 92 | owner: (void **)&owner); |
| 93 | if (!dma_addr) |
| 94 | return NULL; |
| 95 | |
| 96 | offset = dma_addr - owner->base_dma_addr; |
| 97 | index = offset / PAGE_SIZE; |
| 98 | niov = &owner->area.niovs[index]; |
| 99 | |
| 100 | niov->desc.pp_magic = 0; |
| 101 | niov->desc.pp = NULL; |
| 102 | atomic_long_set(v: &niov->desc.pp_ref_count, i: 0); |
| 103 | |
| 104 | return niov; |
| 105 | } |
| 106 | |
| 107 | void net_devmem_free_dmabuf(struct net_iov *niov) |
| 108 | { |
| 109 | struct net_devmem_dmabuf_binding *binding = net_devmem_iov_binding(niov); |
| 110 | unsigned long dma_addr = net_devmem_get_dma_addr(niov); |
| 111 | |
| 112 | if (WARN_ON(!gen_pool_has_addr(binding->chunk_pool, dma_addr, |
| 113 | PAGE_SIZE))) |
| 114 | return; |
| 115 | |
| 116 | gen_pool_free(pool: binding->chunk_pool, addr: dma_addr, PAGE_SIZE); |
| 117 | } |
| 118 | |
| 119 | void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) |
| 120 | { |
| 121 | struct netdev_rx_queue *rxq; |
| 122 | unsigned long xa_idx; |
| 123 | unsigned int rxq_idx; |
| 124 | |
| 125 | xa_erase(&net_devmem_dmabuf_bindings, index: binding->id); |
| 126 | |
| 127 | /* Ensure no tx net_devmem_lookup_dmabuf() are in flight after the |
| 128 | * erase. |
| 129 | */ |
| 130 | synchronize_net(); |
| 131 | |
| 132 | if (binding->list.next) |
| 133 | list_del(entry: &binding->list); |
| 134 | |
| 135 | xa_for_each(&binding->bound_rxqs, xa_idx, rxq) { |
| 136 | const struct pp_memory_provider_params mp_params = { |
| 137 | .mp_priv = binding, |
| 138 | .mp_ops = &dmabuf_devmem_ops, |
| 139 | }; |
| 140 | |
| 141 | rxq_idx = get_netdev_rx_queue_index(queue: rxq); |
| 142 | |
| 143 | __net_mp_close_rxq(dev: binding->dev, rxq_idx, old_p: &mp_params); |
| 144 | } |
| 145 | |
| 146 | net_devmem_dmabuf_binding_put(binding); |
| 147 | } |
| 148 | |
| 149 | int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, |
| 150 | struct net_devmem_dmabuf_binding *binding, |
| 151 | struct netlink_ext_ack *extack) |
| 152 | { |
| 153 | struct pp_memory_provider_params mp_params = { |
| 154 | .mp_priv = binding, |
| 155 | .mp_ops = &dmabuf_devmem_ops, |
| 156 | }; |
| 157 | struct netdev_rx_queue *rxq; |
| 158 | u32 xa_idx; |
| 159 | int err; |
| 160 | |
| 161 | err = __net_mp_open_rxq(dev, rxq_idx, p: &mp_params, extack); |
| 162 | if (err) |
| 163 | return err; |
| 164 | |
| 165 | rxq = __netif_get_rx_queue(dev, rxq: rxq_idx); |
| 166 | err = xa_alloc(xa: &binding->bound_rxqs, id: &xa_idx, entry: rxq, xa_limit_32b, |
| 167 | GFP_KERNEL); |
| 168 | if (err) |
| 169 | goto err_close_rxq; |
| 170 | |
| 171 | return 0; |
| 172 | |
| 173 | err_close_rxq: |
| 174 | __net_mp_close_rxq(dev, rxq_idx, old_p: &mp_params); |
| 175 | return err; |
| 176 | } |
| 177 | |
| 178 | struct net_devmem_dmabuf_binding * |
| 179 | net_devmem_bind_dmabuf(struct net_device *dev, |
| 180 | struct device *dma_dev, |
| 181 | enum dma_data_direction direction, |
| 182 | unsigned int dmabuf_fd, struct netdev_nl_sock *priv, |
| 183 | struct netlink_ext_ack *extack) |
| 184 | { |
| 185 | struct net_devmem_dmabuf_binding *binding; |
| 186 | static u32 id_alloc_next; |
| 187 | struct scatterlist *sg; |
| 188 | struct dma_buf *dmabuf; |
| 189 | unsigned int sg_idx, i; |
| 190 | unsigned long virtual; |
| 191 | int err; |
| 192 | |
| 193 | if (!dma_dev) { |
| 194 | NL_SET_ERR_MSG(extack, "Device doesn't support DMA" ); |
| 195 | return ERR_PTR(error: -EOPNOTSUPP); |
| 196 | } |
| 197 | |
| 198 | dmabuf = dma_buf_get(fd: dmabuf_fd); |
| 199 | if (IS_ERR(ptr: dmabuf)) |
| 200 | return ERR_CAST(ptr: dmabuf); |
| 201 | |
| 202 | binding = kzalloc_node(sizeof(*binding), GFP_KERNEL, |
| 203 | dev_to_node(&dev->dev)); |
| 204 | if (!binding) { |
| 205 | err = -ENOMEM; |
| 206 | goto err_put_dmabuf; |
| 207 | } |
| 208 | |
| 209 | binding->dev = dev; |
| 210 | xa_init_flags(xa: &binding->bound_rxqs, XA_FLAGS_ALLOC); |
| 211 | |
| 212 | refcount_set(r: &binding->ref, n: 1); |
| 213 | |
| 214 | mutex_init(&binding->lock); |
| 215 | |
| 216 | binding->dmabuf = dmabuf; |
| 217 | binding->direction = direction; |
| 218 | |
| 219 | binding->attachment = dma_buf_attach(dmabuf: binding->dmabuf, dev: dma_dev); |
| 220 | if (IS_ERR(ptr: binding->attachment)) { |
| 221 | err = PTR_ERR(ptr: binding->attachment); |
| 222 | NL_SET_ERR_MSG(extack, "Failed to bind dmabuf to device" ); |
| 223 | goto err_free_binding; |
| 224 | } |
| 225 | |
| 226 | binding->sgt = dma_buf_map_attachment_unlocked(attach: binding->attachment, |
| 227 | direction); |
| 228 | if (IS_ERR(ptr: binding->sgt)) { |
| 229 | err = PTR_ERR(ptr: binding->sgt); |
| 230 | NL_SET_ERR_MSG(extack, "Failed to map dmabuf attachment" ); |
| 231 | goto err_detach; |
| 232 | } |
| 233 | |
| 234 | if (direction == DMA_TO_DEVICE) { |
| 235 | binding->tx_vec = kvmalloc_array(dmabuf->size / PAGE_SIZE, |
| 236 | sizeof(struct net_iov *), |
| 237 | GFP_KERNEL); |
| 238 | if (!binding->tx_vec) { |
| 239 | err = -ENOMEM; |
| 240 | goto err_unmap; |
| 241 | } |
| 242 | } |
| 243 | |
| 244 | /* For simplicity we expect to make PAGE_SIZE allocations, but the |
| 245 | * binding can be much more flexible than that. We may be able to |
| 246 | * allocate MTU sized chunks here. Leave that for future work... |
| 247 | */ |
| 248 | binding->chunk_pool = gen_pool_create(PAGE_SHIFT, |
| 249 | dev_to_node(dev: &dev->dev)); |
| 250 | if (!binding->chunk_pool) { |
| 251 | err = -ENOMEM; |
| 252 | goto err_tx_vec; |
| 253 | } |
| 254 | |
| 255 | virtual = 0; |
| 256 | for_each_sgtable_dma_sg(binding->sgt, sg, sg_idx) { |
| 257 | dma_addr_t dma_addr = sg_dma_address(sg); |
| 258 | struct dmabuf_genpool_chunk_owner *owner; |
| 259 | size_t len = sg_dma_len(sg); |
| 260 | struct net_iov *niov; |
| 261 | |
| 262 | owner = kzalloc_node(sizeof(*owner), GFP_KERNEL, |
| 263 | dev_to_node(&dev->dev)); |
| 264 | if (!owner) { |
| 265 | err = -ENOMEM; |
| 266 | goto err_free_chunks; |
| 267 | } |
| 268 | |
| 269 | owner->area.base_virtual = virtual; |
| 270 | owner->base_dma_addr = dma_addr; |
| 271 | owner->area.num_niovs = len / PAGE_SIZE; |
| 272 | owner->binding = binding; |
| 273 | |
| 274 | err = gen_pool_add_owner(binding->chunk_pool, dma_addr, |
| 275 | dma_addr, len, dev_to_node(dev: &dev->dev), |
| 276 | owner); |
| 277 | if (err) { |
| 278 | kfree(objp: owner); |
| 279 | err = -EINVAL; |
| 280 | goto err_free_chunks; |
| 281 | } |
| 282 | |
| 283 | owner->area.niovs = kvmalloc_array(owner->area.num_niovs, |
| 284 | sizeof(*owner->area.niovs), |
| 285 | GFP_KERNEL); |
| 286 | if (!owner->area.niovs) { |
| 287 | err = -ENOMEM; |
| 288 | goto err_free_chunks; |
| 289 | } |
| 290 | |
| 291 | for (i = 0; i < owner->area.num_niovs; i++) { |
| 292 | niov = &owner->area.niovs[i]; |
| 293 | niov->type = NET_IOV_DMABUF; |
| 294 | niov->owner = &owner->area; |
| 295 | page_pool_set_dma_addr_netmem(netmem: net_iov_to_netmem(niov), |
| 296 | addr: net_devmem_get_dma_addr(niov)); |
| 297 | if (direction == DMA_TO_DEVICE) |
| 298 | binding->tx_vec[owner->area.base_virtual / PAGE_SIZE + i] = niov; |
| 299 | } |
| 300 | |
| 301 | virtual += len; |
| 302 | } |
| 303 | |
| 304 | err = xa_alloc_cyclic(xa: &net_devmem_dmabuf_bindings, id: &binding->id, |
| 305 | entry: binding, xa_limit_32b, next: &id_alloc_next, |
| 306 | GFP_KERNEL); |
| 307 | if (err < 0) |
| 308 | goto err_free_chunks; |
| 309 | |
| 310 | list_add(new: &binding->list, head: &priv->bindings); |
| 311 | |
| 312 | return binding; |
| 313 | |
| 314 | err_free_chunks: |
| 315 | gen_pool_for_each_chunk(binding->chunk_pool, |
| 316 | net_devmem_dmabuf_free_chunk_owner, NULL); |
| 317 | gen_pool_destroy(binding->chunk_pool); |
| 318 | err_tx_vec: |
| 319 | kvfree(addr: binding->tx_vec); |
| 320 | err_unmap: |
| 321 | dma_buf_unmap_attachment_unlocked(attach: binding->attachment, sg_table: binding->sgt, |
| 322 | direction); |
| 323 | err_detach: |
| 324 | dma_buf_detach(dmabuf, attach: binding->attachment); |
| 325 | err_free_binding: |
| 326 | kfree(objp: binding); |
| 327 | err_put_dmabuf: |
| 328 | dma_buf_put(dmabuf); |
| 329 | return ERR_PTR(error: err); |
| 330 | } |
| 331 | |
| 332 | struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id) |
| 333 | { |
| 334 | struct net_devmem_dmabuf_binding *binding; |
| 335 | |
| 336 | rcu_read_lock(); |
| 337 | binding = xa_load(&net_devmem_dmabuf_bindings, index: id); |
| 338 | if (binding) { |
| 339 | if (!net_devmem_dmabuf_binding_get(binding)) |
| 340 | binding = NULL; |
| 341 | } |
| 342 | rcu_read_unlock(); |
| 343 | |
| 344 | return binding; |
| 345 | } |
| 346 | |
| 347 | void net_devmem_get_net_iov(struct net_iov *niov) |
| 348 | { |
| 349 | net_devmem_dmabuf_binding_get(binding: net_devmem_iov_binding(niov)); |
| 350 | } |
| 351 | |
| 352 | void net_devmem_put_net_iov(struct net_iov *niov) |
| 353 | { |
| 354 | net_devmem_dmabuf_binding_put(binding: net_devmem_iov_binding(niov)); |
| 355 | } |
| 356 | |
| 357 | struct net_devmem_dmabuf_binding *net_devmem_get_binding(struct sock *sk, |
| 358 | unsigned int dmabuf_id) |
| 359 | { |
| 360 | struct net_devmem_dmabuf_binding *binding; |
| 361 | struct net_device *dst_dev; |
| 362 | struct dst_entry *dst; |
| 363 | int err = 0; |
| 364 | |
| 365 | binding = net_devmem_lookup_dmabuf(id: dmabuf_id); |
| 366 | if (!binding || !binding->tx_vec) { |
| 367 | err = -EINVAL; |
| 368 | goto out_err; |
| 369 | } |
| 370 | |
| 371 | rcu_read_lock(); |
| 372 | dst = __sk_dst_get(sk); |
| 373 | /* If dst is NULL (route expired), attempt to rebuild it. */ |
| 374 | if (unlikely(!dst)) { |
| 375 | if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) { |
| 376 | err = -EHOSTUNREACH; |
| 377 | goto out_unlock; |
| 378 | } |
| 379 | dst = __sk_dst_get(sk); |
| 380 | if (unlikely(!dst)) { |
| 381 | err = -ENODEV; |
| 382 | goto out_unlock; |
| 383 | } |
| 384 | } |
| 385 | |
| 386 | /* The dma-addrs in this binding are only reachable to the corresponding |
| 387 | * net_device. |
| 388 | */ |
| 389 | dst_dev = dst_dev_rcu(dst); |
| 390 | if (unlikely(!dst_dev) || unlikely(dst_dev != binding->dev)) { |
| 391 | err = -ENODEV; |
| 392 | goto out_unlock; |
| 393 | } |
| 394 | |
| 395 | rcu_read_unlock(); |
| 396 | return binding; |
| 397 | |
| 398 | out_unlock: |
| 399 | rcu_read_unlock(); |
| 400 | out_err: |
| 401 | if (binding) |
| 402 | net_devmem_dmabuf_binding_put(binding); |
| 403 | |
| 404 | return ERR_PTR(error: err); |
| 405 | } |
| 406 | |
| 407 | struct net_iov * |
| 408 | net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, |
| 409 | size_t virt_addr, size_t *off, size_t *size) |
| 410 | { |
| 411 | if (virt_addr >= binding->dmabuf->size) |
| 412 | return NULL; |
| 413 | |
| 414 | *off = virt_addr % PAGE_SIZE; |
| 415 | *size = PAGE_SIZE - *off; |
| 416 | |
| 417 | return binding->tx_vec[virt_addr / PAGE_SIZE]; |
| 418 | } |
| 419 | |
| 420 | /*** "Dmabuf devmem memory provider" ***/ |
| 421 | |
| 422 | int mp_dmabuf_devmem_init(struct page_pool *pool) |
| 423 | { |
| 424 | struct net_devmem_dmabuf_binding *binding = pool->mp_priv; |
| 425 | |
| 426 | if (!binding) |
| 427 | return -EINVAL; |
| 428 | |
| 429 | /* dma-buf dma addresses do not need and should not be used with |
| 430 | * dma_sync_for_cpu/device. Force disable dma_sync. |
| 431 | */ |
| 432 | pool->dma_sync = false; |
| 433 | pool->dma_sync_for_cpu = false; |
| 434 | |
| 435 | if (pool->p.order != 0) |
| 436 | return -E2BIG; |
| 437 | |
| 438 | net_devmem_dmabuf_binding_get(binding); |
| 439 | return 0; |
| 440 | } |
| 441 | |
| 442 | netmem_ref mp_dmabuf_devmem_alloc_netmems(struct page_pool *pool, gfp_t gfp) |
| 443 | { |
| 444 | struct net_devmem_dmabuf_binding *binding = pool->mp_priv; |
| 445 | struct net_iov *niov; |
| 446 | netmem_ref netmem; |
| 447 | |
| 448 | niov = net_devmem_alloc_dmabuf(binding); |
| 449 | if (!niov) |
| 450 | return 0; |
| 451 | |
| 452 | netmem = net_iov_to_netmem(niov); |
| 453 | |
| 454 | page_pool_set_pp_info(pool, netmem); |
| 455 | |
| 456 | pool->pages_state_hold_cnt++; |
| 457 | trace_page_pool_state_hold(pool, netmem, hold: pool->pages_state_hold_cnt); |
| 458 | return netmem; |
| 459 | } |
| 460 | |
| 461 | void mp_dmabuf_devmem_destroy(struct page_pool *pool) |
| 462 | { |
| 463 | struct net_devmem_dmabuf_binding *binding = pool->mp_priv; |
| 464 | |
| 465 | net_devmem_dmabuf_binding_put(binding); |
| 466 | } |
| 467 | |
| 468 | bool mp_dmabuf_devmem_release_page(struct page_pool *pool, netmem_ref netmem) |
| 469 | { |
| 470 | long refcount = atomic_long_read(v: netmem_get_pp_ref_count_ref(netmem)); |
| 471 | |
| 472 | if (WARN_ON_ONCE(!netmem_is_net_iov(netmem))) |
| 473 | return false; |
| 474 | |
| 475 | if (WARN_ON_ONCE(refcount != 1)) |
| 476 | return false; |
| 477 | |
| 478 | page_pool_clear_pp_info(netmem); |
| 479 | |
| 480 | net_devmem_free_dmabuf(niov: netmem_to_net_iov(netmem)); |
| 481 | |
| 482 | /* We don't want the page pool put_page()ing our net_iovs. */ |
| 483 | return false; |
| 484 | } |
| 485 | |
| 486 | static int mp_dmabuf_devmem_nl_fill(void *mp_priv, struct sk_buff *rsp, |
| 487 | struct netdev_rx_queue *rxq) |
| 488 | { |
| 489 | const struct net_devmem_dmabuf_binding *binding = mp_priv; |
| 490 | int type = rxq ? NETDEV_A_QUEUE_DMABUF : NETDEV_A_PAGE_POOL_DMABUF; |
| 491 | |
| 492 | return nla_put_u32(skb: rsp, attrtype: type, value: binding->id); |
| 493 | } |
| 494 | |
| 495 | static void mp_dmabuf_devmem_uninstall(void *mp_priv, |
| 496 | struct netdev_rx_queue *rxq) |
| 497 | { |
| 498 | struct net_devmem_dmabuf_binding *binding = mp_priv; |
| 499 | struct netdev_rx_queue *bound_rxq; |
| 500 | unsigned long xa_idx; |
| 501 | |
| 502 | xa_for_each(&binding->bound_rxqs, xa_idx, bound_rxq) { |
| 503 | if (bound_rxq == rxq) { |
| 504 | xa_erase(&binding->bound_rxqs, index: xa_idx); |
| 505 | if (xa_empty(xa: &binding->bound_rxqs)) { |
| 506 | mutex_lock(&binding->lock); |
| 507 | binding->dev = NULL; |
| 508 | mutex_unlock(lock: &binding->lock); |
| 509 | } |
| 510 | break; |
| 511 | } |
| 512 | } |
| 513 | } |
| 514 | |
| 515 | static const struct memory_provider_ops dmabuf_devmem_ops = { |
| 516 | .init = mp_dmabuf_devmem_init, |
| 517 | .destroy = mp_dmabuf_devmem_destroy, |
| 518 | .alloc_netmems = mp_dmabuf_devmem_alloc_netmems, |
| 519 | .release_netmem = mp_dmabuf_devmem_release_page, |
| 520 | .nl_fill = mp_dmabuf_devmem_nl_fill, |
| 521 | .uninstall = mp_dmabuf_devmem_uninstall, |
| 522 | }; |
| 523 | |