1/*
2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
3 * Copyright (c) 2020, Intel Corporation. All rights reserved.
4 *
5 * This software is available to you under a choice of one of two
6 * licenses. You may choose to be licensed under the terms of the GNU
7 * General Public License (GPL) Version 2, available from the file
8 * COPYING in the main directory of this source tree, or the
9 * OpenIB.org BSD license below:
10 *
11 * Redistribution and use in source and binary forms, with or
12 * without modification, are permitted provided that the following
13 * conditions are met:
14 *
15 * - Redistributions of source code must retain the above
16 * copyright notice, this list of conditions and the following
17 * disclaimer.
18 *
19 * - Redistributions in binary form must reproduce the above
20 * copyright notice, this list of conditions and the following
21 * disclaimer in the documentation and/or other materials
22 * provided with the distribution.
23 *
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31 * SOFTWARE.
32 */
33
34
35#include <linux/kref.h>
36#include <linux/random.h>
37#include <linux/debugfs.h>
38#include <linux/export.h>
39#include <linux/delay.h>
40#include <linux/dma-buf.h>
41#include <linux/dma-resv.h>
42#include <rdma/ib_umem_odp.h>
43#include "dm.h"
44#include "mlx5_ib.h"
45#include "umr.h"
46#include "data_direct.h"
47#include "dmah.h"
48
49enum {
50 MAX_PENDING_REG_MR = 8,
51};
52
53#define MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS 4
54#define MLX5_UMR_ALIGN 2048
55
56static void
57create_mkey_callback(int status, struct mlx5_async_work *context);
58static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
59 u64 iova, int access_flags,
60 unsigned long page_size, bool populate,
61 int access_mode, u16 st_index, u8 ph);
62static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr);
63
64static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr,
65 struct ib_pd *pd)
66{
67 struct mlx5_ib_dev *dev = to_mdev(ibdev: pd->device);
68
69 MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC));
70 MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE));
71 MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ));
72 MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE));
73 MLX5_SET(mkc, mkc, lr, 1);
74
75 if (acc & IB_ACCESS_RELAXED_ORDERING) {
76 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write))
77 MLX5_SET(mkc, mkc, relaxed_ordering_write, 1);
78
79 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) ||
80 (MLX5_CAP_GEN(dev->mdev,
81 relaxed_ordering_read_pci_enabled) &&
82 pcie_relaxed_ordering_enabled(dev: dev->mdev->pdev)))
83 MLX5_SET(mkc, mkc, relaxed_ordering_read, 1);
84 }
85
86 MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
87 MLX5_SET(mkc, mkc, qpn, 0xffffff);
88 MLX5_SET64(mkc, mkc, start_addr, start_addr);
89}
90
91static void assign_mkey_variant(struct mlx5_ib_dev *dev, u32 *mkey, u32 *in)
92{
93 u8 key = atomic_inc_return(v: &dev->mkey_var);
94 void *mkc;
95
96 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
97 MLX5_SET(mkc, mkc, mkey_7_0, key);
98 *mkey = key;
99}
100
101static int mlx5_ib_create_mkey(struct mlx5_ib_dev *dev,
102 struct mlx5_ib_mkey *mkey, u32 *in, int inlen)
103{
104 int ret;
105
106 assign_mkey_variant(dev, mkey: &mkey->key, in);
107 ret = mlx5_core_create_mkey(dev: dev->mdev, mkey: &mkey->key, in, inlen);
108 if (!ret)
109 init_waitqueue_head(&mkey->wait);
110
111 return ret;
112}
113
114static int mlx5_ib_create_mkey_cb(struct mlx5r_async_create_mkey *async_create)
115{
116 struct mlx5_ib_dev *dev = async_create->ent->dev;
117 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
118 size_t outlen = MLX5_ST_SZ_BYTES(create_mkey_out);
119
120 MLX5_SET(create_mkey_in, async_create->in, opcode,
121 MLX5_CMD_OP_CREATE_MKEY);
122 assign_mkey_variant(dev, mkey: &async_create->mkey, in: async_create->in);
123 return mlx5_cmd_exec_cb(ctx: &dev->async_ctx, in: async_create->in, in_size: inlen,
124 out: async_create->out, out_size: outlen, callback: create_mkey_callback,
125 work: &async_create->cb_work);
126}
127
128static int mkey_cache_max_order(struct mlx5_ib_dev *dev);
129static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent);
130
131static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
132{
133 WARN_ON(xa_load(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)));
134
135 return mlx5_core_destroy_mkey(dev: dev->mdev, mkey: mr->mmkey.key);
136}
137
138static void create_mkey_warn(struct mlx5_ib_dev *dev, int status, void *out)
139{
140 if (status == -ENXIO) /* core driver is not available */
141 return;
142
143 mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status);
144 if (status != -EREMOTEIO) /* driver specific failure */
145 return;
146
147 /* Failed in FW, print cmd out failure details */
148 mlx5_cmd_out_err(dev: dev->mdev, opcode: MLX5_CMD_OP_CREATE_MKEY, op_mod: 0, out);
149}
150
151static int push_mkey_locked(struct mlx5_cache_ent *ent, u32 mkey)
152{
153 unsigned long tmp = ent->mkeys_queue.ci % NUM_MKEYS_PER_PAGE;
154 struct mlx5_mkeys_page *page;
155
156 lockdep_assert_held(&ent->mkeys_queue.lock);
157 if (ent->mkeys_queue.ci >=
158 ent->mkeys_queue.num_pages * NUM_MKEYS_PER_PAGE) {
159 page = kzalloc(sizeof(*page), GFP_ATOMIC);
160 if (!page)
161 return -ENOMEM;
162 ent->mkeys_queue.num_pages++;
163 list_add_tail(new: &page->list, head: &ent->mkeys_queue.pages_list);
164 } else {
165 page = list_last_entry(&ent->mkeys_queue.pages_list,
166 struct mlx5_mkeys_page, list);
167 }
168
169 page->mkeys[tmp] = mkey;
170 ent->mkeys_queue.ci++;
171 return 0;
172}
173
174static int pop_mkey_locked(struct mlx5_cache_ent *ent)
175{
176 unsigned long tmp = (ent->mkeys_queue.ci - 1) % NUM_MKEYS_PER_PAGE;
177 struct mlx5_mkeys_page *last_page;
178 u32 mkey;
179
180 lockdep_assert_held(&ent->mkeys_queue.lock);
181 last_page = list_last_entry(&ent->mkeys_queue.pages_list,
182 struct mlx5_mkeys_page, list);
183 mkey = last_page->mkeys[tmp];
184 last_page->mkeys[tmp] = 0;
185 ent->mkeys_queue.ci--;
186 if (ent->mkeys_queue.num_pages > 1 && !tmp) {
187 list_del(entry: &last_page->list);
188 ent->mkeys_queue.num_pages--;
189 kfree(objp: last_page);
190 }
191 return mkey;
192}
193
194static void create_mkey_callback(int status, struct mlx5_async_work *context)
195{
196 struct mlx5r_async_create_mkey *mkey_out =
197 container_of(context, struct mlx5r_async_create_mkey, cb_work);
198 struct mlx5_cache_ent *ent = mkey_out->ent;
199 struct mlx5_ib_dev *dev = ent->dev;
200 unsigned long flags;
201
202 if (status) {
203 create_mkey_warn(dev, status, out: mkey_out->out);
204 kfree(objp: mkey_out);
205 spin_lock_irqsave(&ent->mkeys_queue.lock, flags);
206 ent->pending--;
207 WRITE_ONCE(dev->fill_delay, 1);
208 spin_unlock_irqrestore(lock: &ent->mkeys_queue.lock, flags);
209 mod_timer(timer: &dev->delay_timer, expires: jiffies + HZ);
210 return;
211 }
212
213 mkey_out->mkey |= mlx5_idx_to_mkey(
214 MLX5_GET(create_mkey_out, mkey_out->out, mkey_index));
215 WRITE_ONCE(dev->cache.last_add, jiffies);
216
217 spin_lock_irqsave(&ent->mkeys_queue.lock, flags);
218 push_mkey_locked(ent, mkey: mkey_out->mkey);
219 ent->pending--;
220 /* If we are doing fill_to_high_water then keep going. */
221 queue_adjust_cache_locked(ent);
222 spin_unlock_irqrestore(lock: &ent->mkeys_queue.lock, flags);
223 kfree(objp: mkey_out);
224}
225
226static int get_mkc_octo_size(unsigned int access_mode, unsigned int ndescs)
227{
228 int ret = 0;
229
230 switch (access_mode) {
231 case MLX5_MKC_ACCESS_MODE_MTT:
232 ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD /
233 sizeof(struct mlx5_mtt));
234 break;
235 case MLX5_MKC_ACCESS_MODE_KSM:
236 ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD /
237 sizeof(struct mlx5_klm));
238 break;
239 default:
240 WARN_ON(1);
241 }
242 return ret;
243}
244
245static void set_cache_mkc(struct mlx5_cache_ent *ent, void *mkc)
246{
247 set_mkc_access_pd_addr_fields(mkc, acc: ent->rb_key.access_flags, start_addr: 0,
248 pd: ent->dev->umrc.pd);
249 MLX5_SET(mkc, mkc, free, 1);
250 MLX5_SET(mkc, mkc, umr_en, 1);
251 MLX5_SET(mkc, mkc, access_mode_1_0, ent->rb_key.access_mode & 0x3);
252 MLX5_SET(mkc, mkc, access_mode_4_2,
253 (ent->rb_key.access_mode >> 2) & 0x7);
254 MLX5_SET(mkc, mkc, ma_translation_mode, !!ent->rb_key.ats);
255
256 MLX5_SET(mkc, mkc, translations_octword_size,
257 get_mkc_octo_size(ent->rb_key.access_mode,
258 ent->rb_key.ndescs));
259 MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
260
261 if (ent->rb_key.ph != MLX5_IB_NO_PH) {
262 MLX5_SET(mkc, mkc, pcie_tph_en, 1);
263 MLX5_SET(mkc, mkc, pcie_tph_ph, ent->rb_key.ph);
264 if (ent->rb_key.st_index != MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX)
265 MLX5_SET(mkc, mkc, pcie_tph_steering_tag_index,
266 ent->rb_key.st_index);
267 }
268}
269
270/* Asynchronously schedule new MRs to be populated in the cache. */
271static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
272{
273 struct mlx5r_async_create_mkey *async_create;
274 void *mkc;
275 int err = 0;
276 int i;
277
278 for (i = 0; i < num; i++) {
279 async_create = kzalloc(sizeof(struct mlx5r_async_create_mkey),
280 GFP_KERNEL);
281 if (!async_create)
282 return -ENOMEM;
283 mkc = MLX5_ADDR_OF(create_mkey_in, async_create->in,
284 memory_key_mkey_entry);
285 set_cache_mkc(ent, mkc);
286 async_create->ent = ent;
287
288 spin_lock_irq(lock: &ent->mkeys_queue.lock);
289 if (ent->pending >= MAX_PENDING_REG_MR) {
290 err = -EAGAIN;
291 goto free_async_create;
292 }
293 ent->pending++;
294 spin_unlock_irq(lock: &ent->mkeys_queue.lock);
295
296 err = mlx5_ib_create_mkey_cb(async_create);
297 if (err) {
298 mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err);
299 goto err_create_mkey;
300 }
301 }
302
303 return 0;
304
305err_create_mkey:
306 spin_lock_irq(lock: &ent->mkeys_queue.lock);
307 ent->pending--;
308free_async_create:
309 spin_unlock_irq(lock: &ent->mkeys_queue.lock);
310 kfree(objp: async_create);
311 return err;
312}
313
314/* Synchronously create a MR in the cache */
315static int create_cache_mkey(struct mlx5_cache_ent *ent, u32 *mkey)
316{
317 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
318 void *mkc;
319 u32 *in;
320 int err;
321
322 in = kzalloc(inlen, GFP_KERNEL);
323 if (!in)
324 return -ENOMEM;
325 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
326 set_cache_mkc(ent, mkc);
327
328 err = mlx5_core_create_mkey(dev: ent->dev->mdev, mkey, in, inlen);
329 if (err)
330 goto free_in;
331
332 WRITE_ONCE(ent->dev->cache.last_add, jiffies);
333free_in:
334 kfree(objp: in);
335 return err;
336}
337
338static void remove_cache_mr_locked(struct mlx5_cache_ent *ent)
339{
340 u32 mkey;
341
342 lockdep_assert_held(&ent->mkeys_queue.lock);
343 if (!ent->mkeys_queue.ci)
344 return;
345 mkey = pop_mkey_locked(ent);
346 spin_unlock_irq(lock: &ent->mkeys_queue.lock);
347 mlx5_core_destroy_mkey(dev: ent->dev->mdev, mkey);
348 spin_lock_irq(lock: &ent->mkeys_queue.lock);
349}
350
351static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target,
352 bool limit_fill)
353 __acquires(&ent->mkeys_queue.lock) __releases(&ent->mkeys_queue.lock)
354{
355 int err;
356
357 lockdep_assert_held(&ent->mkeys_queue.lock);
358
359 while (true) {
360 if (limit_fill)
361 target = ent->limit * 2;
362 if (target == ent->pending + ent->mkeys_queue.ci)
363 return 0;
364 if (target > ent->pending + ent->mkeys_queue.ci) {
365 u32 todo = target - (ent->pending + ent->mkeys_queue.ci);
366
367 spin_unlock_irq(lock: &ent->mkeys_queue.lock);
368 err = add_keys(ent, num: todo);
369 if (err == -EAGAIN)
370 usleep_range(min: 3000, max: 5000);
371 spin_lock_irq(lock: &ent->mkeys_queue.lock);
372 if (err) {
373 if (err != -EAGAIN)
374 return err;
375 } else
376 return 0;
377 } else {
378 remove_cache_mr_locked(ent);
379 }
380 }
381}
382
383static ssize_t size_write(struct file *filp, const char __user *buf,
384 size_t count, loff_t *pos)
385{
386 struct mlx5_cache_ent *ent = filp->private_data;
387 u32 target;
388 int err;
389
390 err = kstrtou32_from_user(s: buf, count, base: 0, res: &target);
391 if (err)
392 return err;
393
394 /*
395 * Target is the new value of total_mrs the user requests, however we
396 * cannot free MRs that are in use. Compute the target value for stored
397 * mkeys.
398 */
399 spin_lock_irq(lock: &ent->mkeys_queue.lock);
400 if (target < ent->in_use) {
401 err = -EINVAL;
402 goto err_unlock;
403 }
404 target = target - ent->in_use;
405 if (target < ent->limit || target > ent->limit*2) {
406 err = -EINVAL;
407 goto err_unlock;
408 }
409 err = resize_available_mrs(ent, target, limit_fill: false);
410 if (err)
411 goto err_unlock;
412 spin_unlock_irq(lock: &ent->mkeys_queue.lock);
413
414 return count;
415
416err_unlock:
417 spin_unlock_irq(lock: &ent->mkeys_queue.lock);
418 return err;
419}
420
421static ssize_t size_read(struct file *filp, char __user *buf, size_t count,
422 loff_t *pos)
423{
424 struct mlx5_cache_ent *ent = filp->private_data;
425 char lbuf[20];
426 int err;
427
428 err = snprintf(buf: lbuf, size: sizeof(lbuf), fmt: "%ld\n",
429 ent->mkeys_queue.ci + ent->in_use);
430 if (err < 0)
431 return err;
432
433 return simple_read_from_buffer(to: buf, count, ppos: pos, from: lbuf, available: err);
434}
435
436static const struct file_operations size_fops = {
437 .owner = THIS_MODULE,
438 .open = simple_open,
439 .write = size_write,
440 .read = size_read,
441};
442
443static ssize_t limit_write(struct file *filp, const char __user *buf,
444 size_t count, loff_t *pos)
445{
446 struct mlx5_cache_ent *ent = filp->private_data;
447 u32 var;
448 int err;
449
450 err = kstrtou32_from_user(s: buf, count, base: 0, res: &var);
451 if (err)
452 return err;
453
454 /*
455 * Upon set we immediately fill the cache to high water mark implied by
456 * the limit.
457 */
458 spin_lock_irq(lock: &ent->mkeys_queue.lock);
459 ent->limit = var;
460 err = resize_available_mrs(ent, target: 0, limit_fill: true);
461 spin_unlock_irq(lock: &ent->mkeys_queue.lock);
462 if (err)
463 return err;
464 return count;
465}
466
467static ssize_t limit_read(struct file *filp, char __user *buf, size_t count,
468 loff_t *pos)
469{
470 struct mlx5_cache_ent *ent = filp->private_data;
471 char lbuf[20];
472 int err;
473
474 err = snprintf(buf: lbuf, size: sizeof(lbuf), fmt: "%d\n", ent->limit);
475 if (err < 0)
476 return err;
477
478 return simple_read_from_buffer(to: buf, count, ppos: pos, from: lbuf, available: err);
479}
480
481static const struct file_operations limit_fops = {
482 .owner = THIS_MODULE,
483 .open = simple_open,
484 .write = limit_write,
485 .read = limit_read,
486};
487
488static bool someone_adding(struct mlx5_mkey_cache *cache)
489{
490 struct mlx5_cache_ent *ent;
491 struct rb_node *node;
492 bool ret;
493
494 mutex_lock(&cache->rb_lock);
495 for (node = rb_first(root: &cache->rb_root); node; node = rb_next(node)) {
496 ent = rb_entry(node, struct mlx5_cache_ent, node);
497 spin_lock_irq(lock: &ent->mkeys_queue.lock);
498 ret = ent->mkeys_queue.ci < ent->limit;
499 spin_unlock_irq(lock: &ent->mkeys_queue.lock);
500 if (ret) {
501 mutex_unlock(lock: &cache->rb_lock);
502 return true;
503 }
504 }
505 mutex_unlock(lock: &cache->rb_lock);
506 return false;
507}
508
509/*
510 * Check if the bucket is outside the high/low water mark and schedule an async
511 * update. The cache refill has hysteresis, once the low water mark is hit it is
512 * refilled up to the high mark.
513 */
514static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
515{
516 lockdep_assert_held(&ent->mkeys_queue.lock);
517
518 if (ent->disabled || READ_ONCE(ent->dev->fill_delay) || ent->is_tmp)
519 return;
520 if (ent->mkeys_queue.ci < ent->limit) {
521 ent->fill_to_high_water = true;
522 mod_delayed_work(wq: ent->dev->cache.wq, dwork: &ent->dwork, delay: 0);
523 } else if (ent->fill_to_high_water &&
524 ent->mkeys_queue.ci + ent->pending < 2 * ent->limit) {
525 /*
526 * Once we start populating due to hitting a low water mark
527 * continue until we pass the high water mark.
528 */
529 mod_delayed_work(wq: ent->dev->cache.wq, dwork: &ent->dwork, delay: 0);
530 } else if (ent->mkeys_queue.ci == 2 * ent->limit) {
531 ent->fill_to_high_water = false;
532 } else if (ent->mkeys_queue.ci > 2 * ent->limit) {
533 /* Queue deletion of excess entries */
534 ent->fill_to_high_water = false;
535 if (ent->pending)
536 queue_delayed_work(wq: ent->dev->cache.wq, dwork: &ent->dwork,
537 secs_to_jiffies(1));
538 else
539 mod_delayed_work(wq: ent->dev->cache.wq, dwork: &ent->dwork, delay: 0);
540 }
541}
542
543static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent)
544{
545 u32 mkey;
546
547 spin_lock_irq(lock: &ent->mkeys_queue.lock);
548 while (ent->mkeys_queue.ci) {
549 mkey = pop_mkey_locked(ent);
550 spin_unlock_irq(lock: &ent->mkeys_queue.lock);
551 mlx5_core_destroy_mkey(dev: dev->mdev, mkey);
552 spin_lock_irq(lock: &ent->mkeys_queue.lock);
553 }
554 ent->tmp_cleanup_scheduled = false;
555 spin_unlock_irq(lock: &ent->mkeys_queue.lock);
556}
557
558static void __cache_work_func(struct mlx5_cache_ent *ent)
559{
560 struct mlx5_ib_dev *dev = ent->dev;
561 struct mlx5_mkey_cache *cache = &dev->cache;
562 int err;
563
564 spin_lock_irq(lock: &ent->mkeys_queue.lock);
565 if (ent->disabled)
566 goto out;
567
568 if (ent->fill_to_high_water &&
569 ent->mkeys_queue.ci + ent->pending < 2 * ent->limit &&
570 !READ_ONCE(dev->fill_delay)) {
571 spin_unlock_irq(lock: &ent->mkeys_queue.lock);
572 err = add_keys(ent, num: 1);
573 spin_lock_irq(lock: &ent->mkeys_queue.lock);
574 if (ent->disabled)
575 goto out;
576 if (err) {
577 /*
578 * EAGAIN only happens if there are pending MRs, so we
579 * will be rescheduled when storing them. The only
580 * failure path here is ENOMEM.
581 */
582 if (err != -EAGAIN) {
583 mlx5_ib_warn(
584 dev,
585 "add keys command failed, err %d\n",
586 err);
587 queue_delayed_work(wq: cache->wq, dwork: &ent->dwork,
588 secs_to_jiffies(1));
589 }
590 }
591 } else if (ent->mkeys_queue.ci > 2 * ent->limit) {
592 bool need_delay;
593
594 /*
595 * The remove_cache_mr() logic is performed as garbage
596 * collection task. Such task is intended to be run when no
597 * other active processes are running.
598 *
599 * The need_resched() will return TRUE if there are user tasks
600 * to be activated in near future.
601 *
602 * In such case, we don't execute remove_cache_mr() and postpone
603 * the garbage collection work to try to run in next cycle, in
604 * order to free CPU resources to other tasks.
605 */
606 spin_unlock_irq(lock: &ent->mkeys_queue.lock);
607 need_delay = need_resched() || someone_adding(cache) ||
608 !time_after(jiffies,
609 READ_ONCE(cache->last_add) + 300 * HZ);
610 spin_lock_irq(lock: &ent->mkeys_queue.lock);
611 if (ent->disabled)
612 goto out;
613 if (need_delay) {
614 queue_delayed_work(wq: cache->wq, dwork: &ent->dwork, delay: 300 * HZ);
615 goto out;
616 }
617 remove_cache_mr_locked(ent);
618 queue_adjust_cache_locked(ent);
619 }
620out:
621 spin_unlock_irq(lock: &ent->mkeys_queue.lock);
622}
623
624static void delayed_cache_work_func(struct work_struct *work)
625{
626 struct mlx5_cache_ent *ent;
627
628 ent = container_of(work, struct mlx5_cache_ent, dwork.work);
629 /* temp entries are never filled, only cleaned */
630 if (ent->is_tmp)
631 clean_keys(dev: ent->dev, ent);
632 else
633 __cache_work_func(ent);
634}
635
636static int cache_ent_key_cmp(struct mlx5r_cache_rb_key key1,
637 struct mlx5r_cache_rb_key key2)
638{
639 int res;
640
641 res = key1.ats - key2.ats;
642 if (res)
643 return res;
644
645 res = key1.access_mode - key2.access_mode;
646 if (res)
647 return res;
648
649 res = key1.access_flags - key2.access_flags;
650 if (res)
651 return res;
652
653 res = key1.st_index - key2.st_index;
654 if (res)
655 return res;
656
657 res = key1.ph - key2.ph;
658 if (res)
659 return res;
660
661 /*
662 * keep ndescs the last in the compare table since the find function
663 * searches for an exact match on all properties and only closest
664 * match in size.
665 */
666 return key1.ndescs - key2.ndescs;
667}
668
669static int mlx5_cache_ent_insert(struct mlx5_mkey_cache *cache,
670 struct mlx5_cache_ent *ent)
671{
672 struct rb_node **new = &cache->rb_root.rb_node, *parent = NULL;
673 struct mlx5_cache_ent *cur;
674 int cmp;
675
676 /* Figure out where to put new node */
677 while (*new) {
678 cur = rb_entry(*new, struct mlx5_cache_ent, node);
679 parent = *new;
680 cmp = cache_ent_key_cmp(key1: cur->rb_key, key2: ent->rb_key);
681 if (cmp > 0)
682 new = &((*new)->rb_left);
683 if (cmp < 0)
684 new = &((*new)->rb_right);
685 if (cmp == 0)
686 return -EEXIST;
687 }
688
689 /* Add new node and rebalance tree. */
690 rb_link_node(node: &ent->node, parent, rb_link: new);
691 rb_insert_color(&ent->node, &cache->rb_root);
692
693 return 0;
694}
695
696static struct mlx5_cache_ent *
697mkey_cache_ent_from_rb_key(struct mlx5_ib_dev *dev,
698 struct mlx5r_cache_rb_key rb_key)
699{
700 struct rb_node *node = dev->cache.rb_root.rb_node;
701 struct mlx5_cache_ent *cur, *smallest = NULL;
702 u64 ndescs_limit;
703 int cmp;
704
705 /*
706 * Find the smallest ent with order >= requested_order.
707 */
708 while (node) {
709 cur = rb_entry(node, struct mlx5_cache_ent, node);
710 cmp = cache_ent_key_cmp(key1: cur->rb_key, key2: rb_key);
711 if (cmp > 0) {
712 smallest = cur;
713 node = node->rb_left;
714 }
715 if (cmp < 0)
716 node = node->rb_right;
717 if (cmp == 0)
718 return cur;
719 }
720
721 /*
722 * Limit the usage of mkeys larger than twice the required size while
723 * also allowing the usage of smallest cache entry for small MRs.
724 */
725 ndescs_limit = max_t(u64, rb_key.ndescs * 2,
726 MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS);
727
728 return (smallest &&
729 smallest->rb_key.access_mode == rb_key.access_mode &&
730 smallest->rb_key.access_flags == rb_key.access_flags &&
731 smallest->rb_key.ats == rb_key.ats &&
732 smallest->rb_key.st_index == rb_key.st_index &&
733 smallest->rb_key.ph == rb_key.ph &&
734 smallest->rb_key.ndescs <= ndescs_limit) ?
735 smallest :
736 NULL;
737}
738
739static struct mlx5_ib_mr *_mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
740 struct mlx5_cache_ent *ent)
741{
742 struct mlx5_ib_mr *mr;
743 int err;
744
745 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
746 if (!mr)
747 return ERR_PTR(error: -ENOMEM);
748
749 spin_lock_irq(lock: &ent->mkeys_queue.lock);
750 ent->in_use++;
751
752 if (!ent->mkeys_queue.ci) {
753 queue_adjust_cache_locked(ent);
754 ent->miss++;
755 spin_unlock_irq(lock: &ent->mkeys_queue.lock);
756 err = create_cache_mkey(ent, mkey: &mr->mmkey.key);
757 if (err) {
758 spin_lock_irq(lock: &ent->mkeys_queue.lock);
759 ent->in_use--;
760 spin_unlock_irq(lock: &ent->mkeys_queue.lock);
761 kfree(objp: mr);
762 return ERR_PTR(error: err);
763 }
764 } else {
765 mr->mmkey.key = pop_mkey_locked(ent);
766 queue_adjust_cache_locked(ent);
767 spin_unlock_irq(lock: &ent->mkeys_queue.lock);
768 }
769 mr->mmkey.cache_ent = ent;
770 mr->mmkey.type = MLX5_MKEY_MR;
771 mr->mmkey.rb_key = ent->rb_key;
772 mr->mmkey.cacheable = true;
773 init_waitqueue_head(&mr->mmkey.wait);
774 return mr;
775}
776
777static int get_unchangeable_access_flags(struct mlx5_ib_dev *dev,
778 int access_flags)
779{
780 int ret = 0;
781
782 if ((access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
783 MLX5_CAP_GEN(dev->mdev, atomic) &&
784 MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled))
785 ret |= IB_ACCESS_REMOTE_ATOMIC;
786
787 if ((access_flags & IB_ACCESS_RELAXED_ORDERING) &&
788 MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write) &&
789 !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr))
790 ret |= IB_ACCESS_RELAXED_ORDERING;
791
792 if ((access_flags & IB_ACCESS_RELAXED_ORDERING) &&
793 (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) ||
794 MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_pci_enabled)) &&
795 !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr))
796 ret |= IB_ACCESS_RELAXED_ORDERING;
797
798 return ret;
799}
800
801struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
802 int access_flags, int access_mode,
803 int ndescs)
804{
805 struct mlx5r_cache_rb_key rb_key = {
806 .ndescs = ndescs,
807 .access_mode = access_mode,
808 .access_flags = get_unchangeable_access_flags(dev, access_flags),
809 .ph = MLX5_IB_NO_PH,
810 };
811 struct mlx5_cache_ent *ent = mkey_cache_ent_from_rb_key(dev, rb_key);
812
813 if (!ent)
814 return ERR_PTR(error: -EOPNOTSUPP);
815
816 return _mlx5_mr_cache_alloc(dev, ent);
817}
818
819static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
820{
821 if (!mlx5_debugfs_root || dev->is_rep)
822 return;
823
824 debugfs_remove_recursive(dentry: dev->cache.fs_root);
825 dev->cache.fs_root = NULL;
826}
827
828static void mlx5_mkey_cache_debugfs_add_ent(struct mlx5_ib_dev *dev,
829 struct mlx5_cache_ent *ent)
830{
831 int order = order_base_2(ent->rb_key.ndescs);
832 struct dentry *dir;
833
834 if (!mlx5_debugfs_root || dev->is_rep)
835 return;
836
837 if (ent->rb_key.access_mode == MLX5_MKC_ACCESS_MODE_KSM)
838 order = MLX5_IMR_KSM_CACHE_ENTRY + 2;
839
840 sprintf(buf: ent->name, fmt: "%d", order);
841 dir = debugfs_create_dir(name: ent->name, parent: dev->cache.fs_root);
842 debugfs_create_file("size", 0600, dir, ent, &size_fops);
843 debugfs_create_file("limit", 0600, dir, ent, &limit_fops);
844 debugfs_create_ulong(name: "cur", mode: 0400, parent: dir, value: &ent->mkeys_queue.ci);
845 debugfs_create_u32(name: "miss", mode: 0600, parent: dir, value: &ent->miss);
846}
847
848static void mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev *dev)
849{
850 struct dentry *dbg_root = mlx5_debugfs_get_dev_root(dev: dev->mdev);
851 struct mlx5_mkey_cache *cache = &dev->cache;
852
853 if (!mlx5_debugfs_root || dev->is_rep)
854 return;
855
856 cache->fs_root = debugfs_create_dir(name: "mr_cache", parent: dbg_root);
857}
858
859static void delay_time_func(struct timer_list *t)
860{
861 struct mlx5_ib_dev *dev = timer_container_of(dev, t, delay_timer);
862
863 WRITE_ONCE(dev->fill_delay, 0);
864}
865
866static int mlx5r_mkeys_init(struct mlx5_cache_ent *ent)
867{
868 struct mlx5_mkeys_page *page;
869
870 page = kzalloc(sizeof(*page), GFP_KERNEL);
871 if (!page)
872 return -ENOMEM;
873 INIT_LIST_HEAD(list: &ent->mkeys_queue.pages_list);
874 spin_lock_init(&ent->mkeys_queue.lock);
875 list_add_tail(new: &page->list, head: &ent->mkeys_queue.pages_list);
876 ent->mkeys_queue.num_pages++;
877 return 0;
878}
879
880static void mlx5r_mkeys_uninit(struct mlx5_cache_ent *ent)
881{
882 struct mlx5_mkeys_page *page;
883
884 WARN_ON(ent->mkeys_queue.ci || ent->mkeys_queue.num_pages > 1);
885 page = list_last_entry(&ent->mkeys_queue.pages_list,
886 struct mlx5_mkeys_page, list);
887 list_del(entry: &page->list);
888 kfree(objp: page);
889}
890
891struct mlx5_cache_ent *
892mlx5r_cache_create_ent_locked(struct mlx5_ib_dev *dev,
893 struct mlx5r_cache_rb_key rb_key,
894 bool persistent_entry)
895{
896 struct mlx5_cache_ent *ent;
897 int order;
898 int ret;
899
900 ent = kzalloc(sizeof(*ent), GFP_KERNEL);
901 if (!ent)
902 return ERR_PTR(error: -ENOMEM);
903
904 ret = mlx5r_mkeys_init(ent);
905 if (ret)
906 goto mkeys_err;
907 ent->rb_key = rb_key;
908 ent->dev = dev;
909 ent->is_tmp = !persistent_entry;
910
911 INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
912
913 ret = mlx5_cache_ent_insert(cache: &dev->cache, ent);
914 if (ret)
915 goto ent_insert_err;
916
917 if (persistent_entry) {
918 if (rb_key.access_mode == MLX5_MKC_ACCESS_MODE_KSM)
919 order = MLX5_IMR_KSM_CACHE_ENTRY;
920 else
921 order = order_base_2(rb_key.ndescs) - 2;
922
923 if ((dev->mdev->profile.mask & MLX5_PROF_MASK_MR_CACHE) &&
924 !dev->is_rep && mlx5_core_is_pf(dev: dev->mdev) &&
925 mlx5r_umr_can_load_pas(dev, length: 0))
926 ent->limit = dev->mdev->profile.mr_cache[order].limit;
927 else
928 ent->limit = 0;
929
930 mlx5_mkey_cache_debugfs_add_ent(dev, ent);
931 }
932
933 return ent;
934ent_insert_err:
935 mlx5r_mkeys_uninit(ent);
936mkeys_err:
937 kfree(objp: ent);
938 return ERR_PTR(error: ret);
939}
940
941static void mlx5r_destroy_cache_entries(struct mlx5_ib_dev *dev)
942{
943 struct rb_root *root = &dev->cache.rb_root;
944 struct mlx5_cache_ent *ent;
945 struct rb_node *node;
946
947 mutex_lock(&dev->cache.rb_lock);
948 node = rb_first(root);
949 while (node) {
950 ent = rb_entry(node, struct mlx5_cache_ent, node);
951 node = rb_next(node);
952 clean_keys(dev, ent);
953 rb_erase(&ent->node, root);
954 mlx5r_mkeys_uninit(ent);
955 kfree(objp: ent);
956 }
957 mutex_unlock(lock: &dev->cache.rb_lock);
958}
959
960int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
961{
962 struct mlx5_mkey_cache *cache = &dev->cache;
963 struct rb_root *root = &dev->cache.rb_root;
964 struct mlx5r_cache_rb_key rb_key = {
965 .access_mode = MLX5_MKC_ACCESS_MODE_MTT,
966 .ph = MLX5_IB_NO_PH,
967 };
968 struct mlx5_cache_ent *ent;
969 struct rb_node *node;
970 int ret;
971 int i;
972
973 mutex_init(&dev->slow_path_mutex);
974 mutex_init(&dev->cache.rb_lock);
975 dev->cache.rb_root = RB_ROOT;
976 cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM);
977 if (!cache->wq) {
978 mlx5_ib_warn(dev, "failed to create work queue\n");
979 return -ENOMEM;
980 }
981
982 mlx5_cmd_init_async_ctx(dev: dev->mdev, ctx: &dev->async_ctx);
983 timer_setup(&dev->delay_timer, delay_time_func, 0);
984 mlx5_mkey_cache_debugfs_init(dev);
985 mutex_lock(&cache->rb_lock);
986 for (i = 0; i <= mkey_cache_max_order(dev); i++) {
987 rb_key.ndescs = MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS << i;
988 ent = mlx5r_cache_create_ent_locked(dev, rb_key, persistent_entry: true);
989 if (IS_ERR(ptr: ent)) {
990 ret = PTR_ERR(ptr: ent);
991 goto err;
992 }
993 }
994
995 ret = mlx5_odp_init_mkey_cache(dev);
996 if (ret)
997 goto err;
998
999 mutex_unlock(lock: &cache->rb_lock);
1000 for (node = rb_first(root); node; node = rb_next(node)) {
1001 ent = rb_entry(node, struct mlx5_cache_ent, node);
1002 spin_lock_irq(lock: &ent->mkeys_queue.lock);
1003 queue_adjust_cache_locked(ent);
1004 spin_unlock_irq(lock: &ent->mkeys_queue.lock);
1005 }
1006
1007 return 0;
1008
1009err:
1010 mutex_unlock(lock: &cache->rb_lock);
1011 mlx5_mkey_cache_debugfs_cleanup(dev);
1012 mlx5r_destroy_cache_entries(dev);
1013 destroy_workqueue(wq: cache->wq);
1014 mlx5_ib_warn(dev, "failed to create mkey cache entry\n");
1015 return ret;
1016}
1017
1018void mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev)
1019{
1020 struct rb_root *root = &dev->cache.rb_root;
1021 struct mlx5_cache_ent *ent;
1022 struct rb_node *node;
1023
1024 if (!dev->cache.wq)
1025 return;
1026
1027 mutex_lock(&dev->cache.rb_lock);
1028 for (node = rb_first(root); node; node = rb_next(node)) {
1029 ent = rb_entry(node, struct mlx5_cache_ent, node);
1030 spin_lock_irq(lock: &ent->mkeys_queue.lock);
1031 ent->disabled = true;
1032 spin_unlock_irq(lock: &ent->mkeys_queue.lock);
1033 cancel_delayed_work(dwork: &ent->dwork);
1034 }
1035 mutex_unlock(lock: &dev->cache.rb_lock);
1036
1037 /*
1038 * After all entries are disabled and will not reschedule on WQ,
1039 * flush it and all async commands.
1040 */
1041 flush_workqueue(dev->cache.wq);
1042
1043 mlx5_mkey_cache_debugfs_cleanup(dev);
1044 mlx5_cmd_cleanup_async_ctx(ctx: &dev->async_ctx);
1045
1046 /* At this point all entries are disabled and have no concurrent work. */
1047 mlx5r_destroy_cache_entries(dev);
1048
1049 destroy_workqueue(wq: dev->cache.wq);
1050 timer_delete_sync(timer: &dev->delay_timer);
1051}
1052
1053struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc)
1054{
1055 struct mlx5_ib_dev *dev = to_mdev(ibdev: pd->device);
1056 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1057 struct mlx5_ib_mr *mr;
1058 void *mkc;
1059 u32 *in;
1060 int err;
1061
1062 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1063 if (!mr)
1064 return ERR_PTR(error: -ENOMEM);
1065
1066 in = kzalloc(inlen, GFP_KERNEL);
1067 if (!in) {
1068 err = -ENOMEM;
1069 goto err_free;
1070 }
1071
1072 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1073
1074 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA);
1075 MLX5_SET(mkc, mkc, length64, 1);
1076 set_mkc_access_pd_addr_fields(mkc, acc: acc | IB_ACCESS_RELAXED_ORDERING, start_addr: 0,
1077 pd);
1078 MLX5_SET(mkc, mkc, ma_translation_mode, MLX5_CAP_GEN(dev->mdev, ats));
1079
1080 err = mlx5_ib_create_mkey(dev, mkey: &mr->mmkey, in, inlen);
1081 if (err)
1082 goto err_in;
1083
1084 kfree(objp: in);
1085 mr->mmkey.type = MLX5_MKEY_MR;
1086 mr->ibmr.lkey = mr->mmkey.key;
1087 mr->ibmr.rkey = mr->mmkey.key;
1088 mr->umem = NULL;
1089
1090 return &mr->ibmr;
1091
1092err_in:
1093 kfree(objp: in);
1094
1095err_free:
1096 kfree(objp: mr);
1097
1098 return ERR_PTR(error: err);
1099}
1100
1101static int get_octo_len(u64 addr, u64 len, int page_shift)
1102{
1103 u64 page_size = 1ULL << page_shift;
1104 u64 offset;
1105 int npages;
1106
1107 offset = addr & (page_size - 1);
1108 npages = ALIGN(len + offset, page_size) >> page_shift;
1109 return (npages + 1) / 2;
1110}
1111
1112static int mkey_cache_max_order(struct mlx5_ib_dev *dev)
1113{
1114 if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
1115 return MKEY_CACHE_LAST_STD_ENTRY;
1116 return MLX5_MAX_UMR_SHIFT;
1117}
1118
1119static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
1120 u64 length, int access_flags, u64 iova)
1121{
1122 mr->ibmr.lkey = mr->mmkey.key;
1123 mr->ibmr.rkey = mr->mmkey.key;
1124 mr->ibmr.length = length;
1125 mr->ibmr.device = &dev->ib_dev;
1126 mr->ibmr.iova = iova;
1127 mr->access_flags = access_flags;
1128}
1129
1130static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem *umem,
1131 u64 iova)
1132{
1133 /*
1134 * The alignment of iova has already been checked upon entering
1135 * UVERBS_METHOD_REG_DMABUF_MR
1136 */
1137 umem->iova = iova;
1138 return PAGE_SIZE;
1139}
1140
1141static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
1142 struct ib_umem *umem, u64 iova,
1143 int access_flags, int access_mode,
1144 u16 st_index, u8 ph)
1145{
1146 struct mlx5_ib_dev *dev = to_mdev(ibdev: pd->device);
1147 struct mlx5r_cache_rb_key rb_key = {};
1148 struct mlx5_cache_ent *ent;
1149 struct mlx5_ib_mr *mr;
1150 unsigned long page_size;
1151
1152 if (umem->is_dmabuf)
1153 page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova);
1154 else
1155 page_size = mlx5_umem_mkc_find_best_pgsz(dev, umem, iova,
1156 access_mode);
1157 if (WARN_ON(!page_size))
1158 return ERR_PTR(error: -EINVAL);
1159
1160 rb_key.access_mode = access_mode;
1161 rb_key.ndescs = ib_umem_num_dma_blocks(umem, pgsz: page_size);
1162 rb_key.ats = mlx5_umem_needs_ats(dev, umem, access_flags);
1163 rb_key.access_flags = get_unchangeable_access_flags(dev, access_flags);
1164 rb_key.st_index = st_index;
1165 rb_key.ph = ph;
1166 ent = mkey_cache_ent_from_rb_key(dev, rb_key);
1167 /*
1168 * If the MR can't come from the cache then synchronously create an uncached
1169 * one.
1170 */
1171 if (!ent) {
1172 mutex_lock(&dev->slow_path_mutex);
1173 mr = reg_create(pd, umem, iova, access_flags, page_size, populate: false, access_mode,
1174 st_index, ph);
1175 mutex_unlock(lock: &dev->slow_path_mutex);
1176 if (IS_ERR(ptr: mr))
1177 return mr;
1178 mr->mmkey.rb_key = rb_key;
1179 mr->mmkey.cacheable = true;
1180 return mr;
1181 }
1182
1183 mr = _mlx5_mr_cache_alloc(dev, ent);
1184 if (IS_ERR(ptr: mr))
1185 return mr;
1186
1187 mr->ibmr.pd = pd;
1188 mr->umem = umem;
1189 mr->page_shift = order_base_2(page_size);
1190 set_mr_fields(dev, mr, length: umem->length, access_flags, iova);
1191
1192 return mr;
1193}
1194
1195static struct ib_mr *
1196reg_create_crossing_vhca_mr(struct ib_pd *pd, u64 iova, u64 length, int access_flags,
1197 u32 crossed_lkey)
1198{
1199 struct mlx5_ib_dev *dev = to_mdev(ibdev: pd->device);
1200 int access_mode = MLX5_MKC_ACCESS_MODE_CROSSING;
1201 struct mlx5_ib_mr *mr;
1202 void *mkc;
1203 int inlen;
1204 u32 *in;
1205 int err;
1206
1207 if (!MLX5_CAP_GEN(dev->mdev, crossing_vhca_mkey))
1208 return ERR_PTR(error: -EOPNOTSUPP);
1209
1210 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1211 if (!mr)
1212 return ERR_PTR(error: -ENOMEM);
1213
1214 inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1215 in = kvzalloc(inlen, GFP_KERNEL);
1216 if (!in) {
1217 err = -ENOMEM;
1218 goto err_1;
1219 }
1220
1221 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1222 MLX5_SET(mkc, mkc, crossing_target_vhca_id,
1223 MLX5_CAP_GEN(dev->mdev, vhca_id));
1224 MLX5_SET(mkc, mkc, translations_octword_size, crossed_lkey);
1225 MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3);
1226 MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7);
1227
1228 /* for this crossing mkey IOVA should be 0 and len should be IOVA + len */
1229 set_mkc_access_pd_addr_fields(mkc, acc: access_flags, start_addr: 0, pd);
1230 MLX5_SET64(mkc, mkc, len, iova + length);
1231
1232 MLX5_SET(mkc, mkc, free, 0);
1233 MLX5_SET(mkc, mkc, umr_en, 0);
1234 err = mlx5_ib_create_mkey(dev, mkey: &mr->mmkey, in, inlen);
1235 if (err)
1236 goto err_2;
1237
1238 mr->mmkey.type = MLX5_MKEY_MR;
1239 set_mr_fields(dev, mr, length, access_flags, iova);
1240 mr->ibmr.pd = pd;
1241 kvfree(addr: in);
1242 mlx5_ib_dbg(dev, "crossing mkey = 0x%x\n", mr->mmkey.key);
1243
1244 return &mr->ibmr;
1245err_2:
1246 kvfree(addr: in);
1247err_1:
1248 kfree(objp: mr);
1249 return ERR_PTR(error: err);
1250}
1251
1252/*
1253 * If ibmr is NULL it will be allocated by reg_create.
1254 * Else, the given ibmr will be used.
1255 */
1256static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
1257 u64 iova, int access_flags,
1258 unsigned long page_size, bool populate,
1259 int access_mode, u16 st_index, u8 ph)
1260{
1261 struct mlx5_ib_dev *dev = to_mdev(ibdev: pd->device);
1262 struct mlx5_ib_mr *mr;
1263 __be64 *pas;
1264 void *mkc;
1265 int inlen;
1266 u32 *in;
1267 int err;
1268 bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)) &&
1269 (access_mode == MLX5_MKC_ACCESS_MODE_MTT) &&
1270 (ph == MLX5_IB_NO_PH);
1271 bool ksm_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM);
1272
1273 if (!page_size)
1274 return ERR_PTR(error: -EINVAL);
1275 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1276 if (!mr)
1277 return ERR_PTR(error: -ENOMEM);
1278
1279 mr->ibmr.pd = pd;
1280 mr->access_flags = access_flags;
1281 mr->page_shift = order_base_2(page_size);
1282
1283 inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1284 if (populate)
1285 inlen += sizeof(*pas) *
1286 roundup(ib_umem_num_dma_blocks(umem, page_size), 2);
1287 in = kvzalloc(inlen, GFP_KERNEL);
1288 if (!in) {
1289 err = -ENOMEM;
1290 goto err_1;
1291 }
1292 pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
1293 if (populate) {
1294 if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND || ksm_mode)) {
1295 err = -EINVAL;
1296 goto err_2;
1297 }
1298 mlx5_ib_populate_pas(umem, page_size: 1UL << mr->page_shift, pas,
1299 access_flags: pg_cap ? MLX5_IB_MTT_PRESENT : 0);
1300 }
1301
1302 /* The pg_access bit allows setting the access flags
1303 * in the page list submitted with the command.
1304 */
1305 MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap));
1306
1307 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1308 set_mkc_access_pd_addr_fields(mkc, acc: access_flags, start_addr: iova,
1309 pd: populate ? pd : dev->umrc.pd);
1310 /* In case a data direct flow, overwrite the pdn field by its internal kernel PD */
1311 if (umem->is_dmabuf && ksm_mode)
1312 MLX5_SET(mkc, mkc, pd, dev->ddr.pdn);
1313
1314 MLX5_SET(mkc, mkc, free, !populate);
1315 MLX5_SET(mkc, mkc, access_mode_1_0, access_mode);
1316 MLX5_SET(mkc, mkc, umr_en, 1);
1317
1318 MLX5_SET64(mkc, mkc, len, umem->length);
1319 MLX5_SET(mkc, mkc, bsf_octword_size, 0);
1320 if (ksm_mode)
1321 MLX5_SET(mkc, mkc, translations_octword_size,
1322 get_octo_len(iova, umem->length, mr->page_shift) * 2);
1323 else
1324 MLX5_SET(mkc, mkc, translations_octword_size,
1325 get_octo_len(iova, umem->length, mr->page_shift));
1326 MLX5_SET(mkc, mkc, log_page_size, mr->page_shift);
1327 if (mlx5_umem_needs_ats(dev, umem, access_flags))
1328 MLX5_SET(mkc, mkc, ma_translation_mode, 1);
1329 if (populate) {
1330 MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
1331 get_octo_len(iova, umem->length, mr->page_shift));
1332 }
1333
1334 if (ph != MLX5_IB_NO_PH) {
1335 MLX5_SET(mkc, mkc, pcie_tph_en, 1);
1336 MLX5_SET(mkc, mkc, pcie_tph_ph, ph);
1337 if (st_index != MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX)
1338 MLX5_SET(mkc, mkc, pcie_tph_steering_tag_index, st_index);
1339 }
1340
1341 err = mlx5_ib_create_mkey(dev, mkey: &mr->mmkey, in, inlen);
1342 if (err) {
1343 mlx5_ib_warn(dev, "create mkey failed\n");
1344 goto err_2;
1345 }
1346 mr->mmkey.type = MLX5_MKEY_MR;
1347 mr->mmkey.ndescs = get_octo_len(addr: iova, len: umem->length, page_shift: mr->page_shift);
1348 mr->umem = umem;
1349 set_mr_fields(dev, mr, length: umem->length, access_flags, iova);
1350 kvfree(addr: in);
1351
1352 mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmkey.key);
1353
1354 return mr;
1355
1356err_2:
1357 kvfree(addr: in);
1358err_1:
1359 kfree(objp: mr);
1360 return ERR_PTR(error: err);
1361}
1362
1363static struct ib_mr *mlx5_ib_get_dm_mr(struct ib_pd *pd, u64 start_addr,
1364 u64 length, int acc, int mode)
1365{
1366 struct mlx5_ib_dev *dev = to_mdev(ibdev: pd->device);
1367 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1368 struct mlx5_ib_mr *mr;
1369 void *mkc;
1370 u32 *in;
1371 int err;
1372
1373 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1374 if (!mr)
1375 return ERR_PTR(error: -ENOMEM);
1376
1377 in = kzalloc(inlen, GFP_KERNEL);
1378 if (!in) {
1379 err = -ENOMEM;
1380 goto err_free;
1381 }
1382
1383 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1384
1385 MLX5_SET(mkc, mkc, access_mode_1_0, mode & 0x3);
1386 MLX5_SET(mkc, mkc, access_mode_4_2, (mode >> 2) & 0x7);
1387 MLX5_SET64(mkc, mkc, len, length);
1388 set_mkc_access_pd_addr_fields(mkc, acc, start_addr, pd);
1389
1390 err = mlx5_ib_create_mkey(dev, mkey: &mr->mmkey, in, inlen);
1391 if (err)
1392 goto err_in;
1393
1394 kfree(objp: in);
1395
1396 set_mr_fields(dev, mr, length, access_flags: acc, iova: start_addr);
1397
1398 return &mr->ibmr;
1399
1400err_in:
1401 kfree(objp: in);
1402
1403err_free:
1404 kfree(objp: mr);
1405
1406 return ERR_PTR(error: err);
1407}
1408
1409int mlx5_ib_advise_mr(struct ib_pd *pd,
1410 enum ib_uverbs_advise_mr_advice advice,
1411 u32 flags,
1412 struct ib_sge *sg_list,
1413 u32 num_sge,
1414 struct uverbs_attr_bundle *attrs)
1415{
1416 if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH &&
1417 advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
1418 advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
1419 return -EOPNOTSUPP;
1420
1421 return mlx5_ib_advise_mr_prefetch(pd, advice, flags,
1422 sg_list, num_sge);
1423}
1424
1425struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm,
1426 struct ib_dm_mr_attr *attr,
1427 struct uverbs_attr_bundle *attrs)
1428{
1429 struct mlx5_ib_dm *mdm = to_mdm(ibdm: dm);
1430 struct mlx5_core_dev *dev = to_mdev(ibdev: dm->device)->mdev;
1431 u64 start_addr = mdm->dev_addr + attr->offset;
1432 int mode;
1433
1434 switch (mdm->type) {
1435 case MLX5_IB_UAPI_DM_TYPE_MEMIC:
1436 if (attr->access_flags & ~MLX5_IB_DM_MEMIC_ALLOWED_ACCESS)
1437 return ERR_PTR(error: -EINVAL);
1438
1439 mode = MLX5_MKC_ACCESS_MODE_MEMIC;
1440 start_addr -= pci_resource_start(dev->pdev, 0);
1441 break;
1442 case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
1443 case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
1444 case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_PATTERN_SW_ICM:
1445 case MLX5_IB_UAPI_DM_TYPE_ENCAP_SW_ICM:
1446 if (attr->access_flags & ~MLX5_IB_DM_SW_ICM_ALLOWED_ACCESS)
1447 return ERR_PTR(error: -EINVAL);
1448
1449 mode = MLX5_MKC_ACCESS_MODE_SW_ICM;
1450 break;
1451 default:
1452 return ERR_PTR(error: -EINVAL);
1453 }
1454
1455 return mlx5_ib_get_dm_mr(pd, start_addr, length: attr->length,
1456 acc: attr->access_flags, mode);
1457}
1458
1459static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem,
1460 u64 iova, int access_flags,
1461 struct ib_dmah *dmah)
1462{
1463 struct mlx5_ib_dev *dev = to_mdev(ibdev: pd->device);
1464 struct mlx5_ib_mr *mr = NULL;
1465 bool xlt_with_umr;
1466 u16 st_index = MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX;
1467 u8 ph = MLX5_IB_NO_PH;
1468 int err;
1469
1470 if (dmah) {
1471 struct mlx5_ib_dmah *mdmah = to_mdmah(ibdmah: dmah);
1472
1473 ph = dmah->ph;
1474 if (dmah->valid_fields & BIT(IB_DMAH_CPU_ID_EXISTS))
1475 st_index = mdmah->st_index;
1476 }
1477
1478 xlt_with_umr = mlx5r_umr_can_load_pas(dev, length: umem->length);
1479 if (xlt_with_umr) {
1480 mr = alloc_cacheable_mr(pd, umem, iova, access_flags,
1481 access_mode: MLX5_MKC_ACCESS_MODE_MTT,
1482 st_index, ph);
1483 } else {
1484 unsigned long page_size = mlx5_umem_mkc_find_best_pgsz(
1485 dev, umem, iova, access_mode: MLX5_MKC_ACCESS_MODE_MTT);
1486
1487 mutex_lock(&dev->slow_path_mutex);
1488 mr = reg_create(pd, umem, iova, access_flags, page_size,
1489 populate: true, access_mode: MLX5_MKC_ACCESS_MODE_MTT,
1490 st_index, ph);
1491 mutex_unlock(lock: &dev->slow_path_mutex);
1492 }
1493 if (IS_ERR(ptr: mr)) {
1494 ib_umem_release(umem);
1495 return ERR_CAST(ptr: mr);
1496 }
1497
1498 mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
1499
1500 atomic_add(i: ib_umem_num_pages(umem), v: &dev->mdev->priv.reg_pages);
1501
1502 if (xlt_with_umr) {
1503 /*
1504 * If the MR was created with reg_create then it will be
1505 * configured properly but left disabled. It is safe to go ahead
1506 * and configure it again via UMR while enabling it.
1507 */
1508 err = mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ENABLE);
1509 if (err) {
1510 mlx5_ib_dereg_mr(ibmr: &mr->ibmr, NULL);
1511 return ERR_PTR(error: err);
1512 }
1513 }
1514 return &mr->ibmr;
1515}
1516
1517static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length,
1518 u64 iova, int access_flags,
1519 struct ib_udata *udata)
1520{
1521 struct mlx5_ib_dev *dev = to_mdev(ibdev: pd->device);
1522 struct ib_umem_odp *odp;
1523 struct mlx5_ib_mr *mr;
1524 int err;
1525
1526 if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
1527 return ERR_PTR(error: -EOPNOTSUPP);
1528
1529 err = mlx5r_odp_create_eq(dev, eq: &dev->odp_pf_eq);
1530 if (err)
1531 return ERR_PTR(error: err);
1532 if (!start && length == U64_MAX) {
1533 if (iova != 0)
1534 return ERR_PTR(error: -EINVAL);
1535 if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
1536 return ERR_PTR(error: -EINVAL);
1537
1538 mr = mlx5_ib_alloc_implicit_mr(pd: to_mpd(ibpd: pd), access_flags);
1539 if (IS_ERR(ptr: mr))
1540 return ERR_CAST(ptr: mr);
1541 return &mr->ibmr;
1542 }
1543
1544 /* ODP requires xlt update via umr to work. */
1545 if (!mlx5r_umr_can_load_pas(dev, length))
1546 return ERR_PTR(error: -EINVAL);
1547
1548 odp = ib_umem_odp_get(device: &dev->ib_dev, addr: start, size: length, access: access_flags,
1549 ops: &mlx5_mn_ops);
1550 if (IS_ERR(ptr: odp))
1551 return ERR_CAST(ptr: odp);
1552
1553 mr = alloc_cacheable_mr(pd, umem: &odp->umem, iova, access_flags,
1554 access_mode: MLX5_MKC_ACCESS_MODE_MTT,
1555 st_index: MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX,
1556 MLX5_IB_NO_PH);
1557 if (IS_ERR(ptr: mr)) {
1558 ib_umem_release(umem: &odp->umem);
1559 return ERR_CAST(ptr: mr);
1560 }
1561 xa_init(xa: &mr->implicit_children);
1562
1563 odp->private = mr;
1564 err = mlx5r_store_odp_mkey(dev, mmkey: &mr->mmkey);
1565 if (err)
1566 goto err_dereg_mr;
1567
1568 err = mlx5_ib_init_odp_mr(mr);
1569 if (err)
1570 goto err_dereg_mr;
1571 return &mr->ibmr;
1572
1573err_dereg_mr:
1574 mlx5_ib_dereg_mr(ibmr: &mr->ibmr, NULL);
1575 return ERR_PTR(error: err);
1576}
1577
1578struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
1579 u64 iova, int access_flags,
1580 struct ib_dmah *dmah,
1581 struct ib_udata *udata)
1582{
1583 struct mlx5_ib_dev *dev = to_mdev(ibdev: pd->device);
1584 struct ib_umem *umem;
1585 int err;
1586
1587 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) ||
1588 ((access_flags & IB_ACCESS_ON_DEMAND) && dmah))
1589 return ERR_PTR(error: -EOPNOTSUPP);
1590
1591 mlx5_ib_dbg(dev, "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n",
1592 start, iova, length, access_flags);
1593
1594 err = mlx5r_umr_resource_init(dev);
1595 if (err)
1596 return ERR_PTR(error: err);
1597
1598 if (access_flags & IB_ACCESS_ON_DEMAND)
1599 return create_user_odp_mr(pd, start, length, iova, access_flags,
1600 udata);
1601 umem = ib_umem_get(device: &dev->ib_dev, addr: start, size: length, access: access_flags);
1602 if (IS_ERR(ptr: umem))
1603 return ERR_CAST(ptr: umem);
1604 return create_real_mr(pd, umem, iova, access_flags, dmah);
1605}
1606
1607static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach)
1608{
1609 struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv;
1610 struct mlx5_ib_mr *mr = umem_dmabuf->private;
1611
1612 dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv);
1613
1614 if (!umem_dmabuf->sgt || !mr)
1615 return;
1616
1617 mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ZAP);
1618 ib_umem_dmabuf_unmap_pages(umem_dmabuf);
1619}
1620
1621static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = {
1622 .allow_peer2peer = 1,
1623 .move_notify = mlx5_ib_dmabuf_invalidate_cb,
1624};
1625
1626static struct ib_mr *
1627reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device,
1628 u64 offset, u64 length, u64 virt_addr,
1629 int fd, int access_flags, int access_mode,
1630 struct ib_dmah *dmah)
1631{
1632 bool pinned_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM);
1633 struct mlx5_ib_dev *dev = to_mdev(ibdev: pd->device);
1634 struct mlx5_ib_mr *mr = NULL;
1635 struct ib_umem_dmabuf *umem_dmabuf;
1636 u16 st_index = MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX;
1637 u8 ph = MLX5_IB_NO_PH;
1638 int err;
1639
1640 err = mlx5r_umr_resource_init(dev);
1641 if (err)
1642 return ERR_PTR(error: err);
1643
1644 if (!pinned_mode)
1645 umem_dmabuf = ib_umem_dmabuf_get(device: &dev->ib_dev,
1646 offset, size: length, fd,
1647 access: access_flags,
1648 ops: &mlx5_ib_dmabuf_attach_ops);
1649 else
1650 umem_dmabuf = ib_umem_dmabuf_get_pinned_with_dma_device(device: &dev->ib_dev,
1651 dma_device, offset, size: length,
1652 fd, access: access_flags);
1653
1654 if (IS_ERR(ptr: umem_dmabuf)) {
1655 mlx5_ib_dbg(dev, "umem_dmabuf get failed (%pe)\n", umem_dmabuf);
1656 return ERR_CAST(ptr: umem_dmabuf);
1657 }
1658
1659 if (dmah) {
1660 struct mlx5_ib_dmah *mdmah = to_mdmah(ibdmah: dmah);
1661
1662 ph = dmah->ph;
1663 if (dmah->valid_fields & BIT(IB_DMAH_CPU_ID_EXISTS))
1664 st_index = mdmah->st_index;
1665 }
1666
1667 mr = alloc_cacheable_mr(pd, umem: &umem_dmabuf->umem, iova: virt_addr,
1668 access_flags, access_mode,
1669 st_index, ph);
1670 if (IS_ERR(ptr: mr)) {
1671 ib_umem_release(umem: &umem_dmabuf->umem);
1672 return ERR_CAST(ptr: mr);
1673 }
1674
1675 mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
1676
1677 atomic_add(i: ib_umem_num_pages(umem: mr->umem), v: &dev->mdev->priv.reg_pages);
1678 umem_dmabuf->private = mr;
1679 if (!pinned_mode) {
1680 err = mlx5r_store_odp_mkey(dev, mmkey: &mr->mmkey);
1681 if (err)
1682 goto err_dereg_mr;
1683 } else {
1684 mr->data_direct = true;
1685 }
1686
1687 err = mlx5_ib_init_dmabuf_mr(mr);
1688 if (err)
1689 goto err_dereg_mr;
1690 return &mr->ibmr;
1691
1692err_dereg_mr:
1693 __mlx5_ib_dereg_mr(ibmr: &mr->ibmr);
1694 return ERR_PTR(error: err);
1695}
1696
1697static struct ib_mr *
1698reg_user_mr_dmabuf_by_data_direct(struct ib_pd *pd, u64 offset,
1699 u64 length, u64 virt_addr,
1700 int fd, int access_flags)
1701{
1702 struct mlx5_ib_dev *dev = to_mdev(ibdev: pd->device);
1703 struct mlx5_data_direct_dev *data_direct_dev;
1704 struct ib_mr *crossing_mr;
1705 struct ib_mr *crossed_mr;
1706 int ret = 0;
1707
1708 /* As of HW behaviour the IOVA must be page aligned in KSM mode */
1709 if (!PAGE_ALIGNED(virt_addr) || (access_flags & IB_ACCESS_ON_DEMAND))
1710 return ERR_PTR(error: -EOPNOTSUPP);
1711
1712 mutex_lock(&dev->data_direct_lock);
1713 data_direct_dev = dev->data_direct_dev;
1714 if (!data_direct_dev) {
1715 ret = -EINVAL;
1716 goto end;
1717 }
1718
1719 /* If no device's 'data direct mkey' with RO flags exists
1720 * mask it out accordingly.
1721 */
1722 if (!dev->ddr.mkey_ro_valid)
1723 access_flags &= ~IB_ACCESS_RELAXED_ORDERING;
1724 crossed_mr = reg_user_mr_dmabuf(pd, dma_device: &data_direct_dev->pdev->dev,
1725 offset, length, virt_addr, fd,
1726 access_flags, access_mode: MLX5_MKC_ACCESS_MODE_KSM,
1727 NULL);
1728 if (IS_ERR(ptr: crossed_mr)) {
1729 ret = PTR_ERR(ptr: crossed_mr);
1730 goto end;
1731 }
1732
1733 mutex_lock(&dev->slow_path_mutex);
1734 crossing_mr = reg_create_crossing_vhca_mr(pd, iova: virt_addr, length, access_flags,
1735 crossed_lkey: crossed_mr->lkey);
1736 mutex_unlock(lock: &dev->slow_path_mutex);
1737 if (IS_ERR(ptr: crossing_mr)) {
1738 __mlx5_ib_dereg_mr(ibmr: crossed_mr);
1739 ret = PTR_ERR(ptr: crossing_mr);
1740 goto end;
1741 }
1742
1743 list_add_tail(new: &to_mmr(ibmr: crossed_mr)->dd_node, head: &dev->data_direct_mr_list);
1744 to_mmr(ibmr: crossing_mr)->dd_crossed_mr = to_mmr(ibmr: crossed_mr);
1745 to_mmr(ibmr: crossing_mr)->data_direct = true;
1746end:
1747 mutex_unlock(lock: &dev->data_direct_lock);
1748 return ret ? ERR_PTR(error: ret) : crossing_mr;
1749}
1750
1751struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
1752 u64 length, u64 virt_addr,
1753 int fd, int access_flags,
1754 struct ib_dmah *dmah,
1755 struct uverbs_attr_bundle *attrs)
1756{
1757 struct mlx5_ib_dev *dev = to_mdev(ibdev: pd->device);
1758 int mlx5_access_flags = 0;
1759 int err;
1760
1761 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) ||
1762 !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
1763 return ERR_PTR(error: -EOPNOTSUPP);
1764
1765 if (uverbs_attr_is_valid(attrs_bundle: attrs, idx: MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS)) {
1766 err = uverbs_get_flags32(to: &mlx5_access_flags, attrs_bundle: attrs,
1767 idx: MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS,
1768 allowed_bits: MLX5_IB_UAPI_REG_DMABUF_ACCESS_DATA_DIRECT);
1769 if (err)
1770 return ERR_PTR(error: err);
1771 }
1772
1773 mlx5_ib_dbg(dev,
1774 "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x, mlx5_access_flags 0x%x\n",
1775 offset, virt_addr, length, fd, access_flags, mlx5_access_flags);
1776
1777 /* dmabuf requires xlt update via umr to work. */
1778 if (!mlx5r_umr_can_load_pas(dev, length))
1779 return ERR_PTR(error: -EINVAL);
1780
1781 if (mlx5_access_flags & MLX5_IB_UAPI_REG_DMABUF_ACCESS_DATA_DIRECT)
1782 return reg_user_mr_dmabuf_by_data_direct(pd, offset, length, virt_addr,
1783 fd, access_flags);
1784
1785 return reg_user_mr_dmabuf(pd, dma_device: pd->device->dma_device,
1786 offset, length, virt_addr,
1787 fd, access_flags, access_mode: MLX5_MKC_ACCESS_MODE_MTT,
1788 dmah);
1789}
1790
1791/*
1792 * True if the change in access flags can be done via UMR, only some access
1793 * flags can be updated.
1794 */
1795static bool can_use_umr_rereg_access(struct mlx5_ib_dev *dev,
1796 unsigned int current_access_flags,
1797 unsigned int target_access_flags)
1798{
1799 unsigned int diffs = current_access_flags ^ target_access_flags;
1800
1801 if (diffs & ~(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE |
1802 IB_ACCESS_REMOTE_READ | IB_ACCESS_RELAXED_ORDERING |
1803 IB_ACCESS_REMOTE_ATOMIC))
1804 return false;
1805 return mlx5r_umr_can_reconfig(dev, current_access_flags,
1806 target_access_flags);
1807}
1808
1809static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr,
1810 struct ib_umem *new_umem,
1811 int new_access_flags, u64 iova,
1812 unsigned long *page_size)
1813{
1814 struct mlx5_ib_dev *dev = to_mdev(ibdev: mr->ibmr.device);
1815
1816 /* We only track the allocated sizes of MRs from the cache */
1817 if (!mr->mmkey.cache_ent)
1818 return false;
1819 if (!mlx5r_umr_can_load_pas(dev, length: new_umem->length))
1820 return false;
1821
1822 *page_size = mlx5_umem_mkc_find_best_pgsz(
1823 dev, umem: new_umem, iova, access_mode: mr->mmkey.cache_ent->rb_key.access_mode);
1824 if (WARN_ON(!*page_size))
1825 return false;
1826 return (mr->mmkey.cache_ent->rb_key.ndescs) >=
1827 ib_umem_num_dma_blocks(umem: new_umem, pgsz: *page_size);
1828}
1829
1830static int umr_rereg_pas(struct mlx5_ib_mr *mr, struct ib_pd *pd,
1831 int access_flags, int flags, struct ib_umem *new_umem,
1832 u64 iova, unsigned long page_size)
1833{
1834 struct mlx5_ib_dev *dev = to_mdev(ibdev: mr->ibmr.device);
1835 int upd_flags = MLX5_IB_UPD_XLT_ADDR | MLX5_IB_UPD_XLT_ENABLE;
1836 struct ib_umem *old_umem = mr->umem;
1837 int err;
1838
1839 /*
1840 * To keep everything simple the MR is revoked before we start to mess
1841 * with it. This ensure the change is atomic relative to any use of the
1842 * MR.
1843 */
1844 err = mlx5r_umr_revoke_mr(mr);
1845 if (err)
1846 return err;
1847
1848 if (flags & IB_MR_REREG_PD) {
1849 mr->ibmr.pd = pd;
1850 upd_flags |= MLX5_IB_UPD_XLT_PD;
1851 }
1852 if (flags & IB_MR_REREG_ACCESS) {
1853 mr->access_flags = access_flags;
1854 upd_flags |= MLX5_IB_UPD_XLT_ACCESS;
1855 }
1856
1857 mr->ibmr.iova = iova;
1858 mr->ibmr.length = new_umem->length;
1859 mr->page_shift = order_base_2(page_size);
1860 mr->umem = new_umem;
1861 err = mlx5r_umr_update_mr_pas(mr, flags: upd_flags);
1862 if (err) {
1863 /*
1864 * The MR is revoked at this point so there is no issue to free
1865 * new_umem.
1866 */
1867 mr->umem = old_umem;
1868 return err;
1869 }
1870
1871 atomic_sub(i: ib_umem_num_pages(umem: old_umem), v: &dev->mdev->priv.reg_pages);
1872 ib_umem_release(umem: old_umem);
1873 atomic_add(i: ib_umem_num_pages(umem: new_umem), v: &dev->mdev->priv.reg_pages);
1874 return 0;
1875}
1876
1877struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
1878 u64 length, u64 iova, int new_access_flags,
1879 struct ib_pd *new_pd,
1880 struct ib_udata *udata)
1881{
1882 struct mlx5_ib_dev *dev = to_mdev(ibdev: ib_mr->device);
1883 struct mlx5_ib_mr *mr = to_mmr(ibmr: ib_mr);
1884 int err;
1885
1886 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || mr->data_direct ||
1887 mr->mmkey.rb_key.ph != MLX5_IB_NO_PH)
1888 return ERR_PTR(error: -EOPNOTSUPP);
1889
1890 mlx5_ib_dbg(
1891 dev,
1892 "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n",
1893 start, iova, length, new_access_flags);
1894
1895 if (flags & ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS))
1896 return ERR_PTR(error: -EOPNOTSUPP);
1897
1898 if (!(flags & IB_MR_REREG_ACCESS))
1899 new_access_flags = mr->access_flags;
1900 if (!(flags & IB_MR_REREG_PD))
1901 new_pd = ib_mr->pd;
1902
1903 if (!(flags & IB_MR_REREG_TRANS)) {
1904 struct ib_umem *umem;
1905
1906 /* Fast path for PD/access change */
1907 if (can_use_umr_rereg_access(dev, current_access_flags: mr->access_flags,
1908 target_access_flags: new_access_flags)) {
1909 err = mlx5r_umr_rereg_pd_access(mr, pd: new_pd,
1910 access_flags: new_access_flags);
1911 if (err)
1912 return ERR_PTR(error: err);
1913 return NULL;
1914 }
1915 /* DM or ODP MR's don't have a normal umem so we can't re-use it */
1916 if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr))
1917 goto recreate;
1918
1919 /*
1920 * Only one active MR can refer to a umem at one time, revoke
1921 * the old MR before assigning the umem to the new one.
1922 */
1923 err = mlx5r_umr_revoke_mr(mr);
1924 if (err)
1925 return ERR_PTR(error: err);
1926 umem = mr->umem;
1927 mr->umem = NULL;
1928 atomic_sub(i: ib_umem_num_pages(umem), v: &dev->mdev->priv.reg_pages);
1929
1930 return create_real_mr(pd: new_pd, umem, iova: mr->ibmr.iova,
1931 access_flags: new_access_flags, NULL);
1932 }
1933
1934 /*
1935 * DM doesn't have a PAS list so we can't re-use it, odp/dmabuf does
1936 * but the logic around releasing the umem is different
1937 */
1938 if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr))
1939 goto recreate;
1940
1941 if (!(new_access_flags & IB_ACCESS_ON_DEMAND) &&
1942 can_use_umr_rereg_access(dev, current_access_flags: mr->access_flags, target_access_flags: new_access_flags)) {
1943 struct ib_umem *new_umem;
1944 unsigned long page_size;
1945
1946 new_umem = ib_umem_get(device: &dev->ib_dev, addr: start, size: length,
1947 access: new_access_flags);
1948 if (IS_ERR(ptr: new_umem))
1949 return ERR_CAST(ptr: new_umem);
1950
1951 /* Fast path for PAS change */
1952 if (can_use_umr_rereg_pas(mr, new_umem, new_access_flags, iova,
1953 page_size: &page_size)) {
1954 err = umr_rereg_pas(mr, pd: new_pd, access_flags: new_access_flags, flags,
1955 new_umem, iova, page_size);
1956 if (err) {
1957 ib_umem_release(umem: new_umem);
1958 return ERR_PTR(error: err);
1959 }
1960 return NULL;
1961 }
1962 return create_real_mr(pd: new_pd, umem: new_umem, iova, access_flags: new_access_flags, NULL);
1963 }
1964
1965 /*
1966 * Everything else has no state we can preserve, just create a new MR
1967 * from scratch
1968 */
1969recreate:
1970 return mlx5_ib_reg_user_mr(pd: new_pd, start, length, iova,
1971 access_flags: new_access_flags, NULL, udata);
1972}
1973
1974static int
1975mlx5_alloc_priv_descs(struct ib_device *device,
1976 struct mlx5_ib_mr *mr,
1977 int ndescs,
1978 int desc_size)
1979{
1980 struct mlx5_ib_dev *dev = to_mdev(ibdev: device);
1981 struct device *ddev = &dev->mdev->pdev->dev;
1982 int size = ndescs * desc_size;
1983 int add_size;
1984 int ret;
1985
1986 add_size = max_t(int, MLX5_UMR_ALIGN - ARCH_KMALLOC_MINALIGN, 0);
1987 if (is_power_of_2(MLX5_UMR_ALIGN) && add_size) {
1988 int end = max_t(int, MLX5_UMR_ALIGN, roundup_pow_of_two(size));
1989
1990 add_size = min_t(int, end - size, add_size);
1991 }
1992
1993 mr->descs_alloc = kzalloc(size + add_size, GFP_KERNEL);
1994 if (!mr->descs_alloc)
1995 return -ENOMEM;
1996
1997 mr->descs = PTR_ALIGN(mr->descs_alloc, MLX5_UMR_ALIGN);
1998
1999 mr->desc_map = dma_map_single(ddev, mr->descs, size, DMA_TO_DEVICE);
2000 if (dma_mapping_error(dev: ddev, dma_addr: mr->desc_map)) {
2001 ret = -ENOMEM;
2002 goto err;
2003 }
2004
2005 return 0;
2006err:
2007 kfree(objp: mr->descs_alloc);
2008
2009 return ret;
2010}
2011
2012static void
2013mlx5_free_priv_descs(struct mlx5_ib_mr *mr)
2014{
2015 if (!mr->umem && !mr->data_direct &&
2016 mr->ibmr.type != IB_MR_TYPE_DM && mr->descs) {
2017 struct ib_device *device = mr->ibmr.device;
2018 int size = mr->max_descs * mr->desc_size;
2019 struct mlx5_ib_dev *dev = to_mdev(ibdev: device);
2020
2021 dma_unmap_single(&dev->mdev->pdev->dev, mr->desc_map, size,
2022 DMA_TO_DEVICE);
2023 kfree(objp: mr->descs_alloc);
2024 mr->descs = NULL;
2025 }
2026}
2027
2028static int cache_ent_find_and_store(struct mlx5_ib_dev *dev,
2029 struct mlx5_ib_mr *mr)
2030{
2031 struct mlx5_mkey_cache *cache = &dev->cache;
2032 struct mlx5_cache_ent *ent;
2033 int ret;
2034
2035 if (mr->mmkey.cache_ent) {
2036 spin_lock_irq(lock: &mr->mmkey.cache_ent->mkeys_queue.lock);
2037 goto end;
2038 }
2039
2040 mutex_lock(&cache->rb_lock);
2041 ent = mkey_cache_ent_from_rb_key(dev, rb_key: mr->mmkey.rb_key);
2042 if (ent) {
2043 if (ent->rb_key.ndescs == mr->mmkey.rb_key.ndescs) {
2044 if (ent->disabled) {
2045 mutex_unlock(lock: &cache->rb_lock);
2046 return -EOPNOTSUPP;
2047 }
2048 mr->mmkey.cache_ent = ent;
2049 spin_lock_irq(lock: &mr->mmkey.cache_ent->mkeys_queue.lock);
2050 mutex_unlock(lock: &cache->rb_lock);
2051 goto end;
2052 }
2053 }
2054
2055 ent = mlx5r_cache_create_ent_locked(dev, rb_key: mr->mmkey.rb_key, persistent_entry: false);
2056 mutex_unlock(lock: &cache->rb_lock);
2057 if (IS_ERR(ptr: ent))
2058 return PTR_ERR(ptr: ent);
2059
2060 mr->mmkey.cache_ent = ent;
2061 spin_lock_irq(lock: &mr->mmkey.cache_ent->mkeys_queue.lock);
2062
2063end:
2064 ret = push_mkey_locked(ent: mr->mmkey.cache_ent, mkey: mr->mmkey.key);
2065 spin_unlock_irq(lock: &mr->mmkey.cache_ent->mkeys_queue.lock);
2066 return ret;
2067}
2068
2069static int mlx5_ib_revoke_data_direct_mr(struct mlx5_ib_mr *mr)
2070{
2071 struct mlx5_ib_dev *dev = to_mdev(ibdev: mr->ibmr.device);
2072 struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(umem: mr->umem);
2073 int err;
2074
2075 lockdep_assert_held(&dev->data_direct_lock);
2076 mr->revoked = true;
2077 err = mlx5r_umr_revoke_mr(mr);
2078 if (WARN_ON(err))
2079 return err;
2080
2081 ib_umem_dmabuf_revoke(umem_dmabuf);
2082 return 0;
2083}
2084
2085void mlx5_ib_revoke_data_direct_mrs(struct mlx5_ib_dev *dev)
2086{
2087 struct mlx5_ib_mr *mr, *next;
2088
2089 lockdep_assert_held(&dev->data_direct_lock);
2090
2091 list_for_each_entry_safe(mr, next, &dev->data_direct_mr_list, dd_node) {
2092 list_del(entry: &mr->dd_node);
2093 mlx5_ib_revoke_data_direct_mr(mr);
2094 }
2095}
2096
2097static int mlx5_umr_revoke_mr_with_lock(struct mlx5_ib_mr *mr)
2098{
2099 bool is_odp_dma_buf = is_dmabuf_mr(mr) &&
2100 !to_ib_umem_dmabuf(umem: mr->umem)->pinned;
2101 bool is_odp = is_odp_mr(mr);
2102 int ret;
2103
2104 if (is_odp)
2105 mutex_lock(&to_ib_umem_odp(mr->umem)->umem_mutex);
2106
2107 if (is_odp_dma_buf)
2108 dma_resv_lock(obj: to_ib_umem_dmabuf(umem: mr->umem)->attach->dmabuf->resv,
2109 NULL);
2110
2111 ret = mlx5r_umr_revoke_mr(mr);
2112
2113 if (is_odp) {
2114 if (!ret)
2115 to_ib_umem_odp(umem: mr->umem)->private = NULL;
2116 mutex_unlock(lock: &to_ib_umem_odp(umem: mr->umem)->umem_mutex);
2117 }
2118
2119 if (is_odp_dma_buf) {
2120 if (!ret)
2121 to_ib_umem_dmabuf(umem: mr->umem)->private = NULL;
2122 dma_resv_unlock(
2123 obj: to_ib_umem_dmabuf(umem: mr->umem)->attach->dmabuf->resv);
2124 }
2125
2126 return ret;
2127}
2128
2129static int mlx5r_handle_mkey_cleanup(struct mlx5_ib_mr *mr)
2130{
2131 bool is_odp_dma_buf = is_dmabuf_mr(mr) &&
2132 !to_ib_umem_dmabuf(umem: mr->umem)->pinned;
2133 struct mlx5_ib_dev *dev = to_mdev(ibdev: mr->ibmr.device);
2134 struct mlx5_cache_ent *ent = mr->mmkey.cache_ent;
2135 bool is_odp = is_odp_mr(mr);
2136 bool from_cache = !!ent;
2137 int ret;
2138
2139 if (mr->mmkey.cacheable && !mlx5_umr_revoke_mr_with_lock(mr) &&
2140 !cache_ent_find_and_store(dev, mr)) {
2141 ent = mr->mmkey.cache_ent;
2142 /* upon storing to a clean temp entry - schedule its cleanup */
2143 spin_lock_irq(lock: &ent->mkeys_queue.lock);
2144 if (from_cache)
2145 ent->in_use--;
2146 if (ent->is_tmp && !ent->tmp_cleanup_scheduled) {
2147 mod_delayed_work(wq: ent->dev->cache.wq, dwork: &ent->dwork,
2148 secs_to_jiffies(30));
2149 ent->tmp_cleanup_scheduled = true;
2150 }
2151 spin_unlock_irq(lock: &ent->mkeys_queue.lock);
2152 return 0;
2153 }
2154
2155 if (ent) {
2156 spin_lock_irq(lock: &ent->mkeys_queue.lock);
2157 ent->in_use--;
2158 mr->mmkey.cache_ent = NULL;
2159 spin_unlock_irq(lock: &ent->mkeys_queue.lock);
2160 }
2161
2162 if (is_odp)
2163 mutex_lock(&to_ib_umem_odp(mr->umem)->umem_mutex);
2164
2165 if (is_odp_dma_buf)
2166 dma_resv_lock(obj: to_ib_umem_dmabuf(umem: mr->umem)->attach->dmabuf->resv,
2167 NULL);
2168 ret = destroy_mkey(dev, mr);
2169 if (is_odp) {
2170 if (!ret)
2171 to_ib_umem_odp(umem: mr->umem)->private = NULL;
2172 mutex_unlock(lock: &to_ib_umem_odp(umem: mr->umem)->umem_mutex);
2173 }
2174
2175 if (is_odp_dma_buf) {
2176 if (!ret)
2177 to_ib_umem_dmabuf(umem: mr->umem)->private = NULL;
2178 dma_resv_unlock(
2179 obj: to_ib_umem_dmabuf(umem: mr->umem)->attach->dmabuf->resv);
2180 }
2181 return ret;
2182}
2183
2184static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr)
2185{
2186 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2187 struct mlx5_ib_dev *dev = to_mdev(ibdev: ibmr->device);
2188 int rc;
2189
2190 /*
2191 * Any async use of the mr must hold the refcount, once the refcount
2192 * goes to zero no other thread, such as ODP page faults, prefetch, any
2193 * UMR activity, etc can touch the mkey. Thus it is safe to destroy it.
2194 */
2195 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) &&
2196 refcount_read(r: &mr->mmkey.usecount) != 0 &&
2197 xa_erase(&mr_to_mdev(mr)->odp_mkeys, index: mlx5_base_mkey(key: mr->mmkey.key)))
2198 mlx5r_deref_wait_odp_mkey(mmkey: &mr->mmkey);
2199
2200 if (ibmr->type == IB_MR_TYPE_INTEGRITY) {
2201 xa_cmpxchg(xa: &dev->sig_mrs, index: mlx5_base_mkey(key: mr->mmkey.key),
2202 old: mr->sig, NULL, GFP_KERNEL);
2203
2204 if (mr->mtt_mr) {
2205 rc = mlx5_ib_dereg_mr(ibmr: &mr->mtt_mr->ibmr, NULL);
2206 if (rc)
2207 return rc;
2208 mr->mtt_mr = NULL;
2209 }
2210 if (mr->klm_mr) {
2211 rc = mlx5_ib_dereg_mr(ibmr: &mr->klm_mr->ibmr, NULL);
2212 if (rc)
2213 return rc;
2214 mr->klm_mr = NULL;
2215 }
2216
2217 if (mlx5_core_destroy_psv(dev: dev->mdev,
2218 psv_num: mr->sig->psv_memory.psv_idx))
2219 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
2220 mr->sig->psv_memory.psv_idx);
2221 if (mlx5_core_destroy_psv(dev: dev->mdev, psv_num: mr->sig->psv_wire.psv_idx))
2222 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
2223 mr->sig->psv_wire.psv_idx);
2224 kfree(objp: mr->sig);
2225 mr->sig = NULL;
2226 }
2227
2228 /* Stop DMA */
2229 rc = mlx5r_handle_mkey_cleanup(mr);
2230 if (rc)
2231 return rc;
2232
2233 if (mr->umem) {
2234 bool is_odp = is_odp_mr(mr);
2235
2236 if (!is_odp)
2237 atomic_sub(i: ib_umem_num_pages(umem: mr->umem),
2238 v: &dev->mdev->priv.reg_pages);
2239 ib_umem_release(umem: mr->umem);
2240 if (is_odp)
2241 mlx5_ib_free_odp_mr(mr);
2242 }
2243
2244 if (!mr->mmkey.cache_ent)
2245 mlx5_free_priv_descs(mr);
2246
2247 kfree(objp: mr);
2248 return 0;
2249}
2250
2251static int dereg_crossing_data_direct_mr(struct mlx5_ib_dev *dev,
2252 struct mlx5_ib_mr *mr)
2253{
2254 struct mlx5_ib_mr *dd_crossed_mr = mr->dd_crossed_mr;
2255 int ret;
2256
2257 ret = __mlx5_ib_dereg_mr(ibmr: &mr->ibmr);
2258 if (ret)
2259 return ret;
2260
2261 mutex_lock(&dev->data_direct_lock);
2262 if (!dd_crossed_mr->revoked)
2263 list_del(entry: &dd_crossed_mr->dd_node);
2264
2265 ret = __mlx5_ib_dereg_mr(ibmr: &dd_crossed_mr->ibmr);
2266 mutex_unlock(lock: &dev->data_direct_lock);
2267 return ret;
2268}
2269
2270int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
2271{
2272 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2273 struct mlx5_ib_dev *dev = to_mdev(ibdev: ibmr->device);
2274
2275 if (mr->data_direct)
2276 return dereg_crossing_data_direct_mr(dev, mr);
2277
2278 return __mlx5_ib_dereg_mr(ibmr);
2279}
2280
2281static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs,
2282 int access_mode, int page_shift)
2283{
2284 struct mlx5_ib_dev *dev = to_mdev(ibdev: pd->device);
2285 void *mkc;
2286
2287 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
2288
2289 /* This is only used from the kernel, so setting the PD is OK. */
2290 set_mkc_access_pd_addr_fields(mkc, acc: IB_ACCESS_RELAXED_ORDERING, start_addr: 0, pd);
2291 MLX5_SET(mkc, mkc, free, 1);
2292 MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
2293 MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3);
2294 MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7);
2295 MLX5_SET(mkc, mkc, umr_en, 1);
2296 MLX5_SET(mkc, mkc, log_page_size, page_shift);
2297 if (access_mode == MLX5_MKC_ACCESS_MODE_PA ||
2298 access_mode == MLX5_MKC_ACCESS_MODE_MTT)
2299 MLX5_SET(mkc, mkc, ma_translation_mode, MLX5_CAP_GEN(dev->mdev, ats));
2300}
2301
2302static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2303 int ndescs, int desc_size, int page_shift,
2304 int access_mode, u32 *in, int inlen)
2305{
2306 struct mlx5_ib_dev *dev = to_mdev(ibdev: pd->device);
2307 int err;
2308
2309 mr->access_mode = access_mode;
2310 mr->desc_size = desc_size;
2311 mr->max_descs = ndescs;
2312
2313 err = mlx5_alloc_priv_descs(device: pd->device, mr, ndescs, desc_size);
2314 if (err)
2315 return err;
2316
2317 mlx5_set_umr_free_mkey(pd, in, ndescs, access_mode, page_shift);
2318
2319 err = mlx5_ib_create_mkey(dev, mkey: &mr->mmkey, in, inlen);
2320 if (err)
2321 goto err_free_descs;
2322
2323 mr->mmkey.type = MLX5_MKEY_MR;
2324 mr->ibmr.lkey = mr->mmkey.key;
2325 mr->ibmr.rkey = mr->mmkey.key;
2326
2327 return 0;
2328
2329err_free_descs:
2330 mlx5_free_priv_descs(mr);
2331 return err;
2332}
2333
2334static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd,
2335 u32 max_num_sg, u32 max_num_meta_sg,
2336 int desc_size, int access_mode)
2337{
2338 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
2339 int ndescs = ALIGN(max_num_sg + max_num_meta_sg, 4);
2340 int page_shift = 0;
2341 struct mlx5_ib_mr *mr;
2342 u32 *in;
2343 int err;
2344
2345 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
2346 if (!mr)
2347 return ERR_PTR(error: -ENOMEM);
2348
2349 mr->ibmr.pd = pd;
2350 mr->ibmr.device = pd->device;
2351
2352 in = kzalloc(inlen, GFP_KERNEL);
2353 if (!in) {
2354 err = -ENOMEM;
2355 goto err_free;
2356 }
2357
2358 if (access_mode == MLX5_MKC_ACCESS_MODE_MTT)
2359 page_shift = PAGE_SHIFT;
2360
2361 err = _mlx5_alloc_mkey_descs(pd, mr, ndescs, desc_size, page_shift,
2362 access_mode, in, inlen);
2363 if (err)
2364 goto err_free_in;
2365
2366 mr->umem = NULL;
2367 kfree(objp: in);
2368
2369 return mr;
2370
2371err_free_in:
2372 kfree(objp: in);
2373err_free:
2374 kfree(objp: mr);
2375 return ERR_PTR(error: err);
2376}
2377
2378static int mlx5_alloc_mem_reg_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2379 int ndescs, u32 *in, int inlen)
2380{
2381 return _mlx5_alloc_mkey_descs(pd, mr, ndescs, desc_size: sizeof(struct mlx5_mtt),
2382 PAGE_SHIFT, access_mode: MLX5_MKC_ACCESS_MODE_MTT, in,
2383 inlen);
2384}
2385
2386static int mlx5_alloc_sg_gaps_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2387 int ndescs, u32 *in, int inlen)
2388{
2389 return _mlx5_alloc_mkey_descs(pd, mr, ndescs, desc_size: sizeof(struct mlx5_klm),
2390 page_shift: 0, access_mode: MLX5_MKC_ACCESS_MODE_KLMS, in, inlen);
2391}
2392
2393static int mlx5_alloc_integrity_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2394 int max_num_sg, int max_num_meta_sg,
2395 u32 *in, int inlen)
2396{
2397 struct mlx5_ib_dev *dev = to_mdev(ibdev: pd->device);
2398 u32 psv_index[2];
2399 void *mkc;
2400 int err;
2401
2402 mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL);
2403 if (!mr->sig)
2404 return -ENOMEM;
2405
2406 /* create mem & wire PSVs */
2407 err = mlx5_core_create_psv(dev: dev->mdev, pdn: to_mpd(ibpd: pd)->pdn, npsvs: 2, sig_index: psv_index);
2408 if (err)
2409 goto err_free_sig;
2410
2411 mr->sig->psv_memory.psv_idx = psv_index[0];
2412 mr->sig->psv_wire.psv_idx = psv_index[1];
2413
2414 mr->sig->sig_status_checked = true;
2415 mr->sig->sig_err_exists = false;
2416 /* Next UMR, Arm SIGERR */
2417 ++mr->sig->sigerr_count;
2418 mr->klm_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg,
2419 desc_size: sizeof(struct mlx5_klm),
2420 access_mode: MLX5_MKC_ACCESS_MODE_KLMS);
2421 if (IS_ERR(ptr: mr->klm_mr)) {
2422 err = PTR_ERR(ptr: mr->klm_mr);
2423 goto err_destroy_psv;
2424 }
2425 mr->mtt_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg,
2426 desc_size: sizeof(struct mlx5_mtt),
2427 access_mode: MLX5_MKC_ACCESS_MODE_MTT);
2428 if (IS_ERR(ptr: mr->mtt_mr)) {
2429 err = PTR_ERR(ptr: mr->mtt_mr);
2430 goto err_free_klm_mr;
2431 }
2432
2433 /* Set bsf descriptors for mkey */
2434 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
2435 MLX5_SET(mkc, mkc, bsf_en, 1);
2436 MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE);
2437
2438 err = _mlx5_alloc_mkey_descs(pd, mr, ndescs: 4, desc_size: sizeof(struct mlx5_klm), page_shift: 0,
2439 access_mode: MLX5_MKC_ACCESS_MODE_KLMS, in, inlen);
2440 if (err)
2441 goto err_free_mtt_mr;
2442
2443 err = xa_err(entry: xa_store(&dev->sig_mrs, index: mlx5_base_mkey(key: mr->mmkey.key),
2444 entry: mr->sig, GFP_KERNEL));
2445 if (err)
2446 goto err_free_descs;
2447 return 0;
2448
2449err_free_descs:
2450 destroy_mkey(dev, mr);
2451 mlx5_free_priv_descs(mr);
2452err_free_mtt_mr:
2453 mlx5_ib_dereg_mr(ibmr: &mr->mtt_mr->ibmr, NULL);
2454 mr->mtt_mr = NULL;
2455err_free_klm_mr:
2456 mlx5_ib_dereg_mr(ibmr: &mr->klm_mr->ibmr, NULL);
2457 mr->klm_mr = NULL;
2458err_destroy_psv:
2459 if (mlx5_core_destroy_psv(dev: dev->mdev, psv_num: mr->sig->psv_memory.psv_idx))
2460 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
2461 mr->sig->psv_memory.psv_idx);
2462 if (mlx5_core_destroy_psv(dev: dev->mdev, psv_num: mr->sig->psv_wire.psv_idx))
2463 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
2464 mr->sig->psv_wire.psv_idx);
2465err_free_sig:
2466 kfree(objp: mr->sig);
2467
2468 return err;
2469}
2470
2471static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd,
2472 enum ib_mr_type mr_type, u32 max_num_sg,
2473 u32 max_num_meta_sg)
2474{
2475 struct mlx5_ib_dev *dev = to_mdev(ibdev: pd->device);
2476 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
2477 int ndescs = ALIGN(max_num_sg, 4);
2478 struct mlx5_ib_mr *mr;
2479 u32 *in;
2480 int err;
2481
2482 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
2483 if (!mr)
2484 return ERR_PTR(error: -ENOMEM);
2485
2486 in = kzalloc(inlen, GFP_KERNEL);
2487 if (!in) {
2488 err = -ENOMEM;
2489 goto err_free;
2490 }
2491
2492 mr->ibmr.device = pd->device;
2493 mr->umem = NULL;
2494
2495 switch (mr_type) {
2496 case IB_MR_TYPE_MEM_REG:
2497 err = mlx5_alloc_mem_reg_descs(pd, mr, ndescs, in, inlen);
2498 break;
2499 case IB_MR_TYPE_SG_GAPS:
2500 err = mlx5_alloc_sg_gaps_descs(pd, mr, ndescs, in, inlen);
2501 break;
2502 case IB_MR_TYPE_INTEGRITY:
2503 err = mlx5_alloc_integrity_descs(pd, mr, max_num_sg,
2504 max_num_meta_sg, in, inlen);
2505 break;
2506 default:
2507 mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type);
2508 err = -EINVAL;
2509 }
2510
2511 if (err)
2512 goto err_free_in;
2513
2514 kfree(objp: in);
2515
2516 return &mr->ibmr;
2517
2518err_free_in:
2519 kfree(objp: in);
2520err_free:
2521 kfree(objp: mr);
2522 return ERR_PTR(error: err);
2523}
2524
2525struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
2526 u32 max_num_sg)
2527{
2528 return __mlx5_ib_alloc_mr(pd, mr_type, max_num_sg, max_num_meta_sg: 0);
2529}
2530
2531struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd,
2532 u32 max_num_sg, u32 max_num_meta_sg)
2533{
2534 return __mlx5_ib_alloc_mr(pd, mr_type: IB_MR_TYPE_INTEGRITY, max_num_sg,
2535 max_num_meta_sg);
2536}
2537
2538int mlx5_ib_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata)
2539{
2540 struct mlx5_ib_dev *dev = to_mdev(ibdev: ibmw->device);
2541 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
2542 struct mlx5_ib_mw *mw = to_mmw(ibmw);
2543 unsigned int ndescs;
2544 u32 *in = NULL;
2545 void *mkc;
2546 int err;
2547 struct mlx5_ib_alloc_mw req = {};
2548 struct {
2549 __u32 comp_mask;
2550 __u32 response_length;
2551 } resp = {};
2552
2553 err = ib_copy_from_udata(dest: &req, udata, min(udata->inlen, sizeof(req)));
2554 if (err)
2555 return err;
2556
2557 if (req.comp_mask || req.reserved1 || req.reserved2)
2558 return -EOPNOTSUPP;
2559
2560 if (udata->inlen > sizeof(req) &&
2561 !ib_is_udata_cleared(udata, offset: sizeof(req),
2562 len: udata->inlen - sizeof(req)))
2563 return -EOPNOTSUPP;
2564
2565 ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4);
2566
2567 in = kzalloc(inlen, GFP_KERNEL);
2568 if (!in)
2569 return -ENOMEM;
2570
2571 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
2572
2573 MLX5_SET(mkc, mkc, free, 1);
2574 MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
2575 MLX5_SET(mkc, mkc, pd, to_mpd(ibmw->pd)->pdn);
2576 MLX5_SET(mkc, mkc, umr_en, 1);
2577 MLX5_SET(mkc, mkc, lr, 1);
2578 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_KLMS);
2579 MLX5_SET(mkc, mkc, en_rinval, !!((ibmw->type == IB_MW_TYPE_2)));
2580 MLX5_SET(mkc, mkc, qpn, 0xffffff);
2581
2582 err = mlx5_ib_create_mkey(dev, mkey: &mw->mmkey, in, inlen);
2583 if (err)
2584 goto free;
2585
2586 mw->mmkey.type = MLX5_MKEY_MW;
2587 ibmw->rkey = mw->mmkey.key;
2588 mw->mmkey.ndescs = ndescs;
2589
2590 resp.response_length =
2591 min(offsetofend(typeof(resp), response_length), udata->outlen);
2592 if (resp.response_length) {
2593 err = ib_copy_to_udata(udata, src: &resp, len: resp.response_length);
2594 if (err)
2595 goto free_mkey;
2596 }
2597
2598 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
2599 err = mlx5r_store_odp_mkey(dev, mmkey: &mw->mmkey);
2600 if (err)
2601 goto free_mkey;
2602 }
2603
2604 kfree(objp: in);
2605 return 0;
2606
2607free_mkey:
2608 mlx5_core_destroy_mkey(dev: dev->mdev, mkey: mw->mmkey.key);
2609free:
2610 kfree(objp: in);
2611 return err;
2612}
2613
2614int mlx5_ib_dealloc_mw(struct ib_mw *mw)
2615{
2616 struct mlx5_ib_dev *dev = to_mdev(ibdev: mw->device);
2617 struct mlx5_ib_mw *mmw = to_mmw(ibmw: mw);
2618
2619 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) &&
2620 xa_erase(&dev->odp_mkeys, index: mlx5_base_mkey(key: mmw->mmkey.key)))
2621 /*
2622 * pagefault_single_data_segment() may be accessing mmw
2623 * if the user bound an ODP MR to this MW.
2624 */
2625 mlx5r_deref_wait_odp_mkey(mmkey: &mmw->mmkey);
2626
2627 return mlx5_core_destroy_mkey(dev: dev->mdev, mkey: mmw->mmkey.key);
2628}
2629
2630int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
2631 struct ib_mr_status *mr_status)
2632{
2633 struct mlx5_ib_mr *mmr = to_mmr(ibmr);
2634 int ret = 0;
2635
2636 if (check_mask & ~IB_MR_CHECK_SIG_STATUS) {
2637 pr_err("Invalid status check mask\n");
2638 ret = -EINVAL;
2639 goto done;
2640 }
2641
2642 mr_status->fail_status = 0;
2643 if (check_mask & IB_MR_CHECK_SIG_STATUS) {
2644 if (!mmr->sig) {
2645 ret = -EINVAL;
2646 pr_err("signature status check requested on a non-signature enabled MR\n");
2647 goto done;
2648 }
2649
2650 mmr->sig->sig_status_checked = true;
2651 if (!mmr->sig->sig_err_exists)
2652 goto done;
2653
2654 if (ibmr->lkey == mmr->sig->err_item.key)
2655 memcpy(&mr_status->sig_err, &mmr->sig->err_item,
2656 sizeof(mr_status->sig_err));
2657 else {
2658 mr_status->sig_err.err_type = IB_SIG_BAD_GUARD;
2659 mr_status->sig_err.sig_err_offset = 0;
2660 mr_status->sig_err.key = mmr->sig->err_item.key;
2661 }
2662
2663 mmr->sig->sig_err_exists = false;
2664 mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS;
2665 }
2666
2667done:
2668 return ret;
2669}
2670
2671static int
2672mlx5_ib_map_pa_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2673 int data_sg_nents, unsigned int *data_sg_offset,
2674 struct scatterlist *meta_sg, int meta_sg_nents,
2675 unsigned int *meta_sg_offset)
2676{
2677 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2678 unsigned int sg_offset = 0;
2679 int n = 0;
2680
2681 mr->meta_length = 0;
2682 if (data_sg_nents == 1) {
2683 n++;
2684 mr->mmkey.ndescs = 1;
2685 if (data_sg_offset)
2686 sg_offset = *data_sg_offset;
2687 mr->data_length = sg_dma_len(data_sg) - sg_offset;
2688 mr->data_iova = sg_dma_address(data_sg) + sg_offset;
2689 if (meta_sg_nents == 1) {
2690 n++;
2691 mr->meta_ndescs = 1;
2692 if (meta_sg_offset)
2693 sg_offset = *meta_sg_offset;
2694 else
2695 sg_offset = 0;
2696 mr->meta_length = sg_dma_len(meta_sg) - sg_offset;
2697 mr->pi_iova = sg_dma_address(meta_sg) + sg_offset;
2698 }
2699 ibmr->length = mr->data_length + mr->meta_length;
2700 }
2701
2702 return n;
2703}
2704
2705static int
2706mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr,
2707 struct scatterlist *sgl,
2708 unsigned short sg_nents,
2709 unsigned int *sg_offset_p,
2710 struct scatterlist *meta_sgl,
2711 unsigned short meta_sg_nents,
2712 unsigned int *meta_sg_offset_p)
2713{
2714 struct scatterlist *sg = sgl;
2715 struct mlx5_klm *klms = mr->descs;
2716 unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
2717 u32 lkey = mr->ibmr.pd->local_dma_lkey;
2718 int i, j = 0;
2719
2720 mr->ibmr.iova = sg_dma_address(sg) + sg_offset;
2721 mr->ibmr.length = 0;
2722
2723 for_each_sg(sgl, sg, sg_nents, i) {
2724 if (unlikely(i >= mr->max_descs))
2725 break;
2726 klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset);
2727 klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset);
2728 klms[i].key = cpu_to_be32(lkey);
2729 mr->ibmr.length += sg_dma_len(sg) - sg_offset;
2730
2731 sg_offset = 0;
2732 }
2733
2734 if (sg_offset_p)
2735 *sg_offset_p = sg_offset;
2736
2737 mr->mmkey.ndescs = i;
2738 mr->data_length = mr->ibmr.length;
2739
2740 if (meta_sg_nents) {
2741 sg = meta_sgl;
2742 sg_offset = meta_sg_offset_p ? *meta_sg_offset_p : 0;
2743 for_each_sg(meta_sgl, sg, meta_sg_nents, j) {
2744 if (unlikely(i + j >= mr->max_descs))
2745 break;
2746 klms[i + j].va = cpu_to_be64(sg_dma_address(sg) +
2747 sg_offset);
2748 klms[i + j].bcount = cpu_to_be32(sg_dma_len(sg) -
2749 sg_offset);
2750 klms[i + j].key = cpu_to_be32(lkey);
2751 mr->ibmr.length += sg_dma_len(sg) - sg_offset;
2752
2753 sg_offset = 0;
2754 }
2755 if (meta_sg_offset_p)
2756 *meta_sg_offset_p = sg_offset;
2757
2758 mr->meta_ndescs = j;
2759 mr->meta_length = mr->ibmr.length - mr->data_length;
2760 }
2761
2762 return i + j;
2763}
2764
2765static int mlx5_set_page(struct ib_mr *ibmr, u64 addr)
2766{
2767 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2768 __be64 *descs;
2769
2770 if (unlikely(mr->mmkey.ndescs == mr->max_descs))
2771 return -ENOMEM;
2772
2773 descs = mr->descs;
2774 descs[mr->mmkey.ndescs++] = cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
2775
2776 return 0;
2777}
2778
2779static int mlx5_set_page_pi(struct ib_mr *ibmr, u64 addr)
2780{
2781 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2782 __be64 *descs;
2783
2784 if (unlikely(mr->mmkey.ndescs + mr->meta_ndescs == mr->max_descs))
2785 return -ENOMEM;
2786
2787 descs = mr->descs;
2788 descs[mr->mmkey.ndescs + mr->meta_ndescs++] =
2789 cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
2790
2791 return 0;
2792}
2793
2794static int
2795mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2796 int data_sg_nents, unsigned int *data_sg_offset,
2797 struct scatterlist *meta_sg, int meta_sg_nents,
2798 unsigned int *meta_sg_offset)
2799{
2800 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2801 struct mlx5_ib_mr *pi_mr = mr->mtt_mr;
2802 int n;
2803
2804 pi_mr->mmkey.ndescs = 0;
2805 pi_mr->meta_ndescs = 0;
2806 pi_mr->meta_length = 0;
2807
2808 ib_dma_sync_single_for_cpu(dev: ibmr->device, addr: pi_mr->desc_map,
2809 size: pi_mr->desc_size * pi_mr->max_descs,
2810 dir: DMA_TO_DEVICE);
2811
2812 pi_mr->ibmr.page_size = ibmr->page_size;
2813 n = ib_sg_to_pages(mr: &pi_mr->ibmr, sgl: data_sg, sg_nents: data_sg_nents, sg_offset: data_sg_offset,
2814 set_page: mlx5_set_page);
2815 if (n != data_sg_nents)
2816 return n;
2817
2818 pi_mr->data_iova = pi_mr->ibmr.iova;
2819 pi_mr->data_length = pi_mr->ibmr.length;
2820 pi_mr->ibmr.length = pi_mr->data_length;
2821 ibmr->length = pi_mr->data_length;
2822
2823 if (meta_sg_nents) {
2824 u64 page_mask = ~((u64)ibmr->page_size - 1);
2825 u64 iova = pi_mr->data_iova;
2826
2827 n += ib_sg_to_pages(mr: &pi_mr->ibmr, sgl: meta_sg, sg_nents: meta_sg_nents,
2828 sg_offset: meta_sg_offset, set_page: mlx5_set_page_pi);
2829
2830 pi_mr->meta_length = pi_mr->ibmr.length;
2831 /*
2832 * PI address for the HW is the offset of the metadata address
2833 * relative to the first data page address.
2834 * It equals to first data page address + size of data pages +
2835 * metadata offset at the first metadata page
2836 */
2837 pi_mr->pi_iova = (iova & page_mask) +
2838 pi_mr->mmkey.ndescs * ibmr->page_size +
2839 (pi_mr->ibmr.iova & ~page_mask);
2840 /*
2841 * In order to use one MTT MR for data and metadata, we register
2842 * also the gaps between the end of the data and the start of
2843 * the metadata (the sig MR will verify that the HW will access
2844 * to right addresses). This mapping is safe because we use
2845 * internal mkey for the registration.
2846 */
2847 pi_mr->ibmr.length = pi_mr->pi_iova + pi_mr->meta_length - iova;
2848 pi_mr->ibmr.iova = iova;
2849 ibmr->length += pi_mr->meta_length;
2850 }
2851
2852 ib_dma_sync_single_for_device(dev: ibmr->device, addr: pi_mr->desc_map,
2853 size: pi_mr->desc_size * pi_mr->max_descs,
2854 dir: DMA_TO_DEVICE);
2855
2856 return n;
2857}
2858
2859static int
2860mlx5_ib_map_klm_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2861 int data_sg_nents, unsigned int *data_sg_offset,
2862 struct scatterlist *meta_sg, int meta_sg_nents,
2863 unsigned int *meta_sg_offset)
2864{
2865 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2866 struct mlx5_ib_mr *pi_mr = mr->klm_mr;
2867 int n;
2868
2869 pi_mr->mmkey.ndescs = 0;
2870 pi_mr->meta_ndescs = 0;
2871 pi_mr->meta_length = 0;
2872
2873 ib_dma_sync_single_for_cpu(dev: ibmr->device, addr: pi_mr->desc_map,
2874 size: pi_mr->desc_size * pi_mr->max_descs,
2875 dir: DMA_TO_DEVICE);
2876
2877 n = mlx5_ib_sg_to_klms(mr: pi_mr, sgl: data_sg, sg_nents: data_sg_nents, sg_offset_p: data_sg_offset,
2878 meta_sgl: meta_sg, meta_sg_nents, meta_sg_offset_p: meta_sg_offset);
2879
2880 ib_dma_sync_single_for_device(dev: ibmr->device, addr: pi_mr->desc_map,
2881 size: pi_mr->desc_size * pi_mr->max_descs,
2882 dir: DMA_TO_DEVICE);
2883
2884 /* This is zero-based memory region */
2885 pi_mr->data_iova = 0;
2886 pi_mr->ibmr.iova = 0;
2887 pi_mr->pi_iova = pi_mr->data_length;
2888 ibmr->length = pi_mr->ibmr.length;
2889
2890 return n;
2891}
2892
2893int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2894 int data_sg_nents, unsigned int *data_sg_offset,
2895 struct scatterlist *meta_sg, int meta_sg_nents,
2896 unsigned int *meta_sg_offset)
2897{
2898 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2899 struct mlx5_ib_mr *pi_mr = NULL;
2900 int n;
2901
2902 WARN_ON(ibmr->type != IB_MR_TYPE_INTEGRITY);
2903
2904 mr->mmkey.ndescs = 0;
2905 mr->data_length = 0;
2906 mr->data_iova = 0;
2907 mr->meta_ndescs = 0;
2908 mr->pi_iova = 0;
2909 /*
2910 * As a performance optimization, if possible, there is no need to
2911 * perform UMR operation to register the data/metadata buffers.
2912 * First try to map the sg lists to PA descriptors with local_dma_lkey.
2913 * Fallback to UMR only in case of a failure.
2914 */
2915 n = mlx5_ib_map_pa_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2916 data_sg_offset, meta_sg, meta_sg_nents,
2917 meta_sg_offset);
2918 if (n == data_sg_nents + meta_sg_nents)
2919 goto out;
2920 /*
2921 * As a performance optimization, if possible, there is no need to map
2922 * the sg lists to KLM descriptors. First try to map the sg lists to MTT
2923 * descriptors and fallback to KLM only in case of a failure.
2924 * It's more efficient for the HW to work with MTT descriptors
2925 * (especially in high load).
2926 * Use KLM (indirect access) only if it's mandatory.
2927 */
2928 pi_mr = mr->mtt_mr;
2929 n = mlx5_ib_map_mtt_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2930 data_sg_offset, meta_sg, meta_sg_nents,
2931 meta_sg_offset);
2932 if (n == data_sg_nents + meta_sg_nents)
2933 goto out;
2934
2935 pi_mr = mr->klm_mr;
2936 n = mlx5_ib_map_klm_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2937 data_sg_offset, meta_sg, meta_sg_nents,
2938 meta_sg_offset);
2939 if (unlikely(n != data_sg_nents + meta_sg_nents))
2940 return -ENOMEM;
2941
2942out:
2943 /* This is zero-based memory region */
2944 ibmr->iova = 0;
2945 mr->pi_mr = pi_mr;
2946 if (pi_mr)
2947 ibmr->sig_attrs->meta_length = pi_mr->meta_length;
2948 else
2949 ibmr->sig_attrs->meta_length = mr->meta_length;
2950
2951 return 0;
2952}
2953
2954int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
2955 unsigned int *sg_offset)
2956{
2957 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2958 int n;
2959
2960 mr->mmkey.ndescs = 0;
2961
2962 ib_dma_sync_single_for_cpu(dev: ibmr->device, addr: mr->desc_map,
2963 size: mr->desc_size * mr->max_descs,
2964 dir: DMA_TO_DEVICE);
2965
2966 if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS)
2967 n = mlx5_ib_sg_to_klms(mr, sgl: sg, sg_nents, sg_offset_p: sg_offset, NULL, meta_sg_nents: 0,
2968 NULL);
2969 else
2970 n = ib_sg_to_pages(mr: ibmr, sgl: sg, sg_nents, sg_offset,
2971 set_page: mlx5_set_page);
2972
2973 ib_dma_sync_single_for_device(dev: ibmr->device, addr: mr->desc_map,
2974 size: mr->desc_size * mr->max_descs,
2975 dir: DMA_TO_DEVICE);
2976
2977 return n;
2978}
2979

source code of linux/drivers/infiniband/hw/mlx5/mr.c