1 | // SPDX-License-Identifier: GPL-2.0+ |
2 | // Copyright 2017 IBM Corp. |
3 | #include <linux/sched/mm.h> |
4 | #include <linux/mutex.h> |
5 | #include <linux/mm.h> |
6 | #include <linux/mm_types.h> |
7 | #include <linux/mmu_context.h> |
8 | #include <linux/mmu_notifier.h> |
9 | #include <linux/irqdomain.h> |
10 | #include <asm/copro.h> |
11 | #include <asm/pnv-ocxl.h> |
12 | #include <asm/xive.h> |
13 | #include <misc/ocxl.h> |
14 | #include "ocxl_internal.h" |
15 | #include "trace.h" |
16 | |
17 | |
18 | #define SPA_PASID_BITS 15 |
19 | #define SPA_PASID_MAX ((1 << SPA_PASID_BITS) - 1) |
20 | #define SPA_PE_MASK SPA_PASID_MAX |
21 | #define SPA_SPA_SIZE_LOG 22 /* Each SPA is 4 Mb */ |
22 | |
23 | #define SPA_CFG_SF (1ull << (63-0)) |
24 | #define SPA_CFG_TA (1ull << (63-1)) |
25 | #define SPA_CFG_HV (1ull << (63-3)) |
26 | #define SPA_CFG_UV (1ull << (63-4)) |
27 | #define SPA_CFG_XLAT_hpt (0ull << (63-6)) /* Hashed page table (HPT) mode */ |
28 | #define SPA_CFG_XLAT_roh (2ull << (63-6)) /* Radix on HPT mode */ |
29 | #define SPA_CFG_XLAT_ror (3ull << (63-6)) /* Radix on Radix mode */ |
30 | #define SPA_CFG_PR (1ull << (63-49)) |
31 | #define SPA_CFG_TC (1ull << (63-54)) |
32 | #define SPA_CFG_DR (1ull << (63-59)) |
33 | |
34 | #define SPA_XSL_TF (1ull << (63-3)) /* Translation fault */ |
35 | #define SPA_XSL_S (1ull << (63-38)) /* Store operation */ |
36 | |
37 | #define SPA_PE_VALID 0x80000000 |
38 | |
39 | struct ocxl_link; |
40 | |
41 | struct pe_data { |
42 | struct mm_struct *mm; |
43 | /* callback to trigger when a translation fault occurs */ |
44 | void (*xsl_err_cb)(void *data, u64 addr, u64 dsisr); |
45 | /* opaque pointer to be passed to the above callback */ |
46 | void *xsl_err_data; |
47 | struct rcu_head rcu; |
48 | struct ocxl_link *link; |
49 | struct mmu_notifier mmu_notifier; |
50 | }; |
51 | |
52 | struct spa { |
53 | struct ocxl_process_element *spa_mem; |
54 | int spa_order; |
55 | struct mutex spa_lock; |
56 | struct radix_tree_root pe_tree; /* Maps PE handles to pe_data */ |
57 | char *irq_name; |
58 | int virq; |
59 | void __iomem *reg_dsisr; |
60 | void __iomem *reg_dar; |
61 | void __iomem *reg_tfc; |
62 | void __iomem *reg_pe_handle; |
63 | /* |
64 | * The following field are used by the memory fault |
65 | * interrupt handler. We can only have one interrupt at a |
66 | * time. The NPU won't raise another interrupt until the |
67 | * previous one has been ack'd by writing to the TFC register |
68 | */ |
69 | struct xsl_fault { |
70 | struct work_struct fault_work; |
71 | u64 pe; |
72 | u64 dsisr; |
73 | u64 dar; |
74 | struct pe_data pe_data; |
75 | } xsl_fault; |
76 | }; |
77 | |
78 | /* |
79 | * A opencapi link can be used be by several PCI functions. We have |
80 | * one link per device slot. |
81 | * |
82 | * A linked list of opencapi links should suffice, as there's a |
83 | * limited number of opencapi slots on a system and lookup is only |
84 | * done when the device is probed |
85 | */ |
86 | struct ocxl_link { |
87 | struct list_head list; |
88 | struct kref ref; |
89 | int domain; |
90 | int bus; |
91 | int dev; |
92 | void __iomem *arva; /* ATSD register virtual address */ |
93 | spinlock_t atsd_lock; /* to serialize shootdowns */ |
94 | atomic_t irq_available; |
95 | struct spa *spa; |
96 | void *platform_data; |
97 | }; |
98 | static LIST_HEAD(links_list); |
99 | static DEFINE_MUTEX(links_list_lock); |
100 | |
101 | enum xsl_response { |
102 | CONTINUE, |
103 | ADDRESS_ERROR, |
104 | RESTART, |
105 | }; |
106 | |
107 | |
108 | static void read_irq(struct spa *spa, u64 *dsisr, u64 *dar, u64 *pe) |
109 | { |
110 | u64 reg; |
111 | |
112 | *dsisr = in_be64(spa->reg_dsisr); |
113 | *dar = in_be64(spa->reg_dar); |
114 | reg = in_be64(spa->reg_pe_handle); |
115 | *pe = reg & SPA_PE_MASK; |
116 | } |
117 | |
118 | static void ack_irq(struct spa *spa, enum xsl_response r) |
119 | { |
120 | u64 reg = 0; |
121 | |
122 | /* continue is not supported */ |
123 | if (r == RESTART) |
124 | reg = PPC_BIT(31); |
125 | else if (r == ADDRESS_ERROR) |
126 | reg = PPC_BIT(30); |
127 | else |
128 | WARN(1, "Invalid irq response %d\n" , r); |
129 | |
130 | if (reg) { |
131 | trace_ocxl_fault_ack(spa: spa->spa_mem, pe: spa->xsl_fault.pe, |
132 | dsisr: spa->xsl_fault.dsisr, dar: spa->xsl_fault.dar, tfc: reg); |
133 | out_be64(spa->reg_tfc, reg); |
134 | } |
135 | } |
136 | |
137 | static void xsl_fault_handler_bh(struct work_struct *fault_work) |
138 | { |
139 | vm_fault_t flt = 0; |
140 | unsigned long access, flags, inv_flags = 0; |
141 | enum xsl_response r; |
142 | struct xsl_fault *fault = container_of(fault_work, struct xsl_fault, |
143 | fault_work); |
144 | struct spa *spa = container_of(fault, struct spa, xsl_fault); |
145 | |
146 | int rc; |
147 | |
148 | /* |
149 | * We must release a reference on mm_users whenever exiting this |
150 | * function (taken in the memory fault interrupt handler) |
151 | */ |
152 | rc = copro_handle_mm_fault(fault->pe_data.mm, fault->dar, fault->dsisr, |
153 | &flt); |
154 | if (rc) { |
155 | pr_debug("copro_handle_mm_fault failed: %d\n" , rc); |
156 | if (fault->pe_data.xsl_err_cb) { |
157 | fault->pe_data.xsl_err_cb( |
158 | fault->pe_data.xsl_err_data, |
159 | fault->dar, fault->dsisr); |
160 | } |
161 | r = ADDRESS_ERROR; |
162 | goto ack; |
163 | } |
164 | |
165 | if (!radix_enabled()) { |
166 | /* |
167 | * update_mmu_cache() will not have loaded the hash |
168 | * since current->trap is not a 0x400 or 0x300, so |
169 | * just call hash_page_mm() here. |
170 | */ |
171 | access = _PAGE_PRESENT | _PAGE_READ; |
172 | if (fault->dsisr & SPA_XSL_S) |
173 | access |= _PAGE_WRITE; |
174 | |
175 | if (get_region_id(fault->dar) != USER_REGION_ID) |
176 | access |= _PAGE_PRIVILEGED; |
177 | |
178 | local_irq_save(flags); |
179 | hash_page_mm(fault->pe_data.mm, fault->dar, access, 0x300, |
180 | inv_flags); |
181 | local_irq_restore(flags); |
182 | } |
183 | r = RESTART; |
184 | ack: |
185 | mmput(fault->pe_data.mm); |
186 | ack_irq(spa, r); |
187 | } |
188 | |
189 | static irqreturn_t xsl_fault_handler(int irq, void *data) |
190 | { |
191 | struct ocxl_link *link = data; |
192 | struct spa *spa = link->spa; |
193 | u64 dsisr, dar, pe_handle; |
194 | struct pe_data *pe_data; |
195 | struct ocxl_process_element *pe; |
196 | int pid; |
197 | bool schedule = false; |
198 | |
199 | read_irq(spa, dsisr: &dsisr, dar: &dar, pe: &pe_handle); |
200 | trace_ocxl_fault(spa: spa->spa_mem, pe: pe_handle, dsisr, dar, tfc: -1); |
201 | |
202 | WARN_ON(pe_handle > SPA_PE_MASK); |
203 | pe = spa->spa_mem + pe_handle; |
204 | pid = be32_to_cpu(pe->pid); |
205 | /* We could be reading all null values here if the PE is being |
206 | * removed while an interrupt kicks in. It's not supposed to |
207 | * happen if the driver notified the AFU to terminate the |
208 | * PASID, and the AFU waited for pending operations before |
209 | * acknowledging. But even if it happens, we won't find a |
210 | * memory context below and fail silently, so it should be ok. |
211 | */ |
212 | if (!(dsisr & SPA_XSL_TF)) { |
213 | WARN(1, "Invalid xsl interrupt fault register %#llx\n" , dsisr); |
214 | ack_irq(spa, r: ADDRESS_ERROR); |
215 | return IRQ_HANDLED; |
216 | } |
217 | |
218 | rcu_read_lock(); |
219 | pe_data = radix_tree_lookup(&spa->pe_tree, pe_handle); |
220 | if (!pe_data) { |
221 | /* |
222 | * Could only happen if the driver didn't notify the |
223 | * AFU about PASID termination before removing the PE, |
224 | * or the AFU didn't wait for all memory access to |
225 | * have completed. |
226 | * |
227 | * Either way, we fail early, but we shouldn't log an |
228 | * error message, as it is a valid (if unexpected) |
229 | * scenario |
230 | */ |
231 | rcu_read_unlock(); |
232 | pr_debug("Unknown mm context for xsl interrupt\n" ); |
233 | ack_irq(spa, r: ADDRESS_ERROR); |
234 | return IRQ_HANDLED; |
235 | } |
236 | |
237 | if (!pe_data->mm) { |
238 | /* |
239 | * translation fault from a kernel context - an OpenCAPI |
240 | * device tried to access a bad kernel address |
241 | */ |
242 | rcu_read_unlock(); |
243 | pr_warn("Unresolved OpenCAPI xsl fault in kernel context\n" ); |
244 | ack_irq(spa, r: ADDRESS_ERROR); |
245 | return IRQ_HANDLED; |
246 | } |
247 | WARN_ON(pe_data->mm->context.id != pid); |
248 | |
249 | if (mmget_not_zero(mm: pe_data->mm)) { |
250 | spa->xsl_fault.pe = pe_handle; |
251 | spa->xsl_fault.dar = dar; |
252 | spa->xsl_fault.dsisr = dsisr; |
253 | spa->xsl_fault.pe_data = *pe_data; |
254 | schedule = true; |
255 | /* mm_users count released by bottom half */ |
256 | } |
257 | rcu_read_unlock(); |
258 | if (schedule) |
259 | schedule_work(work: &spa->xsl_fault.fault_work); |
260 | else |
261 | ack_irq(spa, r: ADDRESS_ERROR); |
262 | return IRQ_HANDLED; |
263 | } |
264 | |
265 | static void unmap_irq_registers(struct spa *spa) |
266 | { |
267 | pnv_ocxl_unmap_xsl_regs(spa->reg_dsisr, spa->reg_dar, spa->reg_tfc, |
268 | spa->reg_pe_handle); |
269 | } |
270 | |
271 | static int map_irq_registers(struct pci_dev *dev, struct spa *spa) |
272 | { |
273 | return pnv_ocxl_map_xsl_regs(dev, &spa->reg_dsisr, &spa->reg_dar, |
274 | &spa->reg_tfc, &spa->reg_pe_handle); |
275 | } |
276 | |
277 | static int setup_xsl_irq(struct pci_dev *dev, struct ocxl_link *link) |
278 | { |
279 | struct spa *spa = link->spa; |
280 | int rc; |
281 | int hwirq; |
282 | |
283 | rc = pnv_ocxl_get_xsl_irq(dev, &hwirq); |
284 | if (rc) |
285 | return rc; |
286 | |
287 | rc = map_irq_registers(dev, spa); |
288 | if (rc) |
289 | return rc; |
290 | |
291 | spa->irq_name = kasprintf(GFP_KERNEL, fmt: "ocxl-xsl-%x-%x-%x" , |
292 | link->domain, link->bus, link->dev); |
293 | if (!spa->irq_name) { |
294 | dev_err(&dev->dev, "Can't allocate name for xsl interrupt\n" ); |
295 | rc = -ENOMEM; |
296 | goto err_xsl; |
297 | } |
298 | /* |
299 | * At some point, we'll need to look into allowing a higher |
300 | * number of interrupts. Could we have an IRQ domain per link? |
301 | */ |
302 | spa->virq = irq_create_mapping(NULL, hwirq); |
303 | if (!spa->virq) { |
304 | dev_err(&dev->dev, |
305 | "irq_create_mapping failed for translation interrupt\n" ); |
306 | rc = -EINVAL; |
307 | goto err_name; |
308 | } |
309 | |
310 | dev_dbg(&dev->dev, "hwirq %d mapped to virq %d\n" , hwirq, spa->virq); |
311 | |
312 | rc = request_irq(irq: spa->virq, handler: xsl_fault_handler, flags: 0, name: spa->irq_name, |
313 | dev: link); |
314 | if (rc) { |
315 | dev_err(&dev->dev, |
316 | "request_irq failed for translation interrupt: %d\n" , |
317 | rc); |
318 | rc = -EINVAL; |
319 | goto err_mapping; |
320 | } |
321 | return 0; |
322 | |
323 | err_mapping: |
324 | irq_dispose_mapping(virq: spa->virq); |
325 | err_name: |
326 | kfree(objp: spa->irq_name); |
327 | err_xsl: |
328 | unmap_irq_registers(spa); |
329 | return rc; |
330 | } |
331 | |
332 | static void release_xsl_irq(struct ocxl_link *link) |
333 | { |
334 | struct spa *spa = link->spa; |
335 | |
336 | if (spa->virq) { |
337 | free_irq(spa->virq, link); |
338 | irq_dispose_mapping(virq: spa->virq); |
339 | } |
340 | kfree(objp: spa->irq_name); |
341 | unmap_irq_registers(spa); |
342 | } |
343 | |
344 | static int alloc_spa(struct pci_dev *dev, struct ocxl_link *link) |
345 | { |
346 | struct spa *spa; |
347 | |
348 | spa = kzalloc(size: sizeof(struct spa), GFP_KERNEL); |
349 | if (!spa) |
350 | return -ENOMEM; |
351 | |
352 | mutex_init(&spa->spa_lock); |
353 | INIT_RADIX_TREE(&spa->pe_tree, GFP_KERNEL); |
354 | INIT_WORK(&spa->xsl_fault.fault_work, xsl_fault_handler_bh); |
355 | |
356 | spa->spa_order = SPA_SPA_SIZE_LOG - PAGE_SHIFT; |
357 | spa->spa_mem = (struct ocxl_process_element *) |
358 | __get_free_pages(GFP_KERNEL | __GFP_ZERO, order: spa->spa_order); |
359 | if (!spa->spa_mem) { |
360 | dev_err(&dev->dev, "Can't allocate Shared Process Area\n" ); |
361 | kfree(objp: spa); |
362 | return -ENOMEM; |
363 | } |
364 | pr_debug("Allocated SPA for %x:%x:%x at %p\n" , link->domain, link->bus, |
365 | link->dev, spa->spa_mem); |
366 | |
367 | link->spa = spa; |
368 | return 0; |
369 | } |
370 | |
371 | static void free_spa(struct ocxl_link *link) |
372 | { |
373 | struct spa *spa = link->spa; |
374 | |
375 | pr_debug("Freeing SPA for %x:%x:%x\n" , link->domain, link->bus, |
376 | link->dev); |
377 | |
378 | if (spa && spa->spa_mem) { |
379 | free_pages(addr: (unsigned long) spa->spa_mem, order: spa->spa_order); |
380 | kfree(objp: spa); |
381 | link->spa = NULL; |
382 | } |
383 | } |
384 | |
385 | static int alloc_link(struct pci_dev *dev, int PE_mask, struct ocxl_link **out_link) |
386 | { |
387 | struct ocxl_link *link; |
388 | int rc; |
389 | |
390 | link = kzalloc(size: sizeof(struct ocxl_link), GFP_KERNEL); |
391 | if (!link) |
392 | return -ENOMEM; |
393 | |
394 | kref_init(kref: &link->ref); |
395 | link->domain = pci_domain_nr(bus: dev->bus); |
396 | link->bus = dev->bus->number; |
397 | link->dev = PCI_SLOT(dev->devfn); |
398 | atomic_set(v: &link->irq_available, MAX_IRQ_PER_LINK); |
399 | spin_lock_init(&link->atsd_lock); |
400 | |
401 | rc = alloc_spa(dev, link); |
402 | if (rc) |
403 | goto err_free; |
404 | |
405 | rc = setup_xsl_irq(dev, link); |
406 | if (rc) |
407 | goto err_spa; |
408 | |
409 | /* platform specific hook */ |
410 | rc = pnv_ocxl_spa_setup(dev, link->spa->spa_mem, PE_mask, |
411 | &link->platform_data); |
412 | if (rc) |
413 | goto err_xsl_irq; |
414 | |
415 | /* if link->arva is not defeined, MMIO registers are not used to |
416 | * generate TLB invalidate. PowerBus snooping is enabled. |
417 | * Otherwise, PowerBus snooping is disabled. TLB Invalidates are |
418 | * initiated using MMIO registers. |
419 | */ |
420 | pnv_ocxl_map_lpar(dev, mfspr(SPRN_LPID), 0, &link->arva); |
421 | |
422 | *out_link = link; |
423 | return 0; |
424 | |
425 | err_xsl_irq: |
426 | release_xsl_irq(link); |
427 | err_spa: |
428 | free_spa(link); |
429 | err_free: |
430 | kfree(objp: link); |
431 | return rc; |
432 | } |
433 | |
434 | static void free_link(struct ocxl_link *link) |
435 | { |
436 | release_xsl_irq(link); |
437 | free_spa(link); |
438 | kfree(objp: link); |
439 | } |
440 | |
441 | int ocxl_link_setup(struct pci_dev *dev, int PE_mask, void **link_handle) |
442 | { |
443 | int rc = 0; |
444 | struct ocxl_link *link; |
445 | |
446 | mutex_lock(&links_list_lock); |
447 | list_for_each_entry(link, &links_list, list) { |
448 | /* The functions of a device all share the same link */ |
449 | if (link->domain == pci_domain_nr(bus: dev->bus) && |
450 | link->bus == dev->bus->number && |
451 | link->dev == PCI_SLOT(dev->devfn)) { |
452 | kref_get(kref: &link->ref); |
453 | *link_handle = link; |
454 | goto unlock; |
455 | } |
456 | } |
457 | rc = alloc_link(dev, PE_mask, out_link: &link); |
458 | if (rc) |
459 | goto unlock; |
460 | |
461 | list_add(new: &link->list, head: &links_list); |
462 | *link_handle = link; |
463 | unlock: |
464 | mutex_unlock(lock: &links_list_lock); |
465 | return rc; |
466 | } |
467 | EXPORT_SYMBOL_GPL(ocxl_link_setup); |
468 | |
469 | static void release_xsl(struct kref *ref) |
470 | { |
471 | struct ocxl_link *link = container_of(ref, struct ocxl_link, ref); |
472 | |
473 | if (link->arva) { |
474 | pnv_ocxl_unmap_lpar(link->arva); |
475 | link->arva = NULL; |
476 | } |
477 | |
478 | list_del(entry: &link->list); |
479 | /* call platform code before releasing data */ |
480 | pnv_ocxl_spa_release(link->platform_data); |
481 | free_link(link); |
482 | } |
483 | |
484 | void ocxl_link_release(struct pci_dev *dev, void *link_handle) |
485 | { |
486 | struct ocxl_link *link = link_handle; |
487 | |
488 | mutex_lock(&links_list_lock); |
489 | kref_put(kref: &link->ref, release: release_xsl); |
490 | mutex_unlock(lock: &links_list_lock); |
491 | } |
492 | EXPORT_SYMBOL_GPL(ocxl_link_release); |
493 | |
494 | static void arch_invalidate_secondary_tlbs(struct mmu_notifier *mn, |
495 | struct mm_struct *mm, |
496 | unsigned long start, unsigned long end) |
497 | { |
498 | struct pe_data *pe_data = container_of(mn, struct pe_data, mmu_notifier); |
499 | struct ocxl_link *link = pe_data->link; |
500 | unsigned long addr, pid, page_size = PAGE_SIZE; |
501 | |
502 | pid = mm->context.id; |
503 | trace_ocxl_mmu_notifier_range(start, end, pidr: pid); |
504 | |
505 | spin_lock(lock: &link->atsd_lock); |
506 | for (addr = start; addr < end; addr += page_size) |
507 | pnv_ocxl_tlb_invalidate(link->arva, pid, addr, page_size); |
508 | spin_unlock(lock: &link->atsd_lock); |
509 | } |
510 | |
511 | static const struct mmu_notifier_ops ocxl_mmu_notifier_ops = { |
512 | .arch_invalidate_secondary_tlbs = arch_invalidate_secondary_tlbs, |
513 | }; |
514 | |
515 | static u64 calculate_cfg_state(bool kernel) |
516 | { |
517 | u64 state; |
518 | |
519 | state = SPA_CFG_DR; |
520 | if (mfspr(SPRN_LPCR) & LPCR_TC) |
521 | state |= SPA_CFG_TC; |
522 | if (radix_enabled()) |
523 | state |= SPA_CFG_XLAT_ror; |
524 | else |
525 | state |= SPA_CFG_XLAT_hpt; |
526 | state |= SPA_CFG_HV; |
527 | if (kernel) { |
528 | if (mfmsr() & MSR_SF) |
529 | state |= SPA_CFG_SF; |
530 | } else { |
531 | state |= SPA_CFG_PR; |
532 | if (!test_tsk_thread_flag(current, flag: TIF_32BIT)) |
533 | state |= SPA_CFG_SF; |
534 | } |
535 | return state; |
536 | } |
537 | |
538 | int ocxl_link_add_pe(void *link_handle, int pasid, u32 pidr, u32 tidr, |
539 | u64 amr, u16 bdf, struct mm_struct *mm, |
540 | void (*xsl_err_cb)(void *data, u64 addr, u64 dsisr), |
541 | void *xsl_err_data) |
542 | { |
543 | struct ocxl_link *link = link_handle; |
544 | struct spa *spa = link->spa; |
545 | struct ocxl_process_element *pe; |
546 | int pe_handle, rc = 0; |
547 | struct pe_data *pe_data; |
548 | |
549 | BUILD_BUG_ON(sizeof(struct ocxl_process_element) != 128); |
550 | if (pasid > SPA_PASID_MAX) |
551 | return -EINVAL; |
552 | |
553 | mutex_lock(&spa->spa_lock); |
554 | pe_handle = pasid & SPA_PE_MASK; |
555 | pe = spa->spa_mem + pe_handle; |
556 | |
557 | if (pe->software_state) { |
558 | rc = -EBUSY; |
559 | goto unlock; |
560 | } |
561 | |
562 | pe_data = kmalloc(size: sizeof(*pe_data), GFP_KERNEL); |
563 | if (!pe_data) { |
564 | rc = -ENOMEM; |
565 | goto unlock; |
566 | } |
567 | |
568 | pe_data->mm = mm; |
569 | pe_data->xsl_err_cb = xsl_err_cb; |
570 | pe_data->xsl_err_data = xsl_err_data; |
571 | pe_data->link = link; |
572 | pe_data->mmu_notifier.ops = &ocxl_mmu_notifier_ops; |
573 | |
574 | memset(pe, 0, sizeof(struct ocxl_process_element)); |
575 | pe->config_state = cpu_to_be64(calculate_cfg_state(pidr == 0)); |
576 | pe->pasid = cpu_to_be32(pasid << (31 - 19)); |
577 | pe->bdf = cpu_to_be16(bdf); |
578 | pe->lpid = cpu_to_be32(mfspr(SPRN_LPID)); |
579 | pe->pid = cpu_to_be32(pidr); |
580 | pe->tid = cpu_to_be32(tidr); |
581 | pe->amr = cpu_to_be64(amr); |
582 | pe->software_state = cpu_to_be32(SPA_PE_VALID); |
583 | |
584 | /* |
585 | * For user contexts, register a copro so that TLBIs are seen |
586 | * by the nest MMU. If we have a kernel context, TLBIs are |
587 | * already global. |
588 | */ |
589 | if (mm) { |
590 | mm_context_add_copro(mm); |
591 | if (link->arva) { |
592 | /* Use MMIO registers for the TLB Invalidate |
593 | * operations. |
594 | */ |
595 | trace_ocxl_init_mmu_notifier(pasid, pidr: mm->context.id); |
596 | mmu_notifier_register(subscription: &pe_data->mmu_notifier, mm); |
597 | } |
598 | } |
599 | |
600 | /* |
601 | * Barrier is to make sure PE is visible in the SPA before it |
602 | * is used by the device. It also helps with the global TLBI |
603 | * invalidation |
604 | */ |
605 | mb(); |
606 | radix_tree_insert(&spa->pe_tree, index: pe_handle, pe_data); |
607 | |
608 | /* |
609 | * The mm must stay valid for as long as the device uses it. We |
610 | * lower the count when the context is removed from the SPA. |
611 | * |
612 | * We grab mm_count (and not mm_users), as we don't want to |
613 | * end up in a circular dependency if a process mmaps its |
614 | * mmio, therefore incrementing the file ref count when |
615 | * calling mmap(), and forgets to unmap before exiting. In |
616 | * that scenario, when the kernel handles the death of the |
617 | * process, the file is not cleaned because unmap was not |
618 | * called, and the mm wouldn't be freed because we would still |
619 | * have a reference on mm_users. Incrementing mm_count solves |
620 | * the problem. |
621 | */ |
622 | if (mm) |
623 | mmgrab(mm); |
624 | trace_ocxl_context_add(current->pid, spa: spa->spa_mem, pasid, pidr, tidr); |
625 | unlock: |
626 | mutex_unlock(lock: &spa->spa_lock); |
627 | return rc; |
628 | } |
629 | EXPORT_SYMBOL_GPL(ocxl_link_add_pe); |
630 | |
631 | int ocxl_link_update_pe(void *link_handle, int pasid, __u16 tid) |
632 | { |
633 | struct ocxl_link *link = link_handle; |
634 | struct spa *spa = link->spa; |
635 | struct ocxl_process_element *pe; |
636 | int pe_handle, rc; |
637 | |
638 | if (pasid > SPA_PASID_MAX) |
639 | return -EINVAL; |
640 | |
641 | pe_handle = pasid & SPA_PE_MASK; |
642 | pe = spa->spa_mem + pe_handle; |
643 | |
644 | mutex_lock(&spa->spa_lock); |
645 | |
646 | pe->tid = cpu_to_be32(tid); |
647 | |
648 | /* |
649 | * The barrier makes sure the PE is updated |
650 | * before we clear the NPU context cache below, so that the |
651 | * old PE cannot be reloaded erroneously. |
652 | */ |
653 | mb(); |
654 | |
655 | /* |
656 | * hook to platform code |
657 | * On powerpc, the entry needs to be cleared from the context |
658 | * cache of the NPU. |
659 | */ |
660 | rc = pnv_ocxl_spa_remove_pe_from_cache(link->platform_data, pe_handle); |
661 | WARN_ON(rc); |
662 | |
663 | mutex_unlock(lock: &spa->spa_lock); |
664 | return rc; |
665 | } |
666 | |
667 | int ocxl_link_remove_pe(void *link_handle, int pasid) |
668 | { |
669 | struct ocxl_link *link = link_handle; |
670 | struct spa *spa = link->spa; |
671 | struct ocxl_process_element *pe; |
672 | struct pe_data *pe_data; |
673 | int pe_handle, rc; |
674 | |
675 | if (pasid > SPA_PASID_MAX) |
676 | return -EINVAL; |
677 | |
678 | /* |
679 | * About synchronization with our memory fault handler: |
680 | * |
681 | * Before removing the PE, the driver is supposed to have |
682 | * notified the AFU, which should have cleaned up and make |
683 | * sure the PASID is no longer in use, including pending |
684 | * interrupts. However, there's no way to be sure... |
685 | * |
686 | * We clear the PE and remove the context from our radix |
687 | * tree. From that point on, any new interrupt for that |
688 | * context will fail silently, which is ok. As mentioned |
689 | * above, that's not expected, but it could happen if the |
690 | * driver or AFU didn't do the right thing. |
691 | * |
692 | * There could still be a bottom half running, but we don't |
693 | * need to wait/flush, as it is managing a reference count on |
694 | * the mm it reads from the radix tree. |
695 | */ |
696 | pe_handle = pasid & SPA_PE_MASK; |
697 | pe = spa->spa_mem + pe_handle; |
698 | |
699 | mutex_lock(&spa->spa_lock); |
700 | |
701 | if (!(be32_to_cpu(pe->software_state) & SPA_PE_VALID)) { |
702 | rc = -EINVAL; |
703 | goto unlock; |
704 | } |
705 | |
706 | trace_ocxl_context_remove(current->pid, spa: spa->spa_mem, pasid, |
707 | be32_to_cpu(pe->pid), be32_to_cpu(pe->tid)); |
708 | |
709 | memset(pe, 0, sizeof(struct ocxl_process_element)); |
710 | /* |
711 | * The barrier makes sure the PE is removed from the SPA |
712 | * before we clear the NPU context cache below, so that the |
713 | * old PE cannot be reloaded erroneously. |
714 | */ |
715 | mb(); |
716 | |
717 | /* |
718 | * hook to platform code |
719 | * On powerpc, the entry needs to be cleared from the context |
720 | * cache of the NPU. |
721 | */ |
722 | rc = pnv_ocxl_spa_remove_pe_from_cache(link->platform_data, pe_handle); |
723 | WARN_ON(rc); |
724 | |
725 | pe_data = radix_tree_delete(&spa->pe_tree, pe_handle); |
726 | if (!pe_data) { |
727 | WARN(1, "Couldn't find pe data when removing PE\n" ); |
728 | } else { |
729 | if (pe_data->mm) { |
730 | if (link->arva) { |
731 | trace_ocxl_release_mmu_notifier(pasid, |
732 | pidr: pe_data->mm->context.id); |
733 | mmu_notifier_unregister(subscription: &pe_data->mmu_notifier, |
734 | mm: pe_data->mm); |
735 | spin_lock(lock: &link->atsd_lock); |
736 | pnv_ocxl_tlb_invalidate(link->arva, |
737 | pe_data->mm->context.id, |
738 | 0ull, |
739 | PAGE_SIZE); |
740 | spin_unlock(lock: &link->atsd_lock); |
741 | } |
742 | mm_context_remove_copro(pe_data->mm); |
743 | mmdrop(mm: pe_data->mm); |
744 | } |
745 | kfree_rcu(pe_data, rcu); |
746 | } |
747 | unlock: |
748 | mutex_unlock(lock: &spa->spa_lock); |
749 | return rc; |
750 | } |
751 | EXPORT_SYMBOL_GPL(ocxl_link_remove_pe); |
752 | |
753 | int ocxl_link_irq_alloc(void *link_handle, int *hw_irq) |
754 | { |
755 | struct ocxl_link *link = link_handle; |
756 | int irq; |
757 | |
758 | if (atomic_dec_if_positive(v: &link->irq_available) < 0) |
759 | return -ENOSPC; |
760 | |
761 | irq = xive_native_alloc_irq(); |
762 | if (!irq) { |
763 | atomic_inc(v: &link->irq_available); |
764 | return -ENXIO; |
765 | } |
766 | |
767 | *hw_irq = irq; |
768 | return 0; |
769 | } |
770 | EXPORT_SYMBOL_GPL(ocxl_link_irq_alloc); |
771 | |
772 | void ocxl_link_free_irq(void *link_handle, int hw_irq) |
773 | { |
774 | struct ocxl_link *link = link_handle; |
775 | |
776 | xive_native_free_irq(hw_irq); |
777 | atomic_inc(v: &link->irq_available); |
778 | } |
779 | EXPORT_SYMBOL_GPL(ocxl_link_free_irq); |
780 | |