1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* |
3 | * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation |
4 | * |
5 | * Rewrite, cleanup, new allocation schemes, virtual merging: |
6 | * Copyright (C) 2004 Olof Johansson, IBM Corporation |
7 | * and Ben. Herrenschmidt, IBM Corporation |
8 | * |
9 | * Dynamic DMA mapping support, bus-independent parts. |
10 | */ |
11 | |
12 | |
13 | #include <linux/init.h> |
14 | #include <linux/types.h> |
15 | #include <linux/slab.h> |
16 | #include <linux/mm.h> |
17 | #include <linux/spinlock.h> |
18 | #include <linux/string.h> |
19 | #include <linux/dma-mapping.h> |
20 | #include <linux/bitmap.h> |
21 | #include <linux/iommu-helper.h> |
22 | #include <linux/crash_dump.h> |
23 | #include <linux/hash.h> |
24 | #include <linux/fault-inject.h> |
25 | #include <linux/pci.h> |
26 | #include <linux/iommu.h> |
27 | #include <linux/sched.h> |
28 | #include <linux/debugfs.h> |
29 | #include <asm/io.h> |
30 | #include <asm/iommu.h> |
31 | #include <asm/pci-bridge.h> |
32 | #include <asm/machdep.h> |
33 | #include <asm/kdump.h> |
34 | #include <asm/fadump.h> |
35 | #include <asm/vio.h> |
36 | #include <asm/tce.h> |
37 | #include <asm/mmu_context.h> |
38 | #include <asm/ppc-pci.h> |
39 | |
40 | #define DBG(...) |
41 | |
42 | #ifdef CONFIG_IOMMU_DEBUGFS |
43 | static int iommu_debugfs_weight_get(void *data, u64 *val) |
44 | { |
45 | struct iommu_table *tbl = data; |
46 | *val = bitmap_weight(src: tbl->it_map, nbits: tbl->it_size); |
47 | return 0; |
48 | } |
49 | DEFINE_DEBUGFS_ATTRIBUTE(iommu_debugfs_fops_weight, iommu_debugfs_weight_get, NULL, "%llu\n" ); |
50 | |
51 | static void iommu_debugfs_add(struct iommu_table *tbl) |
52 | { |
53 | char name[10]; |
54 | struct dentry *liobn_entry; |
55 | |
56 | sprintf(buf: name, fmt: "%08lx" , tbl->it_index); |
57 | liobn_entry = debugfs_create_dir(name, parent: iommu_debugfs_dir); |
58 | |
59 | debugfs_create_file_unsafe(name: "weight" , mode: 0400, parent: liobn_entry, data: tbl, fops: &iommu_debugfs_fops_weight); |
60 | debugfs_create_ulong(name: "it_size" , mode: 0400, parent: liobn_entry, value: &tbl->it_size); |
61 | debugfs_create_ulong(name: "it_page_shift" , mode: 0400, parent: liobn_entry, value: &tbl->it_page_shift); |
62 | debugfs_create_ulong(name: "it_reserved_start" , mode: 0400, parent: liobn_entry, value: &tbl->it_reserved_start); |
63 | debugfs_create_ulong(name: "it_reserved_end" , mode: 0400, parent: liobn_entry, value: &tbl->it_reserved_end); |
64 | debugfs_create_ulong(name: "it_indirect_levels" , mode: 0400, parent: liobn_entry, value: &tbl->it_indirect_levels); |
65 | debugfs_create_ulong(name: "it_level_size" , mode: 0400, parent: liobn_entry, value: &tbl->it_level_size); |
66 | } |
67 | |
68 | static void iommu_debugfs_del(struct iommu_table *tbl) |
69 | { |
70 | char name[10]; |
71 | |
72 | sprintf(buf: name, fmt: "%08lx" , tbl->it_index); |
73 | debugfs_lookup_and_remove(name, parent: iommu_debugfs_dir); |
74 | } |
75 | #else |
76 | static void iommu_debugfs_add(struct iommu_table *tbl){} |
77 | static void iommu_debugfs_del(struct iommu_table *tbl){} |
78 | #endif |
79 | |
80 | static int novmerge; |
81 | |
82 | static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned int); |
83 | |
84 | static int __init setup_iommu(char *str) |
85 | { |
86 | if (!strcmp(str, "novmerge" )) |
87 | novmerge = 1; |
88 | else if (!strcmp(str, "vmerge" )) |
89 | novmerge = 0; |
90 | return 1; |
91 | } |
92 | |
93 | __setup("iommu=" , setup_iommu); |
94 | |
95 | static DEFINE_PER_CPU(unsigned int, iommu_pool_hash); |
96 | |
97 | /* |
98 | * We precalculate the hash to avoid doing it on every allocation. |
99 | * |
100 | * The hash is important to spread CPUs across all the pools. For example, |
101 | * on a POWER7 with 4 way SMT we want interrupts on the primary threads and |
102 | * with 4 pools all primary threads would map to the same pool. |
103 | */ |
104 | static int __init setup_iommu_pool_hash(void) |
105 | { |
106 | unsigned int i; |
107 | |
108 | for_each_possible_cpu(i) |
109 | per_cpu(iommu_pool_hash, i) = hash_32(val: i, bits: IOMMU_POOL_HASHBITS); |
110 | |
111 | return 0; |
112 | } |
113 | subsys_initcall(setup_iommu_pool_hash); |
114 | |
115 | #ifdef CONFIG_FAIL_IOMMU |
116 | |
117 | static DECLARE_FAULT_ATTR(fail_iommu); |
118 | |
119 | static int __init setup_fail_iommu(char *str) |
120 | { |
121 | return setup_fault_attr(&fail_iommu, str); |
122 | } |
123 | __setup("fail_iommu=" , setup_fail_iommu); |
124 | |
125 | static bool should_fail_iommu(struct device *dev) |
126 | { |
127 | return dev->archdata.fail_iommu && should_fail(&fail_iommu, 1); |
128 | } |
129 | |
130 | static int __init fail_iommu_debugfs(void) |
131 | { |
132 | struct dentry *dir = fault_create_debugfs_attr("fail_iommu" , |
133 | NULL, &fail_iommu); |
134 | |
135 | return PTR_ERR_OR_ZERO(dir); |
136 | } |
137 | late_initcall(fail_iommu_debugfs); |
138 | |
139 | static ssize_t fail_iommu_show(struct device *dev, |
140 | struct device_attribute *attr, char *buf) |
141 | { |
142 | return sprintf(buf, "%d\n" , dev->archdata.fail_iommu); |
143 | } |
144 | |
145 | static ssize_t fail_iommu_store(struct device *dev, |
146 | struct device_attribute *attr, const char *buf, |
147 | size_t count) |
148 | { |
149 | int i; |
150 | |
151 | if (count > 0 && sscanf(buf, "%d" , &i) > 0) |
152 | dev->archdata.fail_iommu = (i == 0) ? 0 : 1; |
153 | |
154 | return count; |
155 | } |
156 | |
157 | static DEVICE_ATTR_RW(fail_iommu); |
158 | |
159 | static int fail_iommu_bus_notify(struct notifier_block *nb, |
160 | unsigned long action, void *data) |
161 | { |
162 | struct device *dev = data; |
163 | |
164 | if (action == BUS_NOTIFY_ADD_DEVICE) { |
165 | if (device_create_file(dev, &dev_attr_fail_iommu)) |
166 | pr_warn("Unable to create IOMMU fault injection sysfs " |
167 | "entries\n" ); |
168 | } else if (action == BUS_NOTIFY_DEL_DEVICE) { |
169 | device_remove_file(dev, &dev_attr_fail_iommu); |
170 | } |
171 | |
172 | return 0; |
173 | } |
174 | |
175 | /* |
176 | * PCI and VIO buses need separate notifier_block structs, since they're linked |
177 | * list nodes. Sharing a notifier_block would mean that any notifiers later |
178 | * registered for PCI buses would also get called by VIO buses and vice versa. |
179 | */ |
180 | static struct notifier_block fail_iommu_pci_bus_notifier = { |
181 | .notifier_call = fail_iommu_bus_notify |
182 | }; |
183 | |
184 | #ifdef CONFIG_IBMVIO |
185 | static struct notifier_block fail_iommu_vio_bus_notifier = { |
186 | .notifier_call = fail_iommu_bus_notify |
187 | }; |
188 | #endif |
189 | |
190 | static int __init fail_iommu_setup(void) |
191 | { |
192 | #ifdef CONFIG_PCI |
193 | bus_register_notifier(&pci_bus_type, &fail_iommu_pci_bus_notifier); |
194 | #endif |
195 | #ifdef CONFIG_IBMVIO |
196 | bus_register_notifier(&vio_bus_type, &fail_iommu_vio_bus_notifier); |
197 | #endif |
198 | |
199 | return 0; |
200 | } |
201 | /* |
202 | * Must execute after PCI and VIO subsystem have initialised but before |
203 | * devices are probed. |
204 | */ |
205 | arch_initcall(fail_iommu_setup); |
206 | #else |
207 | static inline bool should_fail_iommu(struct device *dev) |
208 | { |
209 | return false; |
210 | } |
211 | #endif |
212 | |
213 | static unsigned long iommu_range_alloc(struct device *dev, |
214 | struct iommu_table *tbl, |
215 | unsigned long npages, |
216 | unsigned long *handle, |
217 | unsigned long mask, |
218 | unsigned int align_order) |
219 | { |
220 | unsigned long n, end, start; |
221 | unsigned long limit; |
222 | int largealloc = npages > 15; |
223 | int pass = 0; |
224 | unsigned long align_mask; |
225 | unsigned long flags; |
226 | unsigned int pool_nr; |
227 | struct iommu_pool *pool; |
228 | |
229 | align_mask = (1ull << align_order) - 1; |
230 | |
231 | /* This allocator was derived from x86_64's bit string search */ |
232 | |
233 | /* Sanity check */ |
234 | if (unlikely(npages == 0)) { |
235 | if (printk_ratelimit()) |
236 | WARN_ON(1); |
237 | return DMA_MAPPING_ERROR; |
238 | } |
239 | |
240 | if (should_fail_iommu(dev)) |
241 | return DMA_MAPPING_ERROR; |
242 | |
243 | /* |
244 | * We don't need to disable preemption here because any CPU can |
245 | * safely use any IOMMU pool. |
246 | */ |
247 | pool_nr = raw_cpu_read(iommu_pool_hash) & (tbl->nr_pools - 1); |
248 | |
249 | if (largealloc) |
250 | pool = &(tbl->large_pool); |
251 | else |
252 | pool = &(tbl->pools[pool_nr]); |
253 | |
254 | spin_lock_irqsave(&(pool->lock), flags); |
255 | |
256 | again: |
257 | if ((pass == 0) && handle && *handle && |
258 | (*handle >= pool->start) && (*handle < pool->end)) |
259 | start = *handle; |
260 | else |
261 | start = pool->hint; |
262 | |
263 | limit = pool->end; |
264 | |
265 | /* The case below can happen if we have a small segment appended |
266 | * to a large, or when the previous alloc was at the very end of |
267 | * the available space. If so, go back to the initial start. |
268 | */ |
269 | if (start >= limit) |
270 | start = pool->start; |
271 | |
272 | if (limit + tbl->it_offset > mask) { |
273 | limit = mask - tbl->it_offset + 1; |
274 | /* If we're constrained on address range, first try |
275 | * at the masked hint to avoid O(n) search complexity, |
276 | * but on second pass, start at 0 in pool 0. |
277 | */ |
278 | if ((start & mask) >= limit || pass > 0) { |
279 | spin_unlock(lock: &(pool->lock)); |
280 | pool = &(tbl->pools[0]); |
281 | spin_lock(lock: &(pool->lock)); |
282 | start = pool->start; |
283 | } else { |
284 | start &= mask; |
285 | } |
286 | } |
287 | |
288 | n = iommu_area_alloc(map: tbl->it_map, size: limit, start, nr: npages, shift: tbl->it_offset, |
289 | boundary_size: dma_get_seg_boundary_nr_pages(dev, page_shift: tbl->it_page_shift), |
290 | align_mask); |
291 | if (n == -1) { |
292 | if (likely(pass == 0)) { |
293 | /* First try the pool from the start */ |
294 | pool->hint = pool->start; |
295 | pass++; |
296 | goto again; |
297 | |
298 | } else if (pass <= tbl->nr_pools) { |
299 | /* Now try scanning all the other pools */ |
300 | spin_unlock(lock: &(pool->lock)); |
301 | pool_nr = (pool_nr + 1) & (tbl->nr_pools - 1); |
302 | pool = &tbl->pools[pool_nr]; |
303 | spin_lock(lock: &(pool->lock)); |
304 | pool->hint = pool->start; |
305 | pass++; |
306 | goto again; |
307 | |
308 | } else if (pass == tbl->nr_pools + 1) { |
309 | /* Last resort: try largepool */ |
310 | spin_unlock(lock: &pool->lock); |
311 | pool = &tbl->large_pool; |
312 | spin_lock(lock: &pool->lock); |
313 | pool->hint = pool->start; |
314 | pass++; |
315 | goto again; |
316 | |
317 | } else { |
318 | /* Give up */ |
319 | spin_unlock_irqrestore(lock: &(pool->lock), flags); |
320 | return DMA_MAPPING_ERROR; |
321 | } |
322 | } |
323 | |
324 | end = n + npages; |
325 | |
326 | /* Bump the hint to a new block for small allocs. */ |
327 | if (largealloc) { |
328 | /* Don't bump to new block to avoid fragmentation */ |
329 | pool->hint = end; |
330 | } else { |
331 | /* Overflow will be taken care of at the next allocation */ |
332 | pool->hint = (end + tbl->it_blocksize - 1) & |
333 | ~(tbl->it_blocksize - 1); |
334 | } |
335 | |
336 | /* Update handle for SG allocations */ |
337 | if (handle) |
338 | *handle = end; |
339 | |
340 | spin_unlock_irqrestore(lock: &(pool->lock), flags); |
341 | |
342 | return n; |
343 | } |
344 | |
345 | static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, |
346 | void *page, unsigned int npages, |
347 | enum dma_data_direction direction, |
348 | unsigned long mask, unsigned int align_order, |
349 | unsigned long attrs) |
350 | { |
351 | unsigned long entry; |
352 | dma_addr_t ret = DMA_MAPPING_ERROR; |
353 | int build_fail; |
354 | |
355 | entry = iommu_range_alloc(dev, tbl, npages, NULL, mask, align_order); |
356 | |
357 | if (unlikely(entry == DMA_MAPPING_ERROR)) |
358 | return DMA_MAPPING_ERROR; |
359 | |
360 | entry += tbl->it_offset; /* Offset into real TCE table */ |
361 | ret = entry << tbl->it_page_shift; /* Set the return dma address */ |
362 | |
363 | /* Put the TCEs in the HW table */ |
364 | build_fail = tbl->it_ops->set(tbl, entry, npages, |
365 | (unsigned long)page & |
366 | IOMMU_PAGE_MASK(tbl), direction, attrs); |
367 | |
368 | /* tbl->it_ops->set() only returns non-zero for transient errors. |
369 | * Clean up the table bitmap in this case and return |
370 | * DMA_MAPPING_ERROR. For all other errors the functionality is |
371 | * not altered. |
372 | */ |
373 | if (unlikely(build_fail)) { |
374 | __iommu_free(tbl, ret, npages); |
375 | return DMA_MAPPING_ERROR; |
376 | } |
377 | |
378 | /* Flush/invalidate TLB caches if necessary */ |
379 | if (tbl->it_ops->flush) |
380 | tbl->it_ops->flush(tbl); |
381 | |
382 | /* Make sure updates are seen by hardware */ |
383 | mb(); |
384 | |
385 | return ret; |
386 | } |
387 | |
388 | static bool iommu_free_check(struct iommu_table *tbl, dma_addr_t dma_addr, |
389 | unsigned int npages) |
390 | { |
391 | unsigned long entry, free_entry; |
392 | |
393 | entry = dma_addr >> tbl->it_page_shift; |
394 | free_entry = entry - tbl->it_offset; |
395 | |
396 | if (((free_entry + npages) > tbl->it_size) || |
397 | (entry < tbl->it_offset)) { |
398 | if (printk_ratelimit()) { |
399 | printk(KERN_INFO "iommu_free: invalid entry\n" ); |
400 | printk(KERN_INFO "\tentry = 0x%lx\n" , entry); |
401 | printk(KERN_INFO "\tdma_addr = 0x%llx\n" , (u64)dma_addr); |
402 | printk(KERN_INFO "\tTable = 0x%llx\n" , (u64)tbl); |
403 | printk(KERN_INFO "\tbus# = 0x%llx\n" , (u64)tbl->it_busno); |
404 | printk(KERN_INFO "\tsize = 0x%llx\n" , (u64)tbl->it_size); |
405 | printk(KERN_INFO "\tstartOff = 0x%llx\n" , (u64)tbl->it_offset); |
406 | printk(KERN_INFO "\tindex = 0x%llx\n" , (u64)tbl->it_index); |
407 | WARN_ON(1); |
408 | } |
409 | |
410 | return false; |
411 | } |
412 | |
413 | return true; |
414 | } |
415 | |
416 | static struct iommu_pool *get_pool(struct iommu_table *tbl, |
417 | unsigned long entry) |
418 | { |
419 | struct iommu_pool *p; |
420 | unsigned long largepool_start = tbl->large_pool.start; |
421 | |
422 | /* The large pool is the last pool at the top of the table */ |
423 | if (entry >= largepool_start) { |
424 | p = &tbl->large_pool; |
425 | } else { |
426 | unsigned int pool_nr = entry / tbl->poolsize; |
427 | |
428 | BUG_ON(pool_nr > tbl->nr_pools); |
429 | p = &tbl->pools[pool_nr]; |
430 | } |
431 | |
432 | return p; |
433 | } |
434 | |
435 | static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, |
436 | unsigned int npages) |
437 | { |
438 | unsigned long entry, free_entry; |
439 | unsigned long flags; |
440 | struct iommu_pool *pool; |
441 | |
442 | entry = dma_addr >> tbl->it_page_shift; |
443 | free_entry = entry - tbl->it_offset; |
444 | |
445 | pool = get_pool(tbl, entry: free_entry); |
446 | |
447 | if (!iommu_free_check(tbl, dma_addr, npages)) |
448 | return; |
449 | |
450 | tbl->it_ops->clear(tbl, entry, npages); |
451 | |
452 | spin_lock_irqsave(&(pool->lock), flags); |
453 | bitmap_clear(map: tbl->it_map, start: free_entry, nbits: npages); |
454 | spin_unlock_irqrestore(lock: &(pool->lock), flags); |
455 | } |
456 | |
457 | static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, |
458 | unsigned int npages) |
459 | { |
460 | __iommu_free(tbl, dma_addr, npages); |
461 | |
462 | /* Make sure TLB cache is flushed if the HW needs it. We do |
463 | * not do an mb() here on purpose, it is not needed on any of |
464 | * the current platforms. |
465 | */ |
466 | if (tbl->it_ops->flush) |
467 | tbl->it_ops->flush(tbl); |
468 | } |
469 | |
470 | int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl, |
471 | struct scatterlist *sglist, int nelems, |
472 | unsigned long mask, enum dma_data_direction direction, |
473 | unsigned long attrs) |
474 | { |
475 | dma_addr_t dma_next = 0, dma_addr; |
476 | struct scatterlist *s, *outs, *segstart; |
477 | int outcount, incount, i, build_fail = 0; |
478 | unsigned int align; |
479 | unsigned long handle; |
480 | unsigned int max_seg_size; |
481 | |
482 | BUG_ON(direction == DMA_NONE); |
483 | |
484 | if ((nelems == 0) || !tbl) |
485 | return -EINVAL; |
486 | |
487 | outs = s = segstart = &sglist[0]; |
488 | outcount = 1; |
489 | incount = nelems; |
490 | handle = 0; |
491 | |
492 | /* Init first segment length for backout at failure */ |
493 | outs->dma_length = 0; |
494 | |
495 | DBG("sg mapping %d elements:\n" , nelems); |
496 | |
497 | max_seg_size = dma_get_max_seg_size(dev); |
498 | for_each_sg(sglist, s, nelems, i) { |
499 | unsigned long vaddr, npages, entry, slen; |
500 | |
501 | slen = s->length; |
502 | /* Sanity check */ |
503 | if (slen == 0) { |
504 | dma_next = 0; |
505 | continue; |
506 | } |
507 | /* Allocate iommu entries for that segment */ |
508 | vaddr = (unsigned long) sg_virt(sg: s); |
509 | npages = iommu_num_pages(addr: vaddr, len: slen, io_page_size: IOMMU_PAGE_SIZE(tbl)); |
510 | align = 0; |
511 | if (tbl->it_page_shift < PAGE_SHIFT && slen >= PAGE_SIZE && |
512 | (vaddr & ~PAGE_MASK) == 0) |
513 | align = PAGE_SHIFT - tbl->it_page_shift; |
514 | entry = iommu_range_alloc(dev, tbl, npages, handle: &handle, |
515 | mask: mask >> tbl->it_page_shift, align_order: align); |
516 | |
517 | DBG(" - vaddr: %lx, size: %lx\n" , vaddr, slen); |
518 | |
519 | /* Handle failure */ |
520 | if (unlikely(entry == DMA_MAPPING_ERROR)) { |
521 | if (!(attrs & DMA_ATTR_NO_WARN) && |
522 | printk_ratelimit()) |
523 | dev_info(dev, "iommu_alloc failed, tbl %p " |
524 | "vaddr %lx npages %lu\n" , tbl, vaddr, |
525 | npages); |
526 | goto failure; |
527 | } |
528 | |
529 | /* Convert entry to a dma_addr_t */ |
530 | entry += tbl->it_offset; |
531 | dma_addr = entry << tbl->it_page_shift; |
532 | dma_addr |= (vaddr & ~IOMMU_PAGE_MASK(tbl)); |
533 | |
534 | DBG(" - %lu pages, entry: %lx, dma_addr: %lx\n" , |
535 | npages, entry, dma_addr); |
536 | |
537 | /* Insert into HW table */ |
538 | build_fail = tbl->it_ops->set(tbl, entry, npages, |
539 | vaddr & IOMMU_PAGE_MASK(tbl), |
540 | direction, attrs); |
541 | if(unlikely(build_fail)) |
542 | goto failure; |
543 | |
544 | /* If we are in an open segment, try merging */ |
545 | if (segstart != s) { |
546 | DBG(" - trying merge...\n" ); |
547 | /* We cannot merge if: |
548 | * - allocated dma_addr isn't contiguous to previous allocation |
549 | */ |
550 | if (novmerge || (dma_addr != dma_next) || |
551 | (outs->dma_length + s->length > max_seg_size)) { |
552 | /* Can't merge: create a new segment */ |
553 | segstart = s; |
554 | outcount++; |
555 | outs = sg_next(outs); |
556 | DBG(" can't merge, new segment.\n" ); |
557 | } else { |
558 | outs->dma_length += s->length; |
559 | DBG(" merged, new len: %ux\n" , outs->dma_length); |
560 | } |
561 | } |
562 | |
563 | if (segstart == s) { |
564 | /* This is a new segment, fill entries */ |
565 | DBG(" - filling new segment.\n" ); |
566 | outs->dma_address = dma_addr; |
567 | outs->dma_length = slen; |
568 | } |
569 | |
570 | /* Calculate next page pointer for contiguous check */ |
571 | dma_next = dma_addr + slen; |
572 | |
573 | DBG(" - dma next is: %lx\n" , dma_next); |
574 | } |
575 | |
576 | /* Flush/invalidate TLB caches if necessary */ |
577 | if (tbl->it_ops->flush) |
578 | tbl->it_ops->flush(tbl); |
579 | |
580 | DBG("mapped %d elements:\n" , outcount); |
581 | |
582 | /* For the sake of ppc_iommu_unmap_sg, we clear out the length in the |
583 | * next entry of the sglist if we didn't fill the list completely |
584 | */ |
585 | if (outcount < incount) { |
586 | outs = sg_next(outs); |
587 | outs->dma_length = 0; |
588 | } |
589 | |
590 | /* Make sure updates are seen by hardware */ |
591 | mb(); |
592 | |
593 | return outcount; |
594 | |
595 | failure: |
596 | for_each_sg(sglist, s, nelems, i) { |
597 | if (s->dma_length != 0) { |
598 | unsigned long vaddr, npages; |
599 | |
600 | vaddr = s->dma_address & IOMMU_PAGE_MASK(tbl); |
601 | npages = iommu_num_pages(addr: s->dma_address, len: s->dma_length, |
602 | io_page_size: IOMMU_PAGE_SIZE(tbl)); |
603 | __iommu_free(tbl, vaddr, npages); |
604 | s->dma_length = 0; |
605 | } |
606 | if (s == outs) |
607 | break; |
608 | } |
609 | return -EIO; |
610 | } |
611 | |
612 | |
613 | void ppc_iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist, |
614 | int nelems, enum dma_data_direction direction, |
615 | unsigned long attrs) |
616 | { |
617 | struct scatterlist *sg; |
618 | |
619 | BUG_ON(direction == DMA_NONE); |
620 | |
621 | if (!tbl) |
622 | return; |
623 | |
624 | sg = sglist; |
625 | while (nelems--) { |
626 | unsigned int npages; |
627 | dma_addr_t dma_handle = sg->dma_address; |
628 | |
629 | if (sg->dma_length == 0) |
630 | break; |
631 | npages = iommu_num_pages(addr: dma_handle, len: sg->dma_length, |
632 | io_page_size: IOMMU_PAGE_SIZE(tbl)); |
633 | __iommu_free(tbl, dma_handle, npages); |
634 | sg = sg_next(sg); |
635 | } |
636 | |
637 | /* Flush/invalidate TLBs if necessary. As for iommu_free(), we |
638 | * do not do an mb() here, the affected platforms do not need it |
639 | * when freeing. |
640 | */ |
641 | if (tbl->it_ops->flush) |
642 | tbl->it_ops->flush(tbl); |
643 | } |
644 | |
645 | static void iommu_table_clear(struct iommu_table *tbl) |
646 | { |
647 | /* |
648 | * In case of firmware assisted dump system goes through clean |
649 | * reboot process at the time of system crash. Hence it's safe to |
650 | * clear the TCE entries if firmware assisted dump is active. |
651 | */ |
652 | if (!is_kdump_kernel() || is_fadump_active()) { |
653 | /* Clear the table in case firmware left allocations in it */ |
654 | tbl->it_ops->clear(tbl, tbl->it_offset, tbl->it_size); |
655 | return; |
656 | } |
657 | |
658 | #ifdef CONFIG_CRASH_DUMP |
659 | if (tbl->it_ops->get) { |
660 | unsigned long index, tceval, tcecount = 0; |
661 | |
662 | /* Reserve the existing mappings left by the first kernel. */ |
663 | for (index = 0; index < tbl->it_size; index++) { |
664 | tceval = tbl->it_ops->get(tbl, index + tbl->it_offset); |
665 | /* |
666 | * Freed TCE entry contains 0x7fffffffffffffff on JS20 |
667 | */ |
668 | if (tceval && (tceval != 0x7fffffffffffffffUL)) { |
669 | __set_bit(index, tbl->it_map); |
670 | tcecount++; |
671 | } |
672 | } |
673 | |
674 | if ((tbl->it_size - tcecount) < KDUMP_MIN_TCE_ENTRIES) { |
675 | printk(KERN_WARNING "TCE table is full; freeing " ); |
676 | printk(KERN_WARNING "%d entries for the kdump boot\n" , |
677 | KDUMP_MIN_TCE_ENTRIES); |
678 | for (index = tbl->it_size - KDUMP_MIN_TCE_ENTRIES; |
679 | index < tbl->it_size; index++) |
680 | __clear_bit(index, tbl->it_map); |
681 | } |
682 | } |
683 | #endif |
684 | } |
685 | |
686 | static void iommu_table_reserve_pages(struct iommu_table *tbl, |
687 | unsigned long res_start, unsigned long res_end) |
688 | { |
689 | int i; |
690 | |
691 | WARN_ON_ONCE(res_end < res_start); |
692 | /* |
693 | * Reserve page 0 so it will not be used for any mappings. |
694 | * This avoids buggy drivers that consider page 0 to be invalid |
695 | * to crash the machine or even lose data. |
696 | */ |
697 | if (tbl->it_offset == 0) |
698 | set_bit(nr: 0, addr: tbl->it_map); |
699 | |
700 | if (res_start < tbl->it_offset) |
701 | res_start = tbl->it_offset; |
702 | |
703 | if (res_end > (tbl->it_offset + tbl->it_size)) |
704 | res_end = tbl->it_offset + tbl->it_size; |
705 | |
706 | /* Check if res_start..res_end is a valid range in the table */ |
707 | if (res_start >= res_end) { |
708 | tbl->it_reserved_start = tbl->it_offset; |
709 | tbl->it_reserved_end = tbl->it_offset; |
710 | return; |
711 | } |
712 | |
713 | tbl->it_reserved_start = res_start; |
714 | tbl->it_reserved_end = res_end; |
715 | |
716 | for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i) |
717 | set_bit(nr: i - tbl->it_offset, addr: tbl->it_map); |
718 | } |
719 | |
720 | /* |
721 | * Build a iommu_table structure. This contains a bit map which |
722 | * is used to manage allocation of the tce space. |
723 | */ |
724 | struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid, |
725 | unsigned long res_start, unsigned long res_end) |
726 | { |
727 | unsigned long sz; |
728 | static int welcomed = 0; |
729 | unsigned int i; |
730 | struct iommu_pool *p; |
731 | |
732 | BUG_ON(!tbl->it_ops); |
733 | |
734 | /* number of bytes needed for the bitmap */ |
735 | sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long); |
736 | |
737 | tbl->it_map = vzalloc_node(size: sz, node: nid); |
738 | if (!tbl->it_map) { |
739 | pr_err("%s: Can't allocate %ld bytes\n" , __func__, sz); |
740 | return NULL; |
741 | } |
742 | |
743 | iommu_table_reserve_pages(tbl, res_start, res_end); |
744 | |
745 | /* We only split the IOMMU table if we have 1GB or more of space */ |
746 | if ((tbl->it_size << tbl->it_page_shift) >= (1UL * 1024 * 1024 * 1024)) |
747 | tbl->nr_pools = IOMMU_NR_POOLS; |
748 | else |
749 | tbl->nr_pools = 1; |
750 | |
751 | /* We reserve the top 1/4 of the table for large allocations */ |
752 | tbl->poolsize = (tbl->it_size * 3 / 4) / tbl->nr_pools; |
753 | |
754 | for (i = 0; i < tbl->nr_pools; i++) { |
755 | p = &tbl->pools[i]; |
756 | spin_lock_init(&(p->lock)); |
757 | p->start = tbl->poolsize * i; |
758 | p->hint = p->start; |
759 | p->end = p->start + tbl->poolsize; |
760 | } |
761 | |
762 | p = &tbl->large_pool; |
763 | spin_lock_init(&(p->lock)); |
764 | p->start = tbl->poolsize * i; |
765 | p->hint = p->start; |
766 | p->end = tbl->it_size; |
767 | |
768 | iommu_table_clear(tbl); |
769 | |
770 | if (!welcomed) { |
771 | printk(KERN_INFO "IOMMU table initialized, virtual merging %s\n" , |
772 | novmerge ? "disabled" : "enabled" ); |
773 | welcomed = 1; |
774 | } |
775 | |
776 | iommu_debugfs_add(tbl); |
777 | |
778 | return tbl; |
779 | } |
780 | |
781 | bool iommu_table_in_use(struct iommu_table *tbl) |
782 | { |
783 | unsigned long start = 0, end; |
784 | |
785 | /* ignore reserved bit0 */ |
786 | if (tbl->it_offset == 0) |
787 | start = 1; |
788 | |
789 | /* Simple case with no reserved MMIO32 region */ |
790 | if (!tbl->it_reserved_start && !tbl->it_reserved_end) |
791 | return find_next_bit(addr: tbl->it_map, size: tbl->it_size, offset: start) != tbl->it_size; |
792 | |
793 | end = tbl->it_reserved_start - tbl->it_offset; |
794 | if (find_next_bit(addr: tbl->it_map, size: end, offset: start) != end) |
795 | return true; |
796 | |
797 | start = tbl->it_reserved_end - tbl->it_offset; |
798 | end = tbl->it_size; |
799 | return find_next_bit(addr: tbl->it_map, size: end, offset: start) != end; |
800 | } |
801 | |
802 | static void iommu_table_free(struct kref *kref) |
803 | { |
804 | struct iommu_table *tbl; |
805 | |
806 | tbl = container_of(kref, struct iommu_table, it_kref); |
807 | |
808 | if (tbl->it_ops->free) |
809 | tbl->it_ops->free(tbl); |
810 | |
811 | if (!tbl->it_map) { |
812 | kfree(objp: tbl); |
813 | return; |
814 | } |
815 | |
816 | iommu_debugfs_del(tbl); |
817 | |
818 | /* verify that table contains no entries */ |
819 | if (iommu_table_in_use(tbl)) |
820 | pr_warn("%s: Unexpected TCEs\n" , __func__); |
821 | |
822 | /* free bitmap */ |
823 | vfree(addr: tbl->it_map); |
824 | |
825 | /* free table */ |
826 | kfree(objp: tbl); |
827 | } |
828 | |
829 | struct iommu_table *iommu_tce_table_get(struct iommu_table *tbl) |
830 | { |
831 | if (kref_get_unless_zero(kref: &tbl->it_kref)) |
832 | return tbl; |
833 | |
834 | return NULL; |
835 | } |
836 | EXPORT_SYMBOL_GPL(iommu_tce_table_get); |
837 | |
838 | int iommu_tce_table_put(struct iommu_table *tbl) |
839 | { |
840 | if (WARN_ON(!tbl)) |
841 | return 0; |
842 | |
843 | return kref_put(kref: &tbl->it_kref, release: iommu_table_free); |
844 | } |
845 | EXPORT_SYMBOL_GPL(iommu_tce_table_put); |
846 | |
847 | /* Creates TCEs for a user provided buffer. The user buffer must be |
848 | * contiguous real kernel storage (not vmalloc). The address passed here |
849 | * comprises a page address and offset into that page. The dma_addr_t |
850 | * returned will point to the same byte within the page as was passed in. |
851 | */ |
852 | dma_addr_t iommu_map_page(struct device *dev, struct iommu_table *tbl, |
853 | struct page *page, unsigned long offset, size_t size, |
854 | unsigned long mask, enum dma_data_direction direction, |
855 | unsigned long attrs) |
856 | { |
857 | dma_addr_t dma_handle = DMA_MAPPING_ERROR; |
858 | void *vaddr; |
859 | unsigned long uaddr; |
860 | unsigned int npages, align; |
861 | |
862 | BUG_ON(direction == DMA_NONE); |
863 | |
864 | vaddr = page_address(page) + offset; |
865 | uaddr = (unsigned long)vaddr; |
866 | |
867 | if (tbl) { |
868 | npages = iommu_num_pages(addr: uaddr, len: size, io_page_size: IOMMU_PAGE_SIZE(tbl)); |
869 | align = 0; |
870 | if (tbl->it_page_shift < PAGE_SHIFT && size >= PAGE_SIZE && |
871 | ((unsigned long)vaddr & ~PAGE_MASK) == 0) |
872 | align = PAGE_SHIFT - tbl->it_page_shift; |
873 | |
874 | dma_handle = iommu_alloc(dev, tbl, page: vaddr, npages, direction, |
875 | mask: mask >> tbl->it_page_shift, align_order: align, |
876 | attrs); |
877 | if (dma_handle == DMA_MAPPING_ERROR) { |
878 | if (!(attrs & DMA_ATTR_NO_WARN) && |
879 | printk_ratelimit()) { |
880 | dev_info(dev, "iommu_alloc failed, tbl %p " |
881 | "vaddr %p npages %d\n" , tbl, vaddr, |
882 | npages); |
883 | } |
884 | } else |
885 | dma_handle |= (uaddr & ~IOMMU_PAGE_MASK(tbl)); |
886 | } |
887 | |
888 | return dma_handle; |
889 | } |
890 | |
891 | void iommu_unmap_page(struct iommu_table *tbl, dma_addr_t dma_handle, |
892 | size_t size, enum dma_data_direction direction, |
893 | unsigned long attrs) |
894 | { |
895 | unsigned int npages; |
896 | |
897 | BUG_ON(direction == DMA_NONE); |
898 | |
899 | if (tbl) { |
900 | npages = iommu_num_pages(addr: dma_handle, len: size, |
901 | io_page_size: IOMMU_PAGE_SIZE(tbl)); |
902 | iommu_free(tbl, dma_addr: dma_handle, npages); |
903 | } |
904 | } |
905 | |
906 | /* Allocates a contiguous real buffer and creates mappings over it. |
907 | * Returns the virtual address of the buffer and sets dma_handle |
908 | * to the dma address (mapping) of the first page. |
909 | */ |
910 | void *iommu_alloc_coherent(struct device *dev, struct iommu_table *tbl, |
911 | size_t size, dma_addr_t *dma_handle, |
912 | unsigned long mask, gfp_t flag, int node) |
913 | { |
914 | void *ret = NULL; |
915 | dma_addr_t mapping; |
916 | unsigned int order; |
917 | unsigned int nio_pages, io_order; |
918 | struct page *page; |
919 | int tcesize = (1 << tbl->it_page_shift); |
920 | |
921 | size = PAGE_ALIGN(size); |
922 | order = get_order(size); |
923 | |
924 | /* |
925 | * Client asked for way too much space. This is checked later |
926 | * anyway. It is easier to debug here for the drivers than in |
927 | * the tce tables. |
928 | */ |
929 | if (order >= IOMAP_MAX_ORDER) { |
930 | dev_info(dev, "iommu_alloc_consistent size too large: 0x%lx\n" , |
931 | size); |
932 | return NULL; |
933 | } |
934 | |
935 | if (!tbl) |
936 | return NULL; |
937 | |
938 | /* Alloc enough pages (and possibly more) */ |
939 | page = alloc_pages_node(nid: node, gfp_mask: flag, order); |
940 | if (!page) |
941 | return NULL; |
942 | ret = page_address(page); |
943 | memset(ret, 0, size); |
944 | |
945 | /* Set up tces to cover the allocated range */ |
946 | nio_pages = IOMMU_PAGE_ALIGN(size, tbl) >> tbl->it_page_shift; |
947 | |
948 | io_order = get_iommu_order(size, tbl); |
949 | mapping = iommu_alloc(dev, tbl, page: ret, npages: nio_pages, direction: DMA_BIDIRECTIONAL, |
950 | mask: mask >> tbl->it_page_shift, align_order: io_order, attrs: 0); |
951 | if (mapping == DMA_MAPPING_ERROR) { |
952 | free_pages(addr: (unsigned long)ret, order); |
953 | return NULL; |
954 | } |
955 | |
956 | *dma_handle = mapping | ((u64)ret & (tcesize - 1)); |
957 | return ret; |
958 | } |
959 | |
960 | void iommu_free_coherent(struct iommu_table *tbl, size_t size, |
961 | void *vaddr, dma_addr_t dma_handle) |
962 | { |
963 | if (tbl) { |
964 | unsigned int nio_pages; |
965 | |
966 | size = PAGE_ALIGN(size); |
967 | nio_pages = IOMMU_PAGE_ALIGN(size, tbl) >> tbl->it_page_shift; |
968 | iommu_free(tbl, dma_addr: dma_handle, npages: nio_pages); |
969 | size = PAGE_ALIGN(size); |
970 | free_pages(addr: (unsigned long)vaddr, order: get_order(size)); |
971 | } |
972 | } |
973 | |
974 | unsigned long iommu_direction_to_tce_perm(enum dma_data_direction dir) |
975 | { |
976 | switch (dir) { |
977 | case DMA_BIDIRECTIONAL: |
978 | return TCE_PCI_READ | TCE_PCI_WRITE; |
979 | case DMA_FROM_DEVICE: |
980 | return TCE_PCI_WRITE; |
981 | case DMA_TO_DEVICE: |
982 | return TCE_PCI_READ; |
983 | default: |
984 | return 0; |
985 | } |
986 | } |
987 | EXPORT_SYMBOL_GPL(iommu_direction_to_tce_perm); |
988 | |
989 | #ifdef CONFIG_IOMMU_API |
990 | /* |
991 | * SPAPR TCE API |
992 | */ |
993 | static void group_release(void *iommu_data) |
994 | { |
995 | struct iommu_table_group *table_group = iommu_data; |
996 | |
997 | table_group->group = NULL; |
998 | } |
999 | |
1000 | void iommu_register_group(struct iommu_table_group *table_group, |
1001 | int pci_domain_number, unsigned long pe_num) |
1002 | { |
1003 | struct iommu_group *grp; |
1004 | char *name; |
1005 | |
1006 | grp = iommu_group_alloc(); |
1007 | if (IS_ERR(ptr: grp)) { |
1008 | pr_warn("powerpc iommu api: cannot create new group, err=%ld\n" , |
1009 | PTR_ERR(grp)); |
1010 | return; |
1011 | } |
1012 | table_group->group = grp; |
1013 | iommu_group_set_iommudata(group: grp, iommu_data: table_group, release: group_release); |
1014 | name = kasprintf(GFP_KERNEL, fmt: "domain%d-pe%lx" , |
1015 | pci_domain_number, pe_num); |
1016 | if (!name) |
1017 | return; |
1018 | iommu_group_set_name(group: grp, name); |
1019 | kfree(objp: name); |
1020 | } |
1021 | |
1022 | enum dma_data_direction iommu_tce_direction(unsigned long tce) |
1023 | { |
1024 | if ((tce & TCE_PCI_READ) && (tce & TCE_PCI_WRITE)) |
1025 | return DMA_BIDIRECTIONAL; |
1026 | else if (tce & TCE_PCI_READ) |
1027 | return DMA_TO_DEVICE; |
1028 | else if (tce & TCE_PCI_WRITE) |
1029 | return DMA_FROM_DEVICE; |
1030 | else |
1031 | return DMA_NONE; |
1032 | } |
1033 | EXPORT_SYMBOL_GPL(iommu_tce_direction); |
1034 | |
1035 | void iommu_flush_tce(struct iommu_table *tbl) |
1036 | { |
1037 | /* Flush/invalidate TLB caches if necessary */ |
1038 | if (tbl->it_ops->flush) |
1039 | tbl->it_ops->flush(tbl); |
1040 | |
1041 | /* Make sure updates are seen by hardware */ |
1042 | mb(); |
1043 | } |
1044 | EXPORT_SYMBOL_GPL(iommu_flush_tce); |
1045 | |
1046 | int iommu_tce_check_ioba(unsigned long page_shift, |
1047 | unsigned long offset, unsigned long size, |
1048 | unsigned long ioba, unsigned long npages) |
1049 | { |
1050 | unsigned long mask = (1UL << page_shift) - 1; |
1051 | |
1052 | if (ioba & mask) |
1053 | return -EINVAL; |
1054 | |
1055 | ioba >>= page_shift; |
1056 | if (ioba < offset) |
1057 | return -EINVAL; |
1058 | |
1059 | if ((ioba + 1) > (offset + size)) |
1060 | return -EINVAL; |
1061 | |
1062 | return 0; |
1063 | } |
1064 | EXPORT_SYMBOL_GPL(iommu_tce_check_ioba); |
1065 | |
1066 | int iommu_tce_check_gpa(unsigned long page_shift, unsigned long gpa) |
1067 | { |
1068 | unsigned long mask = (1UL << page_shift) - 1; |
1069 | |
1070 | if (gpa & mask) |
1071 | return -EINVAL; |
1072 | |
1073 | return 0; |
1074 | } |
1075 | EXPORT_SYMBOL_GPL(iommu_tce_check_gpa); |
1076 | |
1077 | long iommu_tce_xchg_no_kill(struct mm_struct *mm, |
1078 | struct iommu_table *tbl, |
1079 | unsigned long entry, unsigned long *hpa, |
1080 | enum dma_data_direction *direction) |
1081 | { |
1082 | long ret; |
1083 | unsigned long size = 0; |
1084 | |
1085 | ret = tbl->it_ops->xchg_no_kill(tbl, entry, hpa, direction); |
1086 | if (!ret && ((*direction == DMA_FROM_DEVICE) || |
1087 | (*direction == DMA_BIDIRECTIONAL)) && |
1088 | !mm_iommu_is_devmem(mm, *hpa, tbl->it_page_shift, |
1089 | &size)) |
1090 | SetPageDirty(pfn_to_page(*hpa >> PAGE_SHIFT)); |
1091 | |
1092 | return ret; |
1093 | } |
1094 | EXPORT_SYMBOL_GPL(iommu_tce_xchg_no_kill); |
1095 | |
1096 | void iommu_tce_kill(struct iommu_table *tbl, |
1097 | unsigned long entry, unsigned long pages) |
1098 | { |
1099 | if (tbl->it_ops->tce_kill) |
1100 | tbl->it_ops->tce_kill(tbl, entry, pages); |
1101 | } |
1102 | EXPORT_SYMBOL_GPL(iommu_tce_kill); |
1103 | |
1104 | #if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) |
1105 | static int iommu_take_ownership(struct iommu_table *tbl) |
1106 | { |
1107 | unsigned long flags, i, sz = (tbl->it_size + 7) >> 3; |
1108 | int ret = 0; |
1109 | |
1110 | /* |
1111 | * VFIO does not control TCE entries allocation and the guest |
1112 | * can write new TCEs on top of existing ones so iommu_tce_build() |
1113 | * must be able to release old pages. This functionality |
1114 | * requires exchange() callback defined so if it is not |
1115 | * implemented, we disallow taking ownership over the table. |
1116 | */ |
1117 | if (!tbl->it_ops->xchg_no_kill) |
1118 | return -EINVAL; |
1119 | |
1120 | spin_lock_irqsave(&tbl->large_pool.lock, flags); |
1121 | for (i = 0; i < tbl->nr_pools; i++) |
1122 | spin_lock_nest_lock(&tbl->pools[i].lock, &tbl->large_pool.lock); |
1123 | |
1124 | if (iommu_table_in_use(tbl)) { |
1125 | pr_err("iommu_tce: it_map is not empty" ); |
1126 | ret = -EBUSY; |
1127 | } else { |
1128 | memset(tbl->it_map, 0xff, sz); |
1129 | } |
1130 | |
1131 | for (i = 0; i < tbl->nr_pools; i++) |
1132 | spin_unlock(&tbl->pools[i].lock); |
1133 | spin_unlock_irqrestore(&tbl->large_pool.lock, flags); |
1134 | |
1135 | return ret; |
1136 | } |
1137 | |
1138 | static void iommu_release_ownership(struct iommu_table *tbl) |
1139 | { |
1140 | unsigned long flags, i, sz = (tbl->it_size + 7) >> 3; |
1141 | |
1142 | spin_lock_irqsave(&tbl->large_pool.lock, flags); |
1143 | for (i = 0; i < tbl->nr_pools; i++) |
1144 | spin_lock_nest_lock(&tbl->pools[i].lock, &tbl->large_pool.lock); |
1145 | |
1146 | memset(tbl->it_map, 0, sz); |
1147 | |
1148 | iommu_table_reserve_pages(tbl, tbl->it_reserved_start, |
1149 | tbl->it_reserved_end); |
1150 | |
1151 | for (i = 0; i < tbl->nr_pools; i++) |
1152 | spin_unlock(&tbl->pools[i].lock); |
1153 | spin_unlock_irqrestore(&tbl->large_pool.lock, flags); |
1154 | } |
1155 | #endif |
1156 | |
1157 | int iommu_add_device(struct iommu_table_group *table_group, struct device *dev) |
1158 | { |
1159 | /* |
1160 | * The sysfs entries should be populated before |
1161 | * binding IOMMU group. If sysfs entries isn't |
1162 | * ready, we simply bail. |
1163 | */ |
1164 | if (!device_is_registered(dev)) |
1165 | return -ENOENT; |
1166 | |
1167 | if (device_iommu_mapped(dev)) { |
1168 | pr_debug("%s: Skipping device %s with iommu group %d\n" , |
1169 | __func__, dev_name(dev), |
1170 | iommu_group_id(dev->iommu_group)); |
1171 | return -EBUSY; |
1172 | } |
1173 | |
1174 | pr_debug("%s: Adding %s to iommu group %d\n" , |
1175 | __func__, dev_name(dev), iommu_group_id(table_group->group)); |
1176 | /* |
1177 | * This is still not adding devices via the IOMMU bus notifier because |
1178 | * of pcibios_init() from arch/powerpc/kernel/pci_64.c which calls |
1179 | * pcibios_scan_phb() first (and this guy adds devices and triggers |
1180 | * the notifier) and only then it calls pci_bus_add_devices() which |
1181 | * configures DMA for buses which also creates PEs and IOMMU groups. |
1182 | */ |
1183 | return iommu_probe_device(dev); |
1184 | } |
1185 | EXPORT_SYMBOL_GPL(iommu_add_device); |
1186 | |
1187 | #if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) |
1188 | /* |
1189 | * A simple iommu_table_group_ops which only allows reusing the existing |
1190 | * iommu_table. This handles VFIO for POWER7 or the nested KVM. |
1191 | * The ops does not allow creating windows and only allows reusing the existing |
1192 | * one if it matches table_group->tce32_start/tce32_size/page_shift. |
1193 | */ |
1194 | static unsigned long spapr_tce_get_table_size(__u32 page_shift, |
1195 | __u64 window_size, __u32 levels) |
1196 | { |
1197 | unsigned long size; |
1198 | |
1199 | if (levels > 1) |
1200 | return ~0U; |
1201 | size = window_size >> (page_shift - 3); |
1202 | return size; |
1203 | } |
1204 | |
1205 | static long spapr_tce_create_table(struct iommu_table_group *table_group, int num, |
1206 | __u32 page_shift, __u64 window_size, __u32 levels, |
1207 | struct iommu_table **ptbl) |
1208 | { |
1209 | struct iommu_table *tbl = table_group->tables[0]; |
1210 | |
1211 | if (num > 0) |
1212 | return -EPERM; |
1213 | |
1214 | if (tbl->it_page_shift != page_shift || |
1215 | tbl->it_size != (window_size >> page_shift) || |
1216 | tbl->it_indirect_levels != levels - 1) |
1217 | return -EINVAL; |
1218 | |
1219 | *ptbl = iommu_tce_table_get(tbl); |
1220 | return 0; |
1221 | } |
1222 | |
1223 | static long spapr_tce_set_window(struct iommu_table_group *table_group, |
1224 | int num, struct iommu_table *tbl) |
1225 | { |
1226 | return tbl == table_group->tables[num] ? 0 : -EPERM; |
1227 | } |
1228 | |
1229 | static long spapr_tce_unset_window(struct iommu_table_group *table_group, int num) |
1230 | { |
1231 | return 0; |
1232 | } |
1233 | |
1234 | static long spapr_tce_take_ownership(struct iommu_table_group *table_group) |
1235 | { |
1236 | int i, j, rc = 0; |
1237 | |
1238 | for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { |
1239 | struct iommu_table *tbl = table_group->tables[i]; |
1240 | |
1241 | if (!tbl || !tbl->it_map) |
1242 | continue; |
1243 | |
1244 | rc = iommu_take_ownership(tbl); |
1245 | if (!rc) |
1246 | continue; |
1247 | |
1248 | for (j = 0; j < i; ++j) |
1249 | iommu_release_ownership(table_group->tables[j]); |
1250 | return rc; |
1251 | } |
1252 | return 0; |
1253 | } |
1254 | |
1255 | static void spapr_tce_release_ownership(struct iommu_table_group *table_group) |
1256 | { |
1257 | int i; |
1258 | |
1259 | for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { |
1260 | struct iommu_table *tbl = table_group->tables[i]; |
1261 | |
1262 | if (!tbl) |
1263 | continue; |
1264 | |
1265 | iommu_table_clear(tbl); |
1266 | if (tbl->it_map) |
1267 | iommu_release_ownership(tbl); |
1268 | } |
1269 | } |
1270 | |
1271 | struct iommu_table_group_ops spapr_tce_table_group_ops = { |
1272 | .get_table_size = spapr_tce_get_table_size, |
1273 | .create_table = spapr_tce_create_table, |
1274 | .set_window = spapr_tce_set_window, |
1275 | .unset_window = spapr_tce_unset_window, |
1276 | .take_ownership = spapr_tce_take_ownership, |
1277 | .release_ownership = spapr_tce_release_ownership, |
1278 | }; |
1279 | |
1280 | /* |
1281 | * A simple iommu_ops to allow less cruft in generic VFIO code. |
1282 | */ |
1283 | static int |
1284 | spapr_tce_platform_iommu_attach_dev(struct iommu_domain *platform_domain, |
1285 | struct device *dev) |
1286 | { |
1287 | struct iommu_domain *domain = iommu_get_domain_for_dev(dev); |
1288 | struct iommu_table_group *table_group; |
1289 | struct iommu_group *grp; |
1290 | |
1291 | /* At first attach the ownership is already set */ |
1292 | if (!domain) |
1293 | return 0; |
1294 | |
1295 | grp = iommu_group_get(dev); |
1296 | table_group = iommu_group_get_iommudata(grp); |
1297 | /* |
1298 | * The domain being set to PLATFORM from earlier |
1299 | * BLOCKED. The table_group ownership has to be released. |
1300 | */ |
1301 | table_group->ops->release_ownership(table_group); |
1302 | iommu_group_put(grp); |
1303 | |
1304 | return 0; |
1305 | } |
1306 | |
1307 | static const struct iommu_domain_ops spapr_tce_platform_domain_ops = { |
1308 | .attach_dev = spapr_tce_platform_iommu_attach_dev, |
1309 | }; |
1310 | |
1311 | static struct iommu_domain spapr_tce_platform_domain = { |
1312 | .type = IOMMU_DOMAIN_PLATFORM, |
1313 | .ops = &spapr_tce_platform_domain_ops, |
1314 | }; |
1315 | |
1316 | static int |
1317 | spapr_tce_blocked_iommu_attach_dev(struct iommu_domain *platform_domain, |
1318 | struct device *dev) |
1319 | { |
1320 | struct iommu_group *grp = iommu_group_get(dev); |
1321 | struct iommu_table_group *table_group; |
1322 | int ret = -EINVAL; |
1323 | |
1324 | /* |
1325 | * FIXME: SPAPR mixes blocked and platform behaviors, the blocked domain |
1326 | * also sets the dma_api ops |
1327 | */ |
1328 | table_group = iommu_group_get_iommudata(grp); |
1329 | ret = table_group->ops->take_ownership(table_group); |
1330 | iommu_group_put(grp); |
1331 | |
1332 | return ret; |
1333 | } |
1334 | |
1335 | static const struct iommu_domain_ops spapr_tce_blocked_domain_ops = { |
1336 | .attach_dev = spapr_tce_blocked_iommu_attach_dev, |
1337 | }; |
1338 | |
1339 | static struct iommu_domain spapr_tce_blocked_domain = { |
1340 | .type = IOMMU_DOMAIN_BLOCKED, |
1341 | .ops = &spapr_tce_blocked_domain_ops, |
1342 | }; |
1343 | |
1344 | static bool spapr_tce_iommu_capable(struct device *dev, enum iommu_cap cap) |
1345 | { |
1346 | switch (cap) { |
1347 | case IOMMU_CAP_CACHE_COHERENCY: |
1348 | return true; |
1349 | default: |
1350 | break; |
1351 | } |
1352 | |
1353 | return false; |
1354 | } |
1355 | |
1356 | static struct iommu_device *spapr_tce_iommu_probe_device(struct device *dev) |
1357 | { |
1358 | struct pci_dev *pdev; |
1359 | struct pci_controller *hose; |
1360 | |
1361 | if (!dev_is_pci(dev)) |
1362 | return ERR_PTR(-ENODEV); |
1363 | |
1364 | pdev = to_pci_dev(dev); |
1365 | hose = pdev->bus->sysdata; |
1366 | |
1367 | return &hose->iommu; |
1368 | } |
1369 | |
1370 | static void spapr_tce_iommu_release_device(struct device *dev) |
1371 | { |
1372 | } |
1373 | |
1374 | static struct iommu_group *spapr_tce_iommu_device_group(struct device *dev) |
1375 | { |
1376 | struct pci_controller *hose; |
1377 | struct pci_dev *pdev; |
1378 | |
1379 | pdev = to_pci_dev(dev); |
1380 | hose = pdev->bus->sysdata; |
1381 | |
1382 | if (!hose->controller_ops.device_group) |
1383 | return ERR_PTR(-ENOENT); |
1384 | |
1385 | return hose->controller_ops.device_group(hose, pdev); |
1386 | } |
1387 | |
1388 | static const struct iommu_ops spapr_tce_iommu_ops = { |
1389 | .default_domain = &spapr_tce_platform_domain, |
1390 | .blocked_domain = &spapr_tce_blocked_domain, |
1391 | .capable = spapr_tce_iommu_capable, |
1392 | .probe_device = spapr_tce_iommu_probe_device, |
1393 | .release_device = spapr_tce_iommu_release_device, |
1394 | .device_group = spapr_tce_iommu_device_group, |
1395 | }; |
1396 | |
1397 | static struct attribute *spapr_tce_iommu_attrs[] = { |
1398 | NULL, |
1399 | }; |
1400 | |
1401 | static struct attribute_group spapr_tce_iommu_group = { |
1402 | .name = "spapr-tce-iommu" , |
1403 | .attrs = spapr_tce_iommu_attrs, |
1404 | }; |
1405 | |
1406 | static const struct attribute_group *spapr_tce_iommu_groups[] = { |
1407 | &spapr_tce_iommu_group, |
1408 | NULL, |
1409 | }; |
1410 | |
1411 | void ppc_iommu_register_device(struct pci_controller *phb) |
1412 | { |
1413 | iommu_device_sysfs_add(&phb->iommu, phb->parent, |
1414 | spapr_tce_iommu_groups, "iommu-phb%04x" , |
1415 | phb->global_number); |
1416 | iommu_device_register(&phb->iommu, &spapr_tce_iommu_ops, |
1417 | phb->parent); |
1418 | } |
1419 | |
1420 | void ppc_iommu_unregister_device(struct pci_controller *phb) |
1421 | { |
1422 | iommu_device_unregister(&phb->iommu); |
1423 | iommu_device_sysfs_remove(&phb->iommu); |
1424 | } |
1425 | |
1426 | /* |
1427 | * This registers IOMMU devices of PHBs. This needs to happen |
1428 | * after core_initcall(iommu_init) + postcore_initcall(pci_driver_init) and |
1429 | * before subsys_initcall(iommu_subsys_init). |
1430 | */ |
1431 | static int __init spapr_tce_setup_phb_iommus_initcall(void) |
1432 | { |
1433 | struct pci_controller *hose; |
1434 | |
1435 | list_for_each_entry(hose, &hose_list, list_node) { |
1436 | ppc_iommu_register_device(hose); |
1437 | } |
1438 | return 0; |
1439 | } |
1440 | postcore_initcall_sync(spapr_tce_setup_phb_iommus_initcall); |
1441 | #endif |
1442 | |
1443 | #endif /* CONFIG_IOMMU_API */ |
1444 | |