1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright © 2006-2014 Intel Corporation.
4 *
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
11 */
12
13#define pr_fmt(fmt) "DMAR: " fmt
14#define dev_fmt(fmt) pr_fmt(fmt)
15
16#include <linux/crash_dump.h>
17#include <linux/dma-direct.h>
18#include <linux/dmi.h>
19#include <linux/memory.h>
20#include <linux/pci.h>
21#include <linux/pci-ats.h>
22#include <linux/spinlock.h>
23#include <linux/syscore_ops.h>
24#include <linux/tboot.h>
25#include <uapi/linux/iommufd.h>
26
27#include "iommu.h"
28#include "../dma-iommu.h"
29#include "../irq_remapping.h"
30#include "pasid.h"
31#include "cap_audit.h"
32#include "perfmon.h"
33
34#define ROOT_SIZE VTD_PAGE_SIZE
35#define CONTEXT_SIZE VTD_PAGE_SIZE
36
37#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
38#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
39#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
40#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
41
42#define IOAPIC_RANGE_START (0xfee00000)
43#define IOAPIC_RANGE_END (0xfeefffff)
44#define IOVA_START_ADDR (0x1000)
45
46#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
47
48#define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
49#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
50
51/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
52 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
53#define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
54 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
55#define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
56
57/* IO virtual address start page frame number */
58#define IOVA_START_PFN (1)
59
60#define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
61
62static void __init check_tylersburg_isoch(void);
63static int rwbf_quirk;
64
65/*
66 * set to 1 to panic kernel if can't successfully enable VT-d
67 * (used when kernel is launched w/ TXT)
68 */
69static int force_on = 0;
70static int intel_iommu_tboot_noforce;
71static int no_platform_optin;
72
73#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
74
75/*
76 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
77 * if marked present.
78 */
79static phys_addr_t root_entry_lctp(struct root_entry *re)
80{
81 if (!(re->lo & 1))
82 return 0;
83
84 return re->lo & VTD_PAGE_MASK;
85}
86
87/*
88 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
89 * if marked present.
90 */
91static phys_addr_t root_entry_uctp(struct root_entry *re)
92{
93 if (!(re->hi & 1))
94 return 0;
95
96 return re->hi & VTD_PAGE_MASK;
97}
98
99static int device_rid_cmp_key(const void *key, const struct rb_node *node)
100{
101 struct device_domain_info *info =
102 rb_entry(node, struct device_domain_info, node);
103 const u16 *rid_lhs = key;
104
105 if (*rid_lhs < PCI_DEVID(info->bus, info->devfn))
106 return -1;
107
108 if (*rid_lhs > PCI_DEVID(info->bus, info->devfn))
109 return 1;
110
111 return 0;
112}
113
114static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs)
115{
116 struct device_domain_info *info =
117 rb_entry(lhs, struct device_domain_info, node);
118 u16 key = PCI_DEVID(info->bus, info->devfn);
119
120 return device_rid_cmp_key(key: &key, node: rhs);
121}
122
123/*
124 * Looks up an IOMMU-probed device using its source ID.
125 *
126 * Returns the pointer to the device if there is a match. Otherwise,
127 * returns NULL.
128 *
129 * Note that this helper doesn't guarantee that the device won't be
130 * released by the iommu subsystem after being returned. The caller
131 * should use its own synchronization mechanism to avoid the device
132 * being released during its use if its possibly the case.
133 */
134struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid)
135{
136 struct device_domain_info *info = NULL;
137 struct rb_node *node;
138 unsigned long flags;
139
140 spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
141 node = rb_find(key: &rid, tree: &iommu->device_rbtree, cmp: device_rid_cmp_key);
142 if (node)
143 info = rb_entry(node, struct device_domain_info, node);
144 spin_unlock_irqrestore(lock: &iommu->device_rbtree_lock, flags);
145
146 return info ? info->dev : NULL;
147}
148
149static int device_rbtree_insert(struct intel_iommu *iommu,
150 struct device_domain_info *info)
151{
152 struct rb_node *curr;
153 unsigned long flags;
154
155 spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
156 curr = rb_find_add(node: &info->node, tree: &iommu->device_rbtree, cmp: device_rid_cmp);
157 spin_unlock_irqrestore(lock: &iommu->device_rbtree_lock, flags);
158 if (WARN_ON(curr))
159 return -EEXIST;
160
161 return 0;
162}
163
164static void device_rbtree_remove(struct device_domain_info *info)
165{
166 struct intel_iommu *iommu = info->iommu;
167 unsigned long flags;
168
169 spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
170 rb_erase(&info->node, &iommu->device_rbtree);
171 spin_unlock_irqrestore(lock: &iommu->device_rbtree_lock, flags);
172}
173
174/*
175 * This domain is a statically identity mapping domain.
176 * 1. This domain creats a static 1:1 mapping to all usable memory.
177 * 2. It maps to each iommu if successful.
178 * 3. Each iommu mapps to this domain if successful.
179 */
180static struct dmar_domain *si_domain;
181static int hw_pass_through = 1;
182
183struct dmar_rmrr_unit {
184 struct list_head list; /* list of rmrr units */
185 struct acpi_dmar_header *hdr; /* ACPI header */
186 u64 base_address; /* reserved base address*/
187 u64 end_address; /* reserved end address */
188 struct dmar_dev_scope *devices; /* target devices */
189 int devices_cnt; /* target device count */
190};
191
192struct dmar_atsr_unit {
193 struct list_head list; /* list of ATSR units */
194 struct acpi_dmar_header *hdr; /* ACPI header */
195 struct dmar_dev_scope *devices; /* target devices */
196 int devices_cnt; /* target device count */
197 u8 include_all:1; /* include all ports */
198};
199
200struct dmar_satc_unit {
201 struct list_head list; /* list of SATC units */
202 struct acpi_dmar_header *hdr; /* ACPI header */
203 struct dmar_dev_scope *devices; /* target devices */
204 struct intel_iommu *iommu; /* the corresponding iommu */
205 int devices_cnt; /* target device count */
206 u8 atc_required:1; /* ATS is required */
207};
208
209static LIST_HEAD(dmar_atsr_units);
210static LIST_HEAD(dmar_rmrr_units);
211static LIST_HEAD(dmar_satc_units);
212
213#define for_each_rmrr_units(rmrr) \
214 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
215
216static void intel_iommu_domain_free(struct iommu_domain *domain);
217
218int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
219int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
220
221int intel_iommu_enabled = 0;
222EXPORT_SYMBOL_GPL(intel_iommu_enabled);
223
224static int dmar_map_gfx = 1;
225static int intel_iommu_superpage = 1;
226static int iommu_identity_mapping;
227static int iommu_skip_te_disable;
228
229#define IDENTMAP_GFX 2
230#define IDENTMAP_AZALIA 4
231
232const struct iommu_ops intel_iommu_ops;
233static const struct iommu_dirty_ops intel_dirty_ops;
234
235static bool translation_pre_enabled(struct intel_iommu *iommu)
236{
237 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
238}
239
240static void clear_translation_pre_enabled(struct intel_iommu *iommu)
241{
242 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
243}
244
245static void init_translation_status(struct intel_iommu *iommu)
246{
247 u32 gsts;
248
249 gsts = readl(addr: iommu->reg + DMAR_GSTS_REG);
250 if (gsts & DMA_GSTS_TES)
251 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
252}
253
254static int __init intel_iommu_setup(char *str)
255{
256 if (!str)
257 return -EINVAL;
258
259 while (*str) {
260 if (!strncmp(str, "on", 2)) {
261 dmar_disabled = 0;
262 pr_info("IOMMU enabled\n");
263 } else if (!strncmp(str, "off", 3)) {
264 dmar_disabled = 1;
265 no_platform_optin = 1;
266 pr_info("IOMMU disabled\n");
267 } else if (!strncmp(str, "igfx_off", 8)) {
268 dmar_map_gfx = 0;
269 pr_info("Disable GFX device mapping\n");
270 } else if (!strncmp(str, "forcedac", 8)) {
271 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
272 iommu_dma_forcedac = true;
273 } else if (!strncmp(str, "strict", 6)) {
274 pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
275 iommu_set_dma_strict();
276 } else if (!strncmp(str, "sp_off", 6)) {
277 pr_info("Disable supported super page\n");
278 intel_iommu_superpage = 0;
279 } else if (!strncmp(str, "sm_on", 5)) {
280 pr_info("Enable scalable mode if hardware supports\n");
281 intel_iommu_sm = 1;
282 } else if (!strncmp(str, "sm_off", 6)) {
283 pr_info("Scalable mode is disallowed\n");
284 intel_iommu_sm = 0;
285 } else if (!strncmp(str, "tboot_noforce", 13)) {
286 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
287 intel_iommu_tboot_noforce = 1;
288 } else {
289 pr_notice("Unknown option - '%s'\n", str);
290 }
291
292 str += strcspn(str, ",");
293 while (*str == ',')
294 str++;
295 }
296
297 return 1;
298}
299__setup("intel_iommu=", intel_iommu_setup);
300
301void *alloc_pgtable_page(int node, gfp_t gfp)
302{
303 struct page *page;
304 void *vaddr = NULL;
305
306 page = alloc_pages_node(nid: node, gfp_mask: gfp | __GFP_ZERO, order: 0);
307 if (page)
308 vaddr = page_address(page);
309 return vaddr;
310}
311
312void free_pgtable_page(void *vaddr)
313{
314 free_page((unsigned long)vaddr);
315}
316
317static int domain_type_is_si(struct dmar_domain *domain)
318{
319 return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
320}
321
322static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn)
323{
324 int addr_width = agaw_to_width(agaw: domain->agaw) - VTD_PAGE_SHIFT;
325
326 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
327}
328
329/*
330 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
331 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
332 * the returned SAGAW.
333 */
334static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
335{
336 unsigned long fl_sagaw, sl_sagaw;
337
338 fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
339 sl_sagaw = cap_sagaw(iommu->cap);
340
341 /* Second level only. */
342 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
343 return sl_sagaw;
344
345 /* First level only. */
346 if (!ecap_slts(iommu->ecap))
347 return fl_sagaw;
348
349 return fl_sagaw & sl_sagaw;
350}
351
352static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
353{
354 unsigned long sagaw;
355 int agaw;
356
357 sagaw = __iommu_calculate_sagaw(iommu);
358 for (agaw = width_to_agaw(width: max_gaw); agaw >= 0; agaw--) {
359 if (test_bit(agaw, &sagaw))
360 break;
361 }
362
363 return agaw;
364}
365
366/*
367 * Calculate max SAGAW for each iommu.
368 */
369int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
370{
371 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
372}
373
374/*
375 * calculate agaw for each iommu.
376 * "SAGAW" may be different across iommus, use a default agaw, and
377 * get a supported less agaw for iommus that don't support the default agaw.
378 */
379int iommu_calculate_agaw(struct intel_iommu *iommu)
380{
381 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
382}
383
384static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
385{
386 return sm_supported(iommu) ?
387 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
388}
389
390static void domain_update_iommu_coherency(struct dmar_domain *domain)
391{
392 struct iommu_domain_info *info;
393 struct dmar_drhd_unit *drhd;
394 struct intel_iommu *iommu;
395 bool found = false;
396 unsigned long i;
397
398 domain->iommu_coherency = true;
399 xa_for_each(&domain->iommu_array, i, info) {
400 found = true;
401 if (!iommu_paging_structure_coherency(iommu: info->iommu)) {
402 domain->iommu_coherency = false;
403 break;
404 }
405 }
406 if (found)
407 return;
408
409 /* No hardware attached; use lowest common denominator */
410 rcu_read_lock();
411 for_each_active_iommu(iommu, drhd) {
412 if (!iommu_paging_structure_coherency(iommu)) {
413 domain->iommu_coherency = false;
414 break;
415 }
416 }
417 rcu_read_unlock();
418}
419
420static int domain_update_iommu_superpage(struct dmar_domain *domain,
421 struct intel_iommu *skip)
422{
423 struct dmar_drhd_unit *drhd;
424 struct intel_iommu *iommu;
425 int mask = 0x3;
426
427 if (!intel_iommu_superpage)
428 return 0;
429
430 /* set iommu_superpage to the smallest common denominator */
431 rcu_read_lock();
432 for_each_active_iommu(iommu, drhd) {
433 if (iommu != skip) {
434 if (domain && domain->use_first_level) {
435 if (!cap_fl1gp_support(iommu->cap))
436 mask = 0x1;
437 } else {
438 mask &= cap_super_page_val(iommu->cap);
439 }
440
441 if (!mask)
442 break;
443 }
444 }
445 rcu_read_unlock();
446
447 return fls(x: mask);
448}
449
450static int domain_update_device_node(struct dmar_domain *domain)
451{
452 struct device_domain_info *info;
453 int nid = NUMA_NO_NODE;
454 unsigned long flags;
455
456 spin_lock_irqsave(&domain->lock, flags);
457 list_for_each_entry(info, &domain->devices, link) {
458 /*
459 * There could possibly be multiple device numa nodes as devices
460 * within the same domain may sit behind different IOMMUs. There
461 * isn't perfect answer in such situation, so we select first
462 * come first served policy.
463 */
464 nid = dev_to_node(dev: info->dev);
465 if (nid != NUMA_NO_NODE)
466 break;
467 }
468 spin_unlock_irqrestore(lock: &domain->lock, flags);
469
470 return nid;
471}
472
473/* Return the super pagesize bitmap if supported. */
474static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
475{
476 unsigned long bitmap = 0;
477
478 /*
479 * 1-level super page supports page size of 2MiB, 2-level super page
480 * supports page size of both 2MiB and 1GiB.
481 */
482 if (domain->iommu_superpage == 1)
483 bitmap |= SZ_2M;
484 else if (domain->iommu_superpage == 2)
485 bitmap |= SZ_2M | SZ_1G;
486
487 return bitmap;
488}
489
490/* Some capabilities may be different across iommus */
491void domain_update_iommu_cap(struct dmar_domain *domain)
492{
493 domain_update_iommu_coherency(domain);
494 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
495
496 /*
497 * If RHSA is missing, we should default to the device numa domain
498 * as fall back.
499 */
500 if (domain->nid == NUMA_NO_NODE)
501 domain->nid = domain_update_device_node(domain);
502
503 /*
504 * First-level translation restricts the input-address to a
505 * canonical address (i.e., address bits 63:N have the same
506 * value as address bit [N-1], where N is 48-bits with 4-level
507 * paging and 57-bits with 5-level paging). Hence, skip bit
508 * [N-1].
509 */
510 if (domain->use_first_level)
511 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
512 else
513 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
514
515 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
516 domain_update_iotlb(domain);
517}
518
519struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
520 u8 devfn, int alloc)
521{
522 struct root_entry *root = &iommu->root_entry[bus];
523 struct context_entry *context;
524 u64 *entry;
525
526 /*
527 * Except that the caller requested to allocate a new entry,
528 * returning a copied context entry makes no sense.
529 */
530 if (!alloc && context_copied(iommu, bus, devfn))
531 return NULL;
532
533 entry = &root->lo;
534 if (sm_supported(iommu)) {
535 if (devfn >= 0x80) {
536 devfn -= 0x80;
537 entry = &root->hi;
538 }
539 devfn *= 2;
540 }
541 if (*entry & 1)
542 context = phys_to_virt(address: *entry & VTD_PAGE_MASK);
543 else {
544 unsigned long phy_addr;
545 if (!alloc)
546 return NULL;
547
548 context = alloc_pgtable_page(node: iommu->node, GFP_ATOMIC);
549 if (!context)
550 return NULL;
551
552 __iommu_flush_cache(iommu, addr: (void *)context, CONTEXT_SIZE);
553 phy_addr = virt_to_phys(address: (void *)context);
554 *entry = phy_addr | 1;
555 __iommu_flush_cache(iommu, addr: entry, size: sizeof(*entry));
556 }
557 return &context[devfn];
558}
559
560/**
561 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
562 * sub-hierarchy of a candidate PCI-PCI bridge
563 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
564 * @bridge: the candidate PCI-PCI bridge
565 *
566 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
567 */
568static bool
569is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
570{
571 struct pci_dev *pdev, *pbridge;
572
573 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
574 return false;
575
576 pdev = to_pci_dev(dev);
577 pbridge = to_pci_dev(bridge);
578
579 if (pbridge->subordinate &&
580 pbridge->subordinate->number <= pdev->bus->number &&
581 pbridge->subordinate->busn_res.end >= pdev->bus->number)
582 return true;
583
584 return false;
585}
586
587static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
588{
589 struct dmar_drhd_unit *drhd;
590 u32 vtbar;
591 int rc;
592
593 /* We know that this device on this chipset has its own IOMMU.
594 * If we find it under a different IOMMU, then the BIOS is lying
595 * to us. Hope that the IOMMU for this device is actually
596 * disabled, and it needs no translation...
597 */
598 rc = pci_bus_read_config_dword(bus: pdev->bus, PCI_DEVFN(0, 0), where: 0xb0, val: &vtbar);
599 if (rc) {
600 /* "can't" happen */
601 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
602 return false;
603 }
604 vtbar &= 0xffff0000;
605
606 /* we know that the this iommu should be at offset 0xa000 from vtbar */
607 drhd = dmar_find_matched_drhd_unit(dev: pdev);
608 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
609 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
610 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
611 return true;
612 }
613
614 return false;
615}
616
617static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
618{
619 if (!iommu || iommu->drhd->ignored)
620 return true;
621
622 if (dev_is_pci(dev)) {
623 struct pci_dev *pdev = to_pci_dev(dev);
624
625 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
626 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
627 quirk_ioat_snb_local_iommu(pdev))
628 return true;
629 }
630
631 return false;
632}
633
634static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn)
635{
636 struct dmar_drhd_unit *drhd = NULL;
637 struct pci_dev *pdev = NULL;
638 struct intel_iommu *iommu;
639 struct device *tmp;
640 u16 segment = 0;
641 int i;
642
643 if (!dev)
644 return NULL;
645
646 if (dev_is_pci(dev)) {
647 struct pci_dev *pf_pdev;
648
649 pdev = pci_real_dma_dev(to_pci_dev(dev));
650
651 /* VFs aren't listed in scope tables; we need to look up
652 * the PF instead to find the IOMMU. */
653 pf_pdev = pci_physfn(dev: pdev);
654 dev = &pf_pdev->dev;
655 segment = pci_domain_nr(bus: pdev->bus);
656 } else if (has_acpi_companion(dev))
657 dev = &ACPI_COMPANION(dev)->dev;
658
659 rcu_read_lock();
660 for_each_iommu(iommu, drhd) {
661 if (pdev && segment != drhd->segment)
662 continue;
663
664 for_each_active_dev_scope(drhd->devices,
665 drhd->devices_cnt, i, tmp) {
666 if (tmp == dev) {
667 /* For a VF use its original BDF# not that of the PF
668 * which we used for the IOMMU lookup. Strictly speaking
669 * we could do this for all PCI devices; we only need to
670 * get the BDF# from the scope table for ACPI matches. */
671 if (pdev && pdev->is_virtfn)
672 goto got_pdev;
673
674 if (bus && devfn) {
675 *bus = drhd->devices[i].bus;
676 *devfn = drhd->devices[i].devfn;
677 }
678 goto out;
679 }
680
681 if (is_downstream_to_pci_bridge(dev, bridge: tmp))
682 goto got_pdev;
683 }
684
685 if (pdev && drhd->include_all) {
686got_pdev:
687 if (bus && devfn) {
688 *bus = pdev->bus->number;
689 *devfn = pdev->devfn;
690 }
691 goto out;
692 }
693 }
694 iommu = NULL;
695out:
696 if (iommu_is_dummy(iommu, dev))
697 iommu = NULL;
698
699 rcu_read_unlock();
700
701 return iommu;
702}
703
704static void domain_flush_cache(struct dmar_domain *domain,
705 void *addr, int size)
706{
707 if (!domain->iommu_coherency)
708 clflush_cache_range(addr, size);
709}
710
711static void free_context_table(struct intel_iommu *iommu)
712{
713 struct context_entry *context;
714 int i;
715
716 if (!iommu->root_entry)
717 return;
718
719 for (i = 0; i < ROOT_ENTRY_NR; i++) {
720 context = iommu_context_addr(iommu, bus: i, devfn: 0, alloc: 0);
721 if (context)
722 free_pgtable_page(vaddr: context);
723
724 if (!sm_supported(iommu))
725 continue;
726
727 context = iommu_context_addr(iommu, bus: i, devfn: 0x80, alloc: 0);
728 if (context)
729 free_pgtable_page(vaddr: context);
730 }
731
732 free_pgtable_page(vaddr: iommu->root_entry);
733 iommu->root_entry = NULL;
734}
735
736#ifdef CONFIG_DMAR_DEBUG
737static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
738 u8 bus, u8 devfn, struct dma_pte *parent, int level)
739{
740 struct dma_pte *pte;
741 int offset;
742
743 while (1) {
744 offset = pfn_level_offset(pfn, level);
745 pte = &parent[offset];
746 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
747 pr_info("PTE not present at level %d\n", level);
748 break;
749 }
750
751 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
752
753 if (level == 1)
754 break;
755
756 parent = phys_to_virt(address: dma_pte_addr(pte));
757 level--;
758 }
759}
760
761void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
762 unsigned long long addr, u32 pasid)
763{
764 struct pasid_dir_entry *dir, *pde;
765 struct pasid_entry *entries, *pte;
766 struct context_entry *ctx_entry;
767 struct root_entry *rt_entry;
768 int i, dir_index, index, level;
769 u8 devfn = source_id & 0xff;
770 u8 bus = source_id >> 8;
771 struct dma_pte *pgtable;
772
773 pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
774
775 /* root entry dump */
776 rt_entry = &iommu->root_entry[bus];
777 if (!rt_entry) {
778 pr_info("root table entry is not present\n");
779 return;
780 }
781
782 if (sm_supported(iommu))
783 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
784 rt_entry->hi, rt_entry->lo);
785 else
786 pr_info("root entry: 0x%016llx", rt_entry->lo);
787
788 /* context entry dump */
789 ctx_entry = iommu_context_addr(iommu, bus, devfn, alloc: 0);
790 if (!ctx_entry) {
791 pr_info("context table entry is not present\n");
792 return;
793 }
794
795 pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
796 ctx_entry->hi, ctx_entry->lo);
797
798 /* legacy mode does not require PASID entries */
799 if (!sm_supported(iommu)) {
800 level = agaw_to_level(agaw: ctx_entry->hi & 7);
801 pgtable = phys_to_virt(address: ctx_entry->lo & VTD_PAGE_MASK);
802 goto pgtable_walk;
803 }
804
805 /* get the pointer to pasid directory entry */
806 dir = phys_to_virt(address: ctx_entry->lo & VTD_PAGE_MASK);
807 if (!dir) {
808 pr_info("pasid directory entry is not present\n");
809 return;
810 }
811 /* For request-without-pasid, get the pasid from context entry */
812 if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
813 pasid = IOMMU_NO_PASID;
814
815 dir_index = pasid >> PASID_PDE_SHIFT;
816 pde = &dir[dir_index];
817 pr_info("pasid dir entry: 0x%016llx\n", pde->val);
818
819 /* get the pointer to the pasid table entry */
820 entries = get_pasid_table_from_pde(pde);
821 if (!entries) {
822 pr_info("pasid table entry is not present\n");
823 return;
824 }
825 index = pasid & PASID_PTE_MASK;
826 pte = &entries[index];
827 for (i = 0; i < ARRAY_SIZE(pte->val); i++)
828 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
829
830 if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
831 level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
832 pgtable = phys_to_virt(address: pte->val[2] & VTD_PAGE_MASK);
833 } else {
834 level = agaw_to_level(agaw: (pte->val[0] >> 2) & 0x7);
835 pgtable = phys_to_virt(address: pte->val[0] & VTD_PAGE_MASK);
836 }
837
838pgtable_walk:
839 pgtable_walk(iommu, pfn: addr >> VTD_PAGE_SHIFT, bus, devfn, parent: pgtable, level);
840}
841#endif
842
843static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
844 unsigned long pfn, int *target_level,
845 gfp_t gfp)
846{
847 struct dma_pte *parent, *pte;
848 int level = agaw_to_level(agaw: domain->agaw);
849 int offset;
850
851 if (!domain_pfn_supported(domain, pfn))
852 /* Address beyond IOMMU's addressing capabilities. */
853 return NULL;
854
855 parent = domain->pgd;
856
857 while (1) {
858 void *tmp_page;
859
860 offset = pfn_level_offset(pfn, level);
861 pte = &parent[offset];
862 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
863 break;
864 if (level == *target_level)
865 break;
866
867 if (!dma_pte_present(pte)) {
868 uint64_t pteval;
869
870 tmp_page = alloc_pgtable_page(node: domain->nid, gfp);
871
872 if (!tmp_page)
873 return NULL;
874
875 domain_flush_cache(domain, addr: tmp_page, VTD_PAGE_SIZE);
876 pteval = ((uint64_t)virt_to_dma_pfn(p: tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
877 if (domain->use_first_level)
878 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
879
880 if (cmpxchg64(&pte->val, 0ULL, pteval))
881 /* Someone else set it while we were thinking; use theirs. */
882 free_pgtable_page(vaddr: tmp_page);
883 else
884 domain_flush_cache(domain, addr: pte, size: sizeof(*pte));
885 }
886 if (level == 1)
887 break;
888
889 parent = phys_to_virt(address: dma_pte_addr(pte));
890 level--;
891 }
892
893 if (!*target_level)
894 *target_level = level;
895
896 return pte;
897}
898
899/* return address's pte at specific level */
900static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
901 unsigned long pfn,
902 int level, int *large_page)
903{
904 struct dma_pte *parent, *pte;
905 int total = agaw_to_level(agaw: domain->agaw);
906 int offset;
907
908 parent = domain->pgd;
909 while (level <= total) {
910 offset = pfn_level_offset(pfn, level: total);
911 pte = &parent[offset];
912 if (level == total)
913 return pte;
914
915 if (!dma_pte_present(pte)) {
916 *large_page = total;
917 break;
918 }
919
920 if (dma_pte_superpage(pte)) {
921 *large_page = total;
922 return pte;
923 }
924
925 parent = phys_to_virt(address: dma_pte_addr(pte));
926 total--;
927 }
928 return NULL;
929}
930
931/* clear last level pte, a tlb flush should be followed */
932static void dma_pte_clear_range(struct dmar_domain *domain,
933 unsigned long start_pfn,
934 unsigned long last_pfn)
935{
936 unsigned int large_page;
937 struct dma_pte *first_pte, *pte;
938
939 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
940 WARN_ON(start_pfn > last_pfn))
941 return;
942
943 /* we don't need lock here; nobody else touches the iova range */
944 do {
945 large_page = 1;
946 first_pte = pte = dma_pfn_level_pte(domain, pfn: start_pfn, level: 1, large_page: &large_page);
947 if (!pte) {
948 start_pfn = align_to_level(pfn: start_pfn + 1, level: large_page + 1);
949 continue;
950 }
951 do {
952 dma_clear_pte(pte);
953 start_pfn += lvl_to_nr_pages(lvl: large_page);
954 pte++;
955 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
956
957 domain_flush_cache(domain, addr: first_pte,
958 size: (void *)pte - (void *)first_pte);
959
960 } while (start_pfn && start_pfn <= last_pfn);
961}
962
963static void dma_pte_free_level(struct dmar_domain *domain, int level,
964 int retain_level, struct dma_pte *pte,
965 unsigned long pfn, unsigned long start_pfn,
966 unsigned long last_pfn)
967{
968 pfn = max(start_pfn, pfn);
969 pte = &pte[pfn_level_offset(pfn, level)];
970
971 do {
972 unsigned long level_pfn;
973 struct dma_pte *level_pte;
974
975 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
976 goto next;
977
978 level_pfn = pfn & level_mask(level);
979 level_pte = phys_to_virt(address: dma_pte_addr(pte));
980
981 if (level > 2) {
982 dma_pte_free_level(domain, level: level - 1, retain_level,
983 pte: level_pte, pfn: level_pfn, start_pfn,
984 last_pfn);
985 }
986
987 /*
988 * Free the page table if we're below the level we want to
989 * retain and the range covers the entire table.
990 */
991 if (level < retain_level && !(start_pfn > level_pfn ||
992 last_pfn < level_pfn + level_size(level) - 1)) {
993 dma_clear_pte(pte);
994 domain_flush_cache(domain, addr: pte, size: sizeof(*pte));
995 free_pgtable_page(vaddr: level_pte);
996 }
997next:
998 pfn += level_size(level);
999 } while (!first_pte_in_page(pte: ++pte) && pfn <= last_pfn);
1000}
1001
1002/*
1003 * clear last level (leaf) ptes and free page table pages below the
1004 * level we wish to keep intact.
1005 */
1006static void dma_pte_free_pagetable(struct dmar_domain *domain,
1007 unsigned long start_pfn,
1008 unsigned long last_pfn,
1009 int retain_level)
1010{
1011 dma_pte_clear_range(domain, start_pfn, last_pfn);
1012
1013 /* We don't need lock here; nobody else touches the iova range */
1014 dma_pte_free_level(domain, level: agaw_to_level(agaw: domain->agaw), retain_level,
1015 pte: domain->pgd, pfn: 0, start_pfn, last_pfn);
1016
1017 /* free pgd */
1018 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1019 free_pgtable_page(vaddr: domain->pgd);
1020 domain->pgd = NULL;
1021 }
1022}
1023
1024/* When a page at a given level is being unlinked from its parent, we don't
1025 need to *modify* it at all. All we need to do is make a list of all the
1026 pages which can be freed just as soon as we've flushed the IOTLB and we
1027 know the hardware page-walk will no longer touch them.
1028 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1029 be freed. */
1030static void dma_pte_list_pagetables(struct dmar_domain *domain,
1031 int level, struct dma_pte *pte,
1032 struct list_head *freelist)
1033{
1034 struct page *pg;
1035
1036 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1037 list_add_tail(new: &pg->lru, head: freelist);
1038
1039 if (level == 1)
1040 return;
1041
1042 pte = page_address(pg);
1043 do {
1044 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1045 dma_pte_list_pagetables(domain, level: level - 1, pte, freelist);
1046 pte++;
1047 } while (!first_pte_in_page(pte));
1048}
1049
1050static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1051 struct dma_pte *pte, unsigned long pfn,
1052 unsigned long start_pfn, unsigned long last_pfn,
1053 struct list_head *freelist)
1054{
1055 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1056
1057 pfn = max(start_pfn, pfn);
1058 pte = &pte[pfn_level_offset(pfn, level)];
1059
1060 do {
1061 unsigned long level_pfn = pfn & level_mask(level);
1062
1063 if (!dma_pte_present(pte))
1064 goto next;
1065
1066 /* If range covers entire pagetable, free it */
1067 if (start_pfn <= level_pfn &&
1068 last_pfn >= level_pfn + level_size(level) - 1) {
1069 /* These suborbinate page tables are going away entirely. Don't
1070 bother to clear them; we're just going to *free* them. */
1071 if (level > 1 && !dma_pte_superpage(pte))
1072 dma_pte_list_pagetables(domain, level: level - 1, pte, freelist);
1073
1074 dma_clear_pte(pte);
1075 if (!first_pte)
1076 first_pte = pte;
1077 last_pte = pte;
1078 } else if (level > 1) {
1079 /* Recurse down into a level that isn't *entirely* obsolete */
1080 dma_pte_clear_level(domain, level: level - 1,
1081 phys_to_virt(address: dma_pte_addr(pte)),
1082 pfn: level_pfn, start_pfn, last_pfn,
1083 freelist);
1084 }
1085next:
1086 pfn = level_pfn + level_size(level);
1087 } while (!first_pte_in_page(pte: ++pte) && pfn <= last_pfn);
1088
1089 if (first_pte)
1090 domain_flush_cache(domain, addr: first_pte,
1091 size: (void *)++last_pte - (void *)first_pte);
1092}
1093
1094/* We can't just free the pages because the IOMMU may still be walking
1095 the page tables, and may have cached the intermediate levels. The
1096 pages can only be freed after the IOTLB flush has been done. */
1097static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1098 unsigned long last_pfn, struct list_head *freelist)
1099{
1100 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1101 WARN_ON(start_pfn > last_pfn))
1102 return;
1103
1104 /* we don't need lock here; nobody else touches the iova range */
1105 dma_pte_clear_level(domain, level: agaw_to_level(agaw: domain->agaw),
1106 pte: domain->pgd, pfn: 0, start_pfn, last_pfn, freelist);
1107
1108 /* free pgd */
1109 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1110 struct page *pgd_page = virt_to_page(domain->pgd);
1111 list_add_tail(new: &pgd_page->lru, head: freelist);
1112 domain->pgd = NULL;
1113 }
1114}
1115
1116/* iommu handling */
1117static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1118{
1119 struct root_entry *root;
1120
1121 root = alloc_pgtable_page(node: iommu->node, GFP_ATOMIC);
1122 if (!root) {
1123 pr_err("Allocating root entry for %s failed\n",
1124 iommu->name);
1125 return -ENOMEM;
1126 }
1127
1128 __iommu_flush_cache(iommu, addr: root, ROOT_SIZE);
1129 iommu->root_entry = root;
1130
1131 return 0;
1132}
1133
1134static void iommu_set_root_entry(struct intel_iommu *iommu)
1135{
1136 u64 addr;
1137 u32 sts;
1138 unsigned long flag;
1139
1140 addr = virt_to_phys(address: iommu->root_entry);
1141 if (sm_supported(iommu))
1142 addr |= DMA_RTADDR_SMT;
1143
1144 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1145 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1146
1147 writel(val: iommu->gcmd | DMA_GCMD_SRTP, addr: iommu->reg + DMAR_GCMD_REG);
1148
1149 /* Make sure hardware complete it */
1150 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1151 readl, (sts & DMA_GSTS_RTPS), sts);
1152
1153 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1154
1155 /*
1156 * Hardware invalidates all DMA remapping hardware translation
1157 * caches as part of SRTP flow.
1158 */
1159 if (cap_esrtps(iommu->cap))
1160 return;
1161
1162 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1163 if (sm_supported(iommu))
1164 qi_flush_pasid_cache(iommu, did: 0, QI_PC_GLOBAL, pasid: 0);
1165 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1166}
1167
1168void iommu_flush_write_buffer(struct intel_iommu *iommu)
1169{
1170 u32 val;
1171 unsigned long flag;
1172
1173 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1174 return;
1175
1176 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1177 writel(val: iommu->gcmd | DMA_GCMD_WBF, addr: iommu->reg + DMAR_GCMD_REG);
1178
1179 /* Make sure hardware complete it */
1180 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1181 readl, (!(val & DMA_GSTS_WBFS)), val);
1182
1183 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1184}
1185
1186/* return value determine if we need a write buffer flush */
1187static void __iommu_flush_context(struct intel_iommu *iommu,
1188 u16 did, u16 source_id, u8 function_mask,
1189 u64 type)
1190{
1191 u64 val = 0;
1192 unsigned long flag;
1193
1194 switch (type) {
1195 case DMA_CCMD_GLOBAL_INVL:
1196 val = DMA_CCMD_GLOBAL_INVL;
1197 break;
1198 case DMA_CCMD_DOMAIN_INVL:
1199 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1200 break;
1201 case DMA_CCMD_DEVICE_INVL:
1202 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1203 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1204 break;
1205 default:
1206 pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1207 iommu->name, type);
1208 return;
1209 }
1210 val |= DMA_CCMD_ICC;
1211
1212 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1213 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1214
1215 /* Make sure hardware complete it */
1216 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1217 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1218
1219 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1220}
1221
1222/* return value determine if we need a write buffer flush */
1223static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1224 u64 addr, unsigned int size_order, u64 type)
1225{
1226 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1227 u64 val = 0, val_iva = 0;
1228 unsigned long flag;
1229
1230 switch (type) {
1231 case DMA_TLB_GLOBAL_FLUSH:
1232 /* global flush doesn't need set IVA_REG */
1233 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1234 break;
1235 case DMA_TLB_DSI_FLUSH:
1236 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1237 break;
1238 case DMA_TLB_PSI_FLUSH:
1239 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1240 /* IH bit is passed in as part of address */
1241 val_iva = size_order | addr;
1242 break;
1243 default:
1244 pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1245 iommu->name, type);
1246 return;
1247 }
1248
1249 if (cap_write_drain(iommu->cap))
1250 val |= DMA_TLB_WRITE_DRAIN;
1251
1252 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1253 /* Note: Only uses first TLB reg currently */
1254 if (val_iva)
1255 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1256 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1257
1258 /* Make sure hardware complete it */
1259 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1260 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1261
1262 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1263
1264 /* check IOTLB invalidation granularity */
1265 if (DMA_TLB_IAIG(val) == 0)
1266 pr_err("Flush IOTLB failed\n");
1267 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1268 pr_debug("TLB flush request %Lx, actual %Lx\n",
1269 (unsigned long long)DMA_TLB_IIRG(type),
1270 (unsigned long long)DMA_TLB_IAIG(val));
1271}
1272
1273static struct device_domain_info *
1274domain_lookup_dev_info(struct dmar_domain *domain,
1275 struct intel_iommu *iommu, u8 bus, u8 devfn)
1276{
1277 struct device_domain_info *info;
1278 unsigned long flags;
1279
1280 spin_lock_irqsave(&domain->lock, flags);
1281 list_for_each_entry(info, &domain->devices, link) {
1282 if (info->iommu == iommu && info->bus == bus &&
1283 info->devfn == devfn) {
1284 spin_unlock_irqrestore(lock: &domain->lock, flags);
1285 return info;
1286 }
1287 }
1288 spin_unlock_irqrestore(lock: &domain->lock, flags);
1289
1290 return NULL;
1291}
1292
1293void domain_update_iotlb(struct dmar_domain *domain)
1294{
1295 struct dev_pasid_info *dev_pasid;
1296 struct device_domain_info *info;
1297 bool has_iotlb_device = false;
1298 unsigned long flags;
1299
1300 spin_lock_irqsave(&domain->lock, flags);
1301 list_for_each_entry(info, &domain->devices, link) {
1302 if (info->ats_enabled) {
1303 has_iotlb_device = true;
1304 break;
1305 }
1306 }
1307
1308 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1309 info = dev_iommu_priv_get(dev: dev_pasid->dev);
1310 if (info->ats_enabled) {
1311 has_iotlb_device = true;
1312 break;
1313 }
1314 }
1315 domain->has_iotlb_device = has_iotlb_device;
1316 spin_unlock_irqrestore(lock: &domain->lock, flags);
1317}
1318
1319/*
1320 * The extra devTLB flush quirk impacts those QAT devices with PCI device
1321 * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1322 * check because it applies only to the built-in QAT devices and it doesn't
1323 * grant additional privileges.
1324 */
1325#define BUGGY_QAT_DEVID_MASK 0x4940
1326static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1327{
1328 if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1329 return false;
1330
1331 if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1332 return false;
1333
1334 return true;
1335}
1336
1337static void iommu_enable_pci_caps(struct device_domain_info *info)
1338{
1339 struct pci_dev *pdev;
1340
1341 if (!dev_is_pci(info->dev))
1342 return;
1343
1344 pdev = to_pci_dev(info->dev);
1345
1346 /* The PCIe spec, in its wisdom, declares that the behaviour of
1347 the device if you enable PASID support after ATS support is
1348 undefined. So always enable PASID support on devices which
1349 have it, even if we can't yet know if we're ever going to
1350 use it. */
1351 if (info->pasid_supported && !pci_enable_pasid(pdev, features: info->pasid_supported & ~1))
1352 info->pasid_enabled = 1;
1353
1354 if (info->ats_supported && pci_ats_page_aligned(dev: pdev) &&
1355 !pci_enable_ats(dev: pdev, VTD_PAGE_SHIFT)) {
1356 info->ats_enabled = 1;
1357 domain_update_iotlb(domain: info->domain);
1358 }
1359}
1360
1361static void iommu_disable_pci_caps(struct device_domain_info *info)
1362{
1363 struct pci_dev *pdev;
1364
1365 if (!dev_is_pci(info->dev))
1366 return;
1367
1368 pdev = to_pci_dev(info->dev);
1369
1370 if (info->ats_enabled) {
1371 pci_disable_ats(dev: pdev);
1372 info->ats_enabled = 0;
1373 domain_update_iotlb(domain: info->domain);
1374 }
1375
1376 if (info->pasid_enabled) {
1377 pci_disable_pasid(pdev);
1378 info->pasid_enabled = 0;
1379 }
1380}
1381
1382static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1383 u64 addr, unsigned int mask)
1384{
1385 u16 sid, qdep;
1386
1387 if (!info || !info->ats_enabled)
1388 return;
1389
1390 sid = info->bus << 8 | info->devfn;
1391 qdep = info->ats_qdep;
1392 qi_flush_dev_iotlb(iommu: info->iommu, sid, pfsid: info->pfsid,
1393 qdep, addr, mask);
1394 quirk_extra_dev_tlb_flush(info, address: addr, pages: mask, IOMMU_NO_PASID, qdep);
1395}
1396
1397static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1398 u64 addr, unsigned mask)
1399{
1400 struct dev_pasid_info *dev_pasid;
1401 struct device_domain_info *info;
1402 unsigned long flags;
1403
1404 if (!domain->has_iotlb_device)
1405 return;
1406
1407 spin_lock_irqsave(&domain->lock, flags);
1408 list_for_each_entry(info, &domain->devices, link)
1409 __iommu_flush_dev_iotlb(info, addr, mask);
1410
1411 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1412 info = dev_iommu_priv_get(dev: dev_pasid->dev);
1413
1414 if (!info->ats_enabled)
1415 continue;
1416
1417 qi_flush_dev_iotlb_pasid(iommu: info->iommu,
1418 PCI_DEVID(info->bus, info->devfn),
1419 pfsid: info->pfsid, pasid: dev_pasid->pasid,
1420 qdep: info->ats_qdep, addr,
1421 size_order: mask);
1422 }
1423 spin_unlock_irqrestore(lock: &domain->lock, flags);
1424}
1425
1426static void domain_flush_pasid_iotlb(struct intel_iommu *iommu,
1427 struct dmar_domain *domain, u64 addr,
1428 unsigned long npages, bool ih)
1429{
1430 u16 did = domain_id_iommu(domain, iommu);
1431 struct dev_pasid_info *dev_pasid;
1432 unsigned long flags;
1433
1434 spin_lock_irqsave(&domain->lock, flags);
1435 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain)
1436 qi_flush_piotlb(iommu, did, pasid: dev_pasid->pasid, addr, npages, ih);
1437
1438 if (!list_empty(head: &domain->devices))
1439 qi_flush_piotlb(iommu, did, IOMMU_NO_PASID, addr, npages, ih);
1440 spin_unlock_irqrestore(lock: &domain->lock, flags);
1441}
1442
1443static void __iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1444 unsigned long pfn, unsigned int pages,
1445 int ih)
1446{
1447 unsigned int aligned_pages = __roundup_pow_of_two(n: pages);
1448 unsigned long bitmask = aligned_pages - 1;
1449 unsigned int mask = ilog2(aligned_pages);
1450 u64 addr = (u64)pfn << VTD_PAGE_SHIFT;
1451
1452 /*
1453 * PSI masks the low order bits of the base address. If the
1454 * address isn't aligned to the mask, then compute a mask value
1455 * needed to ensure the target range is flushed.
1456 */
1457 if (unlikely(bitmask & pfn)) {
1458 unsigned long end_pfn = pfn + pages - 1, shared_bits;
1459
1460 /*
1461 * Since end_pfn <= pfn + bitmask, the only way bits
1462 * higher than bitmask can differ in pfn and end_pfn is
1463 * by carrying. This means after masking out bitmask,
1464 * high bits starting with the first set bit in
1465 * shared_bits are all equal in both pfn and end_pfn.
1466 */
1467 shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1468 mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1469 }
1470
1471 /*
1472 * Fallback to domain selective flush if no PSI support or
1473 * the size is too big.
1474 */
1475 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1476 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1477 DMA_TLB_DSI_FLUSH);
1478 else
1479 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1480 DMA_TLB_PSI_FLUSH);
1481}
1482
1483static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1484 struct dmar_domain *domain,
1485 unsigned long pfn, unsigned int pages,
1486 int ih, int map)
1487{
1488 unsigned int aligned_pages = __roundup_pow_of_two(n: pages);
1489 unsigned int mask = ilog2(aligned_pages);
1490 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1491 u16 did = domain_id_iommu(domain, iommu);
1492
1493 if (WARN_ON(!pages))
1494 return;
1495
1496 if (ih)
1497 ih = 1 << 6;
1498
1499 if (domain->use_first_level)
1500 domain_flush_pasid_iotlb(iommu, domain, addr, npages: pages, ih);
1501 else
1502 __iommu_flush_iotlb_psi(iommu, did, pfn, pages, ih);
1503
1504 /*
1505 * In caching mode, changes of pages from non-present to present require
1506 * flush. However, device IOTLB doesn't need to be flushed in this case.
1507 */
1508 if (!cap_caching_mode(iommu->cap) || !map)
1509 iommu_flush_dev_iotlb(domain, addr, mask);
1510}
1511
1512/* Notification for newly created mappings */
1513static void __mapping_notify_one(struct intel_iommu *iommu, struct dmar_domain *domain,
1514 unsigned long pfn, unsigned int pages)
1515{
1516 /*
1517 * It's a non-present to present mapping. Only flush if caching mode
1518 * and second level.
1519 */
1520 if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
1521 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, ih: 0, map: 1);
1522 else
1523 iommu_flush_write_buffer(iommu);
1524}
1525
1526/*
1527 * Flush the relevant caches in nested translation if the domain
1528 * also serves as a parent
1529 */
1530static void parent_domain_flush(struct dmar_domain *domain,
1531 unsigned long pfn,
1532 unsigned long pages, int ih)
1533{
1534 struct dmar_domain *s1_domain;
1535
1536 spin_lock(lock: &domain->s1_lock);
1537 list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
1538 struct device_domain_info *device_info;
1539 struct iommu_domain_info *info;
1540 unsigned long flags;
1541 unsigned long i;
1542
1543 xa_for_each(&s1_domain->iommu_array, i, info)
1544 __iommu_flush_iotlb_psi(iommu: info->iommu, did: info->did,
1545 pfn, pages, ih);
1546
1547 if (!s1_domain->has_iotlb_device)
1548 continue;
1549
1550 spin_lock_irqsave(&s1_domain->lock, flags);
1551 list_for_each_entry(device_info, &s1_domain->devices, link)
1552 /*
1553 * Address translation cache in device side caches the
1554 * result of nested translation. There is no easy way
1555 * to identify the exact set of nested translations
1556 * affected by a change in S2. So just flush the entire
1557 * device cache.
1558 */
1559 __iommu_flush_dev_iotlb(info: device_info, addr: 0,
1560 MAX_AGAW_PFN_WIDTH);
1561 spin_unlock_irqrestore(lock: &s1_domain->lock, flags);
1562 }
1563 spin_unlock(lock: &domain->s1_lock);
1564}
1565
1566static void intel_flush_iotlb_all(struct iommu_domain *domain)
1567{
1568 struct dmar_domain *dmar_domain = to_dmar_domain(dom: domain);
1569 struct iommu_domain_info *info;
1570 unsigned long idx;
1571
1572 xa_for_each(&dmar_domain->iommu_array, idx, info) {
1573 struct intel_iommu *iommu = info->iommu;
1574 u16 did = domain_id_iommu(domain: dmar_domain, iommu);
1575
1576 if (dmar_domain->use_first_level)
1577 domain_flush_pasid_iotlb(iommu, domain: dmar_domain, addr: 0, npages: -1, ih: 0);
1578 else
1579 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1580 DMA_TLB_DSI_FLUSH);
1581
1582 if (!cap_caching_mode(iommu->cap))
1583 iommu_flush_dev_iotlb(domain: dmar_domain, addr: 0, MAX_AGAW_PFN_WIDTH);
1584 }
1585
1586 if (dmar_domain->nested_parent)
1587 parent_domain_flush(domain: dmar_domain, pfn: 0, pages: -1, ih: 0);
1588}
1589
1590static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1591{
1592 u32 pmen;
1593 unsigned long flags;
1594
1595 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1596 return;
1597
1598 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1599 pmen = readl(addr: iommu->reg + DMAR_PMEN_REG);
1600 pmen &= ~DMA_PMEN_EPM;
1601 writel(val: pmen, addr: iommu->reg + DMAR_PMEN_REG);
1602
1603 /* wait for the protected region status bit to clear */
1604 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1605 readl, !(pmen & DMA_PMEN_PRS), pmen);
1606
1607 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1608}
1609
1610static void iommu_enable_translation(struct intel_iommu *iommu)
1611{
1612 u32 sts;
1613 unsigned long flags;
1614
1615 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1616 iommu->gcmd |= DMA_GCMD_TE;
1617 writel(val: iommu->gcmd, addr: iommu->reg + DMAR_GCMD_REG);
1618
1619 /* Make sure hardware complete it */
1620 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1621 readl, (sts & DMA_GSTS_TES), sts);
1622
1623 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1624}
1625
1626static void iommu_disable_translation(struct intel_iommu *iommu)
1627{
1628 u32 sts;
1629 unsigned long flag;
1630
1631 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1632 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1633 return;
1634
1635 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1636 iommu->gcmd &= ~DMA_GCMD_TE;
1637 writel(val: iommu->gcmd, addr: iommu->reg + DMAR_GCMD_REG);
1638
1639 /* Make sure hardware complete it */
1640 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1641 readl, (!(sts & DMA_GSTS_TES)), sts);
1642
1643 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1644}
1645
1646static int iommu_init_domains(struct intel_iommu *iommu)
1647{
1648 u32 ndomains;
1649
1650 ndomains = cap_ndoms(iommu->cap);
1651 pr_debug("%s: Number of Domains supported <%d>\n",
1652 iommu->name, ndomains);
1653
1654 spin_lock_init(&iommu->lock);
1655
1656 iommu->domain_ids = bitmap_zalloc(nbits: ndomains, GFP_KERNEL);
1657 if (!iommu->domain_ids)
1658 return -ENOMEM;
1659
1660 /*
1661 * If Caching mode is set, then invalid translations are tagged
1662 * with domain-id 0, hence we need to pre-allocate it. We also
1663 * use domain-id 0 as a marker for non-allocated domain-id, so
1664 * make sure it is not used for a real domain.
1665 */
1666 set_bit(nr: 0, addr: iommu->domain_ids);
1667
1668 /*
1669 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1670 * entry for first-level or pass-through translation modes should
1671 * be programmed with a domain id different from those used for
1672 * second-level or nested translation. We reserve a domain id for
1673 * this purpose.
1674 */
1675 if (sm_supported(iommu))
1676 set_bit(FLPT_DEFAULT_DID, addr: iommu->domain_ids);
1677
1678 return 0;
1679}
1680
1681static void disable_dmar_iommu(struct intel_iommu *iommu)
1682{
1683 if (!iommu->domain_ids)
1684 return;
1685
1686 /*
1687 * All iommu domains must have been detached from the devices,
1688 * hence there should be no domain IDs in use.
1689 */
1690 if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1691 > NUM_RESERVED_DID))
1692 return;
1693
1694 if (iommu->gcmd & DMA_GCMD_TE)
1695 iommu_disable_translation(iommu);
1696}
1697
1698static void free_dmar_iommu(struct intel_iommu *iommu)
1699{
1700 if (iommu->domain_ids) {
1701 bitmap_free(bitmap: iommu->domain_ids);
1702 iommu->domain_ids = NULL;
1703 }
1704
1705 if (iommu->copied_tables) {
1706 bitmap_free(bitmap: iommu->copied_tables);
1707 iommu->copied_tables = NULL;
1708 }
1709
1710 /* free context mapping */
1711 free_context_table(iommu);
1712
1713#ifdef CONFIG_INTEL_IOMMU_SVM
1714 if (pasid_supported(iommu)) {
1715 if (ecap_prs(iommu->ecap))
1716 intel_svm_finish_prq(iommu);
1717 }
1718#endif
1719}
1720
1721/*
1722 * Check and return whether first level is used by default for
1723 * DMA translation.
1724 */
1725static bool first_level_by_default(unsigned int type)
1726{
1727 /* Only SL is available in legacy mode */
1728 if (!scalable_mode_support())
1729 return false;
1730
1731 /* Only level (either FL or SL) is available, just use it */
1732 if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1733 return intel_cap_flts_sanity();
1734
1735 /* Both levels are available, decide it based on domain type */
1736 return type != IOMMU_DOMAIN_UNMANAGED;
1737}
1738
1739static struct dmar_domain *alloc_domain(unsigned int type)
1740{
1741 struct dmar_domain *domain;
1742
1743 domain = kzalloc(size: sizeof(*domain), GFP_KERNEL);
1744 if (!domain)
1745 return NULL;
1746
1747 domain->nid = NUMA_NO_NODE;
1748 if (first_level_by_default(type))
1749 domain->use_first_level = true;
1750 domain->has_iotlb_device = false;
1751 INIT_LIST_HEAD(list: &domain->devices);
1752 INIT_LIST_HEAD(list: &domain->dev_pasids);
1753 spin_lock_init(&domain->lock);
1754 xa_init(xa: &domain->iommu_array);
1755
1756 return domain;
1757}
1758
1759int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1760{
1761 struct iommu_domain_info *info, *curr;
1762 unsigned long ndomains;
1763 int num, ret = -ENOSPC;
1764
1765 info = kzalloc(size: sizeof(*info), GFP_KERNEL);
1766 if (!info)
1767 return -ENOMEM;
1768
1769 spin_lock(lock: &iommu->lock);
1770 curr = xa_load(&domain->iommu_array, index: iommu->seq_id);
1771 if (curr) {
1772 curr->refcnt++;
1773 spin_unlock(lock: &iommu->lock);
1774 kfree(objp: info);
1775 return 0;
1776 }
1777
1778 ndomains = cap_ndoms(iommu->cap);
1779 num = find_first_zero_bit(addr: iommu->domain_ids, size: ndomains);
1780 if (num >= ndomains) {
1781 pr_err("%s: No free domain ids\n", iommu->name);
1782 goto err_unlock;
1783 }
1784
1785 set_bit(nr: num, addr: iommu->domain_ids);
1786 info->refcnt = 1;
1787 info->did = num;
1788 info->iommu = iommu;
1789 curr = xa_cmpxchg(xa: &domain->iommu_array, index: iommu->seq_id,
1790 NULL, entry: info, GFP_ATOMIC);
1791 if (curr) {
1792 ret = xa_err(entry: curr) ? : -EBUSY;
1793 goto err_clear;
1794 }
1795 domain_update_iommu_cap(domain);
1796
1797 spin_unlock(lock: &iommu->lock);
1798 return 0;
1799
1800err_clear:
1801 clear_bit(nr: info->did, addr: iommu->domain_ids);
1802err_unlock:
1803 spin_unlock(lock: &iommu->lock);
1804 kfree(objp: info);
1805 return ret;
1806}
1807
1808void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1809{
1810 struct iommu_domain_info *info;
1811
1812 spin_lock(lock: &iommu->lock);
1813 info = xa_load(&domain->iommu_array, index: iommu->seq_id);
1814 if (--info->refcnt == 0) {
1815 clear_bit(nr: info->did, addr: iommu->domain_ids);
1816 xa_erase(&domain->iommu_array, index: iommu->seq_id);
1817 domain->nid = NUMA_NO_NODE;
1818 domain_update_iommu_cap(domain);
1819 kfree(objp: info);
1820 }
1821 spin_unlock(lock: &iommu->lock);
1822}
1823
1824static int guestwidth_to_adjustwidth(int gaw)
1825{
1826 int agaw;
1827 int r = (gaw - 12) % 9;
1828
1829 if (r == 0)
1830 agaw = gaw;
1831 else
1832 agaw = gaw + 9 - r;
1833 if (agaw > 64)
1834 agaw = 64;
1835 return agaw;
1836}
1837
1838static void domain_exit(struct dmar_domain *domain)
1839{
1840 if (domain->pgd) {
1841 LIST_HEAD(freelist);
1842
1843 domain_unmap(domain, start_pfn: 0, DOMAIN_MAX_PFN(domain->gaw), freelist: &freelist);
1844 put_pages_list(pages: &freelist);
1845 }
1846
1847 if (WARN_ON(!list_empty(&domain->devices)))
1848 return;
1849
1850 kfree(objp: domain);
1851}
1852
1853static int domain_context_mapping_one(struct dmar_domain *domain,
1854 struct intel_iommu *iommu,
1855 u8 bus, u8 devfn)
1856{
1857 struct device_domain_info *info =
1858 domain_lookup_dev_info(domain, iommu, bus, devfn);
1859 u16 did = domain_id_iommu(domain, iommu);
1860 int translation = CONTEXT_TT_MULTI_LEVEL;
1861 struct dma_pte *pgd = domain->pgd;
1862 struct context_entry *context;
1863 int agaw, ret;
1864
1865 if (hw_pass_through && domain_type_is_si(domain))
1866 translation = CONTEXT_TT_PASS_THROUGH;
1867
1868 pr_debug("Set context mapping for %02x:%02x.%d\n",
1869 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1870
1871 spin_lock(lock: &iommu->lock);
1872 ret = -ENOMEM;
1873 context = iommu_context_addr(iommu, bus, devfn, alloc: 1);
1874 if (!context)
1875 goto out_unlock;
1876
1877 ret = 0;
1878 if (context_present(context) && !context_copied(iommu, bus, devfn))
1879 goto out_unlock;
1880
1881 /*
1882 * For kdump cases, old valid entries may be cached due to the
1883 * in-flight DMA and copied pgtable, but there is no unmapping
1884 * behaviour for them, thus we need an explicit cache flush for
1885 * the newly-mapped device. For kdump, at this point, the device
1886 * is supposed to finish reset at its driver probe stage, so no
1887 * in-flight DMA will exist, and we don't need to worry anymore
1888 * hereafter.
1889 */
1890 if (context_copied(iommu, bus, devfn)) {
1891 u16 did_old = context_domain_id(c: context);
1892
1893 if (did_old < cap_ndoms(iommu->cap)) {
1894 iommu->flush.flush_context(iommu, did_old,
1895 (((u16)bus) << 8) | devfn,
1896 DMA_CCMD_MASK_NOBIT,
1897 DMA_CCMD_DEVICE_INVL);
1898 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1899 DMA_TLB_DSI_FLUSH);
1900 }
1901
1902 clear_context_copied(iommu, bus, devfn);
1903 }
1904
1905 context_clear_entry(context);
1906 context_set_domain_id(context, value: did);
1907
1908 if (translation != CONTEXT_TT_PASS_THROUGH) {
1909 /*
1910 * Skip top levels of page tables for iommu which has
1911 * less agaw than default. Unnecessary for PT mode.
1912 */
1913 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
1914 ret = -ENOMEM;
1915 pgd = phys_to_virt(address: dma_pte_addr(pte: pgd));
1916 if (!dma_pte_present(pte: pgd))
1917 goto out_unlock;
1918 }
1919
1920 if (info && info->ats_supported)
1921 translation = CONTEXT_TT_DEV_IOTLB;
1922 else
1923 translation = CONTEXT_TT_MULTI_LEVEL;
1924
1925 context_set_address_root(context, virt_to_phys(address: pgd));
1926 context_set_address_width(context, value: agaw);
1927 } else {
1928 /*
1929 * In pass through mode, AW must be programmed to
1930 * indicate the largest AGAW value supported by
1931 * hardware. And ASR is ignored by hardware.
1932 */
1933 context_set_address_width(context, value: iommu->msagaw);
1934 }
1935
1936 context_set_translation_type(context, value: translation);
1937 context_set_fault_enable(context);
1938 context_set_present(context);
1939 if (!ecap_coherent(iommu->ecap))
1940 clflush_cache_range(addr: context, size: sizeof(*context));
1941
1942 /*
1943 * It's a non-present to present mapping. If hardware doesn't cache
1944 * non-present entry we only need to flush the write-buffer. If the
1945 * _does_ cache non-present entries, then it does so in the special
1946 * domain #0, which we have to flush:
1947 */
1948 if (cap_caching_mode(iommu->cap)) {
1949 iommu->flush.flush_context(iommu, 0,
1950 (((u16)bus) << 8) | devfn,
1951 DMA_CCMD_MASK_NOBIT,
1952 DMA_CCMD_DEVICE_INVL);
1953 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1954 } else {
1955 iommu_flush_write_buffer(iommu);
1956 }
1957
1958 ret = 0;
1959
1960out_unlock:
1961 spin_unlock(lock: &iommu->lock);
1962
1963 return ret;
1964}
1965
1966static int domain_context_mapping_cb(struct pci_dev *pdev,
1967 u16 alias, void *opaque)
1968{
1969 struct device_domain_info *info = dev_iommu_priv_get(dev: &pdev->dev);
1970 struct intel_iommu *iommu = info->iommu;
1971 struct dmar_domain *domain = opaque;
1972
1973 return domain_context_mapping_one(domain, iommu,
1974 PCI_BUS_NUM(alias), devfn: alias & 0xff);
1975}
1976
1977static int
1978domain_context_mapping(struct dmar_domain *domain, struct device *dev)
1979{
1980 struct device_domain_info *info = dev_iommu_priv_get(dev);
1981 struct intel_iommu *iommu = info->iommu;
1982 u8 bus = info->bus, devfn = info->devfn;
1983
1984 if (!dev_is_pci(dev))
1985 return domain_context_mapping_one(domain, iommu, bus, devfn);
1986
1987 return pci_for_each_dma_alias(to_pci_dev(dev),
1988 fn: domain_context_mapping_cb, data: domain);
1989}
1990
1991/* Returns a number of VTD pages, but aligned to MM page size */
1992static unsigned long aligned_nrpages(unsigned long host_addr, size_t size)
1993{
1994 host_addr &= ~PAGE_MASK;
1995 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1996}
1997
1998/* Return largest possible superpage level for a given mapping */
1999static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn,
2000 unsigned long phy_pfn, unsigned long pages)
2001{
2002 int support, level = 1;
2003 unsigned long pfnmerge;
2004
2005 support = domain->iommu_superpage;
2006
2007 /* To use a large page, the virtual *and* physical addresses
2008 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2009 of them will mean we have to use smaller pages. So just
2010 merge them and check both at once. */
2011 pfnmerge = iov_pfn | phy_pfn;
2012
2013 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2014 pages >>= VTD_STRIDE_SHIFT;
2015 if (!pages)
2016 break;
2017 pfnmerge >>= VTD_STRIDE_SHIFT;
2018 level++;
2019 support--;
2020 }
2021 return level;
2022}
2023
2024/*
2025 * Ensure that old small page tables are removed to make room for superpage(s).
2026 * We're going to add new large pages, so make sure we don't remove their parent
2027 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2028 */
2029static void switch_to_super_page(struct dmar_domain *domain,
2030 unsigned long start_pfn,
2031 unsigned long end_pfn, int level)
2032{
2033 unsigned long lvl_pages = lvl_to_nr_pages(lvl: level);
2034 struct iommu_domain_info *info;
2035 struct dma_pte *pte = NULL;
2036 unsigned long i;
2037
2038 while (start_pfn <= end_pfn) {
2039 if (!pte)
2040 pte = pfn_to_dma_pte(domain, pfn: start_pfn, target_level: &level,
2041 GFP_ATOMIC);
2042
2043 if (dma_pte_present(pte)) {
2044 dma_pte_free_pagetable(domain, start_pfn,
2045 last_pfn: start_pfn + lvl_pages - 1,
2046 retain_level: level + 1);
2047
2048 xa_for_each(&domain->iommu_array, i, info)
2049 iommu_flush_iotlb_psi(iommu: info->iommu, domain,
2050 pfn: start_pfn, pages: lvl_pages,
2051 ih: 0, map: 0);
2052 if (domain->nested_parent)
2053 parent_domain_flush(domain, pfn: start_pfn,
2054 pages: lvl_pages, ih: 0);
2055 }
2056
2057 pte++;
2058 start_pfn += lvl_pages;
2059 if (first_pte_in_page(pte))
2060 pte = NULL;
2061 }
2062}
2063
2064static int
2065__domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2066 unsigned long phys_pfn, unsigned long nr_pages, int prot,
2067 gfp_t gfp)
2068{
2069 struct dma_pte *first_pte = NULL, *pte = NULL;
2070 unsigned int largepage_lvl = 0;
2071 unsigned long lvl_pages = 0;
2072 phys_addr_t pteval;
2073 u64 attr;
2074
2075 if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
2076 return -EINVAL;
2077
2078 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2079 return -EINVAL;
2080
2081 if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
2082 pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
2083 return -EINVAL;
2084 }
2085
2086 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2087 attr |= DMA_FL_PTE_PRESENT;
2088 if (domain->use_first_level) {
2089 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2090 if (prot & DMA_PTE_WRITE)
2091 attr |= DMA_FL_PTE_DIRTY;
2092 }
2093
2094 domain->has_mappings = true;
2095
2096 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2097
2098 while (nr_pages > 0) {
2099 uint64_t tmp;
2100
2101 if (!pte) {
2102 largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2103 phy_pfn: phys_pfn, pages: nr_pages);
2104
2105 pte = pfn_to_dma_pte(domain, pfn: iov_pfn, target_level: &largepage_lvl,
2106 gfp);
2107 if (!pte)
2108 return -ENOMEM;
2109 first_pte = pte;
2110
2111 lvl_pages = lvl_to_nr_pages(lvl: largepage_lvl);
2112
2113 /* It is large page*/
2114 if (largepage_lvl > 1) {
2115 unsigned long end_pfn;
2116 unsigned long pages_to_remove;
2117
2118 pteval |= DMA_PTE_LARGE_PAGE;
2119 pages_to_remove = min_t(unsigned long, nr_pages,
2120 nr_pte_to_next_page(pte) * lvl_pages);
2121 end_pfn = iov_pfn + pages_to_remove - 1;
2122 switch_to_super_page(domain, start_pfn: iov_pfn, end_pfn, level: largepage_lvl);
2123 } else {
2124 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2125 }
2126
2127 }
2128 /* We don't need lock here, nobody else
2129 * touches the iova range
2130 */
2131 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2132 if (tmp) {
2133 static int dumps = 5;
2134 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2135 iov_pfn, tmp, (unsigned long long)pteval);
2136 if (dumps) {
2137 dumps--;
2138 debug_dma_dump_mappings(NULL);
2139 }
2140 WARN_ON(1);
2141 }
2142
2143 nr_pages -= lvl_pages;
2144 iov_pfn += lvl_pages;
2145 phys_pfn += lvl_pages;
2146 pteval += lvl_pages * VTD_PAGE_SIZE;
2147
2148 /* If the next PTE would be the first in a new page, then we
2149 * need to flush the cache on the entries we've just written.
2150 * And then we'll need to recalculate 'pte', so clear it and
2151 * let it get set again in the if (!pte) block above.
2152 *
2153 * If we're done (!nr_pages) we need to flush the cache too.
2154 *
2155 * Also if we've been setting superpages, we may need to
2156 * recalculate 'pte' and switch back to smaller pages for the
2157 * end of the mapping, if the trailing size is not enough to
2158 * use another superpage (i.e. nr_pages < lvl_pages).
2159 */
2160 pte++;
2161 if (!nr_pages || first_pte_in_page(pte) ||
2162 (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2163 domain_flush_cache(domain, addr: first_pte,
2164 size: (void *)pte - (void *)first_pte);
2165 pte = NULL;
2166 }
2167 }
2168
2169 return 0;
2170}
2171
2172static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2173{
2174 struct intel_iommu *iommu = info->iommu;
2175 struct context_entry *context;
2176 u16 did_old;
2177
2178 spin_lock(lock: &iommu->lock);
2179 context = iommu_context_addr(iommu, bus, devfn, alloc: 0);
2180 if (!context) {
2181 spin_unlock(lock: &iommu->lock);
2182 return;
2183 }
2184
2185 did_old = context_domain_id(c: context);
2186
2187 context_clear_entry(context);
2188 __iommu_flush_cache(iommu, addr: context, size: sizeof(*context));
2189 spin_unlock(lock: &iommu->lock);
2190 iommu->flush.flush_context(iommu,
2191 did_old,
2192 (((u16)bus) << 8) | devfn,
2193 DMA_CCMD_MASK_NOBIT,
2194 DMA_CCMD_DEVICE_INVL);
2195
2196 iommu->flush.flush_iotlb(iommu,
2197 did_old,
2198 0,
2199 0,
2200 DMA_TLB_DSI_FLUSH);
2201
2202 __iommu_flush_dev_iotlb(info, addr: 0, MAX_AGAW_PFN_WIDTH);
2203}
2204
2205static int domain_setup_first_level(struct intel_iommu *iommu,
2206 struct dmar_domain *domain,
2207 struct device *dev,
2208 u32 pasid)
2209{
2210 struct dma_pte *pgd = domain->pgd;
2211 int agaw, level;
2212 int flags = 0;
2213
2214 /*
2215 * Skip top levels of page tables for iommu which has
2216 * less agaw than default. Unnecessary for PT mode.
2217 */
2218 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2219 pgd = phys_to_virt(address: dma_pte_addr(pte: pgd));
2220 if (!dma_pte_present(pte: pgd))
2221 return -ENOMEM;
2222 }
2223
2224 level = agaw_to_level(agaw);
2225 if (level != 4 && level != 5)
2226 return -EINVAL;
2227
2228 if (level == 5)
2229 flags |= PASID_FLAG_FL5LP;
2230
2231 if (domain->force_snooping)
2232 flags |= PASID_FLAG_PAGE_SNOOP;
2233
2234 return intel_pasid_setup_first_level(iommu, dev, pgd: (pgd_t *)pgd, pasid,
2235 did: domain_id_iommu(domain, iommu),
2236 flags);
2237}
2238
2239static bool dev_is_real_dma_subdevice(struct device *dev)
2240{
2241 return dev && dev_is_pci(dev) &&
2242 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2243}
2244
2245static int iommu_domain_identity_map(struct dmar_domain *domain,
2246 unsigned long first_vpfn,
2247 unsigned long last_vpfn)
2248{
2249 /*
2250 * RMRR range might have overlap with physical memory range,
2251 * clear it first
2252 */
2253 dma_pte_clear_range(domain, start_pfn: first_vpfn, last_pfn: last_vpfn);
2254
2255 return __domain_mapping(domain, iov_pfn: first_vpfn,
2256 phys_pfn: first_vpfn, nr_pages: last_vpfn - first_vpfn + 1,
2257 DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL);
2258}
2259
2260static int md_domain_init(struct dmar_domain *domain, int guest_width);
2261
2262static int __init si_domain_init(int hw)
2263{
2264 struct dmar_rmrr_unit *rmrr;
2265 struct device *dev;
2266 int i, nid, ret;
2267
2268 si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2269 if (!si_domain)
2270 return -EFAULT;
2271
2272 if (md_domain_init(domain: si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2273 domain_exit(domain: si_domain);
2274 si_domain = NULL;
2275 return -EFAULT;
2276 }
2277
2278 if (hw)
2279 return 0;
2280
2281 for_each_online_node(nid) {
2282 unsigned long start_pfn, end_pfn;
2283 int i;
2284
2285 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2286 ret = iommu_domain_identity_map(domain: si_domain,
2287 first_vpfn: mm_to_dma_pfn_start(mm_pfn: start_pfn),
2288 last_vpfn: mm_to_dma_pfn_end(mm_pfn: end_pfn));
2289 if (ret)
2290 return ret;
2291 }
2292 }
2293
2294 /*
2295 * Identity map the RMRRs so that devices with RMRRs could also use
2296 * the si_domain.
2297 */
2298 for_each_rmrr_units(rmrr) {
2299 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2300 i, dev) {
2301 unsigned long long start = rmrr->base_address;
2302 unsigned long long end = rmrr->end_address;
2303
2304 if (WARN_ON(end < start ||
2305 end >> agaw_to_width(si_domain->agaw)))
2306 continue;
2307
2308 ret = iommu_domain_identity_map(domain: si_domain,
2309 first_vpfn: mm_to_dma_pfn_start(mm_pfn: start >> PAGE_SHIFT),
2310 last_vpfn: mm_to_dma_pfn_end(mm_pfn: end >> PAGE_SHIFT));
2311 if (ret)
2312 return ret;
2313 }
2314 }
2315
2316 return 0;
2317}
2318
2319static int dmar_domain_attach_device(struct dmar_domain *domain,
2320 struct device *dev)
2321{
2322 struct device_domain_info *info = dev_iommu_priv_get(dev);
2323 struct intel_iommu *iommu = info->iommu;
2324 unsigned long flags;
2325 int ret;
2326
2327 ret = domain_attach_iommu(domain, iommu);
2328 if (ret)
2329 return ret;
2330 info->domain = domain;
2331 spin_lock_irqsave(&domain->lock, flags);
2332 list_add(new: &info->link, head: &domain->devices);
2333 spin_unlock_irqrestore(lock: &domain->lock, flags);
2334
2335 if (dev_is_real_dma_subdevice(dev))
2336 return 0;
2337
2338 if (!sm_supported(iommu))
2339 ret = domain_context_mapping(domain, dev);
2340 else if (hw_pass_through && domain_type_is_si(domain))
2341 ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID);
2342 else if (domain->use_first_level)
2343 ret = domain_setup_first_level(iommu, domain, dev, IOMMU_NO_PASID);
2344 else
2345 ret = intel_pasid_setup_second_level(iommu, domain, dev, IOMMU_NO_PASID);
2346
2347 if (ret) {
2348 device_block_translation(dev);
2349 return ret;
2350 }
2351
2352 if (sm_supported(info->iommu) || !domain_type_is_si(domain: info->domain))
2353 iommu_enable_pci_caps(info);
2354
2355 return 0;
2356}
2357
2358/**
2359 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2360 * is relaxable (ie. is allowed to be not enforced under some conditions)
2361 * @dev: device handle
2362 *
2363 * We assume that PCI USB devices with RMRRs have them largely
2364 * for historical reasons and that the RMRR space is not actively used post
2365 * boot. This exclusion may change if vendors begin to abuse it.
2366 *
2367 * The same exception is made for graphics devices, with the requirement that
2368 * any use of the RMRR regions will be torn down before assigning the device
2369 * to a guest.
2370 *
2371 * Return: true if the RMRR is relaxable, false otherwise
2372 */
2373static bool device_rmrr_is_relaxable(struct device *dev)
2374{
2375 struct pci_dev *pdev;
2376
2377 if (!dev_is_pci(dev))
2378 return false;
2379
2380 pdev = to_pci_dev(dev);
2381 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2382 return true;
2383 else
2384 return false;
2385}
2386
2387/*
2388 * Return the required default domain type for a specific device.
2389 *
2390 * @dev: the device in query
2391 * @startup: true if this is during early boot
2392 *
2393 * Returns:
2394 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2395 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2396 * - 0: both identity and dynamic domains work for this device
2397 */
2398static int device_def_domain_type(struct device *dev)
2399{
2400 if (dev_is_pci(dev)) {
2401 struct pci_dev *pdev = to_pci_dev(dev);
2402
2403 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2404 return IOMMU_DOMAIN_IDENTITY;
2405
2406 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2407 return IOMMU_DOMAIN_IDENTITY;
2408 }
2409
2410 return 0;
2411}
2412
2413static void intel_iommu_init_qi(struct intel_iommu *iommu)
2414{
2415 /*
2416 * Start from the sane iommu hardware state.
2417 * If the queued invalidation is already initialized by us
2418 * (for example, while enabling interrupt-remapping) then
2419 * we got the things already rolling from a sane state.
2420 */
2421 if (!iommu->qi) {
2422 /*
2423 * Clear any previous faults.
2424 */
2425 dmar_fault(irq: -1, dev_id: iommu);
2426 /*
2427 * Disable queued invalidation if supported and already enabled
2428 * before OS handover.
2429 */
2430 dmar_disable_qi(iommu);
2431 }
2432
2433 if (dmar_enable_qi(iommu)) {
2434 /*
2435 * Queued Invalidate not enabled, use Register Based Invalidate
2436 */
2437 iommu->flush.flush_context = __iommu_flush_context;
2438 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2439 pr_info("%s: Using Register based invalidation\n",
2440 iommu->name);
2441 } else {
2442 iommu->flush.flush_context = qi_flush_context;
2443 iommu->flush.flush_iotlb = qi_flush_iotlb;
2444 pr_info("%s: Using Queued invalidation\n", iommu->name);
2445 }
2446}
2447
2448static int copy_context_table(struct intel_iommu *iommu,
2449 struct root_entry *old_re,
2450 struct context_entry **tbl,
2451 int bus, bool ext)
2452{
2453 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2454 struct context_entry *new_ce = NULL, ce;
2455 struct context_entry *old_ce = NULL;
2456 struct root_entry re;
2457 phys_addr_t old_ce_phys;
2458
2459 tbl_idx = ext ? bus * 2 : bus;
2460 memcpy(&re, old_re, sizeof(re));
2461
2462 for (devfn = 0; devfn < 256; devfn++) {
2463 /* First calculate the correct index */
2464 idx = (ext ? devfn * 2 : devfn) % 256;
2465
2466 if (idx == 0) {
2467 /* First save what we may have and clean up */
2468 if (new_ce) {
2469 tbl[tbl_idx] = new_ce;
2470 __iommu_flush_cache(iommu, addr: new_ce,
2471 VTD_PAGE_SIZE);
2472 pos = 1;
2473 }
2474
2475 if (old_ce)
2476 memunmap(addr: old_ce);
2477
2478 ret = 0;
2479 if (devfn < 0x80)
2480 old_ce_phys = root_entry_lctp(re: &re);
2481 else
2482 old_ce_phys = root_entry_uctp(re: &re);
2483
2484 if (!old_ce_phys) {
2485 if (ext && devfn == 0) {
2486 /* No LCTP, try UCTP */
2487 devfn = 0x7f;
2488 continue;
2489 } else {
2490 goto out;
2491 }
2492 }
2493
2494 ret = -ENOMEM;
2495 old_ce = memremap(offset: old_ce_phys, PAGE_SIZE,
2496 flags: MEMREMAP_WB);
2497 if (!old_ce)
2498 goto out;
2499
2500 new_ce = alloc_pgtable_page(node: iommu->node, GFP_KERNEL);
2501 if (!new_ce)
2502 goto out_unmap;
2503
2504 ret = 0;
2505 }
2506
2507 /* Now copy the context entry */
2508 memcpy(&ce, old_ce + idx, sizeof(ce));
2509
2510 if (!context_present(context: &ce))
2511 continue;
2512
2513 did = context_domain_id(c: &ce);
2514 if (did >= 0 && did < cap_ndoms(iommu->cap))
2515 set_bit(nr: did, addr: iommu->domain_ids);
2516
2517 set_context_copied(iommu, bus, devfn);
2518 new_ce[idx] = ce;
2519 }
2520
2521 tbl[tbl_idx + pos] = new_ce;
2522
2523 __iommu_flush_cache(iommu, addr: new_ce, VTD_PAGE_SIZE);
2524
2525out_unmap:
2526 memunmap(addr: old_ce);
2527
2528out:
2529 return ret;
2530}
2531
2532static int copy_translation_tables(struct intel_iommu *iommu)
2533{
2534 struct context_entry **ctxt_tbls;
2535 struct root_entry *old_rt;
2536 phys_addr_t old_rt_phys;
2537 int ctxt_table_entries;
2538 u64 rtaddr_reg;
2539 int bus, ret;
2540 bool new_ext, ext;
2541
2542 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2543 ext = !!(rtaddr_reg & DMA_RTADDR_SMT);
2544 new_ext = !!sm_supported(iommu);
2545
2546 /*
2547 * The RTT bit can only be changed when translation is disabled,
2548 * but disabling translation means to open a window for data
2549 * corruption. So bail out and don't copy anything if we would
2550 * have to change the bit.
2551 */
2552 if (new_ext != ext)
2553 return -EINVAL;
2554
2555 iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2556 if (!iommu->copied_tables)
2557 return -ENOMEM;
2558
2559 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2560 if (!old_rt_phys)
2561 return -EINVAL;
2562
2563 old_rt = memremap(offset: old_rt_phys, PAGE_SIZE, flags: MEMREMAP_WB);
2564 if (!old_rt)
2565 return -ENOMEM;
2566
2567 /* This is too big for the stack - allocate it from slab */
2568 ctxt_table_entries = ext ? 512 : 256;
2569 ret = -ENOMEM;
2570 ctxt_tbls = kcalloc(n: ctxt_table_entries, size: sizeof(void *), GFP_KERNEL);
2571 if (!ctxt_tbls)
2572 goto out_unmap;
2573
2574 for (bus = 0; bus < 256; bus++) {
2575 ret = copy_context_table(iommu, old_re: &old_rt[bus],
2576 tbl: ctxt_tbls, bus, ext);
2577 if (ret) {
2578 pr_err("%s: Failed to copy context table for bus %d\n",
2579 iommu->name, bus);
2580 continue;
2581 }
2582 }
2583
2584 spin_lock(lock: &iommu->lock);
2585
2586 /* Context tables are copied, now write them to the root_entry table */
2587 for (bus = 0; bus < 256; bus++) {
2588 int idx = ext ? bus * 2 : bus;
2589 u64 val;
2590
2591 if (ctxt_tbls[idx]) {
2592 val = virt_to_phys(address: ctxt_tbls[idx]) | 1;
2593 iommu->root_entry[bus].lo = val;
2594 }
2595
2596 if (!ext || !ctxt_tbls[idx + 1])
2597 continue;
2598
2599 val = virt_to_phys(address: ctxt_tbls[idx + 1]) | 1;
2600 iommu->root_entry[bus].hi = val;
2601 }
2602
2603 spin_unlock(lock: &iommu->lock);
2604
2605 kfree(objp: ctxt_tbls);
2606
2607 __iommu_flush_cache(iommu, addr: iommu->root_entry, PAGE_SIZE);
2608
2609 ret = 0;
2610
2611out_unmap:
2612 memunmap(addr: old_rt);
2613
2614 return ret;
2615}
2616
2617static int __init init_dmars(void)
2618{
2619 struct dmar_drhd_unit *drhd;
2620 struct intel_iommu *iommu;
2621 int ret;
2622
2623 ret = intel_cap_audit(type: CAP_AUDIT_STATIC_DMAR, NULL);
2624 if (ret)
2625 goto free_iommu;
2626
2627 for_each_iommu(iommu, drhd) {
2628 if (drhd->ignored) {
2629 iommu_disable_translation(iommu);
2630 continue;
2631 }
2632
2633 /*
2634 * Find the max pasid size of all IOMMU's in the system.
2635 * We need to ensure the system pasid table is no bigger
2636 * than the smallest supported.
2637 */
2638 if (pasid_supported(iommu)) {
2639 u32 temp = 2 << ecap_pss(iommu->ecap);
2640
2641 intel_pasid_max_id = min_t(u32, temp,
2642 intel_pasid_max_id);
2643 }
2644
2645 intel_iommu_init_qi(iommu);
2646
2647 ret = iommu_init_domains(iommu);
2648 if (ret)
2649 goto free_iommu;
2650
2651 init_translation_status(iommu);
2652
2653 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2654 iommu_disable_translation(iommu);
2655 clear_translation_pre_enabled(iommu);
2656 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2657 iommu->name);
2658 }
2659
2660 /*
2661 * TBD:
2662 * we could share the same root & context tables
2663 * among all IOMMU's. Need to Split it later.
2664 */
2665 ret = iommu_alloc_root_entry(iommu);
2666 if (ret)
2667 goto free_iommu;
2668
2669 if (translation_pre_enabled(iommu)) {
2670 pr_info("Translation already enabled - trying to copy translation structures\n");
2671
2672 ret = copy_translation_tables(iommu);
2673 if (ret) {
2674 /*
2675 * We found the IOMMU with translation
2676 * enabled - but failed to copy over the
2677 * old root-entry table. Try to proceed
2678 * by disabling translation now and
2679 * allocating a clean root-entry table.
2680 * This might cause DMAR faults, but
2681 * probably the dump will still succeed.
2682 */
2683 pr_err("Failed to copy translation tables from previous kernel for %s\n",
2684 iommu->name);
2685 iommu_disable_translation(iommu);
2686 clear_translation_pre_enabled(iommu);
2687 } else {
2688 pr_info("Copied translation tables from previous kernel for %s\n",
2689 iommu->name);
2690 }
2691 }
2692
2693 if (!ecap_pass_through(iommu->ecap))
2694 hw_pass_through = 0;
2695 intel_svm_check(iommu);
2696 }
2697
2698 /*
2699 * Now that qi is enabled on all iommus, set the root entry and flush
2700 * caches. This is required on some Intel X58 chipsets, otherwise the
2701 * flush_context function will loop forever and the boot hangs.
2702 */
2703 for_each_active_iommu(iommu, drhd) {
2704 iommu_flush_write_buffer(iommu);
2705 iommu_set_root_entry(iommu);
2706 }
2707
2708 if (!dmar_map_gfx)
2709 iommu_identity_mapping |= IDENTMAP_GFX;
2710
2711 check_tylersburg_isoch();
2712
2713 ret = si_domain_init(hw: hw_pass_through);
2714 if (ret)
2715 goto free_iommu;
2716
2717 /*
2718 * for each drhd
2719 * enable fault log
2720 * global invalidate context cache
2721 * global invalidate iotlb
2722 * enable translation
2723 */
2724 for_each_iommu(iommu, drhd) {
2725 if (drhd->ignored) {
2726 /*
2727 * we always have to disable PMRs or DMA may fail on
2728 * this device
2729 */
2730 if (force_on)
2731 iommu_disable_protect_mem_regions(iommu);
2732 continue;
2733 }
2734
2735 iommu_flush_write_buffer(iommu);
2736
2737#ifdef CONFIG_INTEL_IOMMU_SVM
2738 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2739 /*
2740 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2741 * could cause possible lock race condition.
2742 */
2743 up_write(sem: &dmar_global_lock);
2744 ret = intel_svm_enable_prq(iommu);
2745 down_write(sem: &dmar_global_lock);
2746 if (ret)
2747 goto free_iommu;
2748 }
2749#endif
2750 ret = dmar_set_interrupt(iommu);
2751 if (ret)
2752 goto free_iommu;
2753 }
2754
2755 return 0;
2756
2757free_iommu:
2758 for_each_active_iommu(iommu, drhd) {
2759 disable_dmar_iommu(iommu);
2760 free_dmar_iommu(iommu);
2761 }
2762 if (si_domain) {
2763 domain_exit(domain: si_domain);
2764 si_domain = NULL;
2765 }
2766
2767 return ret;
2768}
2769
2770static void __init init_no_remapping_devices(void)
2771{
2772 struct dmar_drhd_unit *drhd;
2773 struct device *dev;
2774 int i;
2775
2776 for_each_drhd_unit(drhd) {
2777 if (!drhd->include_all) {
2778 for_each_active_dev_scope(drhd->devices,
2779 drhd->devices_cnt, i, dev)
2780 break;
2781 /* ignore DMAR unit if no devices exist */
2782 if (i == drhd->devices_cnt)
2783 drhd->ignored = 1;
2784 }
2785 }
2786
2787 for_each_active_drhd_unit(drhd) {
2788 if (drhd->include_all)
2789 continue;
2790
2791 for_each_active_dev_scope(drhd->devices,
2792 drhd->devices_cnt, i, dev)
2793 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2794 break;
2795 if (i < drhd->devices_cnt)
2796 continue;
2797
2798 /* This IOMMU has *only* gfx devices. Either bypass it or
2799 set the gfx_mapped flag, as appropriate */
2800 drhd->gfx_dedicated = 1;
2801 if (!dmar_map_gfx)
2802 drhd->ignored = 1;
2803 }
2804}
2805
2806#ifdef CONFIG_SUSPEND
2807static int init_iommu_hw(void)
2808{
2809 struct dmar_drhd_unit *drhd;
2810 struct intel_iommu *iommu = NULL;
2811 int ret;
2812
2813 for_each_active_iommu(iommu, drhd) {
2814 if (iommu->qi) {
2815 ret = dmar_reenable_qi(iommu);
2816 if (ret)
2817 return ret;
2818 }
2819 }
2820
2821 for_each_iommu(iommu, drhd) {
2822 if (drhd->ignored) {
2823 /*
2824 * we always have to disable PMRs or DMA may fail on
2825 * this device
2826 */
2827 if (force_on)
2828 iommu_disable_protect_mem_regions(iommu);
2829 continue;
2830 }
2831
2832 iommu_flush_write_buffer(iommu);
2833 iommu_set_root_entry(iommu);
2834 iommu_enable_translation(iommu);
2835 iommu_disable_protect_mem_regions(iommu);
2836 }
2837
2838 return 0;
2839}
2840
2841static void iommu_flush_all(void)
2842{
2843 struct dmar_drhd_unit *drhd;
2844 struct intel_iommu *iommu;
2845
2846 for_each_active_iommu(iommu, drhd) {
2847 iommu->flush.flush_context(iommu, 0, 0, 0,
2848 DMA_CCMD_GLOBAL_INVL);
2849 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2850 DMA_TLB_GLOBAL_FLUSH);
2851 }
2852}
2853
2854static int iommu_suspend(void)
2855{
2856 struct dmar_drhd_unit *drhd;
2857 struct intel_iommu *iommu = NULL;
2858 unsigned long flag;
2859
2860 iommu_flush_all();
2861
2862 for_each_active_iommu(iommu, drhd) {
2863 iommu_disable_translation(iommu);
2864
2865 raw_spin_lock_irqsave(&iommu->register_lock, flag);
2866
2867 iommu->iommu_state[SR_DMAR_FECTL_REG] =
2868 readl(addr: iommu->reg + DMAR_FECTL_REG);
2869 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
2870 readl(addr: iommu->reg + DMAR_FEDATA_REG);
2871 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
2872 readl(addr: iommu->reg + DMAR_FEADDR_REG);
2873 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
2874 readl(addr: iommu->reg + DMAR_FEUADDR_REG);
2875
2876 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2877 }
2878 return 0;
2879}
2880
2881static void iommu_resume(void)
2882{
2883 struct dmar_drhd_unit *drhd;
2884 struct intel_iommu *iommu = NULL;
2885 unsigned long flag;
2886
2887 if (init_iommu_hw()) {
2888 if (force_on)
2889 panic(fmt: "tboot: IOMMU setup failed, DMAR can not resume!\n");
2890 else
2891 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
2892 return;
2893 }
2894
2895 for_each_active_iommu(iommu, drhd) {
2896
2897 raw_spin_lock_irqsave(&iommu->register_lock, flag);
2898
2899 writel(val: iommu->iommu_state[SR_DMAR_FECTL_REG],
2900 addr: iommu->reg + DMAR_FECTL_REG);
2901 writel(val: iommu->iommu_state[SR_DMAR_FEDATA_REG],
2902 addr: iommu->reg + DMAR_FEDATA_REG);
2903 writel(val: iommu->iommu_state[SR_DMAR_FEADDR_REG],
2904 addr: iommu->reg + DMAR_FEADDR_REG);
2905 writel(val: iommu->iommu_state[SR_DMAR_FEUADDR_REG],
2906 addr: iommu->reg + DMAR_FEUADDR_REG);
2907
2908 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2909 }
2910}
2911
2912static struct syscore_ops iommu_syscore_ops = {
2913 .resume = iommu_resume,
2914 .suspend = iommu_suspend,
2915};
2916
2917static void __init init_iommu_pm_ops(void)
2918{
2919 register_syscore_ops(ops: &iommu_syscore_ops);
2920}
2921
2922#else
2923static inline void init_iommu_pm_ops(void) {}
2924#endif /* CONFIG_PM */
2925
2926static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
2927{
2928 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
2929 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
2930 rmrr->end_address <= rmrr->base_address ||
2931 arch_rmrr_sanity_check(rmrr))
2932 return -EINVAL;
2933
2934 return 0;
2935}
2936
2937int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
2938{
2939 struct acpi_dmar_reserved_memory *rmrr;
2940 struct dmar_rmrr_unit *rmrru;
2941
2942 rmrr = (struct acpi_dmar_reserved_memory *)header;
2943 if (rmrr_sanity_check(rmrr)) {
2944 pr_warn(FW_BUG
2945 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
2946 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2947 rmrr->base_address, rmrr->end_address,
2948 dmi_get_system_info(DMI_BIOS_VENDOR),
2949 dmi_get_system_info(DMI_BIOS_VERSION),
2950 dmi_get_system_info(DMI_PRODUCT_VERSION));
2951 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
2952 }
2953
2954 rmrru = kzalloc(size: sizeof(*rmrru), GFP_KERNEL);
2955 if (!rmrru)
2956 goto out;
2957
2958 rmrru->hdr = header;
2959
2960 rmrru->base_address = rmrr->base_address;
2961 rmrru->end_address = rmrr->end_address;
2962
2963 rmrru->devices = dmar_alloc_dev_scope(start: (void *)(rmrr + 1),
2964 end: ((void *)rmrr) + rmrr->header.length,
2965 cnt: &rmrru->devices_cnt);
2966 if (rmrru->devices_cnt && rmrru->devices == NULL)
2967 goto free_rmrru;
2968
2969 list_add(new: &rmrru->list, head: &dmar_rmrr_units);
2970
2971 return 0;
2972free_rmrru:
2973 kfree(objp: rmrru);
2974out:
2975 return -ENOMEM;
2976}
2977
2978static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
2979{
2980 struct dmar_atsr_unit *atsru;
2981 struct acpi_dmar_atsr *tmp;
2982
2983 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
2984 dmar_rcu_check()) {
2985 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
2986 if (atsr->segment != tmp->segment)
2987 continue;
2988 if (atsr->header.length != tmp->header.length)
2989 continue;
2990 if (memcmp(p: atsr, q: tmp, size: atsr->header.length) == 0)
2991 return atsru;
2992 }
2993
2994 return NULL;
2995}
2996
2997int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2998{
2999 struct acpi_dmar_atsr *atsr;
3000 struct dmar_atsr_unit *atsru;
3001
3002 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3003 return 0;
3004
3005 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3006 atsru = dmar_find_atsr(atsr);
3007 if (atsru)
3008 return 0;
3009
3010 atsru = kzalloc(size: sizeof(*atsru) + hdr->length, GFP_KERNEL);
3011 if (!atsru)
3012 return -ENOMEM;
3013
3014 /*
3015 * If memory is allocated from slab by ACPI _DSM method, we need to
3016 * copy the memory content because the memory buffer will be freed
3017 * on return.
3018 */
3019 atsru->hdr = (void *)(atsru + 1);
3020 memcpy(atsru->hdr, hdr, hdr->length);
3021 atsru->include_all = atsr->flags & 0x1;
3022 if (!atsru->include_all) {
3023 atsru->devices = dmar_alloc_dev_scope(start: (void *)(atsr + 1),
3024 end: (void *)atsr + atsr->header.length,
3025 cnt: &atsru->devices_cnt);
3026 if (atsru->devices_cnt && atsru->devices == NULL) {
3027 kfree(objp: atsru);
3028 return -ENOMEM;
3029 }
3030 }
3031
3032 list_add_rcu(new: &atsru->list, head: &dmar_atsr_units);
3033
3034 return 0;
3035}
3036
3037static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3038{
3039 dmar_free_dev_scope(devices: &atsru->devices, cnt: &atsru->devices_cnt);
3040 kfree(objp: atsru);
3041}
3042
3043int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3044{
3045 struct acpi_dmar_atsr *atsr;
3046 struct dmar_atsr_unit *atsru;
3047
3048 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3049 atsru = dmar_find_atsr(atsr);
3050 if (atsru) {
3051 list_del_rcu(entry: &atsru->list);
3052 synchronize_rcu();
3053 intel_iommu_free_atsr(atsru);
3054 }
3055
3056 return 0;
3057}
3058
3059int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3060{
3061 int i;
3062 struct device *dev;
3063 struct acpi_dmar_atsr *atsr;
3064 struct dmar_atsr_unit *atsru;
3065
3066 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3067 atsru = dmar_find_atsr(atsr);
3068 if (!atsru)
3069 return 0;
3070
3071 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3072 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3073 i, dev)
3074 return -EBUSY;
3075 }
3076
3077 return 0;
3078}
3079
3080static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3081{
3082 struct dmar_satc_unit *satcu;
3083 struct acpi_dmar_satc *tmp;
3084
3085 list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3086 dmar_rcu_check()) {
3087 tmp = (struct acpi_dmar_satc *)satcu->hdr;
3088 if (satc->segment != tmp->segment)
3089 continue;
3090 if (satc->header.length != tmp->header.length)
3091 continue;
3092 if (memcmp(p: satc, q: tmp, size: satc->header.length) == 0)
3093 return satcu;
3094 }
3095
3096 return NULL;
3097}
3098
3099int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3100{
3101 struct acpi_dmar_satc *satc;
3102 struct dmar_satc_unit *satcu;
3103
3104 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3105 return 0;
3106
3107 satc = container_of(hdr, struct acpi_dmar_satc, header);
3108 satcu = dmar_find_satc(satc);
3109 if (satcu)
3110 return 0;
3111
3112 satcu = kzalloc(size: sizeof(*satcu) + hdr->length, GFP_KERNEL);
3113 if (!satcu)
3114 return -ENOMEM;
3115
3116 satcu->hdr = (void *)(satcu + 1);
3117 memcpy(satcu->hdr, hdr, hdr->length);
3118 satcu->atc_required = satc->flags & 0x1;
3119 satcu->devices = dmar_alloc_dev_scope(start: (void *)(satc + 1),
3120 end: (void *)satc + satc->header.length,
3121 cnt: &satcu->devices_cnt);
3122 if (satcu->devices_cnt && !satcu->devices) {
3123 kfree(objp: satcu);
3124 return -ENOMEM;
3125 }
3126 list_add_rcu(new: &satcu->list, head: &dmar_satc_units);
3127
3128 return 0;
3129}
3130
3131static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3132{
3133 int sp, ret;
3134 struct intel_iommu *iommu = dmaru->iommu;
3135
3136 ret = intel_cap_audit(type: CAP_AUDIT_HOTPLUG_DMAR, iommu);
3137 if (ret)
3138 goto out;
3139
3140 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3141 pr_warn("%s: Doesn't support hardware pass through.\n",
3142 iommu->name);
3143 return -ENXIO;
3144 }
3145
3146 sp = domain_update_iommu_superpage(NULL, skip: iommu) - 1;
3147 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3148 pr_warn("%s: Doesn't support large page.\n",
3149 iommu->name);
3150 return -ENXIO;
3151 }
3152
3153 /*
3154 * Disable translation if already enabled prior to OS handover.
3155 */
3156 if (iommu->gcmd & DMA_GCMD_TE)
3157 iommu_disable_translation(iommu);
3158
3159 ret = iommu_init_domains(iommu);
3160 if (ret == 0)
3161 ret = iommu_alloc_root_entry(iommu);
3162 if (ret)
3163 goto out;
3164
3165 intel_svm_check(iommu);
3166
3167 if (dmaru->ignored) {
3168 /*
3169 * we always have to disable PMRs or DMA may fail on this device
3170 */
3171 if (force_on)
3172 iommu_disable_protect_mem_regions(iommu);
3173 return 0;
3174 }
3175
3176 intel_iommu_init_qi(iommu);
3177 iommu_flush_write_buffer(iommu);
3178
3179#ifdef CONFIG_INTEL_IOMMU_SVM
3180 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3181 ret = intel_svm_enable_prq(iommu);
3182 if (ret)
3183 goto disable_iommu;
3184 }
3185#endif
3186 ret = dmar_set_interrupt(iommu);
3187 if (ret)
3188 goto disable_iommu;
3189
3190 iommu_set_root_entry(iommu);
3191 iommu_enable_translation(iommu);
3192
3193 iommu_disable_protect_mem_regions(iommu);
3194 return 0;
3195
3196disable_iommu:
3197 disable_dmar_iommu(iommu);
3198out:
3199 free_dmar_iommu(iommu);
3200 return ret;
3201}
3202
3203int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3204{
3205 int ret = 0;
3206 struct intel_iommu *iommu = dmaru->iommu;
3207
3208 if (!intel_iommu_enabled)
3209 return 0;
3210 if (iommu == NULL)
3211 return -EINVAL;
3212
3213 if (insert) {
3214 ret = intel_iommu_add(dmaru);
3215 } else {
3216 disable_dmar_iommu(iommu);
3217 free_dmar_iommu(iommu);
3218 }
3219
3220 return ret;
3221}
3222
3223static void intel_iommu_free_dmars(void)
3224{
3225 struct dmar_rmrr_unit *rmrru, *rmrr_n;
3226 struct dmar_atsr_unit *atsru, *atsr_n;
3227 struct dmar_satc_unit *satcu, *satc_n;
3228
3229 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3230 list_del(entry: &rmrru->list);
3231 dmar_free_dev_scope(devices: &rmrru->devices, cnt: &rmrru->devices_cnt);
3232 kfree(objp: rmrru);
3233 }
3234
3235 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3236 list_del(entry: &atsru->list);
3237 intel_iommu_free_atsr(atsru);
3238 }
3239 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3240 list_del(entry: &satcu->list);
3241 dmar_free_dev_scope(devices: &satcu->devices, cnt: &satcu->devices_cnt);
3242 kfree(objp: satcu);
3243 }
3244}
3245
3246static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3247{
3248 struct dmar_satc_unit *satcu;
3249 struct acpi_dmar_satc *satc;
3250 struct device *tmp;
3251 int i;
3252
3253 dev = pci_physfn(dev);
3254 rcu_read_lock();
3255
3256 list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3257 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3258 if (satc->segment != pci_domain_nr(bus: dev->bus))
3259 continue;
3260 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3261 if (to_pci_dev(tmp) == dev)
3262 goto out;
3263 }
3264 satcu = NULL;
3265out:
3266 rcu_read_unlock();
3267 return satcu;
3268}
3269
3270static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3271{
3272 int i, ret = 1;
3273 struct pci_bus *bus;
3274 struct pci_dev *bridge = NULL;
3275 struct device *tmp;
3276 struct acpi_dmar_atsr *atsr;
3277 struct dmar_atsr_unit *atsru;
3278 struct dmar_satc_unit *satcu;
3279
3280 dev = pci_physfn(dev);
3281 satcu = dmar_find_matched_satc_unit(dev);
3282 if (satcu)
3283 /*
3284 * This device supports ATS as it is in SATC table.
3285 * When IOMMU is in legacy mode, enabling ATS is done
3286 * automatically by HW for the device that requires
3287 * ATS, hence OS should not enable this device ATS
3288 * to avoid duplicated TLB invalidation.
3289 */
3290 return !(satcu->atc_required && !sm_supported(iommu));
3291
3292 for (bus = dev->bus; bus; bus = bus->parent) {
3293 bridge = bus->self;
3294 /* If it's an integrated device, allow ATS */
3295 if (!bridge)
3296 return 1;
3297 /* Connected via non-PCIe: no ATS */
3298 if (!pci_is_pcie(dev: bridge) ||
3299 pci_pcie_type(dev: bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3300 return 0;
3301 /* If we found the root port, look it up in the ATSR */
3302 if (pci_pcie_type(dev: bridge) == PCI_EXP_TYPE_ROOT_PORT)
3303 break;
3304 }
3305
3306 rcu_read_lock();
3307 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3308 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3309 if (atsr->segment != pci_domain_nr(bus: dev->bus))
3310 continue;
3311
3312 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3313 if (tmp == &bridge->dev)
3314 goto out;
3315
3316 if (atsru->include_all)
3317 goto out;
3318 }
3319 ret = 0;
3320out:
3321 rcu_read_unlock();
3322
3323 return ret;
3324}
3325
3326int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3327{
3328 int ret;
3329 struct dmar_rmrr_unit *rmrru;
3330 struct dmar_atsr_unit *atsru;
3331 struct dmar_satc_unit *satcu;
3332 struct acpi_dmar_atsr *atsr;
3333 struct acpi_dmar_reserved_memory *rmrr;
3334 struct acpi_dmar_satc *satc;
3335
3336 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3337 return 0;
3338
3339 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3340 rmrr = container_of(rmrru->hdr,
3341 struct acpi_dmar_reserved_memory, header);
3342 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3343 ret = dmar_insert_dev_scope(info, start: (void *)(rmrr + 1),
3344 end: ((void *)rmrr) + rmrr->header.length,
3345 segment: rmrr->segment, devices: rmrru->devices,
3346 devices_cnt: rmrru->devices_cnt);
3347 if (ret < 0)
3348 return ret;
3349 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3350 dmar_remove_dev_scope(info, segment: rmrr->segment,
3351 devices: rmrru->devices, count: rmrru->devices_cnt);
3352 }
3353 }
3354
3355 list_for_each_entry(atsru, &dmar_atsr_units, list) {
3356 if (atsru->include_all)
3357 continue;
3358
3359 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3360 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3361 ret = dmar_insert_dev_scope(info, start: (void *)(atsr + 1),
3362 end: (void *)atsr + atsr->header.length,
3363 segment: atsr->segment, devices: atsru->devices,
3364 devices_cnt: atsru->devices_cnt);
3365 if (ret > 0)
3366 break;
3367 else if (ret < 0)
3368 return ret;
3369 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3370 if (dmar_remove_dev_scope(info, segment: atsr->segment,
3371 devices: atsru->devices, count: atsru->devices_cnt))
3372 break;
3373 }
3374 }
3375 list_for_each_entry(satcu, &dmar_satc_units, list) {
3376 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3377 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3378 ret = dmar_insert_dev_scope(info, start: (void *)(satc + 1),
3379 end: (void *)satc + satc->header.length,
3380 segment: satc->segment, devices: satcu->devices,
3381 devices_cnt: satcu->devices_cnt);
3382 if (ret > 0)
3383 break;
3384 else if (ret < 0)
3385 return ret;
3386 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3387 if (dmar_remove_dev_scope(info, segment: satc->segment,
3388 devices: satcu->devices, count: satcu->devices_cnt))
3389 break;
3390 }
3391 }
3392
3393 return 0;
3394}
3395
3396static int intel_iommu_memory_notifier(struct notifier_block *nb,
3397 unsigned long val, void *v)
3398{
3399 struct memory_notify *mhp = v;
3400 unsigned long start_vpfn = mm_to_dma_pfn_start(mm_pfn: mhp->start_pfn);
3401 unsigned long last_vpfn = mm_to_dma_pfn_end(mm_pfn: mhp->start_pfn +
3402 mhp->nr_pages - 1);
3403
3404 switch (val) {
3405 case MEM_GOING_ONLINE:
3406 if (iommu_domain_identity_map(domain: si_domain,
3407 first_vpfn: start_vpfn, last_vpfn)) {
3408 pr_warn("Failed to build identity map for [%lx-%lx]\n",
3409 start_vpfn, last_vpfn);
3410 return NOTIFY_BAD;
3411 }
3412 break;
3413
3414 case MEM_OFFLINE:
3415 case MEM_CANCEL_ONLINE:
3416 {
3417 struct dmar_drhd_unit *drhd;
3418 struct intel_iommu *iommu;
3419 LIST_HEAD(freelist);
3420
3421 domain_unmap(domain: si_domain, start_pfn: start_vpfn, last_pfn: last_vpfn, freelist: &freelist);
3422
3423 rcu_read_lock();
3424 for_each_active_iommu(iommu, drhd)
3425 iommu_flush_iotlb_psi(iommu, domain: si_domain,
3426 pfn: start_vpfn, pages: mhp->nr_pages,
3427 ih: list_empty(head: &freelist), map: 0);
3428 rcu_read_unlock();
3429 put_pages_list(pages: &freelist);
3430 }
3431 break;
3432 }
3433
3434 return NOTIFY_OK;
3435}
3436
3437static struct notifier_block intel_iommu_memory_nb = {
3438 .notifier_call = intel_iommu_memory_notifier,
3439 .priority = 0
3440};
3441
3442static void intel_disable_iommus(void)
3443{
3444 struct intel_iommu *iommu = NULL;
3445 struct dmar_drhd_unit *drhd;
3446
3447 for_each_iommu(iommu, drhd)
3448 iommu_disable_translation(iommu);
3449}
3450
3451void intel_iommu_shutdown(void)
3452{
3453 struct dmar_drhd_unit *drhd;
3454 struct intel_iommu *iommu = NULL;
3455
3456 if (no_iommu || dmar_disabled)
3457 return;
3458
3459 down_write(sem: &dmar_global_lock);
3460
3461 /* Disable PMRs explicitly here. */
3462 for_each_iommu(iommu, drhd)
3463 iommu_disable_protect_mem_regions(iommu);
3464
3465 /* Make sure the IOMMUs are switched off */
3466 intel_disable_iommus();
3467
3468 up_write(sem: &dmar_global_lock);
3469}
3470
3471static struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3472{
3473 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3474
3475 return container_of(iommu_dev, struct intel_iommu, iommu);
3476}
3477
3478static ssize_t version_show(struct device *dev,
3479 struct device_attribute *attr, char *buf)
3480{
3481 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3482 u32 ver = readl(addr: iommu->reg + DMAR_VER_REG);
3483 return sysfs_emit(buf, fmt: "%d:%d\n",
3484 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3485}
3486static DEVICE_ATTR_RO(version);
3487
3488static ssize_t address_show(struct device *dev,
3489 struct device_attribute *attr, char *buf)
3490{
3491 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3492 return sysfs_emit(buf, fmt: "%llx\n", iommu->reg_phys);
3493}
3494static DEVICE_ATTR_RO(address);
3495
3496static ssize_t cap_show(struct device *dev,
3497 struct device_attribute *attr, char *buf)
3498{
3499 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3500 return sysfs_emit(buf, fmt: "%llx\n", iommu->cap);
3501}
3502static DEVICE_ATTR_RO(cap);
3503
3504static ssize_t ecap_show(struct device *dev,
3505 struct device_attribute *attr, char *buf)
3506{
3507 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3508 return sysfs_emit(buf, fmt: "%llx\n", iommu->ecap);
3509}
3510static DEVICE_ATTR_RO(ecap);
3511
3512static ssize_t domains_supported_show(struct device *dev,
3513 struct device_attribute *attr, char *buf)
3514{
3515 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3516 return sysfs_emit(buf, fmt: "%ld\n", cap_ndoms(iommu->cap));
3517}
3518static DEVICE_ATTR_RO(domains_supported);
3519
3520static ssize_t domains_used_show(struct device *dev,
3521 struct device_attribute *attr, char *buf)
3522{
3523 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3524 return sysfs_emit(buf, fmt: "%d\n",
3525 bitmap_weight(src: iommu->domain_ids,
3526 cap_ndoms(iommu->cap)));
3527}
3528static DEVICE_ATTR_RO(domains_used);
3529
3530static struct attribute *intel_iommu_attrs[] = {
3531 &dev_attr_version.attr,
3532 &dev_attr_address.attr,
3533 &dev_attr_cap.attr,
3534 &dev_attr_ecap.attr,
3535 &dev_attr_domains_supported.attr,
3536 &dev_attr_domains_used.attr,
3537 NULL,
3538};
3539
3540static struct attribute_group intel_iommu_group = {
3541 .name = "intel-iommu",
3542 .attrs = intel_iommu_attrs,
3543};
3544
3545const struct attribute_group *intel_iommu_groups[] = {
3546 &intel_iommu_group,
3547 NULL,
3548};
3549
3550static bool has_external_pci(void)
3551{
3552 struct pci_dev *pdev = NULL;
3553
3554 for_each_pci_dev(pdev)
3555 if (pdev->external_facing) {
3556 pci_dev_put(dev: pdev);
3557 return true;
3558 }
3559
3560 return false;
3561}
3562
3563static int __init platform_optin_force_iommu(void)
3564{
3565 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3566 return 0;
3567
3568 if (no_iommu || dmar_disabled)
3569 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3570
3571 /*
3572 * If Intel-IOMMU is disabled by default, we will apply identity
3573 * map for all devices except those marked as being untrusted.
3574 */
3575 if (dmar_disabled)
3576 iommu_set_default_passthrough(cmd_line: false);
3577
3578 dmar_disabled = 0;
3579 no_iommu = 0;
3580
3581 return 1;
3582}
3583
3584static int __init probe_acpi_namespace_devices(void)
3585{
3586 struct dmar_drhd_unit *drhd;
3587 /* To avoid a -Wunused-but-set-variable warning. */
3588 struct intel_iommu *iommu __maybe_unused;
3589 struct device *dev;
3590 int i, ret = 0;
3591
3592 for_each_active_iommu(iommu, drhd) {
3593 for_each_active_dev_scope(drhd->devices,
3594 drhd->devices_cnt, i, dev) {
3595 struct acpi_device_physical_node *pn;
3596 struct acpi_device *adev;
3597
3598 if (dev->bus != &acpi_bus_type)
3599 continue;
3600
3601 adev = to_acpi_device(dev);
3602 mutex_lock(&adev->physical_node_lock);
3603 list_for_each_entry(pn,
3604 &adev->physical_node_list, node) {
3605 ret = iommu_probe_device(dev: pn->dev);
3606 if (ret)
3607 break;
3608 }
3609 mutex_unlock(lock: &adev->physical_node_lock);
3610
3611 if (ret)
3612 return ret;
3613 }
3614 }
3615
3616 return 0;
3617}
3618
3619static __init int tboot_force_iommu(void)
3620{
3621 if (!tboot_enabled())
3622 return 0;
3623
3624 if (no_iommu || dmar_disabled)
3625 pr_warn("Forcing Intel-IOMMU to enabled\n");
3626
3627 dmar_disabled = 0;
3628 no_iommu = 0;
3629
3630 return 1;
3631}
3632
3633int __init intel_iommu_init(void)
3634{
3635 int ret = -ENODEV;
3636 struct dmar_drhd_unit *drhd;
3637 struct intel_iommu *iommu;
3638
3639 /*
3640 * Intel IOMMU is required for a TXT/tboot launch or platform
3641 * opt in, so enforce that.
3642 */
3643 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3644 platform_optin_force_iommu();
3645
3646 down_write(sem: &dmar_global_lock);
3647 if (dmar_table_init()) {
3648 if (force_on)
3649 panic(fmt: "tboot: Failed to initialize DMAR table\n");
3650 goto out_free_dmar;
3651 }
3652
3653 if (dmar_dev_scope_init() < 0) {
3654 if (force_on)
3655 panic(fmt: "tboot: Failed to initialize DMAR device scope\n");
3656 goto out_free_dmar;
3657 }
3658
3659 up_write(sem: &dmar_global_lock);
3660
3661 /*
3662 * The bus notifier takes the dmar_global_lock, so lockdep will
3663 * complain later when we register it under the lock.
3664 */
3665 dmar_register_bus_notifier();
3666
3667 down_write(sem: &dmar_global_lock);
3668
3669 if (!no_iommu)
3670 intel_iommu_debugfs_init();
3671
3672 if (no_iommu || dmar_disabled) {
3673 /*
3674 * We exit the function here to ensure IOMMU's remapping and
3675 * mempool aren't setup, which means that the IOMMU's PMRs
3676 * won't be disabled via the call to init_dmars(). So disable
3677 * it explicitly here. The PMRs were setup by tboot prior to
3678 * calling SENTER, but the kernel is expected to reset/tear
3679 * down the PMRs.
3680 */
3681 if (intel_iommu_tboot_noforce) {
3682 for_each_iommu(iommu, drhd)
3683 iommu_disable_protect_mem_regions(iommu);
3684 }
3685
3686 /*
3687 * Make sure the IOMMUs are switched off, even when we
3688 * boot into a kexec kernel and the previous kernel left
3689 * them enabled
3690 */
3691 intel_disable_iommus();
3692 goto out_free_dmar;
3693 }
3694
3695 if (list_empty(head: &dmar_rmrr_units))
3696 pr_info("No RMRR found\n");
3697
3698 if (list_empty(head: &dmar_atsr_units))
3699 pr_info("No ATSR found\n");
3700
3701 if (list_empty(head: &dmar_satc_units))
3702 pr_info("No SATC found\n");
3703
3704 init_no_remapping_devices();
3705
3706 ret = init_dmars();
3707 if (ret) {
3708 if (force_on)
3709 panic(fmt: "tboot: Failed to initialize DMARs\n");
3710 pr_err("Initialization failed\n");
3711 goto out_free_dmar;
3712 }
3713 up_write(sem: &dmar_global_lock);
3714
3715 init_iommu_pm_ops();
3716
3717 down_read(sem: &dmar_global_lock);
3718 for_each_active_iommu(iommu, drhd) {
3719 /*
3720 * The flush queue implementation does not perform
3721 * page-selective invalidations that are required for efficient
3722 * TLB flushes in virtual environments. The benefit of batching
3723 * is likely to be much lower than the overhead of synchronizing
3724 * the virtual and physical IOMMU page-tables.
3725 */
3726 if (cap_caching_mode(iommu->cap) &&
3727 !first_level_by_default(IOMMU_DOMAIN_DMA)) {
3728 pr_info_once("IOMMU batching disallowed due to virtualization\n");
3729 iommu_set_dma_strict();
3730 }
3731 iommu_device_sysfs_add(iommu: &iommu->iommu, NULL,
3732 groups: intel_iommu_groups,
3733 fmt: "%s", iommu->name);
3734 iommu_device_register(iommu: &iommu->iommu, ops: &intel_iommu_ops, NULL);
3735
3736 iommu_pmu_register(iommu);
3737 }
3738 up_read(sem: &dmar_global_lock);
3739
3740 if (si_domain && !hw_pass_through)
3741 register_memory_notifier(nb: &intel_iommu_memory_nb);
3742
3743 down_read(sem: &dmar_global_lock);
3744 if (probe_acpi_namespace_devices())
3745 pr_warn("ACPI name space devices didn't probe correctly\n");
3746
3747 /* Finally, we enable the DMA remapping hardware. */
3748 for_each_iommu(iommu, drhd) {
3749 if (!drhd->ignored && !translation_pre_enabled(iommu))
3750 iommu_enable_translation(iommu);
3751
3752 iommu_disable_protect_mem_regions(iommu);
3753 }
3754 up_read(sem: &dmar_global_lock);
3755
3756 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3757
3758 intel_iommu_enabled = 1;
3759
3760 return 0;
3761
3762out_free_dmar:
3763 intel_iommu_free_dmars();
3764 up_write(sem: &dmar_global_lock);
3765 return ret;
3766}
3767
3768static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3769{
3770 struct device_domain_info *info = opaque;
3771
3772 domain_context_clear_one(info, PCI_BUS_NUM(alias), devfn: alias & 0xff);
3773 return 0;
3774}
3775
3776/*
3777 * NB - intel-iommu lacks any sort of reference counting for the users of
3778 * dependent devices. If multiple endpoints have intersecting dependent
3779 * devices, unbinding the driver from any one of them will possibly leave
3780 * the others unable to operate.
3781 */
3782static void domain_context_clear(struct device_domain_info *info)
3783{
3784 if (!dev_is_pci(info->dev))
3785 domain_context_clear_one(info, bus: info->bus, devfn: info->devfn);
3786
3787 pci_for_each_dma_alias(to_pci_dev(info->dev),
3788 fn: &domain_context_clear_one_cb, data: info);
3789}
3790
3791/*
3792 * Clear the page table pointer in context or pasid table entries so that
3793 * all DMA requests without PASID from the device are blocked. If the page
3794 * table has been set, clean up the data structures.
3795 */
3796void device_block_translation(struct device *dev)
3797{
3798 struct device_domain_info *info = dev_iommu_priv_get(dev);
3799 struct intel_iommu *iommu = info->iommu;
3800 unsigned long flags;
3801
3802 iommu_disable_pci_caps(info);
3803 if (!dev_is_real_dma_subdevice(dev)) {
3804 if (sm_supported(iommu))
3805 intel_pasid_tear_down_entry(iommu, dev,
3806 IOMMU_NO_PASID, fault_ignore: false);
3807 else
3808 domain_context_clear(info);
3809 }
3810
3811 if (!info->domain)
3812 return;
3813
3814 spin_lock_irqsave(&info->domain->lock, flags);
3815 list_del(entry: &info->link);
3816 spin_unlock_irqrestore(lock: &info->domain->lock, flags);
3817
3818 domain_detach_iommu(domain: info->domain, iommu);
3819 info->domain = NULL;
3820}
3821
3822static int md_domain_init(struct dmar_domain *domain, int guest_width)
3823{
3824 int adjust_width;
3825
3826 /* calculate AGAW */
3827 domain->gaw = guest_width;
3828 adjust_width = guestwidth_to_adjustwidth(gaw: guest_width);
3829 domain->agaw = width_to_agaw(width: adjust_width);
3830
3831 domain->iommu_coherency = false;
3832 domain->iommu_superpage = 0;
3833 domain->max_addr = 0;
3834
3835 /* always allocate the top pgd */
3836 domain->pgd = alloc_pgtable_page(node: domain->nid, GFP_ATOMIC);
3837 if (!domain->pgd)
3838 return -ENOMEM;
3839 domain_flush_cache(domain, addr: domain->pgd, PAGE_SIZE);
3840 return 0;
3841}
3842
3843static int blocking_domain_attach_dev(struct iommu_domain *domain,
3844 struct device *dev)
3845{
3846 device_block_translation(dev);
3847 return 0;
3848}
3849
3850static struct iommu_domain blocking_domain = {
3851 .type = IOMMU_DOMAIN_BLOCKED,
3852 .ops = &(const struct iommu_domain_ops) {
3853 .attach_dev = blocking_domain_attach_dev,
3854 }
3855};
3856
3857static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
3858{
3859 struct dmar_domain *dmar_domain;
3860 struct iommu_domain *domain;
3861
3862 switch (type) {
3863 case IOMMU_DOMAIN_DMA:
3864 case IOMMU_DOMAIN_UNMANAGED:
3865 dmar_domain = alloc_domain(type);
3866 if (!dmar_domain) {
3867 pr_err("Can't allocate dmar_domain\n");
3868 return NULL;
3869 }
3870 if (md_domain_init(domain: dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3871 pr_err("Domain initialization failed\n");
3872 domain_exit(domain: dmar_domain);
3873 return NULL;
3874 }
3875
3876 domain = &dmar_domain->domain;
3877 domain->geometry.aperture_start = 0;
3878 domain->geometry.aperture_end =
3879 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
3880 domain->geometry.force_aperture = true;
3881
3882 return domain;
3883 case IOMMU_DOMAIN_IDENTITY:
3884 return &si_domain->domain;
3885 case IOMMU_DOMAIN_SVA:
3886 return intel_svm_domain_alloc();
3887 default:
3888 return NULL;
3889 }
3890
3891 return NULL;
3892}
3893
3894static struct iommu_domain *
3895intel_iommu_domain_alloc_user(struct device *dev, u32 flags,
3896 struct iommu_domain *parent,
3897 const struct iommu_user_data *user_data)
3898{
3899 struct device_domain_info *info = dev_iommu_priv_get(dev);
3900 bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
3901 bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
3902 struct intel_iommu *iommu = info->iommu;
3903 struct dmar_domain *dmar_domain;
3904 struct iommu_domain *domain;
3905
3906 /* Must be NESTING domain */
3907 if (parent) {
3908 if (!nested_supported(iommu) || flags)
3909 return ERR_PTR(error: -EOPNOTSUPP);
3910 return intel_nested_domain_alloc(parent, user_data);
3911 }
3912
3913 if (flags &
3914 (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
3915 return ERR_PTR(error: -EOPNOTSUPP);
3916 if (nested_parent && !nested_supported(iommu))
3917 return ERR_PTR(error: -EOPNOTSUPP);
3918 if (user_data || (dirty_tracking && !ssads_supported(iommu)))
3919 return ERR_PTR(error: -EOPNOTSUPP);
3920
3921 /*
3922 * domain_alloc_user op needs to fully initialize a domain before
3923 * return, so uses iommu_domain_alloc() here for simple.
3924 */
3925 domain = iommu_domain_alloc(bus: dev->bus);
3926 if (!domain)
3927 return ERR_PTR(error: -ENOMEM);
3928
3929 dmar_domain = to_dmar_domain(dom: domain);
3930
3931 if (nested_parent) {
3932 dmar_domain->nested_parent = true;
3933 INIT_LIST_HEAD(list: &dmar_domain->s1_domains);
3934 spin_lock_init(&dmar_domain->s1_lock);
3935 }
3936
3937 if (dirty_tracking) {
3938 if (dmar_domain->use_first_level) {
3939 iommu_domain_free(domain);
3940 return ERR_PTR(error: -EOPNOTSUPP);
3941 }
3942 domain->dirty_ops = &intel_dirty_ops;
3943 }
3944
3945 return domain;
3946}
3947
3948static void intel_iommu_domain_free(struct iommu_domain *domain)
3949{
3950 struct dmar_domain *dmar_domain = to_dmar_domain(dom: domain);
3951
3952 WARN_ON(dmar_domain->nested_parent &&
3953 !list_empty(&dmar_domain->s1_domains));
3954 if (domain != &si_domain->domain)
3955 domain_exit(domain: dmar_domain);
3956}
3957
3958int prepare_domain_attach_device(struct iommu_domain *domain,
3959 struct device *dev)
3960{
3961 struct device_domain_info *info = dev_iommu_priv_get(dev);
3962 struct dmar_domain *dmar_domain = to_dmar_domain(dom: domain);
3963 struct intel_iommu *iommu = info->iommu;
3964 int addr_width;
3965
3966 if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
3967 return -EINVAL;
3968
3969 if (domain->dirty_ops && !ssads_supported(iommu))
3970 return -EINVAL;
3971
3972 /* check if this iommu agaw is sufficient for max mapped address */
3973 addr_width = agaw_to_width(agaw: iommu->agaw);
3974 if (addr_width > cap_mgaw(iommu->cap))
3975 addr_width = cap_mgaw(iommu->cap);
3976
3977 if (dmar_domain->max_addr > (1LL << addr_width))
3978 return -EINVAL;
3979 dmar_domain->gaw = addr_width;
3980
3981 /*
3982 * Knock out extra levels of page tables if necessary
3983 */
3984 while (iommu->agaw < dmar_domain->agaw) {
3985 struct dma_pte *pte;
3986
3987 pte = dmar_domain->pgd;
3988 if (dma_pte_present(pte)) {
3989 dmar_domain->pgd = phys_to_virt(address: dma_pte_addr(pte));
3990 free_pgtable_page(vaddr: pte);
3991 }
3992 dmar_domain->agaw--;
3993 }
3994
3995 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3996 context_copied(iommu, bus: info->bus, devfn: info->devfn))
3997 return intel_pasid_setup_sm_context(dev);
3998
3999 return 0;
4000}
4001
4002static int intel_iommu_attach_device(struct iommu_domain *domain,
4003 struct device *dev)
4004{
4005 struct device_domain_info *info = dev_iommu_priv_get(dev);
4006 int ret;
4007
4008 if (info->domain)
4009 device_block_translation(dev);
4010
4011 ret = prepare_domain_attach_device(domain, dev);
4012 if (ret)
4013 return ret;
4014
4015 return dmar_domain_attach_device(domain: to_dmar_domain(dom: domain), dev);
4016}
4017
4018static int intel_iommu_map(struct iommu_domain *domain,
4019 unsigned long iova, phys_addr_t hpa,
4020 size_t size, int iommu_prot, gfp_t gfp)
4021{
4022 struct dmar_domain *dmar_domain = to_dmar_domain(dom: domain);
4023 u64 max_addr;
4024 int prot = 0;
4025
4026 if (iommu_prot & IOMMU_READ)
4027 prot |= DMA_PTE_READ;
4028 if (iommu_prot & IOMMU_WRITE)
4029 prot |= DMA_PTE_WRITE;
4030 if (dmar_domain->set_pte_snp)
4031 prot |= DMA_PTE_SNP;
4032
4033 max_addr = iova + size;
4034 if (dmar_domain->max_addr < max_addr) {
4035 u64 end;
4036
4037 /* check if minimum agaw is sufficient for mapped address */
4038 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4039 if (end < max_addr) {
4040 pr_err("%s: iommu width (%d) is not "
4041 "sufficient for the mapped address (%llx)\n",
4042 __func__, dmar_domain->gaw, max_addr);
4043 return -EFAULT;
4044 }
4045 dmar_domain->max_addr = max_addr;
4046 }
4047 /* Round up size to next multiple of PAGE_SIZE, if it and
4048 the low bits of hpa would take us onto the next page */
4049 size = aligned_nrpages(host_addr: hpa, size);
4050 return __domain_mapping(domain: dmar_domain, iov_pfn: iova >> VTD_PAGE_SHIFT,
4051 phys_pfn: hpa >> VTD_PAGE_SHIFT, nr_pages: size, prot, gfp);
4052}
4053
4054static int intel_iommu_map_pages(struct iommu_domain *domain,
4055 unsigned long iova, phys_addr_t paddr,
4056 size_t pgsize, size_t pgcount,
4057 int prot, gfp_t gfp, size_t *mapped)
4058{
4059 unsigned long pgshift = __ffs(pgsize);
4060 size_t size = pgcount << pgshift;
4061 int ret;
4062
4063 if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4064 return -EINVAL;
4065
4066 if (!IS_ALIGNED(iova | paddr, pgsize))
4067 return -EINVAL;
4068
4069 ret = intel_iommu_map(domain, iova, hpa: paddr, size, iommu_prot: prot, gfp);
4070 if (!ret && mapped)
4071 *mapped = size;
4072
4073 return ret;
4074}
4075
4076static size_t intel_iommu_unmap(struct iommu_domain *domain,
4077 unsigned long iova, size_t size,
4078 struct iommu_iotlb_gather *gather)
4079{
4080 struct dmar_domain *dmar_domain = to_dmar_domain(dom: domain);
4081 unsigned long start_pfn, last_pfn;
4082 int level = 0;
4083
4084 /* Cope with horrid API which requires us to unmap more than the
4085 size argument if it happens to be a large-page mapping. */
4086 if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
4087 &level, GFP_ATOMIC)))
4088 return 0;
4089
4090 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4091 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4092
4093 start_pfn = iova >> VTD_PAGE_SHIFT;
4094 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4095
4096 domain_unmap(domain: dmar_domain, start_pfn, last_pfn, freelist: &gather->freelist);
4097
4098 if (dmar_domain->max_addr == iova + size)
4099 dmar_domain->max_addr = iova;
4100
4101 /*
4102 * We do not use page-selective IOTLB invalidation in flush queue,
4103 * so there is no need to track page and sync iotlb.
4104 */
4105 if (!iommu_iotlb_gather_queued(gather))
4106 iommu_iotlb_gather_add_page(domain, gather, iova, size);
4107
4108 return size;
4109}
4110
4111static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4112 unsigned long iova,
4113 size_t pgsize, size_t pgcount,
4114 struct iommu_iotlb_gather *gather)
4115{
4116 unsigned long pgshift = __ffs(pgsize);
4117 size_t size = pgcount << pgshift;
4118
4119 return intel_iommu_unmap(domain, iova, size, gather);
4120}
4121
4122static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4123 struct iommu_iotlb_gather *gather)
4124{
4125 struct dmar_domain *dmar_domain = to_dmar_domain(dom: domain);
4126 unsigned long iova_pfn = IOVA_PFN(gather->start);
4127 size_t size = gather->end - gather->start;
4128 struct iommu_domain_info *info;
4129 unsigned long start_pfn;
4130 unsigned long nrpages;
4131 unsigned long i;
4132
4133 nrpages = aligned_nrpages(host_addr: gather->start, size);
4134 start_pfn = mm_to_dma_pfn_start(mm_pfn: iova_pfn);
4135
4136 xa_for_each(&dmar_domain->iommu_array, i, info)
4137 iommu_flush_iotlb_psi(iommu: info->iommu, domain: dmar_domain,
4138 pfn: start_pfn, pages: nrpages,
4139 ih: list_empty(head: &gather->freelist), map: 0);
4140
4141 if (dmar_domain->nested_parent)
4142 parent_domain_flush(domain: dmar_domain, pfn: start_pfn, pages: nrpages,
4143 ih: list_empty(head: &gather->freelist));
4144 put_pages_list(pages: &gather->freelist);
4145}
4146
4147static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4148 dma_addr_t iova)
4149{
4150 struct dmar_domain *dmar_domain = to_dmar_domain(dom: domain);
4151 struct dma_pte *pte;
4152 int level = 0;
4153 u64 phys = 0;
4154
4155 pte = pfn_to_dma_pte(domain: dmar_domain, pfn: iova >> VTD_PAGE_SHIFT, target_level: &level,
4156 GFP_ATOMIC);
4157 if (pte && dma_pte_present(pte))
4158 phys = dma_pte_addr(pte) +
4159 (iova & (BIT_MASK(level_to_offset_bits(level) +
4160 VTD_PAGE_SHIFT) - 1));
4161
4162 return phys;
4163}
4164
4165static bool domain_support_force_snooping(struct dmar_domain *domain)
4166{
4167 struct device_domain_info *info;
4168 bool support = true;
4169
4170 assert_spin_locked(&domain->lock);
4171 list_for_each_entry(info, &domain->devices, link) {
4172 if (!ecap_sc_support(info->iommu->ecap)) {
4173 support = false;
4174 break;
4175 }
4176 }
4177
4178 return support;
4179}
4180
4181static void domain_set_force_snooping(struct dmar_domain *domain)
4182{
4183 struct device_domain_info *info;
4184
4185 assert_spin_locked(&domain->lock);
4186 /*
4187 * Second level page table supports per-PTE snoop control. The
4188 * iommu_map() interface will handle this by setting SNP bit.
4189 */
4190 if (!domain->use_first_level) {
4191 domain->set_pte_snp = true;
4192 return;
4193 }
4194
4195 list_for_each_entry(info, &domain->devices, link)
4196 intel_pasid_setup_page_snoop_control(iommu: info->iommu, dev: info->dev,
4197 IOMMU_NO_PASID);
4198}
4199
4200static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4201{
4202 struct dmar_domain *dmar_domain = to_dmar_domain(dom: domain);
4203 unsigned long flags;
4204
4205 if (dmar_domain->force_snooping)
4206 return true;
4207
4208 spin_lock_irqsave(&dmar_domain->lock, flags);
4209 if (!domain_support_force_snooping(domain: dmar_domain) ||
4210 (!dmar_domain->use_first_level && dmar_domain->has_mappings)) {
4211 spin_unlock_irqrestore(lock: &dmar_domain->lock, flags);
4212 return false;
4213 }
4214
4215 domain_set_force_snooping(domain: dmar_domain);
4216 dmar_domain->force_snooping = true;
4217 spin_unlock_irqrestore(lock: &dmar_domain->lock, flags);
4218
4219 return true;
4220}
4221
4222static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4223{
4224 struct device_domain_info *info = dev_iommu_priv_get(dev);
4225
4226 switch (cap) {
4227 case IOMMU_CAP_CACHE_COHERENCY:
4228 case IOMMU_CAP_DEFERRED_FLUSH:
4229 return true;
4230 case IOMMU_CAP_PRE_BOOT_PROTECTION:
4231 return dmar_platform_optin();
4232 case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
4233 return ecap_sc_support(info->iommu->ecap);
4234 case IOMMU_CAP_DIRTY_TRACKING:
4235 return ssads_supported(info->iommu);
4236 default:
4237 return false;
4238 }
4239}
4240
4241static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4242{
4243 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4244 struct device_domain_info *info;
4245 struct intel_iommu *iommu;
4246 u8 bus, devfn;
4247 int ret;
4248
4249 iommu = device_lookup_iommu(dev, bus: &bus, devfn: &devfn);
4250 if (!iommu || !iommu->iommu.ops)
4251 return ERR_PTR(error: -ENODEV);
4252
4253 info = kzalloc(size: sizeof(*info), GFP_KERNEL);
4254 if (!info)
4255 return ERR_PTR(error: -ENOMEM);
4256
4257 if (dev_is_real_dma_subdevice(dev)) {
4258 info->bus = pdev->bus->number;
4259 info->devfn = pdev->devfn;
4260 info->segment = pci_domain_nr(bus: pdev->bus);
4261 } else {
4262 info->bus = bus;
4263 info->devfn = devfn;
4264 info->segment = iommu->segment;
4265 }
4266
4267 info->dev = dev;
4268 info->iommu = iommu;
4269 if (dev_is_pci(dev)) {
4270 if (ecap_dev_iotlb_support(iommu->ecap) &&
4271 pci_ats_supported(dev: pdev) &&
4272 dmar_ats_supported(dev: pdev, iommu)) {
4273 info->ats_supported = 1;
4274 info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4275
4276 /*
4277 * For IOMMU that supports device IOTLB throttling
4278 * (DIT), we assign PFSID to the invalidation desc
4279 * of a VF such that IOMMU HW can gauge queue depth
4280 * at PF level. If DIT is not set, PFSID will be
4281 * treated as reserved, which should be set to 0.
4282 */
4283 if (ecap_dit(iommu->ecap))
4284 info->pfsid = pci_dev_id(dev: pci_physfn(dev: pdev));
4285 info->ats_qdep = pci_ats_queue_depth(dev: pdev);
4286 }
4287 if (sm_supported(iommu)) {
4288 if (pasid_supported(iommu)) {
4289 int features = pci_pasid_features(pdev);
4290
4291 if (features >= 0)
4292 info->pasid_supported = features | 1;
4293 }
4294
4295 if (info->ats_supported && ecap_prs(iommu->ecap) &&
4296 pci_pri_supported(pdev))
4297 info->pri_supported = 1;
4298 }
4299 }
4300
4301 dev_iommu_priv_set(dev, priv: info);
4302 if (pdev && pci_ats_supported(dev: pdev)) {
4303 ret = device_rbtree_insert(iommu, info);
4304 if (ret)
4305 goto free;
4306 }
4307
4308 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4309 ret = intel_pasid_alloc_table(dev);
4310 if (ret) {
4311 dev_err(dev, "PASID table allocation failed\n");
4312 goto clear_rbtree;
4313 }
4314
4315 if (!context_copied(iommu, bus: info->bus, devfn: info->devfn)) {
4316 ret = intel_pasid_setup_sm_context(dev);
4317 if (ret)
4318 goto free_table;
4319 }
4320 }
4321
4322 intel_iommu_debugfs_create_dev(info);
4323
4324 return &iommu->iommu;
4325free_table:
4326 intel_pasid_free_table(dev);
4327clear_rbtree:
4328 device_rbtree_remove(info);
4329free:
4330 kfree(objp: info);
4331
4332 return ERR_PTR(error: ret);
4333}
4334
4335static void intel_iommu_release_device(struct device *dev)
4336{
4337 struct device_domain_info *info = dev_iommu_priv_get(dev);
4338 struct intel_iommu *iommu = info->iommu;
4339
4340 mutex_lock(&iommu->iopf_lock);
4341 if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev)))
4342 device_rbtree_remove(info);
4343 mutex_unlock(lock: &iommu->iopf_lock);
4344
4345 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
4346 !context_copied(iommu, bus: info->bus, devfn: info->devfn))
4347 intel_pasid_teardown_sm_context(dev);
4348
4349 intel_pasid_free_table(dev);
4350 intel_iommu_debugfs_remove_dev(info);
4351 kfree(objp: info);
4352 set_dma_ops(dev, NULL);
4353}
4354
4355static void intel_iommu_probe_finalize(struct device *dev)
4356{
4357 set_dma_ops(dev, NULL);
4358 iommu_setup_dma_ops(dev, dma_base: 0, U64_MAX);
4359}
4360
4361static void intel_iommu_get_resv_regions(struct device *device,
4362 struct list_head *head)
4363{
4364 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4365 struct iommu_resv_region *reg;
4366 struct dmar_rmrr_unit *rmrr;
4367 struct device *i_dev;
4368 int i;
4369
4370 rcu_read_lock();
4371 for_each_rmrr_units(rmrr) {
4372 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4373 i, i_dev) {
4374 struct iommu_resv_region *resv;
4375 enum iommu_resv_type type;
4376 size_t length;
4377
4378 if (i_dev != device &&
4379 !is_downstream_to_pci_bridge(dev: device, bridge: i_dev))
4380 continue;
4381
4382 length = rmrr->end_address - rmrr->base_address + 1;
4383
4384 type = device_rmrr_is_relaxable(dev: device) ?
4385 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4386
4387 resv = iommu_alloc_resv_region(start: rmrr->base_address,
4388 length, prot, type,
4389 GFP_ATOMIC);
4390 if (!resv)
4391 break;
4392
4393 list_add_tail(new: &resv->list, head);
4394 }
4395 }
4396 rcu_read_unlock();
4397
4398#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4399 if (dev_is_pci(device)) {
4400 struct pci_dev *pdev = to_pci_dev(device);
4401
4402 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4403 reg = iommu_alloc_resv_region(start: 0, length: 1UL << 24, prot,
4404 type: IOMMU_RESV_DIRECT_RELAXABLE,
4405 GFP_KERNEL);
4406 if (reg)
4407 list_add_tail(new: &reg->list, head);
4408 }
4409 }
4410#endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4411
4412 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4413 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4414 prot: 0, type: IOMMU_RESV_MSI, GFP_KERNEL);
4415 if (!reg)
4416 return;
4417 list_add_tail(new: &reg->list, head);
4418}
4419
4420static struct iommu_group *intel_iommu_device_group(struct device *dev)
4421{
4422 if (dev_is_pci(dev))
4423 return pci_device_group(dev);
4424 return generic_device_group(dev);
4425}
4426
4427static int intel_iommu_enable_sva(struct device *dev)
4428{
4429 struct device_domain_info *info = dev_iommu_priv_get(dev);
4430 struct intel_iommu *iommu;
4431
4432 if (!info || dmar_disabled)
4433 return -EINVAL;
4434
4435 iommu = info->iommu;
4436 if (!iommu)
4437 return -EINVAL;
4438
4439 if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4440 return -ENODEV;
4441
4442 if (!info->pasid_enabled || !info->ats_enabled)
4443 return -EINVAL;
4444
4445 /*
4446 * Devices having device-specific I/O fault handling should not
4447 * support PCI/PRI. The IOMMU side has no means to check the
4448 * capability of device-specific IOPF. Therefore, IOMMU can only
4449 * default that if the device driver enables SVA on a non-PRI
4450 * device, it will handle IOPF in its own way.
4451 */
4452 if (!info->pri_supported)
4453 return 0;
4454
4455 /* Devices supporting PRI should have it enabled. */
4456 if (!info->pri_enabled)
4457 return -EINVAL;
4458
4459 return 0;
4460}
4461
4462static int intel_iommu_enable_iopf(struct device *dev)
4463{
4464 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4465 struct device_domain_info *info = dev_iommu_priv_get(dev);
4466 struct intel_iommu *iommu;
4467 int ret;
4468
4469 if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
4470 return -ENODEV;
4471
4472 if (info->pri_enabled)
4473 return -EBUSY;
4474
4475 iommu = info->iommu;
4476 if (!iommu)
4477 return -EINVAL;
4478
4479 /* PASID is required in PRG Response Message. */
4480 if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
4481 return -EINVAL;
4482
4483 ret = pci_reset_pri(pdev);
4484 if (ret)
4485 return ret;
4486
4487 ret = iopf_queue_add_device(queue: iommu->iopf_queue, dev);
4488 if (ret)
4489 return ret;
4490
4491 ret = pci_enable_pri(pdev, PRQ_DEPTH);
4492 if (ret) {
4493 iopf_queue_remove_device(queue: iommu->iopf_queue, dev);
4494 return ret;
4495 }
4496
4497 info->pri_enabled = 1;
4498
4499 return 0;
4500}
4501
4502static int intel_iommu_disable_iopf(struct device *dev)
4503{
4504 struct device_domain_info *info = dev_iommu_priv_get(dev);
4505 struct intel_iommu *iommu = info->iommu;
4506
4507 if (!info->pri_enabled)
4508 return -EINVAL;
4509
4510 /*
4511 * PCIe spec states that by clearing PRI enable bit, the Page
4512 * Request Interface will not issue new page requests, but has
4513 * outstanding page requests that have been transmitted or are
4514 * queued for transmission. This is supposed to be called after
4515 * the device driver has stopped DMA, all PASIDs have been
4516 * unbound and the outstanding PRQs have been drained.
4517 */
4518 pci_disable_pri(to_pci_dev(dev));
4519 info->pri_enabled = 0;
4520 iopf_queue_remove_device(queue: iommu->iopf_queue, dev);
4521
4522 return 0;
4523}
4524
4525static int
4526intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4527{
4528 switch (feat) {
4529 case IOMMU_DEV_FEAT_IOPF:
4530 return intel_iommu_enable_iopf(dev);
4531
4532 case IOMMU_DEV_FEAT_SVA:
4533 return intel_iommu_enable_sva(dev);
4534
4535 default:
4536 return -ENODEV;
4537 }
4538}
4539
4540static int
4541intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4542{
4543 switch (feat) {
4544 case IOMMU_DEV_FEAT_IOPF:
4545 return intel_iommu_disable_iopf(dev);
4546
4547 case IOMMU_DEV_FEAT_SVA:
4548 return 0;
4549
4550 default:
4551 return -ENODEV;
4552 }
4553}
4554
4555static bool intel_iommu_is_attach_deferred(struct device *dev)
4556{
4557 struct device_domain_info *info = dev_iommu_priv_get(dev);
4558
4559 return translation_pre_enabled(iommu: info->iommu) && !info->domain;
4560}
4561
4562/*
4563 * Check that the device does not live on an external facing PCI port that is
4564 * marked as untrusted. Such devices should not be able to apply quirks and
4565 * thus not be able to bypass the IOMMU restrictions.
4566 */
4567static bool risky_device(struct pci_dev *pdev)
4568{
4569 if (pdev->untrusted) {
4570 pci_info(pdev,
4571 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4572 pdev->vendor, pdev->device);
4573 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4574 return true;
4575 }
4576 return false;
4577}
4578
4579static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4580 unsigned long iova, size_t size)
4581{
4582 struct dmar_domain *dmar_domain = to_dmar_domain(dom: domain);
4583 unsigned long pages = aligned_nrpages(host_addr: iova, size);
4584 unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4585 struct iommu_domain_info *info;
4586 unsigned long i;
4587
4588 xa_for_each(&dmar_domain->iommu_array, i, info)
4589 __mapping_notify_one(iommu: info->iommu, domain: dmar_domain, pfn, pages);
4590 return 0;
4591}
4592
4593static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
4594{
4595 struct device_domain_info *info = dev_iommu_priv_get(dev);
4596 struct dev_pasid_info *curr, *dev_pasid = NULL;
4597 struct intel_iommu *iommu = info->iommu;
4598 struct dmar_domain *dmar_domain;
4599 struct iommu_domain *domain;
4600 unsigned long flags;
4601
4602 domain = iommu_get_domain_for_dev_pasid(dev, pasid, type: 0);
4603 if (WARN_ON_ONCE(!domain))
4604 goto out_tear_down;
4605
4606 /*
4607 * The SVA implementation needs to handle its own stuffs like the mm
4608 * notification. Before consolidating that code into iommu core, let
4609 * the intel sva code handle it.
4610 */
4611 if (domain->type == IOMMU_DOMAIN_SVA) {
4612 intel_svm_remove_dev_pasid(dev, pasid);
4613 goto out_tear_down;
4614 }
4615
4616 dmar_domain = to_dmar_domain(dom: domain);
4617 spin_lock_irqsave(&dmar_domain->lock, flags);
4618 list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4619 if (curr->dev == dev && curr->pasid == pasid) {
4620 list_del(entry: &curr->link_domain);
4621 dev_pasid = curr;
4622 break;
4623 }
4624 }
4625 WARN_ON_ONCE(!dev_pasid);
4626 spin_unlock_irqrestore(lock: &dmar_domain->lock, flags);
4627
4628 domain_detach_iommu(domain: dmar_domain, iommu);
4629 intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
4630 kfree(objp: dev_pasid);
4631out_tear_down:
4632 intel_pasid_tear_down_entry(iommu, dev, pasid, fault_ignore: false);
4633 intel_drain_pasid_prq(dev, pasid);
4634}
4635
4636static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4637 struct device *dev, ioasid_t pasid)
4638{
4639 struct device_domain_info *info = dev_iommu_priv_get(dev);
4640 struct dmar_domain *dmar_domain = to_dmar_domain(dom: domain);
4641 struct intel_iommu *iommu = info->iommu;
4642 struct dev_pasid_info *dev_pasid;
4643 unsigned long flags;
4644 int ret;
4645
4646 if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4647 return -EOPNOTSUPP;
4648
4649 if (domain->dirty_ops)
4650 return -EINVAL;
4651
4652 if (context_copied(iommu, bus: info->bus, devfn: info->devfn))
4653 return -EBUSY;
4654
4655 ret = prepare_domain_attach_device(domain, dev);
4656 if (ret)
4657 return ret;
4658
4659 dev_pasid = kzalloc(size: sizeof(*dev_pasid), GFP_KERNEL);
4660 if (!dev_pasid)
4661 return -ENOMEM;
4662
4663 ret = domain_attach_iommu(domain: dmar_domain, iommu);
4664 if (ret)
4665 goto out_free;
4666
4667 if (domain_type_is_si(domain: dmar_domain))
4668 ret = intel_pasid_setup_pass_through(iommu, dev, pasid);
4669 else if (dmar_domain->use_first_level)
4670 ret = domain_setup_first_level(iommu, domain: dmar_domain,
4671 dev, pasid);
4672 else
4673 ret = intel_pasid_setup_second_level(iommu, domain: dmar_domain,
4674 dev, pasid);
4675 if (ret)
4676 goto out_detach_iommu;
4677
4678 dev_pasid->dev = dev;
4679 dev_pasid->pasid = pasid;
4680 spin_lock_irqsave(&dmar_domain->lock, flags);
4681 list_add(new: &dev_pasid->link_domain, head: &dmar_domain->dev_pasids);
4682 spin_unlock_irqrestore(lock: &dmar_domain->lock, flags);
4683
4684 if (domain->type & __IOMMU_DOMAIN_PAGING)
4685 intel_iommu_debugfs_create_dev_pasid(dev_pasid);
4686
4687 return 0;
4688out_detach_iommu:
4689 domain_detach_iommu(domain: dmar_domain, iommu);
4690out_free:
4691 kfree(objp: dev_pasid);
4692 return ret;
4693}
4694
4695static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4696{
4697 struct device_domain_info *info = dev_iommu_priv_get(dev);
4698 struct intel_iommu *iommu = info->iommu;
4699 struct iommu_hw_info_vtd *vtd;
4700
4701 vtd = kzalloc(size: sizeof(*vtd), GFP_KERNEL);
4702 if (!vtd)
4703 return ERR_PTR(error: -ENOMEM);
4704
4705 vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
4706 vtd->cap_reg = iommu->cap;
4707 vtd->ecap_reg = iommu->ecap;
4708 *length = sizeof(*vtd);
4709 *type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4710 return vtd;
4711}
4712
4713/*
4714 * Set dirty tracking for the device list of a domain. The caller must
4715 * hold the domain->lock when calling it.
4716 */
4717static int device_set_dirty_tracking(struct list_head *devices, bool enable)
4718{
4719 struct device_domain_info *info;
4720 int ret = 0;
4721
4722 list_for_each_entry(info, devices, link) {
4723 ret = intel_pasid_setup_dirty_tracking(iommu: info->iommu, dev: info->dev,
4724 IOMMU_NO_PASID, enabled: enable);
4725 if (ret)
4726 break;
4727 }
4728
4729 return ret;
4730}
4731
4732static int parent_domain_set_dirty_tracking(struct dmar_domain *domain,
4733 bool enable)
4734{
4735 struct dmar_domain *s1_domain;
4736 unsigned long flags;
4737 int ret;
4738
4739 spin_lock(lock: &domain->s1_lock);
4740 list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4741 spin_lock_irqsave(&s1_domain->lock, flags);
4742 ret = device_set_dirty_tracking(devices: &s1_domain->devices, enable);
4743 spin_unlock_irqrestore(lock: &s1_domain->lock, flags);
4744 if (ret)
4745 goto err_unwind;
4746 }
4747 spin_unlock(lock: &domain->s1_lock);
4748 return 0;
4749
4750err_unwind:
4751 list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4752 spin_lock_irqsave(&s1_domain->lock, flags);
4753 device_set_dirty_tracking(devices: &s1_domain->devices,
4754 enable: domain->dirty_tracking);
4755 spin_unlock_irqrestore(lock: &s1_domain->lock, flags);
4756 }
4757 spin_unlock(lock: &domain->s1_lock);
4758 return ret;
4759}
4760
4761static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
4762 bool enable)
4763{
4764 struct dmar_domain *dmar_domain = to_dmar_domain(dom: domain);
4765 int ret;
4766
4767 spin_lock(lock: &dmar_domain->lock);
4768 if (dmar_domain->dirty_tracking == enable)
4769 goto out_unlock;
4770
4771 ret = device_set_dirty_tracking(devices: &dmar_domain->devices, enable);
4772 if (ret)
4773 goto err_unwind;
4774
4775 if (dmar_domain->nested_parent) {
4776 ret = parent_domain_set_dirty_tracking(domain: dmar_domain, enable);
4777 if (ret)
4778 goto err_unwind;
4779 }
4780
4781 dmar_domain->dirty_tracking = enable;
4782out_unlock:
4783 spin_unlock(lock: &dmar_domain->lock);
4784
4785 return 0;
4786
4787err_unwind:
4788 device_set_dirty_tracking(devices: &dmar_domain->devices,
4789 enable: dmar_domain->dirty_tracking);
4790 spin_unlock(lock: &dmar_domain->lock);
4791 return ret;
4792}
4793
4794static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
4795 unsigned long iova, size_t size,
4796 unsigned long flags,
4797 struct iommu_dirty_bitmap *dirty)
4798{
4799 struct dmar_domain *dmar_domain = to_dmar_domain(dom: domain);
4800 unsigned long end = iova + size - 1;
4801 unsigned long pgsize;
4802
4803 /*
4804 * IOMMUFD core calls into a dirty tracking disabled domain without an
4805 * IOVA bitmap set in order to clean dirty bits in all PTEs that might
4806 * have occurred when we stopped dirty tracking. This ensures that we
4807 * never inherit dirtied bits from a previous cycle.
4808 */
4809 if (!dmar_domain->dirty_tracking && dirty->bitmap)
4810 return -EINVAL;
4811
4812 do {
4813 struct dma_pte *pte;
4814 int lvl = 0;
4815
4816 pte = pfn_to_dma_pte(domain: dmar_domain, pfn: iova >> VTD_PAGE_SHIFT, target_level: &lvl,
4817 GFP_ATOMIC);
4818 pgsize = level_size(level: lvl) << VTD_PAGE_SHIFT;
4819 if (!pte || !dma_pte_present(pte)) {
4820 iova += pgsize;
4821 continue;
4822 }
4823
4824 if (dma_sl_pte_test_and_clear_dirty(pte, flags))
4825 iommu_dirty_bitmap_record(dirty, iova, length: pgsize);
4826 iova += pgsize;
4827 } while (iova < end);
4828
4829 return 0;
4830}
4831
4832static const struct iommu_dirty_ops intel_dirty_ops = {
4833 .set_dirty_tracking = intel_iommu_set_dirty_tracking,
4834 .read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
4835};
4836
4837const struct iommu_ops intel_iommu_ops = {
4838 .blocked_domain = &blocking_domain,
4839 .release_domain = &blocking_domain,
4840 .capable = intel_iommu_capable,
4841 .hw_info = intel_iommu_hw_info,
4842 .domain_alloc = intel_iommu_domain_alloc,
4843 .domain_alloc_user = intel_iommu_domain_alloc_user,
4844 .probe_device = intel_iommu_probe_device,
4845 .probe_finalize = intel_iommu_probe_finalize,
4846 .release_device = intel_iommu_release_device,
4847 .get_resv_regions = intel_iommu_get_resv_regions,
4848 .device_group = intel_iommu_device_group,
4849 .dev_enable_feat = intel_iommu_dev_enable_feat,
4850 .dev_disable_feat = intel_iommu_dev_disable_feat,
4851 .is_attach_deferred = intel_iommu_is_attach_deferred,
4852 .def_domain_type = device_def_domain_type,
4853 .remove_dev_pasid = intel_iommu_remove_dev_pasid,
4854 .pgsize_bitmap = SZ_4K,
4855#ifdef CONFIG_INTEL_IOMMU_SVM
4856 .page_response = intel_svm_page_response,
4857#endif
4858 .default_domain_ops = &(const struct iommu_domain_ops) {
4859 .attach_dev = intel_iommu_attach_device,
4860 .set_dev_pasid = intel_iommu_set_dev_pasid,
4861 .map_pages = intel_iommu_map_pages,
4862 .unmap_pages = intel_iommu_unmap_pages,
4863 .iotlb_sync_map = intel_iommu_iotlb_sync_map,
4864 .flush_iotlb_all = intel_flush_iotlb_all,
4865 .iotlb_sync = intel_iommu_tlb_sync,
4866 .iova_to_phys = intel_iommu_iova_to_phys,
4867 .free = intel_iommu_domain_free,
4868 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4869 }
4870};
4871
4872static void quirk_iommu_igfx(struct pci_dev *dev)
4873{
4874 if (risky_device(pdev: dev))
4875 return;
4876
4877 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4878 dmar_map_gfx = 0;
4879}
4880
4881/* G4x/GM45 integrated gfx dmar support is totally busted. */
4882DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4883DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4884DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4885DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4886DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4887DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4888DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4889
4890/* Broadwell igfx malfunctions with dmar */
4891DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4892DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4893DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4894DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4895DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4896DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4897DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4898DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4899DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4900DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4901DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4902DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4903DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4904DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4905DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4906DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4907DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4908DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4909DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4910DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4911DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4912DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4913DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4914DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4915
4916static void quirk_iommu_rwbf(struct pci_dev *dev)
4917{
4918 if (risky_device(pdev: dev))
4919 return;
4920
4921 /*
4922 * Mobile 4 Series Chipset neglects to set RWBF capability,
4923 * but needs it. Same seems to hold for the desktop versions.
4924 */
4925 pci_info(dev, "Forcing write-buffer flush capability\n");
4926 rwbf_quirk = 1;
4927}
4928
4929DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4930DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4931DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4932DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4933DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4934DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4935DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4936
4937#define GGC 0x52
4938#define GGC_MEMORY_SIZE_MASK (0xf << 8)
4939#define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4940#define GGC_MEMORY_SIZE_1M (0x1 << 8)
4941#define GGC_MEMORY_SIZE_2M (0x3 << 8)
4942#define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4943#define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4944#define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4945#define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4946
4947static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4948{
4949 unsigned short ggc;
4950
4951 if (risky_device(pdev: dev))
4952 return;
4953
4954 if (pci_read_config_word(dev, GGC, val: &ggc))
4955 return;
4956
4957 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4958 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4959 dmar_map_gfx = 0;
4960 } else if (dmar_map_gfx) {
4961 /* we have to ensure the gfx device is idle before we flush */
4962 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4963 iommu_set_dma_strict();
4964 }
4965}
4966DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4967DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4968DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4969DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4970
4971static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4972{
4973 unsigned short ver;
4974
4975 if (!IS_GFX_DEVICE(dev))
4976 return;
4977
4978 ver = (dev->device >> 8) & 0xff;
4979 if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4980 ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4981 ver != 0x9a && ver != 0xa7 && ver != 0x7d)
4982 return;
4983
4984 if (risky_device(pdev: dev))
4985 return;
4986
4987 pci_info(dev, "Skip IOMMU disabling for graphics\n");
4988 iommu_skip_te_disable = 1;
4989}
4990DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4991
4992/* On Tylersburg chipsets, some BIOSes have been known to enable the
4993 ISOCH DMAR unit for the Azalia sound device, but not give it any
4994 TLB entries, which causes it to deadlock. Check for that. We do
4995 this in a function called from init_dmars(), instead of in a PCI
4996 quirk, because we don't want to print the obnoxious "BIOS broken"
4997 message if VT-d is actually disabled.
4998*/
4999static void __init check_tylersburg_isoch(void)
5000{
5001 struct pci_dev *pdev;
5002 uint32_t vtisochctrl;
5003
5004 /* If there's no Azalia in the system anyway, forget it. */
5005 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, device: 0x3a3e, NULL);
5006 if (!pdev)
5007 return;
5008
5009 if (risky_device(pdev)) {
5010 pci_dev_put(dev: pdev);
5011 return;
5012 }
5013
5014 pci_dev_put(dev: pdev);
5015
5016 /* System Management Registers. Might be hidden, in which case
5017 we can't do the sanity check. But that's OK, because the
5018 known-broken BIOSes _don't_ actually hide it, so far. */
5019 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, device: 0x342e, NULL);
5020 if (!pdev)
5021 return;
5022
5023 if (risky_device(pdev)) {
5024 pci_dev_put(dev: pdev);
5025 return;
5026 }
5027
5028 if (pci_read_config_dword(dev: pdev, where: 0x188, val: &vtisochctrl)) {
5029 pci_dev_put(dev: pdev);
5030 return;
5031 }
5032
5033 pci_dev_put(dev: pdev);
5034
5035 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5036 if (vtisochctrl & 1)
5037 return;
5038
5039 /* Drop all bits other than the number of TLB entries */
5040 vtisochctrl &= 0x1c;
5041
5042 /* If we have the recommended number of TLB entries (16), fine. */
5043 if (vtisochctrl == 0x10)
5044 return;
5045
5046 /* Zero TLB entries? You get to ride the short bus to school. */
5047 if (!vtisochctrl) {
5048 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5049 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5050 dmi_get_system_info(DMI_BIOS_VENDOR),
5051 dmi_get_system_info(DMI_BIOS_VERSION),
5052 dmi_get_system_info(DMI_PRODUCT_VERSION));
5053 iommu_identity_mapping |= IDENTMAP_AZALIA;
5054 return;
5055 }
5056
5057 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5058 vtisochctrl);
5059}
5060
5061/*
5062 * Here we deal with a device TLB defect where device may inadvertently issue ATS
5063 * invalidation completion before posted writes initiated with translated address
5064 * that utilized translations matching the invalidation address range, violating
5065 * the invalidation completion ordering.
5066 * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
5067 * vulnerable to this defect. In other words, any dTLB invalidation initiated not
5068 * under the control of the trusted/privileged host device driver must use this
5069 * quirk.
5070 * Device TLBs are invalidated under the following six conditions:
5071 * 1. Device driver does DMA API unmap IOVA
5072 * 2. Device driver unbind a PASID from a process, sva_unbind_device()
5073 * 3. PASID is torn down, after PASID cache is flushed. e.g. process
5074 * exit_mmap() due to crash
5075 * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
5076 * VM has to free pages that were unmapped
5077 * 5. Userspace driver unmaps a DMA buffer
5078 * 6. Cache invalidation in vSVA usage (upcoming)
5079 *
5080 * For #1 and #2, device drivers are responsible for stopping DMA traffic
5081 * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
5082 * invalidate TLB the same way as normal user unmap which will use this quirk.
5083 * The dTLB invalidation after PASID cache flush does not need this quirk.
5084 *
5085 * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
5086 */
5087void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
5088 unsigned long address, unsigned long mask,
5089 u32 pasid, u16 qdep)
5090{
5091 u16 sid;
5092
5093 if (likely(!info->dtlb_extra_inval))
5094 return;
5095
5096 sid = PCI_DEVID(info->bus, info->devfn);
5097 if (pasid == IOMMU_NO_PASID) {
5098 qi_flush_dev_iotlb(iommu: info->iommu, sid, pfsid: info->pfsid,
5099 qdep, addr: address, mask);
5100 } else {
5101 qi_flush_dev_iotlb_pasid(iommu: info->iommu, sid, pfsid: info->pfsid,
5102 pasid, qdep, addr: address, size_order: mask);
5103 }
5104}
5105
5106#define ecmd_get_status_code(res) (((res) & 0xff) >> 1)
5107
5108/*
5109 * Function to submit a command to the enhanced command interface. The
5110 * valid enhanced command descriptions are defined in Table 47 of the
5111 * VT-d spec. The VT-d hardware implementation may support some but not
5112 * all commands, which can be determined by checking the Enhanced
5113 * Command Capability Register.
5114 *
5115 * Return values:
5116 * - 0: Command successful without any error;
5117 * - Negative: software error value;
5118 * - Nonzero positive: failure status code defined in Table 48.
5119 */
5120int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
5121{
5122 unsigned long flags;
5123 u64 res;
5124 int ret;
5125
5126 if (!cap_ecmds(iommu->cap))
5127 return -ENODEV;
5128
5129 raw_spin_lock_irqsave(&iommu->register_lock, flags);
5130
5131 res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
5132 if (res & DMA_ECMD_ECRSP_IP) {
5133 ret = -EBUSY;
5134 goto err;
5135 }
5136
5137 /*
5138 * Unconditionally write the operand B, because
5139 * - There is no side effect if an ecmd doesn't require an
5140 * operand B, but we set the register to some value.
5141 * - It's not invoked in any critical path. The extra MMIO
5142 * write doesn't bring any performance concerns.
5143 */
5144 dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
5145 dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
5146
5147 IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
5148 !(res & DMA_ECMD_ECRSP_IP), res);
5149
5150 if (res & DMA_ECMD_ECRSP_IP) {
5151 ret = -ETIMEDOUT;
5152 goto err;
5153 }
5154
5155 ret = ecmd_get_status_code(res);
5156err:
5157 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
5158
5159 return ret;
5160}
5161

source code of linux/drivers/iommu/intel/iommu.c