iommu.c source code [linux/drivers/iommu/intel/iommu.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* Copyright © 2006-2014 Intel Corporation.
4	*
5	* Authors: David Woodhouse <dwmw2@infradead.org>,
6	* Ashok Raj <ashok.raj@intel.com>,
7	* Shaohua Li <shaohua.li@intel.com>,
8	* Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9	* Fenghua Yu <fenghua.yu@intel.com>
10	* Joerg Roedel <jroedel@suse.de>
11	*/
12
13	#define pr_fmt(fmt) "DMAR: " fmt
14	#define dev_fmt(fmt) pr_fmt(fmt)
15
16	#include <linux/crash_dump.h>
17	#include <linux/dma-direct.h>
18	#include <linux/dmi.h>
19	#include <linux/memory.h>
20	#include <linux/pci.h>
21	#include <linux/pci-ats.h>
22	#include <linux/spinlock.h>
23	#include <linux/syscore_ops.h>
24	#include <linux/tboot.h>
25	#include <uapi/linux/iommufd.h>
26
27	#include "iommu.h"
28	#include "../dma-iommu.h"
29	#include "../irq_remapping.h"
30	#include "../iommu-pages.h"
31	#include "pasid.h"
32	#include "perfmon.h"
33
34	#define ROOT_SIZE VTD_PAGE_SIZE
35	#define CONTEXT_SIZE VTD_PAGE_SIZE
36
37	#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
38	#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
39	#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
40	#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
41
42	#define IOAPIC_RANGE_START (0xfee00000)
43	#define IOAPIC_RANGE_END (0xfeefffff)
44	#define IOVA_START_ADDR (0x1000)
45
46	#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
47
48	#define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
49	#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
50
51	/ We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR*
52	to match. That way, we can use 'unsigned long' for PFNs with impunity. /*
53	#define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
54	__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
55	#define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
56
57	static void __init check_tylersburg_isoch(void);
58	static int rwbf_quirk;
59
60	/*
61	* set to 1 to panic kernel if can't successfully enable VT-d
62	* (used when kernel is launched w/ TXT)
63	*/
64	static int force_on = `0`;
65	static int intel_iommu_tboot_noforce;
66	static int no_platform_optin;
67
68	#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
69
70	/*
71	* Take a root_entry and return the Lower Context Table Pointer (LCTP)
72	* if marked present.
73	*/
74	static phys_addr_t root_entry_lctp(struct root_entry *re)
75	{
76	if (!(re->lo & `1`))
77	return `0`;
78
79	return re->lo & VTD_PAGE_MASK;
80	}
81
82	/*
83	* Take a root_entry and return the Upper Context Table Pointer (UCTP)
84	* if marked present.
85	*/
86	static phys_addr_t root_entry_uctp(struct root_entry *re)
87	{
88	if (!(re->hi & `1`))
89	return `0`;
90
91	return re->hi & VTD_PAGE_MASK;
92	}
93
94	static int device_rid_cmp_key(const void key, const* struct rb_node *node)
95	{
96	struct device_domain_info *info =
97	rb_entry(node, struct device_domain_info, node);
98	const u16 *rid_lhs = key;
99
100	if (*rid_lhs < PCI_DEVID(info->bus, info->devfn))
101	return -`1`;
102
103	if (*rid_lhs > PCI_DEVID(info->bus, info->devfn))
104	return `1`;
105
106	return `0`;
107	}
108
109	static int device_rid_cmp(struct rb_node lhs, const* struct rb_node *rhs)
110	{
111	struct device_domain_info *info =
112	rb_entry(lhs, struct device_domain_info, node);
113	u16 key = PCI_DEVID(info->bus, info->devfn);
114
115	return device_rid_cmp_key(key: &key, node: rhs);
116	}
117
118	/*
119	* Looks up an IOMMU-probed device using its source ID.
120	*
121	* Returns the pointer to the device if there is a match. Otherwise,
122	* returns NULL.
123	*
124	* Note that this helper doesn't guarantee that the device won't be
125	* released by the iommu subsystem after being returned. The caller
126	* should use its own synchronization mechanism to avoid the device
127	* being released during its use if its possibly the case.
128	*/
129	struct device device_rbtree_find(struct* intel_iommu *iommu, u16 rid)
130	{
131	struct device_domain_info *info = NULL;
132	struct rb_node *node;
133	unsigned long flags;
134
135	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
136	node = rb_find(key: &rid, tree: &iommu->device_rbtree, cmp: device_rid_cmp_key);
137	if (node)
138	info = rb_entry(node, struct device_domain_info, node);
139	spin_unlock_irqrestore(lock: &iommu->device_rbtree_lock, flags);
140
141	return info ? info->dev : NULL;
142	}
143
144	static int device_rbtree_insert(struct intel_iommu *iommu,
145	struct device_domain_info *info)
146	{
147	struct rb_node *curr;
148	unsigned long flags;
149
150	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
151	curr = rb_find_add(node: &info->node, tree: &iommu->device_rbtree, cmp: device_rid_cmp);
152	spin_unlock_irqrestore(lock: &iommu->device_rbtree_lock, flags);
153	if (WARN_ON(curr))
154	return -EEXIST;
155
156	return `0`;
157	}
158
159	static void device_rbtree_remove(struct device_domain_info *info)
160	{
161	struct intel_iommu *iommu = info->iommu;
162	unsigned long flags;
163
164	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
165	rb_erase(&info->node, &iommu->device_rbtree);
166	spin_unlock_irqrestore(lock: &iommu->device_rbtree_lock, flags);
167	}
168
169	struct dmar_rmrr_unit {
170	struct list_head list; / list of rmrr units /
171	struct acpi_dmar_header hdr; /* ACPI header /
172	u64 base_address; / reserved base address/
173	u64 end_address; / reserved end address /
174	struct dmar_dev_scope devices; /* target devices /
175	int devices_cnt; / target device count /
176	};
177
178	struct dmar_atsr_unit {
179	struct list_head list; / list of ATSR units /
180	struct acpi_dmar_header hdr; /* ACPI header /
181	struct dmar_dev_scope devices; /* target devices /
182	int devices_cnt; / target device count /
183	u8 include_all:`1`; / include all ports /
184	};
185
186	struct dmar_satc_unit {
187	struct list_head list; / list of SATC units /
188	struct acpi_dmar_header hdr; /* ACPI header /
189	struct dmar_dev_scope devices; /* target devices /
190	struct intel_iommu iommu; /* the corresponding iommu /
191	int devices_cnt; / target device count /
192	u8 atc_required:`1`; / ATS is required /
193	};
194
195	static LIST_HEAD(dmar_atsr_units);
196	static LIST_HEAD(dmar_rmrr_units);
197	static LIST_HEAD(dmar_satc_units);
198
199	#define for_each_rmrr_units(rmrr) \
200	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
201
202	static void intel_iommu_domain_free(struct iommu_domain *domain);
203
204	int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
205	int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
206
207	int intel_iommu_enabled = `0`;
208	EXPORT_SYMBOL_GPL(intel_iommu_enabled);
209
210	static int intel_iommu_superpage = `1`;
211	static int iommu_identity_mapping;
212	static int iommu_skip_te_disable;
213	static int disable_igfx_iommu;
214
215	#define IDENTMAP_AZALIA 4
216
217	const struct iommu_ops intel_iommu_ops;
218	static const struct iommu_dirty_ops intel_dirty_ops;
219
220	static bool translation_pre_enabled(struct intel_iommu *iommu)
221	{
222	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
223	}
224
225	static void clear_translation_pre_enabled(struct intel_iommu *iommu)
226	{
227	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
228	}
229
230	static void init_translation_status(struct intel_iommu *iommu)
231	{
232	u32 gsts;
233
234	gsts = readl(addr: iommu->reg + DMAR_GSTS_REG);
235	if (gsts & DMA_GSTS_TES)
236	iommu->flags \|= VTD_FLAG_TRANS_PRE_ENABLED;
237	}
238
239	static int __init intel_iommu_setup(char *str)
240	{
241	if (!str)
242	return -EINVAL;
243
244	while (*str) {
245	if (!strncmp(str, "on", `2`)) {
246	dmar_disabled = `0`;
247	pr_info("IOMMU enabled\n");
248	} else if (!strncmp(str, "off", `3`)) {
249	dmar_disabled = `1`;
250	no_platform_optin = `1`;
251	pr_info("IOMMU disabled\n");
252	} else if (!strncmp(str, "igfx_off", `8`)) {
253	disable_igfx_iommu = `1`;
254	pr_info("Disable GFX device mapping\n");
255	} else if (!strncmp(str, "forcedac", `8`)) {
256	pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
257	iommu_dma_forcedac = true;
258	} else if (!strncmp(str, "strict", `6`)) {
259	pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
260	iommu_set_dma_strict();
261	} else if (!strncmp(str, "sp_off", `6`)) {
262	pr_info("Disable supported super page\n");
263	intel_iommu_superpage = `0`;
264	} else if (!strncmp(str, "sm_on", `5`)) {
265	pr_info("Enable scalable mode if hardware supports\n");
266	intel_iommu_sm = `1`;
267	} else if (!strncmp(str, "sm_off", `6`)) {
268	pr_info("Scalable mode is disallowed\n");
269	intel_iommu_sm = `0`;
270	} else if (!strncmp(str, "tboot_noforce", `13`)) {
271	pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
272	intel_iommu_tboot_noforce = `1`;
273	} else {
274	pr_notice("Unknown option - '%s'\n", str);
275	}
276
277	str += strcspn(str, ",");
278	while (*str == `','`)
279	str++;
280	}
281
282	return `1`;
283	}
284	__setup("intel_iommu=", intel_iommu_setup);
285
286	static int domain_pfn_supported(struct dmar_domain domain, unsigned* long pfn)
287	{
288	int addr_width = agaw_to_width(agaw: domain->agaw) - VTD_PAGE_SHIFT;
289
290	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
291	}
292
293	/*
294	* Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
295	* Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
296	* the returned SAGAW.
297	*/
298	static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
299	{
300	unsigned long fl_sagaw, sl_sagaw;
301
302	fl_sagaw = BIT(`2`) \| (cap_fl5lp_support(iommu->cap) ? BIT(`3`) : `0`);
303	sl_sagaw = cap_sagaw(iommu->cap);
304
305	/ Second level only. /
306	if (!sm_supported(iommu) \|\| !ecap_flts(iommu->ecap))
307	return sl_sagaw;
308
309	/ First level only. /
310	if (!ecap_slts(iommu->ecap))
311	return fl_sagaw;
312
313	return fl_sagaw & sl_sagaw;
314	}
315
316	static int __iommu_calculate_agaw(struct intel_iommu iommu, int* max_gaw)
317	{
318	unsigned long sagaw;
319	int agaw;
320
321	sagaw = __iommu_calculate_sagaw(iommu);
322	for (agaw = width_to_agaw(width: max_gaw); agaw >= `0`; agaw--) {
323	if (test_bit(agaw, &sagaw))
324	break;
325	}
326
327	return agaw;
328	}
329
330	/*
331	* Calculate max SAGAW for each iommu.
332	*/
333	int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
334	{
335	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
336	}
337
338	/*
339	* calculate agaw for each iommu.
340	* "SAGAW" may be different across iommus, use a default agaw, and
341	* get a supported less agaw for iommus that don't support the default agaw.
342	*/
343	int iommu_calculate_agaw(struct intel_iommu *iommu)
344	{
345	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
346	}
347
348	static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
349	{
350	return sm_supported(iommu) ?
351	ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
352	}
353
354	/ Return the super pagesize bitmap if supported. /
355	static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
356	{
357	unsigned long bitmap = `0`;
358
359	/*
360	* 1-level super page supports page size of 2MiB, 2-level super page
361	* supports page size of both 2MiB and 1GiB.
362	*/
363	if (domain->iommu_superpage == `1`)
364	bitmap \|= SZ_2M;
365	else if (domain->iommu_superpage == `2`)
366	bitmap \|= SZ_2M \| SZ_1G;
367
368	return bitmap;
369	}
370
371	struct context_entry iommu_context_addr(struct* intel_iommu *iommu, u8 bus,
372	u8 devfn, int alloc)
373	{
374	struct root_entry *root = &iommu->root_entry[bus];
375	struct context_entry *context;
376	u64 *entry;
377
378	/*
379	* Except that the caller requested to allocate a new entry,
380	* returning a copied context entry makes no sense.
381	*/
382	if (!alloc && context_copied(iommu, bus, devfn))
383	return NULL;
384
385	entry = &root->lo;
386	if (sm_supported(iommu)) {
387	if (devfn >= `0x80`) {
388	devfn -= `0x80`;
389	entry = &root->hi;
390	}
391	devfn *= `2`;
392	}
393	if (*entry & `1`)
394	context = phys_to_virt(address: *entry & VTD_PAGE_MASK);
395	else {
396	unsigned long phy_addr;
397	if (!alloc)
398	return NULL;
399
400	context = iommu_alloc_pages_node_sz(nid: iommu->node, GFP_ATOMIC,
401	SZ_4K);
402	if (!context)
403	return NULL;
404
405	__iommu_flush_cache(iommu, addr: (void *)context, CONTEXT_SIZE);
406	phy_addr = virt_to_phys(address: (void *)context);
407	*entry = phy_addr \| `1`;
408	__iommu_flush_cache(iommu, addr: entry, size: sizeof(*entry));
409	}
410	return &context[devfn];
411	}
412
413	/**
414	* is_downstream_to_pci_bridge - test if a device belongs to the PCI
415	* sub-hierarchy of a candidate PCI-PCI bridge
416	* @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
417	* @bridge: the candidate PCI-PCI bridge
418	*
419	* Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
420	*/
421	static bool
422	is_downstream_to_pci_bridge(struct device dev, struct* device *bridge)
423	{
424	struct pci_dev pdev, pbridge;
425
426	if (!dev_is_pci(dev) \|\| !dev_is_pci(bridge))
427	return false;
428
429	pdev = to_pci_dev(dev);
430	pbridge = to_pci_dev(bridge);
431
432	if (pbridge->subordinate &&
433	pbridge->subordinate->number <= pdev->bus->number &&
434	pbridge->subordinate->busn_res.end >= pdev->bus->number)
435	return true;
436
437	return false;
438	}
439
440	static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
441	{
442	struct dmar_drhd_unit *drhd;
443	u32 vtbar;
444	int rc;
445
446	/ We know that this device on this chipset has its own IOMMU.*
447	* If we find it under a different IOMMU, then the BIOS is lying
448	* to us. Hope that the IOMMU for this device is actually
449	* disabled, and it needs no translation...
450	*/
451	rc = pci_bus_read_config_dword(bus: pdev->bus, PCI_DEVFN(`0`, `0`), where: `0xb0`, val: &vtbar);
452	if (rc) {
453	/ "can't" happen /
454	dev_info(&pdev->dev, "failed to run vt-d quirk\n");
455	return false;
456	}
457	vtbar &= `0xffff0000`;
458
459	/ we know that the this iommu should be at offset 0xa000 from vtbar /
460	drhd = dmar_find_matched_drhd_unit(dev: pdev);
461	if (!drhd \|\| drhd->reg_base_addr - vtbar != `0xa000`) {
462	pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
463	add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
464	return true;
465	}
466
467	return false;
468	}
469
470	static bool iommu_is_dummy(struct intel_iommu iommu, struct* device *dev)
471	{
472	if (!iommu \|\| iommu->drhd->ignored)
473	return true;
474
475	if (dev_is_pci(dev)) {
476	struct pci_dev *pdev = to_pci_dev(dev);
477
478	if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
479	pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
480	quirk_ioat_snb_local_iommu(pdev))
481	return true;
482	}
483
484	return false;
485	}
486
487	static struct intel_iommu device_lookup_iommu(struct* device dev, u8 bus, u8 *devfn)
488	{
489	struct dmar_drhd_unit *drhd = NULL;
490	struct pci_dev *pdev = NULL;
491	struct intel_iommu *iommu;
492	struct device *tmp;
493	u16 segment = `0`;
494	int i;
495
496	if (!dev)
497	return NULL;
498
499	if (dev_is_pci(dev)) {
500	struct pci_dev *pf_pdev;
501
502	pdev = pci_real_dma_dev(to_pci_dev(dev));
503
504	/ VFs aren't listed in scope tables; we need to look up*
505	* the PF instead to find the IOMMU. */
506	pf_pdev = pci_physfn(dev: pdev);
507	dev = &pf_pdev->dev;
508	segment = pci_domain_nr(bus: pdev->bus);
509	} else if (has_acpi_companion(dev))
510	dev = &ACPI_COMPANION(dev)->dev;
511
512	rcu_read_lock();
513	for_each_iommu(iommu, drhd) {
514	if (pdev && segment != drhd->segment)
515	continue;
516
517	for_each_active_dev_scope(drhd->devices,
518	drhd->devices_cnt, i, tmp) {
519	if (tmp == dev) {
520	/ For a VF use its original BDF# not that of the PF*
521	* which we used for the IOMMU lookup. Strictly speaking
522	* we could do this for all PCI devices; we only need to
523	* get the BDF# from the scope table for ACPI matches. */
524	if (pdev && pdev->is_virtfn)
525	goto got_pdev;
526
527	if (bus && devfn) {
528	*bus = drhd->devices[i].bus;
529	*devfn = drhd->devices[i].devfn;
530	}
531	goto out;
532	}
533
534	if (is_downstream_to_pci_bridge(dev, bridge: tmp))
535	goto got_pdev;
536	}
537
538	if (pdev && drhd->include_all) {
539	got_pdev:
540	if (bus && devfn) {
541	*bus = pdev->bus->number;
542	*devfn = pdev->devfn;
543	}
544	goto out;
545	}
546	}
547	iommu = NULL;
548	out:
549	if (iommu_is_dummy(iommu, dev))
550	iommu = NULL;
551
552	rcu_read_unlock();
553
554	return iommu;
555	}
556
557	static void domain_flush_cache(struct dmar_domain *domain,
558	void addr, int* size)
559	{
560	if (!domain->iommu_coherency)
561	clflush_cache_range(addr, size);
562	}
563
564	static void free_context_table(struct intel_iommu *iommu)
565	{
566	struct context_entry *context;
567	int i;
568
569	if (!iommu->root_entry)
570	return;
571
572	for (i = `0`; i < ROOT_ENTRY_NR; i++) {
573	context = iommu_context_addr(iommu, bus: i, devfn: `0`, alloc: `0`);
574	if (context)
575	iommu_free_pages(virt: context);
576
577	if (!sm_supported(iommu))
578	continue;
579
580	context = iommu_context_addr(iommu, bus: i, devfn: `0x80`, alloc: `0`);
581	if (context)
582	iommu_free_pages(virt: context);
583	}
584
585	iommu_free_pages(virt: iommu->root_entry);
586	iommu->root_entry = NULL;
587	}
588
589	#ifdef CONFIG_DMAR_DEBUG
590	static void pgtable_walk(struct intel_iommu iommu, unsigned* long pfn,
591	u8 bus, u8 devfn, struct dma_pte parent, int* level)
592	{
593	struct dma_pte *pte;
594	int offset;
595
596	while (`1`) {
597	offset = pfn_level_offset(pfn, level);
598	pte = &parent[offset];
599
600	pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
601
602	if (!dma_pte_present(pte)) {
603	pr_info("page table not present at level %d\n", level - `1`);
604	break;
605	}
606
607	if (level == `1` \|\| dma_pte_superpage(pte))
608	break;
609
610	parent = phys_to_virt(address: dma_pte_addr(pte));
611	level--;
612	}
613	}
614
615	void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
616	unsigned long long addr, u32 pasid)
617	{
618	struct pasid_dir_entry dir, pde;
619	struct pasid_entry entries, pte;
620	struct context_entry *ctx_entry;
621	struct root_entry *rt_entry;
622	int i, dir_index, index, level;
623	u8 devfn = source_id & `0xff`;
624	u8 bus = source_id >> `8`;
625	struct dma_pte *pgtable;
626
627	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
628
629	/ root entry dump /
630	if (!iommu->root_entry) {
631	pr_info("root table is not present\n");
632	return;
633	}
634	rt_entry = &iommu->root_entry[bus];
635
636	if (sm_supported(iommu))
637	pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
638	rt_entry->hi, rt_entry->lo);
639	else
640	pr_info("root entry: 0x%016llx", rt_entry->lo);
641
642	/ context entry dump /
643	ctx_entry = iommu_context_addr(iommu, bus, devfn, alloc: `0`);
644	if (!ctx_entry) {
645	pr_info("context table is not present\n");
646	return;
647	}
648
649	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
650	ctx_entry->hi, ctx_entry->lo);
651
652	/ legacy mode does not require PASID entries /
653	if (!sm_supported(iommu)) {
654	if (!context_present(context: ctx_entry)) {
655	pr_info("legacy mode page table is not present\n");
656	return;
657	}
658	level = agaw_to_level(agaw: ctx_entry->hi & `7`);
659	pgtable = phys_to_virt(address: ctx_entry->lo & VTD_PAGE_MASK);
660	goto pgtable_walk;
661	}
662
663	if (!context_present(context: ctx_entry)) {
664	pr_info("pasid directory table is not present\n");
665	return;
666	}
667
668	/ get the pointer to pasid directory entry /
669	dir = phys_to_virt(address: ctx_entry->lo & VTD_PAGE_MASK);
670
671	/ For request-without-pasid, get the pasid from context entry /
672	if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
673	pasid = IOMMU_NO_PASID;
674
675	dir_index = pasid >> PASID_PDE_SHIFT;
676	pde = &dir[dir_index];
677	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
678
679	/ get the pointer to the pasid table entry /
680	entries = get_pasid_table_from_pde(pde);
681	if (!entries) {
682	pr_info("pasid table is not present\n");
683	return;
684	}
685	index = pasid & PASID_PTE_MASK;
686	pte = &entries[index];
687	for (i = `0`; i < ARRAY_SIZE(pte->val); i++)
688	pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
689
690	if (!pasid_pte_is_present(pte)) {
691	pr_info("scalable mode page table is not present\n");
692	return;
693	}
694
695	if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
696	level = pte->val[`2`] & BIT_ULL(`2`) ? `5` : `4`;
697	pgtable = phys_to_virt(address: pte->val[`2`] & VTD_PAGE_MASK);
698	} else {
699	level = agaw_to_level(agaw: (pte->val[`0`] >> `2`) & `0x7`);
700	pgtable = phys_to_virt(address: pte->val[`0`] & VTD_PAGE_MASK);
701	}
702
703	pgtable_walk:
704	pgtable_walk(iommu, pfn: addr >> VTD_PAGE_SHIFT, bus, devfn, parent: pgtable, level);
705	}
706	#endif
707
708	static struct dma_pte pfn_to_dma_pte(struct* dmar_domain *domain,
709	unsigned long pfn, int *target_level,
710	gfp_t gfp)
711	{
712	struct dma_pte parent, pte;
713	int level = agaw_to_level(agaw: domain->agaw);
714	int offset;
715
716	if (!domain_pfn_supported(domain, pfn))
717	/ Address beyond IOMMU's addressing capabilities. /
718	return NULL;
719
720	parent = domain->pgd;
721
722	while (`1`) {
723	void *tmp_page;
724
725	offset = pfn_level_offset(pfn, level);
726	pte = &parent[offset];
727	if (!*target_level && (dma_pte_superpage(pte) \|\| !dma_pte_present(pte)))
728	break;
729	if (level == *target_level)
730	break;
731
732	if (!dma_pte_present(pte)) {
733	uint64_t pteval, tmp;
734
735	tmp_page = iommu_alloc_pages_node_sz(nid: domain->nid, gfp,
736	SZ_4K);
737
738	if (!tmp_page)
739	return NULL;
740
741	domain_flush_cache(domain, addr: tmp_page, VTD_PAGE_SIZE);
742	pteval = virt_to_phys(address: tmp_page) \| DMA_PTE_READ \|
743	DMA_PTE_WRITE;
744	if (domain->use_first_level)
745	pteval \|= DMA_FL_PTE_US \| DMA_FL_PTE_ACCESS;
746
747	tmp = `0ULL`;
748	if (!try_cmpxchg64(&pte->val, &tmp, pteval))
749	/ Someone else set it while we were thinking; use theirs. /
750	iommu_free_pages(virt: tmp_page);
751	else
752	domain_flush_cache(domain, addr: pte, size: sizeof(*pte));
753	}
754	if (level == `1`)
755	break;
756
757	parent = phys_to_virt(address: dma_pte_addr(pte));
758	level--;
759	}
760
761	if (!*target_level)
762	*target_level = level;
763
764	return pte;
765	}
766
767	/ return address's pte at specific level /
768	static struct dma_pte dma_pfn_level_pte(struct* dmar_domain *domain,
769	unsigned long pfn,
770	int level, int *large_page)
771	{
772	struct dma_pte parent, pte;
773	int total = agaw_to_level(agaw: domain->agaw);
774	int offset;
775
776	parent = domain->pgd;
777	while (level <= total) {
778	offset = pfn_level_offset(pfn, level: total);
779	pte = &parent[offset];
780	if (level == total)
781	return pte;
782
783	if (!dma_pte_present(pte)) {
784	*large_page = total;
785	break;
786	}
787
788	if (dma_pte_superpage(pte)) {
789	*large_page = total;
790	return pte;
791	}
792
793	parent = phys_to_virt(address: dma_pte_addr(pte));
794	total--;
795	}
796	return NULL;
797	}
798
799	/ clear last level pte, a tlb flush should be followed /
800	static void dma_pte_clear_range(struct dmar_domain *domain,
801	unsigned long start_pfn,
802	unsigned long last_pfn)
803	{
804	unsigned int large_page;
805	struct dma_pte first_pte, pte;
806
807	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) \|\|
808	WARN_ON(start_pfn > last_pfn))
809	return;
810
811	/ we don't need lock here; nobody else touches the iova range /
812	do {
813	large_page = `1`;
814	first_pte = pte = dma_pfn_level_pte(domain, pfn: start_pfn, level: `1`, large_page: &large_page);
815	if (!pte) {
816	start_pfn = align_to_level(pfn: start_pfn + `1`, level: large_page + `1`);
817	continue;
818	}
819	do {
820	dma_clear_pte(pte);
821	start_pfn += lvl_to_nr_pages(lvl: large_page);
822	pte++;
823	} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
824
825	domain_flush_cache(domain, addr: first_pte,
826	size: (void )pte - (void* *)first_pte);
827
828	} while (start_pfn && start_pfn <= last_pfn);
829	}
830
831	static void dma_pte_free_level(struct dmar_domain domain, int* level,
832	int retain_level, struct dma_pte *pte,
833	unsigned long pfn, unsigned long start_pfn,
834	unsigned long last_pfn)
835	{
836	pfn = max(start_pfn, pfn);
837	pte = &pte[pfn_level_offset(pfn, level)];
838
839	do {
840	unsigned long level_pfn;
841	struct dma_pte *level_pte;
842
843	if (!dma_pte_present(pte) \|\| dma_pte_superpage(pte))
844	goto next;
845
846	level_pfn = pfn & level_mask(level);
847	level_pte = phys_to_virt(address: dma_pte_addr(pte));
848
849	if (level > `2`) {
850	dma_pte_free_level(domain, level: level - `1`, retain_level,
851	pte: level_pte, pfn: level_pfn, start_pfn,
852	last_pfn);
853	}
854
855	/*
856	* Free the page table if we're below the level we want to
857	* retain and the range covers the entire table.
858	*/
859	if (level < retain_level && !(start_pfn > level_pfn \|\|
860	last_pfn < level_pfn + level_size(level) - `1`)) {
861	dma_clear_pte(pte);
862	domain_flush_cache(domain, addr: pte, size: sizeof(*pte));
863	iommu_free_pages(virt: level_pte);
864	}
865	next:
866	pfn += level_size(level);
867	} while (!first_pte_in_page(pte: ++pte) && pfn <= last_pfn);
868	}
869
870	/*
871	* clear last level (leaf) ptes and free page table pages below the
872	* level we wish to keep intact.
873	*/
874	static void dma_pte_free_pagetable(struct dmar_domain *domain,
875	unsigned long start_pfn,
876	unsigned long last_pfn,
877	int retain_level)
878	{
879	dma_pte_clear_range(domain, start_pfn, last_pfn);
880
881	/ We don't need lock here; nobody else touches the iova range /
882	dma_pte_free_level(domain, level: agaw_to_level(agaw: domain->agaw), retain_level,
883	pte: domain->pgd, pfn: `0`, start_pfn, last_pfn);
884
885	/ free pgd /
886	if (start_pfn == `0` && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
887	iommu_free_pages(virt: domain->pgd);
888	domain->pgd = NULL;
889	}
890	}
891
892	/ When a page at a given level is being unlinked from its parent, we don't*
893	need to modify* it at all. All we need to do is make a list of all the*
894	pages which can be freed just as soon as we've flushed the IOTLB and we
895	know the hardware page-walk will no longer touch them.
896	The 'pte' argument is the parent* PTE, pointing to the page that is to*
897	be freed. /*
898	static void dma_pte_list_pagetables(struct dmar_domain *domain,
899	int level, struct dma_pte *parent_pte,
900	struct iommu_pages_list *freelist)
901	{
902	struct dma_pte *pte = phys_to_virt(address: dma_pte_addr(pte: parent_pte));
903
904	iommu_pages_list_add(list: freelist, virt: pte);
905
906	if (level == `1`)
907	return;
908
909	do {
910	if (dma_pte_present(pte) && !dma_pte_superpage(pte))
911	dma_pte_list_pagetables(domain, level: level - `1`, parent_pte: pte, freelist);
912	pte++;
913	} while (!first_pte_in_page(pte));
914	}
915
916	static void dma_pte_clear_level(struct dmar_domain domain, int* level,
917	struct dma_pte pte, unsigned* long pfn,
918	unsigned long start_pfn, unsigned long last_pfn,
919	struct iommu_pages_list *freelist)
920	{
921	struct dma_pte first_pte = NULL, last_pte = NULL;
922
923	pfn = max(start_pfn, pfn);
924	pte = &pte[pfn_level_offset(pfn, level)];
925
926	do {
927	unsigned long level_pfn = pfn & level_mask(level);
928
929	if (!dma_pte_present(pte))
930	goto next;
931
932	/ If range covers entire pagetable, free it /
933	if (start_pfn <= level_pfn &&
934	last_pfn >= level_pfn + level_size(level) - `1`) {
935	/ These suborbinate page tables are going away entirely. Don't*
936	bother to clear them; we're just going to free* them. /
937	if (level > `1` && !dma_pte_superpage(pte))
938	dma_pte_list_pagetables(domain, level: level - `1`, parent_pte: pte, freelist);
939
940	dma_clear_pte(pte);
941	if (!first_pte)
942	first_pte = pte;
943	last_pte = pte;
944	} else if (level > `1`) {
945	/ Recurse down into a level that isn't entirely obsolete /
946	dma_pte_clear_level(domain, level: level - `1`,
947	phys_to_virt(address: dma_pte_addr(pte)),
948	pfn: level_pfn, start_pfn, last_pfn,
949	freelist);
950	}
951	next:
952	pfn = level_pfn + level_size(level);
953	} while (!first_pte_in_page(pte: ++pte) && pfn <= last_pfn);
954
955	if (first_pte)
956	domain_flush_cache(domain, addr: first_pte,
957	size: (void )++last_pte - (void* *)first_pte);
958	}
959
960	/ We can't just free the pages because the IOMMU may still be walking*
961	the page tables, and may have cached the intermediate levels. The
962	pages can only be freed after the IOTLB flush has been done. /*
963	static void domain_unmap(struct dmar_domain domain, unsigned* long start_pfn,
964	unsigned long last_pfn,
965	struct iommu_pages_list *freelist)
966	{
967	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) \|\|
968	WARN_ON(start_pfn > last_pfn))
969	return;
970
971	/ we don't need lock here; nobody else touches the iova range /
972	dma_pte_clear_level(domain, level: agaw_to_level(agaw: domain->agaw),
973	pte: domain->pgd, pfn: `0`, start_pfn, last_pfn, freelist);
974
975	/ free pgd /
976	if (start_pfn == `0` && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
977	iommu_pages_list_add(list: freelist, virt: domain->pgd);
978	domain->pgd = NULL;
979	}
980	}
981
982	/ iommu handling /
983	static int iommu_alloc_root_entry(struct intel_iommu *iommu)
984	{
985	struct root_entry *root;
986
987	root = iommu_alloc_pages_node_sz(nid: iommu->node, GFP_ATOMIC, SZ_4K);
988	if (!root) {
989	pr_err("Allocating root entry for %s failed\n",
990	iommu->name);
991	return -ENOMEM;
992	}
993
994	__iommu_flush_cache(iommu, addr: root, ROOT_SIZE);
995	iommu->root_entry = root;
996
997	return `0`;
998	}
999
1000	static void iommu_set_root_entry(struct intel_iommu *iommu)
1001	{
1002	u64 addr;
1003	u32 sts;
1004	unsigned long flag;
1005
1006	addr = virt_to_phys(address: iommu->root_entry);
1007	if (sm_supported(iommu))
1008	addr \|= DMA_RTADDR_SMT;
1009
1010	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1011	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1012
1013	writel(val: iommu->gcmd \| DMA_GCMD_SRTP, addr: iommu->reg + DMAR_GCMD_REG);
1014
1015	/ Make sure hardware complete it /
1016	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1017	readl, (sts & DMA_GSTS_RTPS), sts);
1018
1019	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1020
1021	/*
1022	* Hardware invalidates all DMA remapping hardware translation
1023	* caches as part of SRTP flow.
1024	*/
1025	if (cap_esrtps(iommu->cap))
1026	return;
1027
1028	iommu->flush.flush_context(iommu, `0`, `0`, `0`, DMA_CCMD_GLOBAL_INVL);
1029	if (sm_supported(iommu))
1030	qi_flush_pasid_cache(iommu, did: `0`, QI_PC_GLOBAL, pasid: `0`);
1031	iommu->flush.flush_iotlb(iommu, `0`, `0`, `0`, DMA_TLB_GLOBAL_FLUSH);
1032	}
1033
1034	void iommu_flush_write_buffer(struct intel_iommu *iommu)
1035	{
1036	u32 val;
1037	unsigned long flag;
1038
1039	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1040	return;
1041
1042	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1043	writel(val: iommu->gcmd \| DMA_GCMD_WBF, addr: iommu->reg + DMAR_GCMD_REG);
1044
1045	/ Make sure hardware complete it /
1046	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1047	readl, (!(val & DMA_GSTS_WBFS)), val);
1048
1049	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1050	}
1051
1052	/ return value determine if we need a write buffer flush /
1053	static void __iommu_flush_context(struct intel_iommu *iommu,
1054	u16 did, u16 source_id, u8 function_mask,
1055	u64 type)
1056	{
1057	u64 val = `0`;
1058	unsigned long flag;
1059
1060	switch (type) {
1061	case DMA_CCMD_GLOBAL_INVL:
1062	val = DMA_CCMD_GLOBAL_INVL;
1063	break;
1064	case DMA_CCMD_DOMAIN_INVL:
1065	val = DMA_CCMD_DOMAIN_INVL\|DMA_CCMD_DID(did);
1066	break;
1067	case DMA_CCMD_DEVICE_INVL:
1068	val = DMA_CCMD_DEVICE_INVL\|DMA_CCMD_DID(did)
1069	\| DMA_CCMD_SID(source_id) \| DMA_CCMD_FM(function_mask);
1070	break;
1071	default:
1072	pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1073	iommu->name, type);
1074	return;
1075	}
1076	val \|= DMA_CCMD_ICC;
1077
1078	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1079	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1080
1081	/ Make sure hardware complete it /
1082	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1083	dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1084
1085	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1086	}
1087
1088	void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
1089	unsigned int size_order, u64 type)
1090	{
1091	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1092	u64 val = `0`, val_iva = `0`;
1093	unsigned long flag;
1094
1095	switch (type) {
1096	case DMA_TLB_GLOBAL_FLUSH:
1097	/ global flush doesn't need set IVA_REG /
1098	val = DMA_TLB_GLOBAL_FLUSH\|DMA_TLB_IVT;
1099	break;
1100	case DMA_TLB_DSI_FLUSH:
1101	val = DMA_TLB_DSI_FLUSH\|DMA_TLB_IVT\|DMA_TLB_DID(did);
1102	break;
1103	case DMA_TLB_PSI_FLUSH:
1104	val = DMA_TLB_PSI_FLUSH\|DMA_TLB_IVT\|DMA_TLB_DID(did);
1105	/ IH bit is passed in as part of address /
1106	val_iva = size_order \| addr;
1107	break;
1108	default:
1109	pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1110	iommu->name, type);
1111	return;
1112	}
1113
1114	if (cap_write_drain(iommu->cap))
1115	val \|= DMA_TLB_WRITE_DRAIN;
1116
1117	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1118	/ Note: Only uses first TLB reg currently /
1119	if (val_iva)
1120	dmar_writeq(iommu->reg + tlb_offset, val_iva);
1121	dmar_writeq(iommu->reg + tlb_offset + `8`, val);
1122
1123	/ Make sure hardware complete it /
1124	IOMMU_WAIT_OP(iommu, tlb_offset + `8`,
1125	dmar_readq, (!(val & DMA_TLB_IVT)), val);
1126
1127	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1128
1129	/ check IOTLB invalidation granularity /
1130	if (DMA_TLB_IAIG(val) == `0`)
1131	pr_err("Flush IOTLB failed\n");
1132	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1133	pr_debug("TLB flush request %Lx, actual %Lx\n",
1134	(unsigned long long)DMA_TLB_IIRG(type),
1135	(unsigned long long)DMA_TLB_IAIG(val));
1136	}
1137
1138	static struct device_domain_info *
1139	domain_lookup_dev_info(struct dmar_domain *domain,
1140	struct intel_iommu *iommu, u8 bus, u8 devfn)
1141	{
1142	struct device_domain_info *info;
1143	unsigned long flags;
1144
1145	spin_lock_irqsave(&domain->lock, flags);
1146	list_for_each_entry(info, &domain->devices, link) {
1147	if (info->iommu == iommu && info->bus == bus &&
1148	info->devfn == devfn) {
1149	spin_unlock_irqrestore(lock: &domain->lock, flags);
1150	return info;
1151	}
1152	}
1153	spin_unlock_irqrestore(lock: &domain->lock, flags);
1154
1155	return NULL;
1156	}
1157
1158	/*
1159	* The extra devTLB flush quirk impacts those QAT devices with PCI device
1160	* IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1161	* check because it applies only to the built-in QAT devices and it doesn't
1162	* grant additional privileges.
1163	*/
1164	#define BUGGY_QAT_DEVID_MASK 0x4940
1165	static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1166	{
1167	if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1168	return false;
1169
1170	if ((pdev->device & `0xfffc`) != BUGGY_QAT_DEVID_MASK)
1171	return false;
1172
1173	return true;
1174	}
1175
1176	static void iommu_enable_pci_ats(struct device_domain_info *info)
1177	{
1178	struct pci_dev *pdev;
1179
1180	if (!info->ats_supported)
1181	return;
1182
1183	pdev = to_pci_dev(info->dev);
1184	if (!pci_ats_page_aligned(dev: pdev))
1185	return;
1186
1187	if (!pci_enable_ats(dev: pdev, VTD_PAGE_SHIFT))
1188	info->ats_enabled = `1`;
1189	}
1190
1191	static void iommu_disable_pci_ats(struct device_domain_info *info)
1192	{
1193	if (!info->ats_enabled)
1194	return;
1195
1196	pci_disable_ats(to_pci_dev(info->dev));
1197	info->ats_enabled = `0`;
1198	}
1199
1200	static void iommu_enable_pci_pri(struct device_domain_info *info)
1201	{
1202	struct pci_dev *pdev;
1203
1204	if (!info->ats_enabled \|\| !info->pri_supported)
1205	return;
1206
1207	pdev = to_pci_dev(info->dev);
1208	/ PASID is required in PRG Response Message. /
1209	if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
1210	return;
1211
1212	if (pci_reset_pri(pdev))
1213	return;
1214
1215	if (!pci_enable_pri(pdev, PRQ_DEPTH))
1216	info->pri_enabled = `1`;
1217	}
1218
1219	static void iommu_disable_pci_pri(struct device_domain_info *info)
1220	{
1221	if (!info->pri_enabled)
1222	return;
1223
1224	if (WARN_ON(info->iopf_refcount))
1225	iopf_queue_remove_device(queue: info->iommu->iopf_queue, dev: info->dev);
1226
1227	pci_disable_pri(to_pci_dev(info->dev));
1228	info->pri_enabled = `0`;
1229	}
1230
1231	static void intel_flush_iotlb_all(struct iommu_domain *domain)
1232	{
1233	cache_tag_flush_all(domain: to_dmar_domain(dom: domain));
1234	}
1235
1236	static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1237	{
1238	u32 pmen;
1239	unsigned long flags;
1240
1241	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1242	return;
1243
1244	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1245	pmen = readl(addr: iommu->reg + DMAR_PMEN_REG);
1246	pmen &= ~DMA_PMEN_EPM;
1247	writel(val: pmen, addr: iommu->reg + DMAR_PMEN_REG);
1248
1249	/ wait for the protected region status bit to clear /
1250	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1251	readl, !(pmen & DMA_PMEN_PRS), pmen);
1252
1253	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1254	}
1255
1256	static void iommu_enable_translation(struct intel_iommu *iommu)
1257	{
1258	u32 sts;
1259	unsigned long flags;
1260
1261	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1262	iommu->gcmd \|= DMA_GCMD_TE;
1263	writel(val: iommu->gcmd, addr: iommu->reg + DMAR_GCMD_REG);
1264
1265	/ Make sure hardware complete it /
1266	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1267	readl, (sts & DMA_GSTS_TES), sts);
1268
1269	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1270	}
1271
1272	static void iommu_disable_translation(struct intel_iommu *iommu)
1273	{
1274	u32 sts;
1275	unsigned long flag;
1276
1277	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1278	(cap_read_drain(iommu->cap) \|\| cap_write_drain(iommu->cap)))
1279	return;
1280
1281	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1282	iommu->gcmd &= ~DMA_GCMD_TE;
1283	writel(val: iommu->gcmd, addr: iommu->reg + DMAR_GCMD_REG);
1284
1285	/ Make sure hardware complete it /
1286	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1287	readl, (!(sts & DMA_GSTS_TES)), sts);
1288
1289	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1290	}
1291
1292	static void disable_dmar_iommu(struct intel_iommu *iommu)
1293	{
1294	/*
1295	* All iommu domains must have been detached from the devices,
1296	* hence there should be no domain IDs in use.
1297	*/
1298	if (WARN_ON(!ida_is_empty(&iommu->domain_ida)))
1299	return;
1300
1301	if (iommu->gcmd & DMA_GCMD_TE)
1302	iommu_disable_translation(iommu);
1303	}
1304
1305	static void free_dmar_iommu(struct intel_iommu *iommu)
1306	{
1307	if (iommu->copied_tables) {
1308	bitmap_free(bitmap: iommu->copied_tables);
1309	iommu->copied_tables = NULL;
1310	}
1311
1312	/ free context mapping /
1313	free_context_table(iommu);
1314
1315	if (ecap_prs(iommu->ecap))
1316	intel_iommu_finish_prq(iommu);
1317	}
1318
1319	/*
1320	* Check and return whether first level is used by default for
1321	* DMA translation.
1322	*/
1323	static bool first_level_by_default(struct intel_iommu *iommu)
1324	{
1325	/ Only SL is available in legacy mode /
1326	if (!sm_supported(iommu))
1327	return false;
1328
1329	/ Only level (either FL or SL) is available, just use it /
1330	if (ecap_flts(iommu->ecap) ^ ecap_slts(iommu->ecap))
1331	return ecap_flts(iommu->ecap);
1332
1333	return true;
1334	}
1335
1336	int domain_attach_iommu(struct dmar_domain domain, struct* intel_iommu *iommu)
1337	{
1338	struct iommu_domain_info info, curr;
1339	int num, ret = -ENOSPC;
1340
1341	if (domain->domain.type == IOMMU_DOMAIN_SVA)
1342	return `0`;
1343
1344	info = kzalloc(sizeof(*info), GFP_KERNEL);
1345	if (!info)
1346	return -ENOMEM;
1347
1348	guard(mutex)(T: &iommu->did_lock);
1349	curr = xa_load(&domain->iommu_array, index: iommu->seq_id);
1350	if (curr) {
1351	curr->refcnt++;
1352	kfree(objp: info);
1353	return `0`;
1354	}
1355
1356	num = ida_alloc_range(&iommu->domain_ida, IDA_START_DID,
1357	cap_ndoms(iommu->cap) - `1`, GFP_KERNEL);
1358	if (num < `0`) {
1359	pr_err("%s: No free domain ids\n", iommu->name);
1360	goto err_unlock;
1361	}
1362
1363	info->refcnt = `1`;
1364	info->did = num;
1365	info->iommu = iommu;
1366	curr = xa_cmpxchg(xa: &domain->iommu_array, index: iommu->seq_id,
1367	NULL, entry: info, GFP_KERNEL);
1368	if (curr) {
1369	ret = xa_err(entry: curr) ? : -EBUSY;
1370	goto err_clear;
1371	}
1372
1373	return `0`;
1374
1375	err_clear:
1376	ida_free(&iommu->domain_ida, id: info->did);
1377	err_unlock:
1378	kfree(objp: info);
1379	return ret;
1380	}
1381
1382	void domain_detach_iommu(struct dmar_domain domain, struct* intel_iommu *iommu)
1383	{
1384	struct iommu_domain_info *info;
1385
1386	if (domain->domain.type == IOMMU_DOMAIN_SVA)
1387	return;
1388
1389	guard(mutex)(T: &iommu->did_lock);
1390	info = xa_load(&domain->iommu_array, index: iommu->seq_id);
1391	if (--info->refcnt == `0`) {
1392	ida_free(&iommu->domain_ida, id: info->did);
1393	xa_erase(&domain->iommu_array, index: iommu->seq_id);
1394	domain->nid = NUMA_NO_NODE;
1395	kfree(objp: info);
1396	}
1397	}
1398
1399	static void domain_exit(struct dmar_domain *domain)
1400	{
1401	if (domain->pgd) {
1402	struct iommu_pages_list freelist =
1403	IOMMU_PAGES_LIST_INIT(freelist);
1404
1405	domain_unmap(domain, start_pfn: `0`, DOMAIN_MAX_PFN(domain->gaw), freelist: &freelist);
1406	iommu_put_pages_list(list: &freelist);
1407	}
1408
1409	if (WARN_ON(!list_empty(&domain->devices)))
1410	return;
1411
1412	kfree(objp: domain->qi_batch);
1413	kfree(objp: domain);
1414	}
1415
1416	/*
1417	* For kdump cases, old valid entries may be cached due to the
1418	* in-flight DMA and copied pgtable, but there is no unmapping
1419	* behaviour for them, thus we need an explicit cache flush for
1420	* the newly-mapped device. For kdump, at this point, the device
1421	* is supposed to finish reset at its driver probe stage, so no
1422	* in-flight DMA will exist, and we don't need to worry anymore
1423	* hereafter.
1424	*/
1425	static void copied_context_tear_down(struct intel_iommu *iommu,
1426	struct context_entry *context,
1427	u8 bus, u8 devfn)
1428	{
1429	u16 did_old;
1430
1431	if (!context_copied(iommu, bus, devfn))
1432	return;
1433
1434	assert_spin_locked(&iommu->lock);
1435
1436	did_old = context_domain_id(c: context);
1437	context_clear_entry(context);
1438
1439	if (did_old < cap_ndoms(iommu->cap)) {
1440	iommu->flush.flush_context(iommu, did_old,
1441	PCI_DEVID(bus, devfn),
1442	DMA_CCMD_MASK_NOBIT,
1443	DMA_CCMD_DEVICE_INVL);
1444	iommu->flush.flush_iotlb(iommu, did_old, `0`, `0`,
1445	DMA_TLB_DSI_FLUSH);
1446	}
1447
1448	clear_context_copied(iommu, bus, devfn);
1449	}
1450
1451	/*
1452	* It's a non-present to present mapping. If hardware doesn't cache
1453	* non-present entry we only need to flush the write-buffer. If the
1454	* _does_ cache non-present entries, then it does so in the special
1455	* domain #0, which we have to flush:
1456	*/
1457	static void context_present_cache_flush(struct intel_iommu *iommu, u16 did,
1458	u8 bus, u8 devfn)
1459	{
1460	if (cap_caching_mode(iommu->cap)) {
1461	iommu->flush.flush_context(iommu, `0`,
1462	PCI_DEVID(bus, devfn),
1463	DMA_CCMD_MASK_NOBIT,
1464	DMA_CCMD_DEVICE_INVL);
1465	iommu->flush.flush_iotlb(iommu, did, `0`, `0`, DMA_TLB_DSI_FLUSH);
1466	} else {
1467	iommu_flush_write_buffer(iommu);
1468	}
1469	}
1470
1471	static int domain_context_mapping_one(struct dmar_domain *domain,
1472	struct intel_iommu *iommu,
1473	u8 bus, u8 devfn)
1474	{
1475	struct device_domain_info *info =
1476	domain_lookup_dev_info(domain, iommu, bus, devfn);
1477	u16 did = domain_id_iommu(domain, iommu);
1478	int translation = CONTEXT_TT_MULTI_LEVEL;
1479	struct dma_pte *pgd = domain->pgd;
1480	struct context_entry *context;
1481	int ret;
1482
1483	pr_debug("Set context mapping for %02x:%02x.%d\n",
1484	bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1485
1486	spin_lock(lock: &iommu->lock);
1487	ret = -ENOMEM;
1488	context = iommu_context_addr(iommu, bus, devfn, alloc: `1`);
1489	if (!context)
1490	goto out_unlock;
1491
1492	ret = `0`;
1493	if (context_present(context) && !context_copied(iommu, bus, devfn))
1494	goto out_unlock;
1495
1496	copied_context_tear_down(iommu, context, bus, devfn);
1497	context_clear_entry(context);
1498	context_set_domain_id(context, value: did);
1499
1500	if (info && info->ats_supported)
1501	translation = CONTEXT_TT_DEV_IOTLB;
1502	else
1503	translation = CONTEXT_TT_MULTI_LEVEL;
1504
1505	context_set_address_root(context, virt_to_phys(address: pgd));
1506	context_set_address_width(context, value: domain->agaw);
1507	context_set_translation_type(context, value: translation);
1508	context_set_fault_enable(context);
1509	context_set_present(context);
1510	if (!ecap_coherent(iommu->ecap))
1511	clflush_cache_range(addr: context, size: sizeof(*context));
1512	context_present_cache_flush(iommu, did, bus, devfn);
1513	ret = `0`;
1514
1515	out_unlock:
1516	spin_unlock(lock: &iommu->lock);
1517
1518	return ret;
1519	}
1520
1521	static int domain_context_mapping_cb(struct pci_dev *pdev,
1522	u16 alias, void *opaque)
1523	{
1524	struct device_domain_info *info = dev_iommu_priv_get(dev: &pdev->dev);
1525	struct intel_iommu *iommu = info->iommu;
1526	struct dmar_domain *domain = opaque;
1527
1528	return domain_context_mapping_one(domain, iommu,
1529	PCI_BUS_NUM(alias), devfn: alias & `0xff`);
1530	}
1531
1532	static int
1533	domain_context_mapping(struct dmar_domain domain, struct* device *dev)
1534	{
1535	struct device_domain_info *info = dev_iommu_priv_get(dev);
1536	struct intel_iommu *iommu = info->iommu;
1537	u8 bus = info->bus, devfn = info->devfn;
1538	int ret;
1539
1540	if (!dev_is_pci(dev))
1541	return domain_context_mapping_one(domain, iommu, bus, devfn);
1542
1543	ret = pci_for_each_dma_alias(to_pci_dev(dev),
1544	fn: domain_context_mapping_cb, data: domain);
1545	if (ret)
1546	return ret;
1547
1548	iommu_enable_pci_ats(info);
1549
1550	return `0`;
1551	}
1552
1553	/ Return largest possible superpage level for a given mapping /
1554	static int hardware_largepage_caps(struct dmar_domain domain, unsigned* long iov_pfn,
1555	unsigned long phy_pfn, unsigned long pages)
1556	{
1557	int support, level = `1`;
1558	unsigned long pfnmerge;
1559
1560	support = domain->iommu_superpage;
1561
1562	/ To use a large page, the virtual and physical addresses*
1563	must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1564	of them will mean we have to use smaller pages. So just
1565	merge them and check both at once. /*
1566	pfnmerge = iov_pfn \| phy_pfn;
1567
1568	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1569	pages >>= VTD_STRIDE_SHIFT;
1570	if (!pages)
1571	break;
1572	pfnmerge >>= VTD_STRIDE_SHIFT;
1573	level++;
1574	support--;
1575	}
1576	return level;
1577	}
1578
1579	/*
1580	* Ensure that old small page tables are removed to make room for superpage(s).
1581	* We're going to add new large pages, so make sure we don't remove their parent
1582	* tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
1583	*/
1584	static void switch_to_super_page(struct dmar_domain *domain,
1585	unsigned long start_pfn,
1586	unsigned long end_pfn, int level)
1587	{
1588	unsigned long lvl_pages = lvl_to_nr_pages(lvl: level);
1589	struct dma_pte *pte = NULL;
1590
1591	while (start_pfn <= end_pfn) {
1592	if (!pte)
1593	pte = pfn_to_dma_pte(domain, pfn: start_pfn, target_level: &level,
1594	GFP_ATOMIC);
1595
1596	if (dma_pte_present(pte)) {
1597	dma_pte_free_pagetable(domain, start_pfn,
1598	last_pfn: start_pfn + lvl_pages - `1`,
1599	retain_level: level + `1`);
1600
1601	cache_tag_flush_range(domain, start: start_pfn << VTD_PAGE_SHIFT,
1602	end: end_pfn << VTD_PAGE_SHIFT, ih: `0`);
1603	}
1604
1605	pte++;
1606	start_pfn += lvl_pages;
1607	if (first_pte_in_page(pte))
1608	pte = NULL;
1609	}
1610	}
1611
1612	static int
1613	__domain_mapping(struct dmar_domain domain, unsigned* long iov_pfn,
1614	unsigned long phys_pfn, unsigned long nr_pages, int prot,
1615	gfp_t gfp)
1616	{
1617	struct dma_pte first_pte = NULL, pte = NULL;
1618	unsigned int largepage_lvl = `0`;
1619	unsigned long lvl_pages = `0`;
1620	phys_addr_t pteval;
1621	u64 attr;
1622
1623	if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - `1`)))
1624	return -EINVAL;
1625
1626	if ((prot & (DMA_PTE_READ\|DMA_PTE_WRITE)) == `0`)
1627	return -EINVAL;
1628
1629	if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
1630	pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
1631	return -EINVAL;
1632	}
1633
1634	attr = prot & (DMA_PTE_READ \| DMA_PTE_WRITE \| DMA_PTE_SNP);
1635	if (domain->use_first_level) {
1636	attr \|= DMA_FL_PTE_PRESENT \| DMA_FL_PTE_US \| DMA_FL_PTE_ACCESS;
1637	if (prot & DMA_PTE_WRITE)
1638	attr \|= DMA_FL_PTE_DIRTY;
1639	}
1640
1641	domain->has_mappings = true;
1642
1643	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) \| attr;
1644
1645	while (nr_pages > `0`) {
1646	uint64_t tmp;
1647
1648	if (!pte) {
1649	largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
1650	phy_pfn: phys_pfn, pages: nr_pages);
1651
1652	pte = pfn_to_dma_pte(domain, pfn: iov_pfn, target_level: &largepage_lvl,
1653	gfp);
1654	if (!pte)
1655	return -ENOMEM;
1656	first_pte = pte;
1657
1658	lvl_pages = lvl_to_nr_pages(lvl: largepage_lvl);
1659
1660	/ It is large page/
1661	if (largepage_lvl > `1`) {
1662	unsigned long end_pfn;
1663	unsigned long pages_to_remove;
1664
1665	pteval \|= DMA_PTE_LARGE_PAGE;
1666	pages_to_remove = min_t(unsigned long, nr_pages,
1667	nr_pte_to_next_page(pte) * lvl_pages);
1668	end_pfn = iov_pfn + pages_to_remove - `1`;
1669	switch_to_super_page(domain, start_pfn: iov_pfn, end_pfn, level: largepage_lvl);
1670	} else {
1671	pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1672	}
1673
1674	}
1675	/ We don't need lock here, nobody else*
1676	* touches the iova range
1677	*/
1678	tmp = `0ULL`;
1679	if (!try_cmpxchg64_local(&pte->val, &tmp, pteval)) {
1680	static int dumps = `5`;
1681	pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1682	iov_pfn, tmp, (unsigned long long)pteval);
1683	if (dumps) {
1684	dumps--;
1685	debug_dma_dump_mappings(NULL);
1686	}
1687	WARN_ON(`1`);
1688	}
1689
1690	nr_pages -= lvl_pages;
1691	iov_pfn += lvl_pages;
1692	phys_pfn += lvl_pages;
1693	pteval += lvl_pages * VTD_PAGE_SIZE;
1694
1695	/ If the next PTE would be the first in a new page, then we*
1696	* need to flush the cache on the entries we've just written.
1697	* And then we'll need to recalculate 'pte', so clear it and
1698	* let it get set again in the if (!pte) block above.
1699	*
1700	* If we're done (!nr_pages) we need to flush the cache too.
1701	*
1702	* Also if we've been setting superpages, we may need to
1703	* recalculate 'pte' and switch back to smaller pages for the
1704	* end of the mapping, if the trailing size is not enough to
1705	* use another superpage (i.e. nr_pages < lvl_pages).
1706	*/
1707	pte++;
1708	if (!nr_pages \|\| first_pte_in_page(pte) \|\|
1709	(largepage_lvl > `1` && nr_pages < lvl_pages)) {
1710	domain_flush_cache(domain, addr: first_pte,
1711	size: (void )pte - (void* *)first_pte);
1712	pte = NULL;
1713	}
1714	}
1715
1716	return `0`;
1717	}
1718
1719	static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
1720	{
1721	struct intel_iommu *iommu = info->iommu;
1722	struct context_entry *context;
1723	u16 did;
1724
1725	spin_lock(lock: &iommu->lock);
1726	context = iommu_context_addr(iommu, bus, devfn, alloc: `0`);
1727	if (!context) {
1728	spin_unlock(lock: &iommu->lock);
1729	return;
1730	}
1731
1732	did = context_domain_id(c: context);
1733	context_clear_entry(context);
1734	__iommu_flush_cache(iommu, addr: context, size: sizeof(*context));
1735	spin_unlock(lock: &iommu->lock);
1736	intel_context_flush_no_pasid(info, context, did);
1737	}
1738
1739	int __domain_setup_first_level(struct intel_iommu *iommu,
1740	struct device *dev, ioasid_t pasid,
1741	u16 did, pgd_t pgd, int* flags,
1742	struct iommu_domain *old)
1743	{
1744	if (!old)
1745	return intel_pasid_setup_first_level(iommu, dev, pgd,
1746	pasid, did, flags);
1747	return intel_pasid_replace_first_level(iommu, dev, pgd, pasid, did,
1748	old_did: iommu_domain_did(domain: old, iommu),
1749	flags);
1750	}
1751
1752	static int domain_setup_second_level(struct intel_iommu *iommu,
1753	struct dmar_domain *domain,
1754	struct device *dev, ioasid_t pasid,
1755	struct iommu_domain *old)
1756	{
1757	if (!old)
1758	return intel_pasid_setup_second_level(iommu, domain,
1759	dev, pasid);
1760	return intel_pasid_replace_second_level(iommu, domain, dev,
1761	old_did: iommu_domain_did(domain: old, iommu),
1762	pasid);
1763	}
1764
1765	static int domain_setup_passthrough(struct intel_iommu *iommu,
1766	struct device *dev, ioasid_t pasid,
1767	struct iommu_domain *old)
1768	{
1769	if (!old)
1770	return intel_pasid_setup_pass_through(iommu, dev, pasid);
1771	return intel_pasid_replace_pass_through(iommu, dev,
1772	old_did: iommu_domain_did(domain: old, iommu),
1773	pasid);
1774	}
1775
1776	static int domain_setup_first_level(struct intel_iommu *iommu,
1777	struct dmar_domain *domain,
1778	struct device *dev,
1779	u32 pasid, struct iommu_domain *old)
1780	{
1781	struct dma_pte *pgd = domain->pgd;
1782	int level, flags = `0`;
1783
1784	level = agaw_to_level(agaw: domain->agaw);
1785	if (level != `4` && level != `5`)
1786	return -EINVAL;
1787
1788	if (level == `5`)
1789	flags \|= PASID_FLAG_FL5LP;
1790
1791	if (domain->force_snooping)
1792	flags \|= PASID_FLAG_PAGE_SNOOP;
1793
1794	return __domain_setup_first_level(iommu, dev, pasid,
1795	did: domain_id_iommu(domain, iommu),
1796	pgd: (pgd_t *)pgd, flags, old);
1797	}
1798
1799	static int dmar_domain_attach_device(struct dmar_domain *domain,
1800	struct device *dev)
1801	{
1802	struct device_domain_info *info = dev_iommu_priv_get(dev);
1803	struct intel_iommu *iommu = info->iommu;
1804	unsigned long flags;
1805	int ret;
1806
1807	ret = domain_attach_iommu(domain, iommu);
1808	if (ret)
1809	return ret;
1810
1811	info->domain = domain;
1812	info->domain_attached = true;
1813	spin_lock_irqsave(&domain->lock, flags);
1814	list_add(new: &info->link, head: &domain->devices);
1815	spin_unlock_irqrestore(lock: &domain->lock, flags);
1816
1817	if (dev_is_real_dma_subdevice(dev))
1818	return `0`;
1819
1820	if (!sm_supported(iommu))
1821	ret = domain_context_mapping(domain, dev);
1822	else if (domain->use_first_level)
1823	ret = domain_setup_first_level(iommu, domain, dev,
1824	IOMMU_NO_PASID, NULL);
1825	else
1826	ret = domain_setup_second_level(iommu, domain, dev,
1827	IOMMU_NO_PASID, NULL);
1828
1829	if (ret)
1830	goto out_block_translation;
1831
1832	ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID);
1833	if (ret)
1834	goto out_block_translation;
1835
1836	return `0`;
1837
1838	out_block_translation:
1839	device_block_translation(dev);
1840	return ret;
1841	}
1842
1843	/**
1844	* device_rmrr_is_relaxable - Test whether the RMRR of this device
1845	* is relaxable (ie. is allowed to be not enforced under some conditions)
1846	* @dev: device handle
1847	*
1848	* We assume that PCI USB devices with RMRRs have them largely
1849	* for historical reasons and that the RMRR space is not actively used post
1850	* boot. This exclusion may change if vendors begin to abuse it.
1851	*
1852	* The same exception is made for graphics devices, with the requirement that
1853	* any use of the RMRR regions will be torn down before assigning the device
1854	* to a guest.
1855	*
1856	* Return: true if the RMRR is relaxable, false otherwise
1857	*/
1858	static bool device_rmrr_is_relaxable(struct device *dev)
1859	{
1860	struct pci_dev *pdev;
1861
1862	if (!dev_is_pci(dev))
1863	return false;
1864
1865	pdev = to_pci_dev(dev);
1866	if (IS_USB_DEVICE(pdev) \|\| IS_GFX_DEVICE(pdev))
1867	return true;
1868	else
1869	return false;
1870	}
1871
1872	static int device_def_domain_type(struct device *dev)
1873	{
1874	struct device_domain_info *info = dev_iommu_priv_get(dev);
1875	struct intel_iommu *iommu = info->iommu;
1876
1877	/*
1878	* Hardware does not support the passthrough translation mode.
1879	* Always use a dynamaic mapping domain.
1880	*/
1881	if (!ecap_pass_through(iommu->ecap))
1882	return IOMMU_DOMAIN_DMA;
1883
1884	if (dev_is_pci(dev)) {
1885	struct pci_dev *pdev = to_pci_dev(dev);
1886
1887	if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
1888	return IOMMU_DOMAIN_IDENTITY;
1889	}
1890
1891	return `0`;
1892	}
1893
1894	static void intel_iommu_init_qi(struct intel_iommu *iommu)
1895	{
1896	/*
1897	* Start from the sane iommu hardware state.
1898	* If the queued invalidation is already initialized by us
1899	* (for example, while enabling interrupt-remapping) then
1900	* we got the things already rolling from a sane state.
1901	*/
1902	if (!iommu->qi) {
1903	/*
1904	* Clear any previous faults.
1905	*/
1906	dmar_fault(irq: -`1`, dev_id: iommu);
1907	/*
1908	* Disable queued invalidation if supported and already enabled
1909	* before OS handover.
1910	*/
1911	dmar_disable_qi(iommu);
1912	}
1913
1914	if (dmar_enable_qi(iommu)) {
1915	/*
1916	* Queued Invalidate not enabled, use Register Based Invalidate
1917	*/
1918	iommu->flush.flush_context = __iommu_flush_context;
1919	iommu->flush.flush_iotlb = __iommu_flush_iotlb;
1920	pr_info("%s: Using Register based invalidation\n",
1921	iommu->name);
1922	} else {
1923	iommu->flush.flush_context = qi_flush_context;
1924	iommu->flush.flush_iotlb = qi_flush_iotlb;
1925	pr_info("%s: Using Queued invalidation\n", iommu->name);
1926	}
1927	}
1928
1929	static int copy_context_table(struct intel_iommu *iommu,
1930	struct root_entry *old_re,
1931	struct context_entry **tbl,
1932	int bus, bool ext)
1933	{
1934	int tbl_idx, pos = `0`, idx, devfn, ret = `0`, did;
1935	struct context_entry *new_ce = NULL, ce;
1936	struct context_entry *old_ce = NULL;
1937	struct root_entry re;
1938	phys_addr_t old_ce_phys;
1939
1940	tbl_idx = ext ? bus * `2` : bus;
1941	memcpy(&re, old_re, sizeof(re));
1942
1943	for (devfn = `0`; devfn < `256`; devfn++) {
1944	/ First calculate the correct index /
1945	idx = (ext ? devfn * `2` : devfn) % `256`;
1946
1947	if (idx == `0`) {
1948	/ First save what we may have and clean up /
1949	if (new_ce) {
1950	tbl[tbl_idx] = new_ce;
1951	__iommu_flush_cache(iommu, addr: new_ce,
1952	VTD_PAGE_SIZE);
1953	pos = `1`;
1954	}
1955
1956	if (old_ce)
1957	memunmap(addr: old_ce);
1958
1959	ret = `0`;
1960	if (devfn < `0x80`)
1961	old_ce_phys = root_entry_lctp(re: &re);
1962	else
1963	old_ce_phys = root_entry_uctp(re: &re);
1964
1965	if (!old_ce_phys) {
1966	if (ext && devfn == `0`) {
1967	/ No LCTP, try UCTP /
1968	devfn = `0x7f`;
1969	continue;
1970	} else {
1971	goto out;
1972	}
1973	}
1974
1975	ret = -ENOMEM;
1976	old_ce = memremap(offset: old_ce_phys, PAGE_SIZE,
1977	flags: MEMREMAP_WB);
1978	if (!old_ce)
1979	goto out;
1980
1981	new_ce = iommu_alloc_pages_node_sz(nid: iommu->node,
1982	GFP_KERNEL, SZ_4K);
1983	if (!new_ce)
1984	goto out_unmap;
1985
1986	ret = `0`;
1987	}
1988
1989	/ Now copy the context entry /
1990	memcpy(&ce, old_ce + idx, sizeof(ce));
1991
1992	if (!context_present(context: &ce))
1993	continue;
1994
1995	did = context_domain_id(c: &ce);
1996	if (did >= `0` && did < cap_ndoms(iommu->cap))
1997	ida_alloc_range(&iommu->domain_ida, min: did, max: did, GFP_KERNEL);
1998
1999	set_context_copied(iommu, bus, devfn);
2000	new_ce[idx] = ce;
2001	}
2002
2003	tbl[tbl_idx + pos] = new_ce;
2004
2005	__iommu_flush_cache(iommu, addr: new_ce, VTD_PAGE_SIZE);
2006
2007	out_unmap:
2008	memunmap(addr: old_ce);
2009
2010	out:
2011	return ret;
2012	}
2013
2014	static int copy_translation_tables(struct intel_iommu *iommu)
2015	{
2016	struct context_entry **ctxt_tbls;
2017	struct root_entry *old_rt;
2018	phys_addr_t old_rt_phys;
2019	int ctxt_table_entries;
2020	u64 rtaddr_reg;
2021	int bus, ret;
2022	bool new_ext, ext;
2023
2024	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2025	ext = !!(rtaddr_reg & DMA_RTADDR_SMT);
2026	new_ext = !!sm_supported(iommu);
2027
2028	/*
2029	* The RTT bit can only be changed when translation is disabled,
2030	* but disabling translation means to open a window for data
2031	* corruption. So bail out and don't copy anything if we would
2032	* have to change the bit.
2033	*/
2034	if (new_ext != ext)
2035	return -EINVAL;
2036
2037	iommu->copied_tables = bitmap_zalloc(BIT_ULL(`16`), GFP_KERNEL);
2038	if (!iommu->copied_tables)
2039	return -ENOMEM;
2040
2041	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2042	if (!old_rt_phys)
2043	return -EINVAL;
2044
2045	old_rt = memremap(offset: old_rt_phys, PAGE_SIZE, flags: MEMREMAP_WB);
2046	if (!old_rt)
2047	return -ENOMEM;
2048
2049	/ This is too big for the stack - allocate it from slab /
2050	ctxt_table_entries = ext ? `512` : `256`;
2051	ret = -ENOMEM;
2052	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2053	if (!ctxt_tbls)
2054	goto out_unmap;
2055
2056	for (bus = `0`; bus < `256`; bus++) {
2057	ret = copy_context_table(iommu, old_re: &old_rt[bus],
2058	tbl: ctxt_tbls, bus, ext);
2059	if (ret) {
2060	pr_err("%s: Failed to copy context table for bus %d\n",
2061	iommu->name, bus);
2062	continue;
2063	}
2064	}
2065
2066	spin_lock(lock: &iommu->lock);
2067
2068	/ Context tables are copied, now write them to the root_entry table /
2069	for (bus = `0`; bus < `256`; bus++) {
2070	int idx = ext ? bus * `2` : bus;
2071	u64 val;
2072
2073	if (ctxt_tbls[idx]) {
2074	val = virt_to_phys(address: ctxt_tbls[idx]) \| `1`;
2075	iommu->root_entry[bus].lo = val;
2076	}
2077
2078	if (!ext \|\| !ctxt_tbls[idx + `1`])
2079	continue;
2080
2081	val = virt_to_phys(address: ctxt_tbls[idx + `1`]) \| `1`;
2082	iommu->root_entry[bus].hi = val;
2083	}
2084
2085	spin_unlock(lock: &iommu->lock);
2086
2087	kfree(objp: ctxt_tbls);
2088
2089	__iommu_flush_cache(iommu, addr: iommu->root_entry, PAGE_SIZE);
2090
2091	ret = `0`;
2092
2093	out_unmap:
2094	memunmap(addr: old_rt);
2095
2096	return ret;
2097	}
2098
2099	static int __init init_dmars(void)
2100	{
2101	struct dmar_drhd_unit *drhd;
2102	struct intel_iommu *iommu;
2103	int ret;
2104
2105	for_each_iommu(iommu, drhd) {
2106	if (drhd->ignored) {
2107	iommu_disable_translation(iommu);
2108	continue;
2109	}
2110
2111	/*
2112	* Find the max pasid size of all IOMMU's in the system.
2113	* We need to ensure the system pasid table is no bigger
2114	* than the smallest supported.
2115	*/
2116	if (pasid_supported(iommu)) {
2117	u32 temp = `2` << ecap_pss(iommu->ecap);
2118
2119	intel_pasid_max_id = min_t(u32, temp,
2120	intel_pasid_max_id);
2121	}
2122
2123	intel_iommu_init_qi(iommu);
2124	init_translation_status(iommu);
2125
2126	if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2127	iommu_disable_translation(iommu);
2128	clear_translation_pre_enabled(iommu);
2129	pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2130	iommu->name);
2131	}
2132
2133	/*
2134	* TBD:
2135	* we could share the same root & context tables
2136	* among all IOMMU's. Need to Split it later.
2137	*/
2138	ret = iommu_alloc_root_entry(iommu);
2139	if (ret)
2140	goto free_iommu;
2141
2142	if (translation_pre_enabled(iommu)) {
2143	pr_info("Translation already enabled - trying to copy translation structures\n");
2144
2145	ret = copy_translation_tables(iommu);
2146	if (ret) {
2147	/*
2148	* We found the IOMMU with translation
2149	* enabled - but failed to copy over the
2150	* old root-entry table. Try to proceed
2151	* by disabling translation now and
2152	* allocating a clean root-entry table.
2153	* This might cause DMAR faults, but
2154	* probably the dump will still succeed.
2155	*/
2156	pr_err("Failed to copy translation tables from previous kernel for %s\n",
2157	iommu->name);
2158	iommu_disable_translation(iommu);
2159	clear_translation_pre_enabled(iommu);
2160	} else {
2161	pr_info("Copied translation tables from previous kernel for %s\n",
2162	iommu->name);
2163	}
2164	}
2165
2166	intel_svm_check(iommu);
2167	}
2168
2169	/*
2170	* Now that qi is enabled on all iommus, set the root entry and flush
2171	* caches. This is required on some Intel X58 chipsets, otherwise the
2172	* flush_context function will loop forever and the boot hangs.
2173	*/
2174	for_each_active_iommu(iommu, drhd) {
2175	iommu_flush_write_buffer(iommu);
2176	iommu_set_root_entry(iommu);
2177	}
2178
2179	check_tylersburg_isoch();
2180
2181	/*
2182	* for each drhd
2183	* enable fault log
2184	* global invalidate context cache
2185	* global invalidate iotlb
2186	* enable translation
2187	*/
2188	for_each_iommu(iommu, drhd) {
2189	if (drhd->ignored) {
2190	/*
2191	* we always have to disable PMRs or DMA may fail on
2192	* this device
2193	*/
2194	if (force_on)
2195	iommu_disable_protect_mem_regions(iommu);
2196	continue;
2197	}
2198
2199	iommu_flush_write_buffer(iommu);
2200
2201	if (ecap_prs(iommu->ecap)) {
2202	/*
2203	* Call dmar_alloc_hwirq() with dmar_global_lock held,
2204	* could cause possible lock race condition.
2205	*/
2206	up_write(sem: &dmar_global_lock);
2207	ret = intel_iommu_enable_prq(iommu);
2208	down_write(sem: &dmar_global_lock);
2209	if (ret)
2210	goto free_iommu;
2211	}
2212
2213	ret = dmar_set_interrupt(iommu);
2214	if (ret)
2215	goto free_iommu;
2216	}
2217
2218	return `0`;
2219
2220	free_iommu:
2221	for_each_active_iommu(iommu, drhd) {
2222	disable_dmar_iommu(iommu);
2223	free_dmar_iommu(iommu);
2224	}
2225
2226	return ret;
2227	}
2228
2229	static void __init init_no_remapping_devices(void)
2230	{
2231	struct dmar_drhd_unit *drhd;
2232	struct device *dev;
2233	int i;
2234
2235	for_each_drhd_unit(drhd) {
2236	if (!drhd->include_all) {
2237	for_each_active_dev_scope(drhd->devices,
2238	drhd->devices_cnt, i, dev)
2239	break;
2240	/ ignore DMAR unit if no devices exist /
2241	if (i == drhd->devices_cnt)
2242	drhd->ignored = `1`;
2243	}
2244	}
2245
2246	for_each_active_drhd_unit(drhd) {
2247	if (drhd->include_all)
2248	continue;
2249
2250	for_each_active_dev_scope(drhd->devices,
2251	drhd->devices_cnt, i, dev)
2252	if (!dev_is_pci(dev) \|\| !IS_GFX_DEVICE(to_pci_dev(dev)))
2253	break;
2254	if (i < drhd->devices_cnt)
2255	continue;
2256
2257	/ This IOMMU has only gfx devices. Either bypass it or*
2258	set the gfx_mapped flag, as appropriate /*
2259	drhd->gfx_dedicated = `1`;
2260	if (disable_igfx_iommu)
2261	drhd->ignored = `1`;
2262	}
2263	}
2264
2265	#ifdef CONFIG_SUSPEND
2266	static int init_iommu_hw(void)
2267	{
2268	struct dmar_drhd_unit *drhd;
2269	struct intel_iommu *iommu = NULL;
2270	int ret;
2271
2272	for_each_active_iommu(iommu, drhd) {
2273	if (iommu->qi) {
2274	ret = dmar_reenable_qi(iommu);
2275	if (ret)
2276	return ret;
2277	}
2278	}
2279
2280	for_each_iommu(iommu, drhd) {
2281	if (drhd->ignored) {
2282	/*
2283	* we always have to disable PMRs or DMA may fail on
2284	* this device
2285	*/
2286	if (force_on)
2287	iommu_disable_protect_mem_regions(iommu);
2288	continue;
2289	}
2290
2291	iommu_flush_write_buffer(iommu);
2292	iommu_set_root_entry(iommu);
2293	iommu_enable_translation(iommu);
2294	iommu_disable_protect_mem_regions(iommu);
2295	}
2296
2297	return `0`;
2298	}
2299
2300	static void iommu_flush_all(void)
2301	{
2302	struct dmar_drhd_unit *drhd;
2303	struct intel_iommu *iommu;
2304
2305	for_each_active_iommu(iommu, drhd) {
2306	iommu->flush.flush_context(iommu, `0`, `0`, `0`,
2307	DMA_CCMD_GLOBAL_INVL);
2308	iommu->flush.flush_iotlb(iommu, `0`, `0`, `0`,
2309	DMA_TLB_GLOBAL_FLUSH);
2310	}
2311	}
2312
2313	static int iommu_suspend(void)
2314	{
2315	struct dmar_drhd_unit *drhd;
2316	struct intel_iommu *iommu = NULL;
2317	unsigned long flag;
2318
2319	iommu_flush_all();
2320
2321	for_each_active_iommu(iommu, drhd) {
2322	iommu_disable_translation(iommu);
2323
2324	raw_spin_lock_irqsave(&iommu->register_lock, flag);
2325
2326	iommu->iommu_state[SR_DMAR_FECTL_REG] =
2327	readl(addr: iommu->reg + DMAR_FECTL_REG);
2328	iommu->iommu_state[SR_DMAR_FEDATA_REG] =
2329	readl(addr: iommu->reg + DMAR_FEDATA_REG);
2330	iommu->iommu_state[SR_DMAR_FEADDR_REG] =
2331	readl(addr: iommu->reg + DMAR_FEADDR_REG);
2332	iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
2333	readl(addr: iommu->reg + DMAR_FEUADDR_REG);
2334
2335	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2336	}
2337	return `0`;
2338	}
2339
2340	static void iommu_resume(void)
2341	{
2342	struct dmar_drhd_unit *drhd;
2343	struct intel_iommu *iommu = NULL;
2344	unsigned long flag;
2345
2346	if (init_iommu_hw()) {
2347	if (force_on)
2348	panic(fmt: "tboot: IOMMU setup failed, DMAR can not resume!\n");
2349	else
2350	WARN(`1`, "IOMMU setup failed, DMAR can not resume!\n");
2351	return;
2352	}
2353
2354	for_each_active_iommu(iommu, drhd) {
2355
2356	raw_spin_lock_irqsave(&iommu->register_lock, flag);
2357
2358	writel(val: iommu->iommu_state[SR_DMAR_FECTL_REG],
2359	addr: iommu->reg + DMAR_FECTL_REG);
2360	writel(val: iommu->iommu_state[SR_DMAR_FEDATA_REG],
2361	addr: iommu->reg + DMAR_FEDATA_REG);
2362	writel(val: iommu->iommu_state[SR_DMAR_FEADDR_REG],
2363	addr: iommu->reg + DMAR_FEADDR_REG);
2364	writel(val: iommu->iommu_state[SR_DMAR_FEUADDR_REG],
2365	addr: iommu->reg + DMAR_FEUADDR_REG);
2366
2367	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2368	}
2369	}
2370
2371	static struct syscore_ops iommu_syscore_ops = {
2372	.resume = iommu_resume,
2373	.suspend = iommu_suspend,
2374	};
2375
2376	static void __init init_iommu_pm_ops(void)
2377	{
2378	register_syscore_ops(ops: &iommu_syscore_ops);
2379	}
2380
2381	#else
2382	static inline void init_iommu_pm_ops(void) {}
2383	#endif /* CONFIG_PM */
2384
2385	static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
2386	{
2387	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) \|\|
2388	!IS_ALIGNED(rmrr->end_address + `1`, PAGE_SIZE) \|\|
2389	rmrr->end_address <= rmrr->base_address \|\|
2390	arch_rmrr_sanity_check(rmrr))
2391	return -EINVAL;
2392
2393	return `0`;
2394	}
2395
2396	int __init dmar_parse_one_rmrr(struct acpi_dmar_header header, void* *arg)
2397	{
2398	struct acpi_dmar_reserved_memory *rmrr;
2399	struct dmar_rmrr_unit *rmrru;
2400
2401	rmrr = (struct acpi_dmar_reserved_memory *)header;
2402	if (rmrr_sanity_check(rmrr)) {
2403	pr_warn(FW_BUG
2404	"Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
2405	"BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2406	rmrr->base_address, rmrr->end_address,
2407	dmi_get_system_info(DMI_BIOS_VENDOR),
2408	dmi_get_system_info(DMI_BIOS_VERSION),
2409	dmi_get_system_info(DMI_PRODUCT_VERSION));
2410	add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
2411	}
2412
2413	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
2414	if (!rmrru)
2415	goto out;
2416
2417	rmrru->hdr = header;
2418
2419	rmrru->base_address = rmrr->base_address;
2420	rmrru->end_address = rmrr->end_address;
2421
2422	rmrru->devices = dmar_alloc_dev_scope(start: (void *)(rmrr + `1`),
2423	end: ((void *)rmrr) + rmrr->header.length,
2424	cnt: &rmrru->devices_cnt);
2425	if (rmrru->devices_cnt && rmrru->devices == NULL)
2426	goto free_rmrru;
2427
2428	list_add(new: &rmrru->list, head: &dmar_rmrr_units);
2429
2430	return `0`;
2431	free_rmrru:
2432	kfree(objp: rmrru);
2433	out:
2434	return -ENOMEM;
2435	}
2436
2437	static struct dmar_atsr_unit dmar_find_atsr(struct* acpi_dmar_atsr *atsr)
2438	{
2439	struct dmar_atsr_unit *atsru;
2440	struct acpi_dmar_atsr *tmp;
2441
2442	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
2443	dmar_rcu_check()) {
2444	tmp = (struct acpi_dmar_atsr *)atsru->hdr;
2445	if (atsr->segment != tmp->segment)
2446	continue;
2447	if (atsr->header.length != tmp->header.length)
2448	continue;
2449	if (memcmp(p: atsr, q: tmp, size: atsr->header.length) == `0`)
2450	return atsru;
2451	}
2452
2453	return NULL;
2454	}
2455
2456	int dmar_parse_one_atsr(struct acpi_dmar_header hdr, void* *arg)
2457	{
2458	struct acpi_dmar_atsr *atsr;
2459	struct dmar_atsr_unit *atsru;
2460
2461	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2462	return `0`;
2463
2464	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2465	atsru = dmar_find_atsr(atsr);
2466	if (atsru)
2467	return `0`;
2468
2469	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
2470	if (!atsru)
2471	return -ENOMEM;
2472
2473	/*
2474	* If memory is allocated from slab by ACPI _DSM method, we need to
2475	* copy the memory content because the memory buffer will be freed
2476	* on return.
2477	*/
2478	atsru->hdr = (void *)(atsru + `1`);
2479	memcpy(atsru->hdr, hdr, hdr->length);
2480	atsru->include_all = atsr->flags & `0x1`;
2481	if (!atsru->include_all) {
2482	atsru->devices = dmar_alloc_dev_scope(start: (void *)(atsr + `1`),
2483	end: (void *)atsr + atsr->header.length,
2484	cnt: &atsru->devices_cnt);
2485	if (atsru->devices_cnt && atsru->devices == NULL) {
2486	kfree(objp: atsru);
2487	return -ENOMEM;
2488	}
2489	}
2490
2491	list_add_rcu(new: &atsru->list, head: &dmar_atsr_units);
2492
2493	return `0`;
2494	}
2495
2496	static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
2497	{
2498	dmar_free_dev_scope(devices: &atsru->devices, cnt: &atsru->devices_cnt);
2499	kfree(objp: atsru);
2500	}
2501
2502	int dmar_release_one_atsr(struct acpi_dmar_header hdr, void* *arg)
2503	{
2504	struct acpi_dmar_atsr *atsr;
2505	struct dmar_atsr_unit *atsru;
2506
2507	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2508	atsru = dmar_find_atsr(atsr);
2509	if (atsru) {
2510	list_del_rcu(entry: &atsru->list);
2511	synchronize_rcu();
2512	intel_iommu_free_atsr(atsru);
2513	}
2514
2515	return `0`;
2516	}
2517
2518	int dmar_check_one_atsr(struct acpi_dmar_header hdr, void* *arg)
2519	{
2520	int i;
2521	struct device *dev;
2522	struct acpi_dmar_atsr *atsr;
2523	struct dmar_atsr_unit *atsru;
2524
2525	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2526	atsru = dmar_find_atsr(atsr);
2527	if (!atsru)
2528	return `0`;
2529
2530	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
2531	for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
2532	i, dev)
2533	return -EBUSY;
2534	}
2535
2536	return `0`;
2537	}
2538
2539	static struct dmar_satc_unit dmar_find_satc(struct* acpi_dmar_satc *satc)
2540	{
2541	struct dmar_satc_unit *satcu;
2542	struct acpi_dmar_satc *tmp;
2543
2544	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
2545	dmar_rcu_check()) {
2546	tmp = (struct acpi_dmar_satc *)satcu->hdr;
2547	if (satc->segment != tmp->segment)
2548	continue;
2549	if (satc->header.length != tmp->header.length)
2550	continue;
2551	if (memcmp(p: satc, q: tmp, size: satc->header.length) == `0`)
2552	return satcu;
2553	}
2554
2555	return NULL;
2556	}
2557
2558	int dmar_parse_one_satc(struct acpi_dmar_header hdr, void* *arg)
2559	{
2560	struct acpi_dmar_satc *satc;
2561	struct dmar_satc_unit *satcu;
2562
2563	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2564	return `0`;
2565
2566	satc = container_of(hdr, struct acpi_dmar_satc, header);
2567	satcu = dmar_find_satc(satc);
2568	if (satcu)
2569	return `0`;
2570
2571	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
2572	if (!satcu)
2573	return -ENOMEM;
2574
2575	satcu->hdr = (void *)(satcu + `1`);
2576	memcpy(satcu->hdr, hdr, hdr->length);
2577	satcu->atc_required = satc->flags & `0x1`;
2578	satcu->devices = dmar_alloc_dev_scope(start: (void *)(satc + `1`),
2579	end: (void *)satc + satc->header.length,
2580	cnt: &satcu->devices_cnt);
2581	if (satcu->devices_cnt && !satcu->devices) {
2582	kfree(objp: satcu);
2583	return -ENOMEM;
2584	}
2585	list_add_rcu(new: &satcu->list, head: &dmar_satc_units);
2586
2587	return `0`;
2588	}
2589
2590	static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
2591	{
2592	struct intel_iommu *iommu = dmaru->iommu;
2593	int ret;
2594
2595	/*
2596	* Disable translation if already enabled prior to OS handover.
2597	*/
2598	if (iommu->gcmd & DMA_GCMD_TE)
2599	iommu_disable_translation(iommu);
2600
2601	ret = iommu_alloc_root_entry(iommu);
2602	if (ret)
2603	goto out;
2604
2605	intel_svm_check(iommu);
2606
2607	if (dmaru->ignored) {
2608	/*
2609	* we always have to disable PMRs or DMA may fail on this device
2610	*/
2611	if (force_on)
2612	iommu_disable_protect_mem_regions(iommu);
2613	return `0`;
2614	}
2615
2616	intel_iommu_init_qi(iommu);
2617	iommu_flush_write_buffer(iommu);
2618
2619	if (ecap_prs(iommu->ecap)) {
2620	ret = intel_iommu_enable_prq(iommu);
2621	if (ret)
2622	goto disable_iommu;
2623	}
2624
2625	ret = dmar_set_interrupt(iommu);
2626	if (ret)
2627	goto disable_iommu;
2628
2629	iommu_set_root_entry(iommu);
2630	iommu_enable_translation(iommu);
2631
2632	iommu_disable_protect_mem_regions(iommu);
2633	return `0`;
2634
2635	disable_iommu:
2636	disable_dmar_iommu(iommu);
2637	out:
2638	free_dmar_iommu(iommu);
2639	return ret;
2640	}
2641
2642	int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
2643	{
2644	int ret = `0`;
2645	struct intel_iommu *iommu = dmaru->iommu;
2646
2647	if (!intel_iommu_enabled)
2648	return `0`;
2649	if (iommu == NULL)
2650	return -EINVAL;
2651
2652	if (insert) {
2653	ret = intel_iommu_add(dmaru);
2654	} else {
2655	disable_dmar_iommu(iommu);
2656	free_dmar_iommu(iommu);
2657	}
2658
2659	return ret;
2660	}
2661
2662	static void intel_iommu_free_dmars(void)
2663	{
2664	struct dmar_rmrr_unit rmrru, rmrr_n;
2665	struct dmar_atsr_unit atsru, atsr_n;
2666	struct dmar_satc_unit satcu, satc_n;
2667
2668	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
2669	list_del(entry: &rmrru->list);
2670	dmar_free_dev_scope(devices: &rmrru->devices, cnt: &rmrru->devices_cnt);
2671	kfree(objp: rmrru);
2672	}
2673
2674	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
2675	list_del(entry: &atsru->list);
2676	intel_iommu_free_atsr(atsru);
2677	}
2678	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
2679	list_del(entry: &satcu->list);
2680	dmar_free_dev_scope(devices: &satcu->devices, cnt: &satcu->devices_cnt);
2681	kfree(objp: satcu);
2682	}
2683	}
2684
2685	static struct dmar_satc_unit dmar_find_matched_satc_unit(struct* pci_dev *dev)
2686	{
2687	struct dmar_satc_unit *satcu;
2688	struct acpi_dmar_satc *satc;
2689	struct device *tmp;
2690	int i;
2691
2692	rcu_read_lock();
2693
2694	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
2695	satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
2696	if (satc->segment != pci_domain_nr(bus: dev->bus))
2697	continue;
2698	for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
2699	if (to_pci_dev(tmp) == dev)
2700	goto out;
2701	}
2702	satcu = NULL;
2703	out:
2704	rcu_read_unlock();
2705	return satcu;
2706	}
2707
2708	static bool dmar_ats_supported(struct pci_dev dev, struct* intel_iommu *iommu)
2709	{
2710	struct pci_dev *bridge = NULL;
2711	struct dmar_atsr_unit *atsru;
2712	struct dmar_satc_unit *satcu;
2713	struct acpi_dmar_atsr *atsr;
2714	bool supported = true;
2715	struct pci_bus *bus;
2716	struct device *tmp;
2717	int i;
2718
2719	dev = pci_physfn(dev);
2720	satcu = dmar_find_matched_satc_unit(dev);
2721	if (satcu)
2722	/*
2723	* This device supports ATS as it is in SATC table.
2724	* When IOMMU is in legacy mode, enabling ATS is done
2725	* automatically by HW for the device that requires
2726	* ATS, hence OS should not enable this device ATS
2727	* to avoid duplicated TLB invalidation.
2728	*/
2729	return !(satcu->atc_required && !sm_supported(iommu));
2730
2731	for (bus = dev->bus; bus; bus = bus->parent) {
2732	bridge = bus->self;
2733	/ If it's an integrated device, allow ATS /
2734	if (!bridge)
2735	return true;
2736	/ Connected via non-PCIe: no ATS /
2737	if (!pci_is_pcie(dev: bridge) \|\|
2738	pci_pcie_type(dev: bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
2739	return false;
2740	/ If we found the root port, look it up in the ATSR /
2741	if (pci_pcie_type(dev: bridge) == PCI_EXP_TYPE_ROOT_PORT)
2742	break;
2743	}
2744
2745	rcu_read_lock();
2746	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
2747	atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
2748	if (atsr->segment != pci_domain_nr(bus: dev->bus))
2749	continue;
2750
2751	for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
2752	if (tmp == &bridge->dev)
2753	goto out;
2754
2755	if (atsru->include_all)
2756	goto out;
2757	}
2758	supported = false;
2759	out:
2760	rcu_read_unlock();
2761
2762	return supported;
2763	}
2764
2765	int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
2766	{
2767	int ret;
2768	struct dmar_rmrr_unit *rmrru;
2769	struct dmar_atsr_unit *atsru;
2770	struct dmar_satc_unit *satcu;
2771	struct acpi_dmar_atsr *atsr;
2772	struct acpi_dmar_reserved_memory *rmrr;
2773	struct acpi_dmar_satc *satc;
2774
2775	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
2776	return `0`;
2777
2778	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
2779	rmrr = container_of(rmrru->hdr,
2780	struct acpi_dmar_reserved_memory, header);
2781	if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2782	ret = dmar_insert_dev_scope(info, start: (void *)(rmrr + `1`),
2783	end: ((void *)rmrr) + rmrr->header.length,
2784	segment: rmrr->segment, devices: rmrru->devices,
2785	devices_cnt: rmrru->devices_cnt);
2786	if (ret < `0`)
2787	return ret;
2788	} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2789	dmar_remove_dev_scope(info, segment: rmrr->segment,
2790	devices: rmrru->devices, count: rmrru->devices_cnt);
2791	}
2792	}
2793
2794	list_for_each_entry(atsru, &dmar_atsr_units, list) {
2795	if (atsru->include_all)
2796	continue;
2797
2798	atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
2799	if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2800	ret = dmar_insert_dev_scope(info, start: (void *)(atsr + `1`),
2801	end: (void *)atsr + atsr->header.length,
2802	segment: atsr->segment, devices: atsru->devices,
2803	devices_cnt: atsru->devices_cnt);
2804	if (ret > `0`)
2805	break;
2806	else if (ret < `0`)
2807	return ret;
2808	} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2809	if (dmar_remove_dev_scope(info, segment: atsr->segment,
2810	devices: atsru->devices, count: atsru->devices_cnt))
2811	break;
2812	}
2813	}
2814	list_for_each_entry(satcu, &dmar_satc_units, list) {
2815	satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
2816	if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2817	ret = dmar_insert_dev_scope(info, start: (void *)(satc + `1`),
2818	end: (void *)satc + satc->header.length,
2819	segment: satc->segment, devices: satcu->devices,
2820	devices_cnt: satcu->devices_cnt);
2821	if (ret > `0`)
2822	break;
2823	else if (ret < `0`)
2824	return ret;
2825	} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2826	if (dmar_remove_dev_scope(info, segment: satc->segment,
2827	devices: satcu->devices, count: satcu->devices_cnt))
2828	break;
2829	}
2830	}
2831
2832	return `0`;
2833	}
2834
2835	static void intel_disable_iommus(void)
2836	{
2837	struct intel_iommu *iommu = NULL;
2838	struct dmar_drhd_unit *drhd;
2839
2840	for_each_iommu(iommu, drhd)
2841	iommu_disable_translation(iommu);
2842	}
2843
2844	void intel_iommu_shutdown(void)
2845	{
2846	struct dmar_drhd_unit *drhd;
2847	struct intel_iommu *iommu = NULL;
2848
2849	if (no_iommu \|\| dmar_disabled)
2850	return;
2851
2852	/*
2853	* All other CPUs were brought down, hotplug interrupts were disabled,
2854	* no lock and RCU checking needed anymore
2855	*/
2856	list_for_each_entry(drhd, &dmar_drhd_units, list) {
2857	iommu = drhd->iommu;
2858
2859	/ Disable PMRs explicitly here. /
2860	iommu_disable_protect_mem_regions(iommu);
2861
2862	/ Make sure the IOMMUs are switched off /
2863	iommu_disable_translation(iommu);
2864	}
2865	}
2866
2867	static struct intel_iommu dev_to_intel_iommu(struct* device *dev)
2868	{
2869	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
2870
2871	return container_of(iommu_dev, struct intel_iommu, iommu);
2872	}
2873
2874	static ssize_t version_show(struct device *dev,
2875	struct device_attribute attr, char* *buf)
2876	{
2877	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2878	u32 ver = readl(addr: iommu->reg + DMAR_VER_REG);
2879	return sysfs_emit(buf, fmt: "%d:%d\n",
2880	DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
2881	}
2882	static DEVICE_ATTR_RO(version);
2883
2884	static ssize_t address_show(struct device *dev,
2885	struct device_attribute attr, char* *buf)
2886	{
2887	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2888	return sysfs_emit(buf, fmt: "%llx\n", iommu->reg_phys);
2889	}
2890	static DEVICE_ATTR_RO(address);
2891
2892	static ssize_t cap_show(struct device *dev,
2893	struct device_attribute attr, char* *buf)
2894	{
2895	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2896	return sysfs_emit(buf, fmt: "%llx\n", iommu->cap);
2897	}
2898	static DEVICE_ATTR_RO(cap);
2899
2900	static ssize_t ecap_show(struct device *dev,
2901	struct device_attribute attr, char* *buf)
2902	{
2903	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2904	return sysfs_emit(buf, fmt: "%llx\n", iommu->ecap);
2905	}
2906	static DEVICE_ATTR_RO(ecap);
2907
2908	static ssize_t domains_supported_show(struct device *dev,
2909	struct device_attribute attr, char* *buf)
2910	{
2911	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2912	return sysfs_emit(buf, fmt: "%ld\n", cap_ndoms(iommu->cap));
2913	}
2914	static DEVICE_ATTR_RO(domains_supported);
2915
2916	static ssize_t domains_used_show(struct device *dev,
2917	struct device_attribute attr, char* *buf)
2918	{
2919	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2920	unsigned int count = `0`;
2921	int id;
2922
2923	for (id = `0`; id < cap_ndoms(iommu->cap); id++)
2924	if (ida_exists(ida: &iommu->domain_ida, id))
2925	count++;
2926
2927	return sysfs_emit(buf, fmt: "%d\n", count);
2928	}
2929	static DEVICE_ATTR_RO(domains_used);
2930
2931	static struct attribute *intel_iommu_attrs[] = {
2932	&dev_attr_version.attr,
2933	&dev_attr_address.attr,
2934	&dev_attr_cap.attr,
2935	&dev_attr_ecap.attr,
2936	&dev_attr_domains_supported.attr,
2937	&dev_attr_domains_used.attr,
2938	NULL,
2939	};
2940
2941	static struct attribute_group intel_iommu_group = {
2942	.name = "intel-iommu",
2943	.attrs = intel_iommu_attrs,
2944	};
2945
2946	const struct attribute_group *intel_iommu_groups[] = {
2947	&intel_iommu_group,
2948	NULL,
2949	};
2950
2951	static bool has_external_pci(void)
2952	{
2953	struct pci_dev *pdev = NULL;
2954
2955	for_each_pci_dev(pdev)
2956	if (pdev->external_facing) {
2957	pci_dev_put(dev: pdev);
2958	return true;
2959	}
2960
2961	return false;
2962	}
2963
2964	static int __init platform_optin_force_iommu(void)
2965	{
2966	if (!dmar_platform_optin() \|\| no_platform_optin \|\| !has_external_pci())
2967	return `0`;
2968
2969	if (no_iommu \|\| dmar_disabled)
2970	pr_info("Intel-IOMMU force enabled due to platform opt in\n");
2971
2972	/*
2973	* If Intel-IOMMU is disabled by default, we will apply identity
2974	* map for all devices except those marked as being untrusted.
2975	*/
2976	if (dmar_disabled)
2977	iommu_set_default_passthrough(cmd_line: false);
2978
2979	dmar_disabled = `0`;
2980	no_iommu = `0`;
2981
2982	return `1`;
2983	}
2984
2985	static int __init probe_acpi_namespace_devices(void)
2986	{
2987	struct dmar_drhd_unit *drhd;
2988	/ To avoid a -Wunused-but-set-variable warning. /
2989	struct intel_iommu *iommu __maybe_unused;
2990	struct device *dev;
2991	int i, ret = `0`;
2992
2993	for_each_active_iommu(iommu, drhd) {
2994	for_each_active_dev_scope(drhd->devices,
2995	drhd->devices_cnt, i, dev) {
2996	struct acpi_device_physical_node *pn;
2997	struct acpi_device *adev;
2998
2999	if (dev->bus != &acpi_bus_type)
3000	continue;
3001
3002	up_read(sem: &dmar_global_lock);
3003	adev = to_acpi_device(dev);
3004	mutex_lock(&adev->physical_node_lock);
3005	list_for_each_entry(pn,
3006	&adev->physical_node_list, node) {
3007	ret = iommu_probe_device(dev: pn->dev);
3008	if (ret)
3009	break;
3010	}
3011	mutex_unlock(lock: &adev->physical_node_lock);
3012	down_read(sem: &dmar_global_lock);
3013
3014	if (ret)
3015	return ret;
3016	}
3017	}
3018
3019	return `0`;
3020	}
3021
3022	static __init int tboot_force_iommu(void)
3023	{
3024	if (!tboot_enabled())
3025	return `0`;
3026
3027	if (no_iommu \|\| dmar_disabled)
3028	pr_warn("Forcing Intel-IOMMU to enabled\n");
3029
3030	dmar_disabled = `0`;
3031	no_iommu = `0`;
3032
3033	return `1`;
3034	}
3035
3036	int __init intel_iommu_init(void)
3037	{
3038	int ret = -ENODEV;
3039	struct dmar_drhd_unit *drhd;
3040	struct intel_iommu *iommu;
3041
3042	/*
3043	* Intel IOMMU is required for a TXT/tboot launch or platform
3044	* opt in, so enforce that.
3045	*/
3046	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) \|\|
3047	platform_optin_force_iommu();
3048
3049	down_write(sem: &dmar_global_lock);
3050	if (dmar_table_init()) {
3051	if (force_on)
3052	panic(fmt: "tboot: Failed to initialize DMAR table\n");
3053	goto out_free_dmar;
3054	}
3055
3056	if (dmar_dev_scope_init() < `0`) {
3057	if (force_on)
3058	panic(fmt: "tboot: Failed to initialize DMAR device scope\n");
3059	goto out_free_dmar;
3060	}
3061
3062	up_write(sem: &dmar_global_lock);
3063
3064	/*
3065	* The bus notifier takes the dmar_global_lock, so lockdep will
3066	* complain later when we register it under the lock.
3067	*/
3068	dmar_register_bus_notifier();
3069
3070	down_write(sem: &dmar_global_lock);
3071
3072	if (!no_iommu)
3073	intel_iommu_debugfs_init();
3074
3075	if (no_iommu \|\| dmar_disabled) {
3076	/*
3077	* We exit the function here to ensure IOMMU's remapping and
3078	* mempool aren't setup, which means that the IOMMU's PMRs
3079	* won't be disabled via the call to init_dmars(). So disable
3080	* it explicitly here. The PMRs were setup by tboot prior to
3081	* calling SENTER, but the kernel is expected to reset/tear
3082	* down the PMRs.
3083	*/
3084	if (intel_iommu_tboot_noforce) {
3085	for_each_iommu(iommu, drhd)
3086	iommu_disable_protect_mem_regions(iommu);
3087	}
3088
3089	/*
3090	* Make sure the IOMMUs are switched off, even when we
3091	* boot into a kexec kernel and the previous kernel left
3092	* them enabled
3093	*/
3094	intel_disable_iommus();
3095	goto out_free_dmar;
3096	}
3097
3098	if (list_empty(head: &dmar_rmrr_units))
3099	pr_info("No RMRR found\n");
3100
3101	if (list_empty(head: &dmar_atsr_units))
3102	pr_info("No ATSR found\n");
3103
3104	if (list_empty(head: &dmar_satc_units))
3105	pr_info("No SATC found\n");
3106
3107	init_no_remapping_devices();
3108
3109	ret = init_dmars();
3110	if (ret) {
3111	if (force_on)
3112	panic(fmt: "tboot: Failed to initialize DMARs\n");
3113	pr_err("Initialization failed\n");
3114	goto out_free_dmar;
3115	}
3116	up_write(sem: &dmar_global_lock);
3117
3118	init_iommu_pm_ops();
3119
3120	down_read(sem: &dmar_global_lock);
3121	for_each_active_iommu(iommu, drhd) {
3122	/*
3123	* The flush queue implementation does not perform
3124	* page-selective invalidations that are required for efficient
3125	* TLB flushes in virtual environments. The benefit of batching
3126	* is likely to be much lower than the overhead of synchronizing
3127	* the virtual and physical IOMMU page-tables.
3128	*/
3129	if (cap_caching_mode(iommu->cap) &&
3130	!first_level_by_default(iommu)) {
3131	pr_info_once("IOMMU batching disallowed due to virtualization\n");
3132	iommu_set_dma_strict();
3133	}
3134	iommu_device_sysfs_add(iommu: &iommu->iommu, NULL,
3135	groups: intel_iommu_groups,
3136	fmt: "%s", iommu->name);
3137	/*
3138	* The iommu device probe is protected by the iommu_probe_device_lock.
3139	* Release the dmar_global_lock before entering the device probe path
3140	* to avoid unnecessary lock order splat.
3141	*/
3142	up_read(sem: &dmar_global_lock);
3143	iommu_device_register(iommu: &iommu->iommu, ops: &intel_iommu_ops, NULL);
3144	down_read(sem: &dmar_global_lock);
3145
3146	iommu_pmu_register(iommu);
3147	}
3148
3149	if (probe_acpi_namespace_devices())
3150	pr_warn("ACPI name space devices didn't probe correctly\n");
3151
3152	/ Finally, we enable the DMA remapping hardware. /
3153	for_each_iommu(iommu, drhd) {
3154	if (!drhd->ignored && !translation_pre_enabled(iommu))
3155	iommu_enable_translation(iommu);
3156
3157	iommu_disable_protect_mem_regions(iommu);
3158	}
3159	up_read(sem: &dmar_global_lock);
3160
3161	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3162
3163	intel_iommu_enabled = `1`;
3164
3165	return `0`;
3166
3167	out_free_dmar:
3168	intel_iommu_free_dmars();
3169	up_write(sem: &dmar_global_lock);
3170	return ret;
3171	}
3172
3173	static int domain_context_clear_one_cb(struct pci_dev pdev, u16 alias, void* *opaque)
3174	{
3175	struct device_domain_info *info = opaque;
3176
3177	domain_context_clear_one(info, PCI_BUS_NUM(alias), devfn: alias & `0xff`);
3178	return `0`;
3179	}
3180
3181	/*
3182	* NB - intel-iommu lacks any sort of reference counting for the users of
3183	* dependent devices. If multiple endpoints have intersecting dependent
3184	* devices, unbinding the driver from any one of them will possibly leave
3185	* the others unable to operate.
3186	*/
3187	static void domain_context_clear(struct device_domain_info *info)
3188	{
3189	if (!dev_is_pci(info->dev)) {
3190	domain_context_clear_one(info, bus: info->bus, devfn: info->devfn);
3191	return;
3192	}
3193
3194	pci_for_each_dma_alias(to_pci_dev(info->dev),
3195	fn: &domain_context_clear_one_cb, data: info);
3196	iommu_disable_pci_ats(info);
3197	}
3198
3199	/*
3200	* Clear the page table pointer in context or pasid table entries so that
3201	* all DMA requests without PASID from the device are blocked. If the page
3202	* table has been set, clean up the data structures.
3203	*/
3204	void device_block_translation(struct device *dev)
3205	{
3206	struct device_domain_info *info = dev_iommu_priv_get(dev);
3207	struct intel_iommu *iommu = info->iommu;
3208	unsigned long flags;
3209
3210	/ Device in DMA blocking state. Noting to do. /
3211	if (!info->domain_attached)
3212	return;
3213
3214	if (info->domain)
3215	cache_tag_unassign_domain(domain: info->domain, dev, IOMMU_NO_PASID);
3216
3217	if (!dev_is_real_dma_subdevice(dev)) {
3218	if (sm_supported(iommu))
3219	intel_pasid_tear_down_entry(iommu, dev,
3220	IOMMU_NO_PASID, fault_ignore: false);
3221	else
3222	domain_context_clear(info);
3223	}
3224
3225	/ Device now in DMA blocking state. /
3226	info->domain_attached = false;
3227
3228	if (!info->domain)
3229	return;
3230
3231	spin_lock_irqsave(&info->domain->lock, flags);
3232	list_del(entry: &info->link);
3233	spin_unlock_irqrestore(lock: &info->domain->lock, flags);
3234
3235	domain_detach_iommu(domain: info->domain, iommu);
3236	info->domain = NULL;
3237	}
3238
3239	static int blocking_domain_attach_dev(struct iommu_domain *domain,
3240	struct device *dev)
3241	{
3242	struct device_domain_info *info = dev_iommu_priv_get(dev);
3243
3244	iopf_for_domain_remove(domain: info->domain ? &info->domain->domain : NULL, dev);
3245	device_block_translation(dev);
3246	return `0`;
3247	}
3248
3249	static int blocking_domain_set_dev_pasid(struct iommu_domain *domain,
3250	struct device *dev, ioasid_t pasid,
3251	struct iommu_domain *old);
3252
3253	static struct iommu_domain blocking_domain = {
3254	.type = IOMMU_DOMAIN_BLOCKED,
3255	.ops = &(const struct iommu_domain_ops) {
3256	.attach_dev = blocking_domain_attach_dev,
3257	.set_dev_pasid = blocking_domain_set_dev_pasid,
3258	}
3259	};
3260
3261	static int iommu_superpage_capability(struct intel_iommu *iommu, bool first_stage)
3262	{
3263	if (!intel_iommu_superpage)
3264	return `0`;
3265
3266	if (first_stage)
3267	return cap_fl1gp_support(iommu->cap) ? `2` : `1`;
3268
3269	return fls(cap_super_page_val(iommu->cap));
3270	}
3271
3272	static struct dmar_domain paging_domain_alloc(struct* device *dev, bool first_stage)
3273	{
3274	struct device_domain_info *info = dev_iommu_priv_get(dev);
3275	struct intel_iommu *iommu = info->iommu;
3276	struct dmar_domain *domain;
3277	int addr_width;
3278
3279	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
3280	if (!domain)
3281	return ERR_PTR(error: -ENOMEM);
3282
3283	INIT_LIST_HEAD(list: &domain->devices);
3284	INIT_LIST_HEAD(list: &domain->dev_pasids);
3285	INIT_LIST_HEAD(list: &domain->cache_tags);
3286	spin_lock_init(&domain->lock);
3287	spin_lock_init(&domain->cache_lock);
3288	xa_init(xa: &domain->iommu_array);
3289
3290	domain->nid = dev_to_node(dev);
3291	domain->use_first_level = first_stage;
3292
3293	/ calculate the address width /
3294	addr_width = agaw_to_width(agaw: iommu->agaw);
3295	if (addr_width > cap_mgaw(iommu->cap))
3296	addr_width = cap_mgaw(iommu->cap);
3297	domain->gaw = addr_width;
3298	domain->agaw = iommu->agaw;
3299	domain->max_addr = __DOMAIN_MAX_ADDR(addr_width);
3300
3301	/ iommu memory access coherency /
3302	domain->iommu_coherency = iommu_paging_structure_coherency(iommu);
3303
3304	/ pagesize bitmap /
3305	domain->domain.pgsize_bitmap = SZ_4K;
3306	domain->iommu_superpage = iommu_superpage_capability(iommu, first_stage);
3307	domain->domain.pgsize_bitmap \|= domain_super_pgsize_bitmap(domain);
3308
3309	/*
3310	* IOVA aperture: First-level translation restricts the input-address
3311	* to a canonical address (i.e., address bits 63:N have the same value
3312	* as address bit [N-1], where N is 48-bits with 4-level paging and
3313	* 57-bits with 5-level paging). Hence, skip bit [N-1].
3314	*/
3315	domain->domain.geometry.force_aperture = true;
3316	domain->domain.geometry.aperture_start = `0`;
3317	if (first_stage)
3318	domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - `1`);
3319	else
3320	domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
3321
3322	/ always allocate the top pgd /
3323	domain->pgd = iommu_alloc_pages_node_sz(nid: domain->nid, GFP_KERNEL, SZ_4K);
3324	if (!domain->pgd) {
3325	kfree(objp: domain);
3326	return ERR_PTR(error: -ENOMEM);
3327	}
3328	domain_flush_cache(domain, addr: domain->pgd, PAGE_SIZE);
3329
3330	return domain;
3331	}
3332
3333	static struct iommu_domain *
3334	intel_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags,
3335	const struct iommu_user_data *user_data)
3336	{
3337	struct device_domain_info *info = dev_iommu_priv_get(dev);
3338	bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
3339	bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
3340	struct intel_iommu *iommu = info->iommu;
3341	struct dmar_domain *dmar_domain;
3342	struct iommu_domain *domain;
3343	bool first_stage;
3344
3345	if (flags &
3346	(~(IOMMU_HWPT_ALLOC_NEST_PARENT \| IOMMU_HWPT_ALLOC_DIRTY_TRACKING \|
3347	IOMMU_HWPT_ALLOC_PASID)))
3348	return ERR_PTR(error: -EOPNOTSUPP);
3349	if (nested_parent && !nested_supported(iommu))
3350	return ERR_PTR(error: -EOPNOTSUPP);
3351	if (user_data \|\| (dirty_tracking && !ssads_supported(iommu)))
3352	return ERR_PTR(error: -EOPNOTSUPP);
3353
3354	/*
3355	* Always allocate the guest compatible page table unless
3356	* IOMMU_HWPT_ALLOC_NEST_PARENT or IOMMU_HWPT_ALLOC_DIRTY_TRACKING
3357	* is specified.
3358	*/
3359	if (nested_parent \|\| dirty_tracking) {
3360	if (!sm_supported(iommu) \|\| !ecap_slts(iommu->ecap))
3361	return ERR_PTR(error: -EOPNOTSUPP);
3362	first_stage = false;
3363	} else {
3364	first_stage = first_level_by_default(iommu);
3365	}
3366
3367	dmar_domain = paging_domain_alloc(dev, first_stage);
3368	if (IS_ERR(ptr: dmar_domain))
3369	return ERR_CAST(ptr: dmar_domain);
3370	domain = &dmar_domain->domain;
3371	domain->type = IOMMU_DOMAIN_UNMANAGED;
3372	domain->owner = &intel_iommu_ops;
3373	domain->ops = intel_iommu_ops.default_domain_ops;
3374
3375	if (nested_parent) {
3376	dmar_domain->nested_parent = true;
3377	INIT_LIST_HEAD(list: &dmar_domain->s1_domains);
3378	spin_lock_init(&dmar_domain->s1_lock);
3379	}
3380
3381	if (dirty_tracking) {
3382	if (dmar_domain->use_first_level) {
3383	iommu_domain_free(domain);
3384	return ERR_PTR(error: -EOPNOTSUPP);
3385	}
3386	domain->dirty_ops = &intel_dirty_ops;
3387	}
3388
3389	return domain;
3390	}
3391
3392	static void intel_iommu_domain_free(struct iommu_domain *domain)
3393	{
3394	struct dmar_domain *dmar_domain = to_dmar_domain(dom: domain);
3395
3396	WARN_ON(dmar_domain->nested_parent &&
3397	!list_empty(&dmar_domain->s1_domains));
3398	domain_exit(domain: dmar_domain);
3399	}
3400
3401	int paging_domain_compatible(struct iommu_domain domain, struct* device *dev)
3402	{
3403	struct device_domain_info *info = dev_iommu_priv_get(dev);
3404	struct dmar_domain *dmar_domain = to_dmar_domain(dom: domain);
3405	struct intel_iommu *iommu = info->iommu;
3406	int addr_width;
3407
3408	if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING)))
3409	return -EPERM;
3410
3411	if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
3412	return -EINVAL;
3413
3414	if (domain->dirty_ops && !ssads_supported(iommu))
3415	return -EINVAL;
3416
3417	if (dmar_domain->iommu_coherency !=
3418	iommu_paging_structure_coherency(iommu))
3419	return -EINVAL;
3420
3421	if (dmar_domain->iommu_superpage !=
3422	iommu_superpage_capability(iommu, first_stage: dmar_domain->use_first_level))
3423	return -EINVAL;
3424
3425	if (dmar_domain->use_first_level &&
3426	(!sm_supported(iommu) \|\| !ecap_flts(iommu->ecap)))
3427	return -EINVAL;
3428
3429	/ check if this iommu agaw is sufficient for max mapped address /
3430	addr_width = agaw_to_width(agaw: iommu->agaw);
3431	if (addr_width > cap_mgaw(iommu->cap))
3432	addr_width = cap_mgaw(iommu->cap);
3433
3434	if (dmar_domain->gaw > addr_width \|\| dmar_domain->agaw > iommu->agaw)
3435	return -EINVAL;
3436
3437	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3438	context_copied(iommu, bus: info->bus, devfn: info->devfn))
3439	return intel_pasid_setup_sm_context(dev);
3440
3441	return `0`;
3442	}
3443
3444	static int intel_iommu_attach_device(struct iommu_domain *domain,
3445	struct device *dev)
3446	{
3447	int ret;
3448
3449	device_block_translation(dev);
3450
3451	ret = paging_domain_compatible(domain, dev);
3452	if (ret)
3453	return ret;
3454
3455	ret = iopf_for_domain_set(domain, dev);
3456	if (ret)
3457	return ret;
3458
3459	ret = dmar_domain_attach_device(domain: to_dmar_domain(dom: domain), dev);
3460	if (ret)
3461	iopf_for_domain_remove(domain, dev);
3462
3463	return ret;
3464	}
3465
3466	static int intel_iommu_map(struct iommu_domain *domain,
3467	unsigned long iova, phys_addr_t hpa,
3468	size_t size, int iommu_prot, gfp_t gfp)
3469	{
3470	struct dmar_domain *dmar_domain = to_dmar_domain(dom: domain);
3471	u64 max_addr;
3472	int prot = `0`;
3473
3474	if (iommu_prot & IOMMU_READ)
3475	prot \|= DMA_PTE_READ;
3476	if (iommu_prot & IOMMU_WRITE)
3477	prot \|= DMA_PTE_WRITE;
3478	if (dmar_domain->set_pte_snp)
3479	prot \|= DMA_PTE_SNP;
3480
3481	max_addr = iova + size;
3482	if (dmar_domain->max_addr < max_addr) {
3483	u64 end;
3484
3485	/ check if minimum agaw is sufficient for mapped address /
3486	end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + `1`;
3487	if (end < max_addr) {
3488	pr_err("%s: iommu width (%d) is not "
3489	"sufficient for the mapped address (%llx)\n",
3490	__func__, dmar_domain->gaw, max_addr);
3491	return -EFAULT;
3492	}
3493	dmar_domain->max_addr = max_addr;
3494	}
3495	/ Round up size to next multiple of PAGE_SIZE, if it and*
3496	the low bits of hpa would take us onto the next page /*
3497	size = aligned_nrpages(host_addr: hpa, size);
3498	return __domain_mapping(domain: dmar_domain, iov_pfn: iova >> VTD_PAGE_SHIFT,
3499	phys_pfn: hpa >> VTD_PAGE_SHIFT, nr_pages: size, prot, gfp);
3500	}
3501
3502	static int intel_iommu_map_pages(struct iommu_domain *domain,
3503	unsigned long iova, phys_addr_t paddr,
3504	size_t pgsize, size_t pgcount,
3505	int prot, gfp_t gfp, size_t *mapped)
3506	{
3507	unsigned long pgshift = __ffs(pgsize);
3508	size_t size = pgcount << pgshift;
3509	int ret;
3510
3511	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
3512	return -EINVAL;
3513
3514	if (!IS_ALIGNED(iova \| paddr, pgsize))
3515	return -EINVAL;
3516
3517	ret = intel_iommu_map(domain, iova, hpa: paddr, size, iommu_prot: prot, gfp);
3518	if (!ret && mapped)
3519	*mapped = size;
3520
3521	return ret;
3522	}
3523
3524	static size_t intel_iommu_unmap(struct iommu_domain *domain,
3525	unsigned long iova, size_t size,
3526	struct iommu_iotlb_gather *gather)
3527	{
3528	struct dmar_domain *dmar_domain = to_dmar_domain(dom: domain);
3529	unsigned long start_pfn, last_pfn;
3530	int level = `0`;
3531
3532	/ Cope with horrid API which requires us to unmap more than the*
3533	size argument if it happens to be a large-page mapping. /*
3534	if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
3535	&level, GFP_ATOMIC)))
3536	return `0`;
3537
3538	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
3539	size = VTD_PAGE_SIZE << level_to_offset_bits(level);
3540
3541	start_pfn = iova >> VTD_PAGE_SHIFT;
3542	last_pfn = (iova + size - `1`) >> VTD_PAGE_SHIFT;
3543
3544	domain_unmap(domain: dmar_domain, start_pfn, last_pfn, freelist: &gather->freelist);
3545
3546	if (dmar_domain->max_addr == iova + size)
3547	dmar_domain->max_addr = iova;
3548
3549	/*
3550	* We do not use page-selective IOTLB invalidation in flush queue,
3551	* so there is no need to track page and sync iotlb.
3552	*/
3553	if (!iommu_iotlb_gather_queued(gather))
3554	iommu_iotlb_gather_add_page(domain, gather, iova, size);
3555
3556	return size;
3557	}
3558
3559	static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
3560	unsigned long iova,
3561	size_t pgsize, size_t pgcount,
3562	struct iommu_iotlb_gather *gather)
3563	{
3564	unsigned long pgshift = __ffs(pgsize);
3565	size_t size = pgcount << pgshift;
3566
3567	return intel_iommu_unmap(domain, iova, size, gather);
3568	}
3569
3570	static void intel_iommu_tlb_sync(struct iommu_domain *domain,
3571	struct iommu_iotlb_gather *gather)
3572	{
3573	cache_tag_flush_range(domain: to_dmar_domain(dom: domain), start: gather->start,
3574	end: gather->end,
3575	ih: iommu_pages_list_empty(list: &gather->freelist));
3576	iommu_put_pages_list(list: &gather->freelist);
3577	}
3578
3579	static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3580	dma_addr_t iova)
3581	{
3582	struct dmar_domain *dmar_domain = to_dmar_domain(dom: domain);
3583	struct dma_pte *pte;
3584	int level = `0`;
3585	u64 phys = `0`;
3586
3587	pte = pfn_to_dma_pte(domain: dmar_domain, pfn: iova >> VTD_PAGE_SHIFT, target_level: &level,
3588	GFP_ATOMIC);
3589	if (pte && dma_pte_present(pte))
3590	phys = dma_pte_addr(pte) +
3591	(iova & (BIT_MASK(level_to_offset_bits(level) +
3592	VTD_PAGE_SHIFT) - `1`));
3593
3594	return phys;
3595	}
3596
3597	static bool domain_support_force_snooping(struct dmar_domain *domain)
3598	{
3599	struct device_domain_info *info;
3600	bool support = true;
3601
3602	assert_spin_locked(&domain->lock);
3603	list_for_each_entry(info, &domain->devices, link) {
3604	if (!ecap_sc_support(info->iommu->ecap)) {
3605	support = false;
3606	break;
3607	}
3608	}
3609
3610	return support;
3611	}
3612
3613	static void domain_set_force_snooping(struct dmar_domain *domain)
3614	{
3615	struct device_domain_info *info;
3616
3617	assert_spin_locked(&domain->lock);
3618	/*
3619	* Second level page table supports per-PTE snoop control. The
3620	* iommu_map() interface will handle this by setting SNP bit.
3621	*/
3622	if (!domain->use_first_level) {
3623	domain->set_pte_snp = true;
3624	return;
3625	}
3626
3627	list_for_each_entry(info, &domain->devices, link)
3628	intel_pasid_setup_page_snoop_control(iommu: info->iommu, dev: info->dev,
3629	IOMMU_NO_PASID);
3630	}
3631
3632	static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
3633	{
3634	struct dmar_domain *dmar_domain = to_dmar_domain(dom: domain);
3635	unsigned long flags;
3636
3637	if (dmar_domain->force_snooping)
3638	return true;
3639
3640	spin_lock_irqsave(&dmar_domain->lock, flags);
3641	if (!domain_support_force_snooping(domain: dmar_domain) \|\|
3642	(!dmar_domain->use_first_level && dmar_domain->has_mappings)) {
3643	spin_unlock_irqrestore(lock: &dmar_domain->lock, flags);
3644	return false;
3645	}
3646
3647	domain_set_force_snooping(domain: dmar_domain);
3648	dmar_domain->force_snooping = true;
3649	spin_unlock_irqrestore(lock: &dmar_domain->lock, flags);
3650
3651	return true;
3652	}
3653
3654	static bool intel_iommu_capable(struct device dev, enum* iommu_cap cap)
3655	{
3656	struct device_domain_info *info = dev_iommu_priv_get(dev);
3657
3658	switch (cap) {
3659	case IOMMU_CAP_CACHE_COHERENCY:
3660	case IOMMU_CAP_DEFERRED_FLUSH:
3661	return true;
3662	case IOMMU_CAP_PRE_BOOT_PROTECTION:
3663	return dmar_platform_optin();
3664	case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
3665	return ecap_sc_support(info->iommu->ecap);
3666	case IOMMU_CAP_DIRTY_TRACKING:
3667	return ssads_supported(info->iommu);
3668	default:
3669	return false;
3670	}
3671	}
3672
3673	static struct iommu_device intel_iommu_probe_device(struct* device *dev)
3674	{
3675	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
3676	struct device_domain_info *info;
3677	struct intel_iommu *iommu;
3678	u8 bus, devfn;
3679	int ret;
3680
3681	iommu = device_lookup_iommu(dev, bus: &bus, devfn: &devfn);
3682	if (!iommu \|\| !iommu->iommu.ops)
3683	return ERR_PTR(error: -ENODEV);
3684
3685	info = kzalloc(sizeof(*info), GFP_KERNEL);
3686	if (!info)
3687	return ERR_PTR(error: -ENOMEM);
3688
3689	if (dev_is_real_dma_subdevice(dev)) {
3690	info->bus = pdev->bus->number;
3691	info->devfn = pdev->devfn;
3692	info->segment = pci_domain_nr(bus: pdev->bus);
3693	} else {
3694	info->bus = bus;
3695	info->devfn = devfn;
3696	info->segment = iommu->segment;
3697	}
3698
3699	info->dev = dev;
3700	info->iommu = iommu;
3701	if (dev_is_pci(dev)) {
3702	if (ecap_dev_iotlb_support(iommu->ecap) &&
3703	pci_ats_supported(dev: pdev) &&
3704	dmar_ats_supported(dev: pdev, iommu)) {
3705	info->ats_supported = `1`;
3706	info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
3707
3708	/*
3709	* For IOMMU that supports device IOTLB throttling
3710	* (DIT), we assign PFSID to the invalidation desc
3711	* of a VF such that IOMMU HW can gauge queue depth
3712	* at PF level. If DIT is not set, PFSID will be
3713	* treated as reserved, which should be set to 0.
3714	*/
3715	if (ecap_dit(iommu->ecap))
3716	info->pfsid = pci_dev_id(dev: pci_physfn(dev: pdev));
3717	info->ats_qdep = pci_ats_queue_depth(dev: pdev);
3718	}
3719	if (sm_supported(iommu)) {
3720	if (pasid_supported(iommu)) {
3721	int features = pci_pasid_features(pdev);
3722
3723	if (features >= `0`)
3724	info->pasid_supported = features \| `1`;
3725	}
3726
3727	if (info->ats_supported && ecap_prs(iommu->ecap) &&
3728	pci_pri_supported(pdev))
3729	info->pri_supported = `1`;
3730	}
3731	}
3732
3733	dev_iommu_priv_set(dev, priv: info);
3734	if (pdev && pci_ats_supported(dev: pdev)) {
3735	pci_prepare_ats(dev: pdev, VTD_PAGE_SHIFT);
3736	ret = device_rbtree_insert(iommu, info);
3737	if (ret)
3738	goto free;
3739	}
3740
3741	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
3742	ret = intel_pasid_alloc_table(dev);
3743	if (ret) {
3744	dev_err(dev, "PASID table allocation failed\n");
3745	goto clear_rbtree;
3746	}
3747
3748	if (!context_copied(iommu, bus: info->bus, devfn: info->devfn)) {
3749	ret = intel_pasid_setup_sm_context(dev);
3750	if (ret)
3751	goto free_table;
3752	}
3753	}
3754
3755	intel_iommu_debugfs_create_dev(info);
3756
3757	return &iommu->iommu;
3758	free_table:
3759	intel_pasid_free_table(dev);
3760	clear_rbtree:
3761	device_rbtree_remove(info);
3762	free:
3763	kfree(objp: info);
3764
3765	return ERR_PTR(error: ret);
3766	}
3767
3768	static void intel_iommu_probe_finalize(struct device *dev)
3769	{
3770	struct device_domain_info *info = dev_iommu_priv_get(dev);
3771	struct intel_iommu *iommu = info->iommu;
3772
3773	/*
3774	* The PCIe spec, in its wisdom, declares that the behaviour of the
3775	* device is undefined if you enable PASID support after ATS support.
3776	* So always enable PASID support on devices which have it, even if
3777	* we can't yet know if we're ever going to use it.
3778	*/
3779	if (info->pasid_supported &&
3780	!pci_enable_pasid(to_pci_dev(dev), features: info->pasid_supported & ~`1`))
3781	info->pasid_enabled = `1`;
3782
3783	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev))
3784	iommu_enable_pci_ats(info);
3785	iommu_enable_pci_pri(info);
3786	}
3787
3788	static void intel_iommu_release_device(struct device *dev)
3789	{
3790	struct device_domain_info *info = dev_iommu_priv_get(dev);
3791	struct intel_iommu *iommu = info->iommu;
3792
3793	iommu_disable_pci_pri(info);
3794	iommu_disable_pci_ats(info);
3795
3796	if (info->pasid_enabled) {
3797	pci_disable_pasid(to_pci_dev(dev));
3798	info->pasid_enabled = `0`;
3799	}
3800
3801	mutex_lock(&iommu->iopf_lock);
3802	if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev)))
3803	device_rbtree_remove(info);
3804	mutex_unlock(lock: &iommu->iopf_lock);
3805
3806	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3807	!context_copied(iommu, bus: info->bus, devfn: info->devfn))
3808	intel_pasid_teardown_sm_context(dev);
3809
3810	intel_pasid_free_table(dev);
3811	intel_iommu_debugfs_remove_dev(info);
3812	kfree(objp: info);
3813	}
3814
3815	static void intel_iommu_get_resv_regions(struct device *device,
3816	struct list_head *head)
3817	{
3818	int prot = DMA_PTE_READ \| DMA_PTE_WRITE;
3819	struct iommu_resv_region *reg;
3820	struct dmar_rmrr_unit *rmrr;
3821	struct device *i_dev;
3822	int i;
3823
3824	rcu_read_lock();
3825	for_each_rmrr_units(rmrr) {
3826	for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3827	i, i_dev) {
3828	struct iommu_resv_region *resv;
3829	enum iommu_resv_type type;
3830	size_t length;
3831
3832	if (i_dev != device &&
3833	!is_downstream_to_pci_bridge(dev: device, bridge: i_dev))
3834	continue;
3835
3836	length = rmrr->end_address - rmrr->base_address + `1`;
3837
3838	type = device_rmrr_is_relaxable(dev: device) ?
3839	IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
3840
3841	resv = iommu_alloc_resv_region(start: rmrr->base_address,
3842	length, prot, type,
3843	GFP_ATOMIC);
3844	if (!resv)
3845	break;
3846
3847	list_add_tail(new: &resv->list, head);
3848	}
3849	}
3850	rcu_read_unlock();
3851
3852	#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
3853	if (dev_is_pci(device)) {
3854	struct pci_dev *pdev = to_pci_dev(device);
3855
3856	if ((pdev->class >> `8`) == PCI_CLASS_BRIDGE_ISA) {
3857	reg = iommu_alloc_resv_region(start: `0`, length: `1UL` << `24`, prot,
3858	type: IOMMU_RESV_DIRECT_RELAXABLE,
3859	GFP_KERNEL);
3860	if (reg)
3861	list_add_tail(new: &reg->list, head);
3862	}
3863	}
3864	#endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
3865
3866	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
3867	IOAPIC_RANGE_END - IOAPIC_RANGE_START + `1`,
3868	prot: `0`, type: IOMMU_RESV_MSI, GFP_KERNEL);
3869	if (!reg)
3870	return;
3871	list_add_tail(new: &reg->list, head);
3872	}
3873
3874	static struct iommu_group intel_iommu_device_group(struct* device *dev)
3875	{
3876	if (dev_is_pci(dev))
3877	return pci_device_group(dev);
3878	return generic_device_group(dev);
3879	}
3880
3881	int intel_iommu_enable_iopf(struct device *dev)
3882	{
3883	struct device_domain_info *info = dev_iommu_priv_get(dev);
3884	struct intel_iommu *iommu = info->iommu;
3885	int ret;
3886
3887	if (!info->pri_enabled)
3888	return -ENODEV;
3889
3890	/ pri_enabled is protected by the group mutex. /
3891	iommu_group_mutex_assert(dev);
3892	if (info->iopf_refcount) {
3893	info->iopf_refcount++;
3894	return `0`;
3895	}
3896
3897	ret = iopf_queue_add_device(queue: iommu->iopf_queue, dev);
3898	if (ret)
3899	return ret;
3900
3901	info->iopf_refcount = `1`;
3902
3903	return `0`;
3904	}
3905
3906	void intel_iommu_disable_iopf(struct device *dev)
3907	{
3908	struct device_domain_info *info = dev_iommu_priv_get(dev);
3909	struct intel_iommu *iommu = info->iommu;
3910
3911	if (WARN_ON(!info->pri_enabled \|\| !info->iopf_refcount))
3912	return;
3913
3914	iommu_group_mutex_assert(dev);
3915	if (--info->iopf_refcount)
3916	return;
3917
3918	iopf_queue_remove_device(queue: iommu->iopf_queue, dev);
3919	}
3920
3921	static bool intel_iommu_is_attach_deferred(struct device *dev)
3922	{
3923	struct device_domain_info *info = dev_iommu_priv_get(dev);
3924
3925	return translation_pre_enabled(iommu: info->iommu) && !info->domain;
3926	}
3927
3928	/*
3929	* Check that the device does not live on an external facing PCI port that is
3930	* marked as untrusted. Such devices should not be able to apply quirks and
3931	* thus not be able to bypass the IOMMU restrictions.
3932	*/
3933	static bool risky_device(struct pci_dev *pdev)
3934	{
3935	if (pdev->untrusted) {
3936	pci_info(pdev,
3937	"Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
3938	pdev->vendor, pdev->device);
3939	pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
3940	return true;
3941	}
3942	return false;
3943	}
3944
3945	static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
3946	unsigned long iova, size_t size)
3947	{
3948	cache_tag_flush_range_np(domain: to_dmar_domain(dom: domain), start: iova, end: iova + size - `1`);
3949
3950	return `0`;
3951	}
3952
3953	void domain_remove_dev_pasid(struct iommu_domain *domain,
3954	struct device *dev, ioasid_t pasid)
3955	{
3956	struct device_domain_info *info = dev_iommu_priv_get(dev);
3957	struct dev_pasid_info curr, dev_pasid = NULL;
3958	struct intel_iommu *iommu = info->iommu;
3959	struct dmar_domain *dmar_domain;
3960	unsigned long flags;
3961
3962	if (!domain)
3963	return;
3964
3965	/ Identity domain has no meta data for pasid. /
3966	if (domain->type == IOMMU_DOMAIN_IDENTITY)
3967	return;
3968
3969	dmar_domain = to_dmar_domain(dom: domain);
3970	spin_lock_irqsave(&dmar_domain->lock, flags);
3971	list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
3972	if (curr->dev == dev && curr->pasid == pasid) {
3973	list_del(entry: &curr->link_domain);
3974	dev_pasid = curr;
3975	break;
3976	}
3977	}
3978	spin_unlock_irqrestore(lock: &dmar_domain->lock, flags);
3979
3980	cache_tag_unassign_domain(domain: dmar_domain, dev, pasid);
3981	domain_detach_iommu(domain: dmar_domain, iommu);
3982	if (!WARN_ON_ONCE(!dev_pasid)) {
3983	intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
3984	kfree(objp: dev_pasid);
3985	}
3986	}
3987
3988	static int blocking_domain_set_dev_pasid(struct iommu_domain *domain,
3989	struct device *dev, ioasid_t pasid,
3990	struct iommu_domain *old)
3991	{
3992	struct device_domain_info *info = dev_iommu_priv_get(dev);
3993
3994	iopf_for_domain_remove(domain: old, dev);
3995	intel_pasid_tear_down_entry(iommu: info->iommu, dev, pasid, fault_ignore: false);
3996	domain_remove_dev_pasid(domain: old, dev, pasid);
3997
3998	return `0`;
3999	}
4000
4001	struct dev_pasid_info *
4002	domain_add_dev_pasid(struct iommu_domain *domain,
4003	struct device *dev, ioasid_t pasid)
4004	{
4005	struct device_domain_info *info = dev_iommu_priv_get(dev);
4006	struct dmar_domain *dmar_domain = to_dmar_domain(dom: domain);
4007	struct intel_iommu *iommu = info->iommu;
4008	struct dev_pasid_info *dev_pasid;
4009	unsigned long flags;
4010	int ret;
4011
4012	dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4013	if (!dev_pasid)
4014	return ERR_PTR(error: -ENOMEM);
4015
4016	ret = domain_attach_iommu(domain: dmar_domain, iommu);
4017	if (ret)
4018	goto out_free;
4019
4020	ret = cache_tag_assign_domain(domain: dmar_domain, dev, pasid);
4021	if (ret)
4022	goto out_detach_iommu;
4023
4024	dev_pasid->dev = dev;
4025	dev_pasid->pasid = pasid;
4026	spin_lock_irqsave(&dmar_domain->lock, flags);
4027	list_add(new: &dev_pasid->link_domain, head: &dmar_domain->dev_pasids);
4028	spin_unlock_irqrestore(lock: &dmar_domain->lock, flags);
4029
4030	return dev_pasid;
4031	out_detach_iommu:
4032	domain_detach_iommu(domain: dmar_domain, iommu);
4033	out_free:
4034	kfree(objp: dev_pasid);
4035	return ERR_PTR(error: ret);
4036	}
4037
4038	static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4039	struct device *dev, ioasid_t pasid,
4040	struct iommu_domain *old)
4041	{
4042	struct device_domain_info *info = dev_iommu_priv_get(dev);
4043	struct dmar_domain *dmar_domain = to_dmar_domain(dom: domain);
4044	struct intel_iommu *iommu = info->iommu;
4045	struct dev_pasid_info *dev_pasid;
4046	int ret;
4047
4048	if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING)))
4049	return -EINVAL;
4050
4051	if (!pasid_supported(iommu) \|\| dev_is_real_dma_subdevice(dev))
4052	return -EOPNOTSUPP;
4053
4054	if (domain->dirty_ops)
4055	return -EINVAL;
4056
4057	if (context_copied(iommu, bus: info->bus, devfn: info->devfn))
4058	return -EBUSY;
4059
4060	ret = paging_domain_compatible(domain, dev);
4061	if (ret)
4062	return ret;
4063
4064	dev_pasid = domain_add_dev_pasid(domain, dev, pasid);
4065	if (IS_ERR(ptr: dev_pasid))
4066	return PTR_ERR(ptr: dev_pasid);
4067
4068	ret = iopf_for_domain_replace(new: domain, old, dev);
4069	if (ret)
4070	goto out_remove_dev_pasid;
4071
4072	if (dmar_domain->use_first_level)
4073	ret = domain_setup_first_level(iommu, domain: dmar_domain,
4074	dev, pasid, old);
4075	else
4076	ret = domain_setup_second_level(iommu, domain: dmar_domain,
4077	dev, pasid, old);
4078	if (ret)
4079	goto out_unwind_iopf;
4080
4081	domain_remove_dev_pasid(domain: old, dev, pasid);
4082
4083	intel_iommu_debugfs_create_dev_pasid(dev_pasid);
4084
4085	return `0`;
4086
4087	out_unwind_iopf:
4088	iopf_for_domain_replace(new: old, old: domain, dev);
4089	out_remove_dev_pasid:
4090	domain_remove_dev_pasid(domain, dev, pasid);
4091	return ret;
4092	}
4093
4094	static void intel_iommu_hw_info(struct* device dev, u32 length, u32 *type)
4095	{
4096	struct device_domain_info *info = dev_iommu_priv_get(dev);
4097	struct intel_iommu *iommu = info->iommu;
4098	struct iommu_hw_info_vtd *vtd;
4099
4100	vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4101	if (!vtd)
4102	return ERR_PTR(error: -ENOMEM);
4103
4104	vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
4105	vtd->cap_reg = iommu->cap;
4106	vtd->ecap_reg = iommu->ecap;
4107	length = sizeof(vtd);
4108	*type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4109	return vtd;
4110	}
4111
4112	/*
4113	* Set dirty tracking for the device list of a domain. The caller must
4114	* hold the domain->lock when calling it.
4115	*/
4116	static int device_set_dirty_tracking(struct list_head *devices, bool enable)
4117	{
4118	struct device_domain_info *info;
4119	int ret = `0`;
4120
4121	list_for_each_entry(info, devices, link) {
4122	ret = intel_pasid_setup_dirty_tracking(iommu: info->iommu, dev: info->dev,
4123	IOMMU_NO_PASID, enabled: enable);
4124	if (ret)
4125	break;
4126	}
4127
4128	return ret;
4129	}
4130
4131	static int parent_domain_set_dirty_tracking(struct dmar_domain *domain,
4132	bool enable)
4133	{
4134	struct dmar_domain *s1_domain;
4135	unsigned long flags;
4136	int ret;
4137
4138	spin_lock(lock: &domain->s1_lock);
4139	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4140	spin_lock_irqsave(&s1_domain->lock, flags);
4141	ret = device_set_dirty_tracking(devices: &s1_domain->devices, enable);
4142	spin_unlock_irqrestore(lock: &s1_domain->lock, flags);
4143	if (ret)
4144	goto err_unwind;
4145	}
4146	spin_unlock(lock: &domain->s1_lock);
4147	return `0`;
4148
4149	err_unwind:
4150	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4151	spin_lock_irqsave(&s1_domain->lock, flags);
4152	device_set_dirty_tracking(devices: &s1_domain->devices,
4153	enable: domain->dirty_tracking);
4154	spin_unlock_irqrestore(lock: &s1_domain->lock, flags);
4155	}
4156	spin_unlock(lock: &domain->s1_lock);
4157	return ret;
4158	}
4159
4160	static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
4161	bool enable)
4162	{
4163	struct dmar_domain *dmar_domain = to_dmar_domain(dom: domain);
4164	int ret;
4165
4166	spin_lock(lock: &dmar_domain->lock);
4167	if (dmar_domain->dirty_tracking == enable)
4168	goto out_unlock;
4169
4170	ret = device_set_dirty_tracking(devices: &dmar_domain->devices, enable);
4171	if (ret)
4172	goto err_unwind;
4173
4174	if (dmar_domain->nested_parent) {
4175	ret = parent_domain_set_dirty_tracking(domain: dmar_domain, enable);
4176	if (ret)
4177	goto err_unwind;
4178	}
4179
4180	dmar_domain->dirty_tracking = enable;
4181	out_unlock:
4182	spin_unlock(lock: &dmar_domain->lock);
4183
4184	return `0`;
4185
4186	err_unwind:
4187	device_set_dirty_tracking(devices: &dmar_domain->devices,
4188	enable: dmar_domain->dirty_tracking);
4189	spin_unlock(lock: &dmar_domain->lock);
4190	return ret;
4191	}
4192
4193	static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
4194	unsigned long iova, size_t size,
4195	unsigned long flags,
4196	struct iommu_dirty_bitmap *dirty)
4197	{
4198	struct dmar_domain *dmar_domain = to_dmar_domain(dom: domain);
4199	unsigned long end = iova + size - `1`;
4200	unsigned long pgsize;
4201
4202	/*
4203	* IOMMUFD core calls into a dirty tracking disabled domain without an
4204	* IOVA bitmap set in order to clean dirty bits in all PTEs that might
4205	* have occurred when we stopped dirty tracking. This ensures that we
4206	* never inherit dirtied bits from a previous cycle.
4207	*/
4208	if (!dmar_domain->dirty_tracking && dirty->bitmap)
4209	return -EINVAL;
4210
4211	do {
4212	struct dma_pte *pte;
4213	int lvl = `0`;
4214
4215	pte = pfn_to_dma_pte(domain: dmar_domain, pfn: iova >> VTD_PAGE_SHIFT, target_level: &lvl,
4216	GFP_ATOMIC);
4217	pgsize = level_size(level: lvl) << VTD_PAGE_SHIFT;
4218	if (!pte \|\| !dma_pte_present(pte)) {
4219	iova += pgsize;
4220	continue;
4221	}
4222
4223	if (dma_sl_pte_test_and_clear_dirty(pte, flags))
4224	iommu_dirty_bitmap_record(dirty, iova, length: pgsize);
4225	iova += pgsize;
4226	} while (iova < end);
4227
4228	return `0`;
4229	}
4230
4231	static const struct iommu_dirty_ops intel_dirty_ops = {
4232	.set_dirty_tracking = intel_iommu_set_dirty_tracking,
4233	.read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
4234	};
4235
4236	static int context_setup_pass_through(struct device *dev, u8 bus, u8 devfn)
4237	{
4238	struct device_domain_info *info = dev_iommu_priv_get(dev);
4239	struct intel_iommu *iommu = info->iommu;
4240	struct context_entry *context;
4241
4242	spin_lock(lock: &iommu->lock);
4243	context = iommu_context_addr(iommu, bus, devfn, alloc: `1`);
4244	if (!context) {
4245	spin_unlock(lock: &iommu->lock);
4246	return -ENOMEM;
4247	}
4248
4249	if (context_present(context) && !context_copied(iommu, bus, devfn)) {
4250	spin_unlock(lock: &iommu->lock);
4251	return `0`;
4252	}
4253
4254	copied_context_tear_down(iommu, context, bus, devfn);
4255	context_clear_entry(context);
4256	context_set_domain_id(context, FLPT_DEFAULT_DID);
4257
4258	/*
4259	* In pass through mode, AW must be programmed to indicate the largest
4260	* AGAW value supported by hardware. And ASR is ignored by hardware.
4261	*/
4262	context_set_address_width(context, value: iommu->msagaw);
4263	context_set_translation_type(context, CONTEXT_TT_PASS_THROUGH);
4264	context_set_fault_enable(context);
4265	context_set_present(context);
4266	if (!ecap_coherent(iommu->ecap))
4267	clflush_cache_range(addr: context, size: sizeof(*context));
4268	context_present_cache_flush(iommu, FLPT_DEFAULT_DID, bus, devfn);
4269	spin_unlock(lock: &iommu->lock);
4270
4271	return `0`;
4272	}
4273
4274	static int context_setup_pass_through_cb(struct pci_dev pdev, u16 alias, void* *data)
4275	{
4276	struct device *dev = data;
4277
4278	return context_setup_pass_through(dev, PCI_BUS_NUM(alias), devfn: alias & `0xff`);
4279	}
4280
4281	static int device_setup_pass_through(struct device *dev)
4282	{
4283	struct device_domain_info *info = dev_iommu_priv_get(dev);
4284
4285	if (!dev_is_pci(dev))
4286	return context_setup_pass_through(dev, bus: info->bus, devfn: info->devfn);
4287
4288	return pci_for_each_dma_alias(to_pci_dev(dev),
4289	fn: context_setup_pass_through_cb, data: dev);
4290	}
4291
4292	static int identity_domain_attach_dev(struct iommu_domain domain, struct* device *dev)
4293	{
4294	struct device_domain_info *info = dev_iommu_priv_get(dev);
4295	struct intel_iommu *iommu = info->iommu;
4296	int ret;
4297
4298	device_block_translation(dev);
4299
4300	if (dev_is_real_dma_subdevice(dev))
4301	return `0`;
4302
4303	/*
4304	* No PRI support with the global identity domain. No need to enable or
4305	* disable PRI in this path as the iommu has been put in the blocking
4306	* state.
4307	*/
4308	if (sm_supported(iommu))
4309	ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID);
4310	else
4311	ret = device_setup_pass_through(dev);
4312
4313	if (!ret)
4314	info->domain_attached = true;
4315
4316	return ret;
4317	}
4318
4319	static int identity_domain_set_dev_pasid(struct iommu_domain *domain,
4320	struct device *dev, ioasid_t pasid,
4321	struct iommu_domain *old)
4322	{
4323	struct device_domain_info *info = dev_iommu_priv_get(dev);
4324	struct intel_iommu *iommu = info->iommu;
4325	int ret;
4326
4327	if (!pasid_supported(iommu) \|\| dev_is_real_dma_subdevice(dev))
4328	return -EOPNOTSUPP;
4329
4330	ret = iopf_for_domain_replace(new: domain, old, dev);
4331	if (ret)
4332	return ret;
4333
4334	ret = domain_setup_passthrough(iommu, dev, pasid, old);
4335	if (ret) {
4336	iopf_for_domain_replace(new: old, old: domain, dev);
4337	return ret;
4338	}
4339
4340	domain_remove_dev_pasid(domain: old, dev, pasid);
4341	return `0`;
4342	}
4343
4344	static struct iommu_domain identity_domain = {
4345	.type = IOMMU_DOMAIN_IDENTITY,
4346	.ops = &(const struct iommu_domain_ops) {
4347	.attach_dev = identity_domain_attach_dev,
4348	.set_dev_pasid = identity_domain_set_dev_pasid,
4349	},
4350	};
4351
4352	const struct iommu_ops intel_iommu_ops = {
4353	.blocked_domain = &blocking_domain,
4354	.release_domain = &blocking_domain,
4355	.identity_domain = &identity_domain,
4356	.capable = intel_iommu_capable,
4357	.hw_info = intel_iommu_hw_info,
4358	.domain_alloc_paging_flags = intel_iommu_domain_alloc_paging_flags,
4359	.domain_alloc_sva = intel_svm_domain_alloc,
4360	.domain_alloc_nested = intel_iommu_domain_alloc_nested,
4361	.probe_device = intel_iommu_probe_device,
4362	.probe_finalize = intel_iommu_probe_finalize,
4363	.release_device = intel_iommu_release_device,
4364	.get_resv_regions = intel_iommu_get_resv_regions,
4365	.device_group = intel_iommu_device_group,
4366	.is_attach_deferred = intel_iommu_is_attach_deferred,
4367	.def_domain_type = device_def_domain_type,
4368	.pgsize_bitmap = SZ_4K,
4369	.page_response = intel_iommu_page_response,
4370	.default_domain_ops = &(const struct iommu_domain_ops) {
4371	.attach_dev = intel_iommu_attach_device,
4372	.set_dev_pasid = intel_iommu_set_dev_pasid,
4373	.map_pages = intel_iommu_map_pages,
4374	.unmap_pages = intel_iommu_unmap_pages,
4375	.iotlb_sync_map = intel_iommu_iotlb_sync_map,
4376	.flush_iotlb_all = intel_flush_iotlb_all,
4377	.iotlb_sync = intel_iommu_tlb_sync,
4378	.iova_to_phys = intel_iommu_iova_to_phys,
4379	.free = intel_iommu_domain_free,
4380	.enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4381	}
4382	};
4383
4384	static void quirk_iommu_igfx(struct pci_dev *dev)
4385	{
4386	if (risky_device(pdev: dev))
4387	return;
4388
4389	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4390	disable_igfx_iommu = `1`;
4391	}
4392
4393	/ G4x/GM45 integrated gfx dmar support is totally busted. /
4394	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, `0x2a40`, quirk_iommu_igfx);
4395	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, `0x2e00`, quirk_iommu_igfx);
4396	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, `0x2e10`, quirk_iommu_igfx);
4397	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, `0x2e20`, quirk_iommu_igfx);
4398	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, `0x2e30`, quirk_iommu_igfx);
4399	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, `0x2e40`, quirk_iommu_igfx);
4400	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, `0x2e90`, quirk_iommu_igfx);
4401
4402	/ QM57/QS57 integrated gfx malfunctions with dmar /
4403	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, `0x0044`, quirk_iommu_igfx);
4404
4405	/ Broadwell igfx malfunctions with dmar /
4406	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, `0x1606`, quirk_iommu_igfx);
4407	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, `0x160B`, quirk_iommu_igfx);
4408	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, `0x160E`, quirk_iommu_igfx);
4409	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, `0x1602`, quirk_iommu_igfx);
4410	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, `0x160A`, quirk_iommu_igfx);
4411	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, `0x160D`, quirk_iommu_igfx);
4412	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, `0x1616`, quirk_iommu_igfx);
4413	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, `0x161B`, quirk_iommu_igfx);
4414	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, `0x161E`, quirk_iommu_igfx);
4415	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, `0x1612`, quirk_iommu_igfx);
4416	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, `0x161A`, quirk_iommu_igfx);
4417	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, `0x161D`, quirk_iommu_igfx);
4418	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, `0x1626`, quirk_iommu_igfx);
4419	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, `0x162B`, quirk_iommu_igfx);
4420	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, `0x162E`, quirk_iommu_igfx);
4421	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, `0x1622`, quirk_iommu_igfx);
4422	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, `0x162A`, quirk_iommu_igfx);
4423	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, `0x162D`, quirk_iommu_igfx);
4424	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, `0x1636`, quirk_iommu_igfx);
4425	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, `0x163B`, quirk_iommu_igfx);
4426	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, `0x163E`, quirk_iommu_igfx);
4427	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, `0x1632`, quirk_iommu_igfx);
4428	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, `0x163A`, quirk_iommu_igfx);
4429	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, `0x163D`, quirk_iommu_igfx);
4430
4431	static void quirk_iommu_rwbf(struct pci_dev *dev)
4432	{
4433	if (risky_device(pdev: dev))
4434	return;
4435
4436	/*
4437	* Mobile 4 Series Chipset neglects to set RWBF capability,
4438	* but needs it. Same seems to hold for the desktop versions.
4439	*/
4440	pci_info(dev, "Forcing write-buffer flush capability\n");
4441	rwbf_quirk = `1`;
4442	}
4443
4444	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, `0x2a40`, quirk_iommu_rwbf);
4445	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, `0x2e00`, quirk_iommu_rwbf);
4446	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, `0x2e10`, quirk_iommu_rwbf);
4447	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, `0x2e20`, quirk_iommu_rwbf);
4448	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, `0x2e30`, quirk_iommu_rwbf);
4449	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, `0x2e40`, quirk_iommu_rwbf);
4450	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, `0x2e90`, quirk_iommu_rwbf);
4451
4452	#define GGC 0x52
4453	#define GGC_MEMORY_SIZE_MASK (0xf << 8)
4454	#define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4455	#define GGC_MEMORY_SIZE_1M (0x1 << 8)
4456	#define GGC_MEMORY_SIZE_2M (0x3 << 8)
4457	#define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4458	#define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4459	#define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4460	#define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4461
4462	static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4463	{
4464	unsigned short ggc;
4465
4466	if (risky_device(pdev: dev))
4467	return;
4468
4469	if (pci_read_config_word(dev, GGC, val: &ggc))
4470	return;
4471
4472	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4473	pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4474	disable_igfx_iommu = `1`;
4475	} else if (!disable_igfx_iommu) {
4476	/ we have to ensure the gfx device is idle before we flush /
4477	pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4478	iommu_set_dma_strict();
4479	}
4480	}
4481	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, `0x0040`, quirk_calpella_no_shadow_gtt);
4482	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, `0x0062`, quirk_calpella_no_shadow_gtt);
4483	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, `0x006a`, quirk_calpella_no_shadow_gtt);
4484
4485	static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4486	{
4487	unsigned short ver;
4488
4489	if (!IS_GFX_DEVICE(dev))
4490	return;
4491
4492	ver = (dev->device >> `8`) & `0xff`;
4493	if (ver != `0x45` && ver != `0x46` && ver != `0x4c` &&
4494	ver != `0x4e` && ver != `0x8a` && ver != `0x98` &&
4495	ver != `0x9a` && ver != `0xa7` && ver != `0x7d`)
4496	return;
4497
4498	if (risky_device(pdev: dev))
4499	return;
4500
4501	pci_info(dev, "Skip IOMMU disabling for graphics\n");
4502	iommu_skip_te_disable = `1`;
4503	}
4504	DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4505
4506	/ On Tylersburg chipsets, some BIOSes have been known to enable the*
4507	ISOCH DMAR unit for the Azalia sound device, but not give it any
4508	TLB entries, which causes it to deadlock. Check for that. We do
4509	this in a function called from init_dmars(), instead of in a PCI
4510	quirk, because we don't want to print the obnoxious "BIOS broken"
4511	message if VT-d is actually disabled.
4512	*/
4513	static void __init check_tylersburg_isoch(void)
4514	{
4515	struct pci_dev *pdev;
4516	uint32_t vtisochctrl;
4517
4518	/ If there's no Azalia in the system anyway, forget it. /
4519	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, device: `0x3a3e`, NULL);
4520	if (!pdev)
4521	return;
4522
4523	if (risky_device(pdev)) {
4524	pci_dev_put(dev: pdev);
4525	return;
4526	}
4527
4528	pci_dev_put(dev: pdev);
4529
4530	/ System Management Registers. Might be hidden, in which case*
4531	we can't do the sanity check. But that's OK, because the
4532	known-broken BIOSes _don't_ actually hide it, so far. /*
4533	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, device: `0x342e`, NULL);
4534	if (!pdev)
4535	return;
4536
4537	if (risky_device(pdev)) {
4538	pci_dev_put(dev: pdev);
4539	return;
4540	}
4541
4542	if (pci_read_config_dword(dev: pdev, where: `0x188`, val: &vtisochctrl)) {
4543	pci_dev_put(dev: pdev);
4544	return;
4545	}
4546
4547	pci_dev_put(dev: pdev);
4548
4549	/ If Azalia DMA is routed to the non-isoch DMAR unit, fine. /
4550	if (vtisochctrl & `1`)
4551	return;
4552
4553	/ Drop all bits other than the number of TLB entries /
4554	vtisochctrl &= `0x1c`;
4555
4556	/ If we have the recommended number of TLB entries (16), fine. /
4557	if (vtisochctrl == `0x10`)
4558	return;
4559
4560	/ Zero TLB entries? You get to ride the short bus to school. /
4561	if (!vtisochctrl) {
4562	WARN(`1`, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4563	"BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4564	dmi_get_system_info(DMI_BIOS_VENDOR),
4565	dmi_get_system_info(DMI_BIOS_VERSION),
4566	dmi_get_system_info(DMI_PRODUCT_VERSION));
4567	iommu_identity_mapping \|= IDENTMAP_AZALIA;
4568	return;
4569	}
4570
4571	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4572	vtisochctrl);
4573	}
4574
4575	/*
4576	* Here we deal with a device TLB defect where device may inadvertently issue ATS
4577	* invalidation completion before posted writes initiated with translated address
4578	* that utilized translations matching the invalidation address range, violating
4579	* the invalidation completion ordering.
4580	* Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
4581	* vulnerable to this defect. In other words, any dTLB invalidation initiated not
4582	* under the control of the trusted/privileged host device driver must use this
4583	* quirk.
4584	* Device TLBs are invalidated under the following six conditions:
4585	* 1. Device driver does DMA API unmap IOVA
4586	* 2. Device driver unbind a PASID from a process, sva_unbind_device()
4587	* 3. PASID is torn down, after PASID cache is flushed. e.g. process
4588	* exit_mmap() due to crash
4589	* 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
4590	* VM has to free pages that were unmapped
4591	* 5. Userspace driver unmaps a DMA buffer
4592	* 6. Cache invalidation in vSVA usage (upcoming)
4593	*
4594	* For #1 and #2, device drivers are responsible for stopping DMA traffic
4595	* before unmap/unbind. For #3, iommu driver gets mmu_notifier to
4596	* invalidate TLB the same way as normal user unmap which will use this quirk.
4597	* The dTLB invalidation after PASID cache flush does not need this quirk.
4598	*
4599	* As a reminder, #6 will NEED this quirk as we enable nested translation.
4600	*/
4601	void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
4602	unsigned long address, unsigned long mask,
4603	u32 pasid, u16 qdep)
4604	{
4605	u16 sid;
4606
4607	if (likely(!info->dtlb_extra_inval))
4608	return;
4609
4610	sid = PCI_DEVID(info->bus, info->devfn);
4611	if (pasid == IOMMU_NO_PASID) {
4612	qi_flush_dev_iotlb(iommu: info->iommu, sid, pfsid: info->pfsid,
4613	qdep, addr: address, mask);
4614	} else {
4615	qi_flush_dev_iotlb_pasid(iommu: info->iommu, sid, pfsid: info->pfsid,
4616	pasid, qdep, addr: address, size_order: mask);
4617	}
4618	}
4619
4620	#define ecmd_get_status_code(res) (((res) & 0xff) >> 1)
4621
4622	/*
4623	* Function to submit a command to the enhanced command interface. The
4624	* valid enhanced command descriptions are defined in Table 47 of the
4625	* VT-d spec. The VT-d hardware implementation may support some but not
4626	* all commands, which can be determined by checking the Enhanced
4627	* Command Capability Register.
4628	*
4629	* Return values:
4630	* - 0: Command successful without any error;
4631	* - Negative: software error value;
4632	* - Nonzero positive: failure status code defined in Table 48.
4633	*/
4634	int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
4635	{
4636	unsigned long flags;
4637	u64 res;
4638	int ret;
4639
4640	if (!cap_ecmds(iommu->cap))
4641	return -ENODEV;
4642
4643	raw_spin_lock_irqsave(&iommu->register_lock, flags);
4644
4645	res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
4646	if (res & DMA_ECMD_ECRSP_IP) {
4647	ret = -EBUSY;
4648	goto err;
4649	}
4650
4651	/*
4652	* Unconditionally write the operand B, because
4653	* - There is no side effect if an ecmd doesn't require an
4654	* operand B, but we set the register to some value.
4655	* - It's not invoked in any critical path. The extra MMIO
4656	* write doesn't bring any performance concerns.
4657	*/
4658	dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
4659	dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd \| (oa << DMA_ECMD_OA_SHIFT));
4660
4661	IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
4662	!(res & DMA_ECMD_ECRSP_IP), res);
4663
4664	if (res & DMA_ECMD_ECRSP_IP) {
4665	ret = -ETIMEDOUT;
4666	goto err;
4667	}
4668
4669	ret = ecmd_get_status_code(res);
4670	err:
4671	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
4672
4673	return ret;
4674	}
4675

Provided by KDAB

Definitions

rwbf_quirk
force_on
intel_iommu_tboot_noforce
no_platform_optin
root_entry_lctp
root_entry_uctp
device_rid_cmp_key
device_rid_cmp
device_rbtree_find
device_rbtree_insert
device_rbtree_remove
dmar_rmrr_unit
dmar_atsr_unit
dmar_satc_unit
dmar_atsr_units
dmar_rmrr_units
dmar_satc_units
dmar_disabled
intel_iommu_sm
intel_iommu_enabled
intel_iommu_superpage
iommu_identity_mapping
iommu_skip_te_disable
disable_igfx_iommu
intel_iommu_ops
intel_dirty_ops
translation_pre_enabled
clear_translation_pre_enabled
init_translation_status
intel_iommu_setup
domain_pfn_supported
__iommu_calculate_sagaw
__iommu_calculate_agaw
iommu_calculate_max_sagaw
iommu_calculate_agaw
iommu_paging_structure_coherency
domain_super_pgsize_bitmap
iommu_context_addr
is_downstream_to_pci_bridge
quirk_ioat_snb_local_iommu
iommu_is_dummy
device_lookup_iommu
domain_flush_cache
free_context_table
pgtable_walk
dmar_fault_dump_ptes
pfn_to_dma_pte
dma_pfn_level_pte
dma_pte_clear_range
dma_pte_free_level
dma_pte_free_pagetable
dma_pte_list_pagetables
dma_pte_clear_level
domain_unmap
iommu_alloc_root_entry
iommu_set_root_entry
iommu_flush_write_buffer
__iommu_flush_context
__iommu_flush_iotlb
domain_lookup_dev_info
dev_needs_extra_dtlb_flush
iommu_enable_pci_ats
iommu_disable_pci_ats
iommu_enable_pci_pri
iommu_disable_pci_pri
intel_flush_iotlb_all
iommu_disable_protect_mem_regions
iommu_enable_translation
iommu_disable_translation
disable_dmar_iommu
free_dmar_iommu
first_level_by_default
domain_attach_iommu
domain_detach_iommu
domain_exit
copied_context_tear_down
context_present_cache_flush
domain_context_mapping_one
domain_context_mapping_cb
domain_context_mapping
hardware_largepage_caps
switch_to_super_page
__domain_mapping
domain_context_clear_one
__domain_setup_first_level
domain_setup_second_level
domain_setup_passthrough
domain_setup_first_level
dmar_domain_attach_device
device_rmrr_is_relaxable
device_def_domain_type
intel_iommu_init_qi
copy_context_table
copy_translation_tables
init_dmars
init_no_remapping_devices
init_iommu_hw
iommu_flush_all
iommu_suspend
iommu_resume
iommu_syscore_ops
init_iommu_pm_ops
rmrr_sanity_check
dmar_parse_one_rmrr
dmar_find_atsr
dmar_parse_one_atsr
intel_iommu_free_atsr
dmar_release_one_atsr
dmar_check_one_atsr
dmar_find_satc
dmar_parse_one_satc
intel_iommu_add
dmar_iommu_hotplug
intel_iommu_free_dmars
dmar_find_matched_satc_unit
dmar_ats_supported
dmar_iommu_notify_scope_dev
intel_disable_iommus
intel_iommu_shutdown
dev_to_intel_iommu
version_show
address_show
cap_show
ecap_show
domains_supported_show
domains_used_show
intel_iommu_attrs
intel_iommu_group
intel_iommu_groups
has_external_pci
platform_optin_force_iommu
probe_acpi_namespace_devices
tboot_force_iommu
intel_iommu_init
domain_context_clear_one_cb
domain_context_clear
device_block_translation
blocking_domain_attach_dev
blocking_domain
iommu_superpage_capability
paging_domain_alloc
intel_iommu_domain_alloc_paging_flags
intel_iommu_domain_free
paging_domain_compatible
intel_iommu_attach_device
intel_iommu_map
intel_iommu_map_pages
intel_iommu_unmap
intel_iommu_unmap_pages
intel_iommu_tlb_sync
intel_iommu_iova_to_phys
domain_support_force_snooping
domain_set_force_snooping
intel_iommu_enforce_cache_coherency
intel_iommu_capable
intel_iommu_probe_device
intel_iommu_probe_finalize
intel_iommu_release_device
intel_iommu_get_resv_regions
intel_iommu_device_group
intel_iommu_enable_iopf
intel_iommu_disable_iopf
intel_iommu_is_attach_deferred
risky_device
intel_iommu_iotlb_sync_map
domain_remove_dev_pasid
blocking_domain_set_dev_pasid
domain_add_dev_pasid
intel_iommu_set_dev_pasid
intel_iommu_hw_info
device_set_dirty_tracking
parent_domain_set_dirty_tracking
intel_iommu_set_dirty_tracking
intel_iommu_read_and_clear_dirty
intel_dirty_ops
context_setup_pass_through
context_setup_pass_through_cb
device_setup_pass_through
identity_domain_attach_dev
identity_domain_set_dev_pasid
identity_domain
intel_iommu_ops
quirk_iommu_igfx
quirk_iommu_rwbf
quirk_calpella_no_shadow_gtt
quirk_igfx_skip_te_disable
check_tylersburg_isoch
quirk_extra_dev_tlb_flush

Improve your Profiling and Debugging skills

Find out more

Definitions

source code of linux/drivers/iommu/intel/iommu.c