1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
4 * Author: Joerg Roedel <jroedel@suse.de>
5 * Leo Duran <leo.duran@amd.com>
6 */
7
8#define pr_fmt(fmt) "AMD-Vi: " fmt
9#define dev_fmt(fmt) pr_fmt(fmt)
10
11#include <linux/ratelimit.h>
12#include <linux/pci.h>
13#include <linux/acpi.h>
14#include <linux/pci-ats.h>
15#include <linux/bitmap.h>
16#include <linux/slab.h>
17#include <linux/debugfs.h>
18#include <linux/scatterlist.h>
19#include <linux/dma-map-ops.h>
20#include <linux/dma-direct.h>
21#include <linux/iommu-helper.h>
22#include <linux/delay.h>
23#include <linux/amd-iommu.h>
24#include <linux/notifier.h>
25#include <linux/export.h>
26#include <linux/irq.h>
27#include <linux/msi.h>
28#include <linux/irqdomain.h>
29#include <linux/percpu.h>
30#include <linux/io-pgtable.h>
31#include <linux/cc_platform.h>
32#include <asm/irq_remapping.h>
33#include <asm/io_apic.h>
34#include <asm/apic.h>
35#include <asm/hw_irq.h>
36#include <asm/proto.h>
37#include <asm/iommu.h>
38#include <asm/gart.h>
39#include <asm/dma.h>
40#include <uapi/linux/iommufd.h>
41
42#include "amd_iommu.h"
43#include "../dma-iommu.h"
44#include "../irq_remapping.h"
45
46#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
47
48/* Reserved IOVA ranges */
49#define MSI_RANGE_START (0xfee00000)
50#define MSI_RANGE_END (0xfeefffff)
51#define HT_RANGE_START (0xfd00000000ULL)
52#define HT_RANGE_END (0xffffffffffULL)
53
54#define DEFAULT_PGTABLE_LEVEL PAGE_MODE_3_LEVEL
55
56static DEFINE_SPINLOCK(pd_bitmap_lock);
57
58LIST_HEAD(ioapic_map);
59LIST_HEAD(hpet_map);
60LIST_HEAD(acpihid_map);
61
62const struct iommu_ops amd_iommu_ops;
63static const struct iommu_dirty_ops amd_dirty_ops;
64
65int amd_iommu_max_glx_val = -1;
66
67/*
68 * general struct to manage commands send to an IOMMU
69 */
70struct iommu_cmd {
71 u32 data[4];
72};
73
74struct kmem_cache *amd_iommu_irq_cache;
75
76static void detach_device(struct device *dev);
77
78static void set_dte_entry(struct amd_iommu *iommu,
79 struct iommu_dev_data *dev_data);
80
81/****************************************************************************
82 *
83 * Helper functions
84 *
85 ****************************************************************************/
86
87static inline bool pdom_is_v2_pgtbl_mode(struct protection_domain *pdom)
88{
89 return (pdom && (pdom->pd_mode == PD_MODE_V2));
90}
91
92static inline int get_acpihid_device_id(struct device *dev,
93 struct acpihid_map_entry **entry)
94{
95 struct acpi_device *adev = ACPI_COMPANION(dev);
96 struct acpihid_map_entry *p;
97
98 if (!adev)
99 return -ENODEV;
100
101 list_for_each_entry(p, &acpihid_map, list) {
102 if (acpi_dev_hid_uid_match(adev, p->hid,
103 p->uid[0] ? p->uid : NULL)) {
104 if (entry)
105 *entry = p;
106 return p->devid;
107 }
108 }
109 return -EINVAL;
110}
111
112static inline int get_device_sbdf_id(struct device *dev)
113{
114 int sbdf;
115
116 if (dev_is_pci(dev))
117 sbdf = get_pci_sbdf_id(to_pci_dev(dev));
118 else
119 sbdf = get_acpihid_device_id(dev, NULL);
120
121 return sbdf;
122}
123
124struct dev_table_entry *get_dev_table(struct amd_iommu *iommu)
125{
126 struct dev_table_entry *dev_table;
127 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
128
129 BUG_ON(pci_seg == NULL);
130 dev_table = pci_seg->dev_table;
131 BUG_ON(dev_table == NULL);
132
133 return dev_table;
134}
135
136static inline u16 get_device_segment(struct device *dev)
137{
138 u16 seg;
139
140 if (dev_is_pci(dev)) {
141 struct pci_dev *pdev = to_pci_dev(dev);
142
143 seg = pci_domain_nr(bus: pdev->bus);
144 } else {
145 u32 devid = get_acpihid_device_id(dev, NULL);
146
147 seg = PCI_SBDF_TO_SEGID(devid);
148 }
149
150 return seg;
151}
152
153/* Writes the specific IOMMU for a device into the PCI segment rlookup table */
154void amd_iommu_set_rlookup_table(struct amd_iommu *iommu, u16 devid)
155{
156 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
157
158 pci_seg->rlookup_table[devid] = iommu;
159}
160
161static struct amd_iommu *__rlookup_amd_iommu(u16 seg, u16 devid)
162{
163 struct amd_iommu_pci_seg *pci_seg;
164
165 for_each_pci_segment(pci_seg) {
166 if (pci_seg->id == seg)
167 return pci_seg->rlookup_table[devid];
168 }
169 return NULL;
170}
171
172static struct amd_iommu *rlookup_amd_iommu(struct device *dev)
173{
174 u16 seg = get_device_segment(dev);
175 int devid = get_device_sbdf_id(dev);
176
177 if (devid < 0)
178 return NULL;
179 return __rlookup_amd_iommu(seg, PCI_SBDF_TO_DEVID(devid));
180}
181
182static struct protection_domain *to_pdomain(struct iommu_domain *dom)
183{
184 return container_of(dom, struct protection_domain, domain);
185}
186
187static struct iommu_dev_data *alloc_dev_data(struct amd_iommu *iommu, u16 devid)
188{
189 struct iommu_dev_data *dev_data;
190 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
191
192 dev_data = kzalloc(size: sizeof(*dev_data), GFP_KERNEL);
193 if (!dev_data)
194 return NULL;
195
196 spin_lock_init(&dev_data->lock);
197 dev_data->devid = devid;
198 ratelimit_default_init(rs: &dev_data->rs);
199
200 llist_add(new: &dev_data->dev_data_list, head: &pci_seg->dev_data_list);
201 return dev_data;
202}
203
204static struct iommu_dev_data *search_dev_data(struct amd_iommu *iommu, u16 devid)
205{
206 struct iommu_dev_data *dev_data;
207 struct llist_node *node;
208 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
209
210 if (llist_empty(head: &pci_seg->dev_data_list))
211 return NULL;
212
213 node = pci_seg->dev_data_list.first;
214 llist_for_each_entry(dev_data, node, dev_data_list) {
215 if (dev_data->devid == devid)
216 return dev_data;
217 }
218
219 return NULL;
220}
221
222static int clone_alias(struct pci_dev *pdev, u16 alias, void *data)
223{
224 struct amd_iommu *iommu;
225 struct dev_table_entry *dev_table;
226 u16 devid = pci_dev_id(dev: pdev);
227
228 if (devid == alias)
229 return 0;
230
231 iommu = rlookup_amd_iommu(dev: &pdev->dev);
232 if (!iommu)
233 return 0;
234
235 amd_iommu_set_rlookup_table(iommu, devid: alias);
236 dev_table = get_dev_table(iommu);
237 memcpy(dev_table[alias].data,
238 dev_table[devid].data,
239 sizeof(dev_table[alias].data));
240
241 return 0;
242}
243
244static void clone_aliases(struct amd_iommu *iommu, struct device *dev)
245{
246 struct pci_dev *pdev;
247
248 if (!dev_is_pci(dev))
249 return;
250 pdev = to_pci_dev(dev);
251
252 /*
253 * The IVRS alias stored in the alias table may not be
254 * part of the PCI DMA aliases if it's bus differs
255 * from the original device.
256 */
257 clone_alias(pdev, alias: iommu->pci_seg->alias_table[pci_dev_id(dev: pdev)], NULL);
258
259 pci_for_each_dma_alias(pdev, fn: clone_alias, NULL);
260}
261
262static void setup_aliases(struct amd_iommu *iommu, struct device *dev)
263{
264 struct pci_dev *pdev = to_pci_dev(dev);
265 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
266 u16 ivrs_alias;
267
268 /* For ACPI HID devices, there are no aliases */
269 if (!dev_is_pci(dev))
270 return;
271
272 /*
273 * Add the IVRS alias to the pci aliases if it is on the same
274 * bus. The IVRS table may know about a quirk that we don't.
275 */
276 ivrs_alias = pci_seg->alias_table[pci_dev_id(dev: pdev)];
277 if (ivrs_alias != pci_dev_id(dev: pdev) &&
278 PCI_BUS_NUM(ivrs_alias) == pdev->bus->number)
279 pci_add_dma_alias(dev: pdev, devfn_from: ivrs_alias & 0xff, nr_devfns: 1);
280
281 clone_aliases(iommu, dev);
282}
283
284static struct iommu_dev_data *find_dev_data(struct amd_iommu *iommu, u16 devid)
285{
286 struct iommu_dev_data *dev_data;
287
288 dev_data = search_dev_data(iommu, devid);
289
290 if (dev_data == NULL) {
291 dev_data = alloc_dev_data(iommu, devid);
292 if (!dev_data)
293 return NULL;
294
295 if (translation_pre_enabled(iommu))
296 dev_data->defer_attach = true;
297 }
298
299 return dev_data;
300}
301
302/*
303* Find or create an IOMMU group for a acpihid device.
304*/
305static struct iommu_group *acpihid_device_group(struct device *dev)
306{
307 struct acpihid_map_entry *p, *entry = NULL;
308 int devid;
309
310 devid = get_acpihid_device_id(dev, entry: &entry);
311 if (devid < 0)
312 return ERR_PTR(error: devid);
313
314 list_for_each_entry(p, &acpihid_map, list) {
315 if ((devid == p->devid) && p->group)
316 entry->group = p->group;
317 }
318
319 if (!entry->group)
320 entry->group = generic_device_group(dev);
321 else
322 iommu_group_ref_get(group: entry->group);
323
324 return entry->group;
325}
326
327static inline bool pdev_pasid_supported(struct iommu_dev_data *dev_data)
328{
329 return (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_PASID_SUP);
330}
331
332static u32 pdev_get_caps(struct pci_dev *pdev)
333{
334 int features;
335 u32 flags = 0;
336
337 if (pci_ats_supported(dev: pdev))
338 flags |= AMD_IOMMU_DEVICE_FLAG_ATS_SUP;
339
340 if (pci_pri_supported(pdev))
341 flags |= AMD_IOMMU_DEVICE_FLAG_PRI_SUP;
342
343 features = pci_pasid_features(pdev);
344 if (features >= 0) {
345 flags |= AMD_IOMMU_DEVICE_FLAG_PASID_SUP;
346
347 if (features & PCI_PASID_CAP_EXEC)
348 flags |= AMD_IOMMU_DEVICE_FLAG_EXEC_SUP;
349
350 if (features & PCI_PASID_CAP_PRIV)
351 flags |= AMD_IOMMU_DEVICE_FLAG_PRIV_SUP;
352 }
353
354 return flags;
355}
356
357static inline int pdev_enable_cap_ats(struct pci_dev *pdev)
358{
359 struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev: &pdev->dev);
360 int ret = -EINVAL;
361
362 if (dev_data->ats_enabled)
363 return 0;
364
365 if (amd_iommu_iotlb_sup &&
366 (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_ATS_SUP)) {
367 ret = pci_enable_ats(dev: pdev, PAGE_SHIFT);
368 if (!ret) {
369 dev_data->ats_enabled = 1;
370 dev_data->ats_qdep = pci_ats_queue_depth(dev: pdev);
371 }
372 }
373
374 return ret;
375}
376
377static inline void pdev_disable_cap_ats(struct pci_dev *pdev)
378{
379 struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev: &pdev->dev);
380
381 if (dev_data->ats_enabled) {
382 pci_disable_ats(dev: pdev);
383 dev_data->ats_enabled = 0;
384 }
385}
386
387int amd_iommu_pdev_enable_cap_pri(struct pci_dev *pdev)
388{
389 struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev: &pdev->dev);
390 int ret = -EINVAL;
391
392 if (dev_data->pri_enabled)
393 return 0;
394
395 if (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_PRI_SUP) {
396 /*
397 * First reset the PRI state of the device.
398 * FIXME: Hardcode number of outstanding requests for now
399 */
400 if (!pci_reset_pri(pdev) && !pci_enable_pri(pdev, reqs: 32)) {
401 dev_data->pri_enabled = 1;
402 dev_data->pri_tlp = pci_prg_resp_pasid_required(pdev);
403
404 ret = 0;
405 }
406 }
407
408 return ret;
409}
410
411void amd_iommu_pdev_disable_cap_pri(struct pci_dev *pdev)
412{
413 struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev: &pdev->dev);
414
415 if (dev_data->pri_enabled) {
416 pci_disable_pri(pdev);
417 dev_data->pri_enabled = 0;
418 }
419}
420
421static inline int pdev_enable_cap_pasid(struct pci_dev *pdev)
422{
423 struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev: &pdev->dev);
424 int ret = -EINVAL;
425
426 if (dev_data->pasid_enabled)
427 return 0;
428
429 if (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_PASID_SUP) {
430 /* Only allow access to user-accessible pages */
431 ret = pci_enable_pasid(pdev, features: 0);
432 if (!ret)
433 dev_data->pasid_enabled = 1;
434 }
435
436 return ret;
437}
438
439static inline void pdev_disable_cap_pasid(struct pci_dev *pdev)
440{
441 struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev: &pdev->dev);
442
443 if (dev_data->pasid_enabled) {
444 pci_disable_pasid(pdev);
445 dev_data->pasid_enabled = 0;
446 }
447}
448
449static void pdev_enable_caps(struct pci_dev *pdev)
450{
451 pdev_enable_cap_ats(pdev);
452 pdev_enable_cap_pasid(pdev);
453 amd_iommu_pdev_enable_cap_pri(pdev);
454
455}
456
457static void pdev_disable_caps(struct pci_dev *pdev)
458{
459 pdev_disable_cap_ats(pdev);
460 pdev_disable_cap_pasid(pdev);
461 amd_iommu_pdev_disable_cap_pri(pdev);
462}
463
464/*
465 * This function checks if the driver got a valid device from the caller to
466 * avoid dereferencing invalid pointers.
467 */
468static bool check_device(struct device *dev)
469{
470 struct amd_iommu_pci_seg *pci_seg;
471 struct amd_iommu *iommu;
472 int devid, sbdf;
473
474 if (!dev)
475 return false;
476
477 sbdf = get_device_sbdf_id(dev);
478 if (sbdf < 0)
479 return false;
480 devid = PCI_SBDF_TO_DEVID(sbdf);
481
482 iommu = rlookup_amd_iommu(dev);
483 if (!iommu)
484 return false;
485
486 /* Out of our scope? */
487 pci_seg = iommu->pci_seg;
488 if (devid > pci_seg->last_bdf)
489 return false;
490
491 return true;
492}
493
494static int iommu_init_device(struct amd_iommu *iommu, struct device *dev)
495{
496 struct iommu_dev_data *dev_data;
497 int devid, sbdf;
498
499 if (dev_iommu_priv_get(dev))
500 return 0;
501
502 sbdf = get_device_sbdf_id(dev);
503 if (sbdf < 0)
504 return sbdf;
505
506 devid = PCI_SBDF_TO_DEVID(sbdf);
507 dev_data = find_dev_data(iommu, devid);
508 if (!dev_data)
509 return -ENOMEM;
510
511 dev_data->dev = dev;
512 setup_aliases(iommu, dev);
513
514 /*
515 * By default we use passthrough mode for IOMMUv2 capable device.
516 * But if amd_iommu=force_isolation is set (e.g. to debug DMA to
517 * invalid address), we ignore the capability for the device so
518 * it'll be forced to go into translation mode.
519 */
520 if ((iommu_default_passthrough() || !amd_iommu_force_isolation) &&
521 dev_is_pci(dev) && amd_iommu_gt_ppr_supported()) {
522 dev_data->flags = pdev_get_caps(to_pci_dev(dev));
523 }
524
525 dev_iommu_priv_set(dev, priv: dev_data);
526
527 return 0;
528}
529
530static void iommu_ignore_device(struct amd_iommu *iommu, struct device *dev)
531{
532 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
533 struct dev_table_entry *dev_table = get_dev_table(iommu);
534 int devid, sbdf;
535
536 sbdf = get_device_sbdf_id(dev);
537 if (sbdf < 0)
538 return;
539
540 devid = PCI_SBDF_TO_DEVID(sbdf);
541 pci_seg->rlookup_table[devid] = NULL;
542 memset(&dev_table[devid], 0, sizeof(struct dev_table_entry));
543
544 setup_aliases(iommu, dev);
545}
546
547static void amd_iommu_uninit_device(struct device *dev)
548{
549 struct iommu_dev_data *dev_data;
550
551 dev_data = dev_iommu_priv_get(dev);
552 if (!dev_data)
553 return;
554
555 if (dev_data->domain)
556 detach_device(dev);
557
558 /*
559 * We keep dev_data around for unplugged devices and reuse it when the
560 * device is re-plugged - not doing so would introduce a ton of races.
561 */
562}
563
564/****************************************************************************
565 *
566 * Interrupt handling functions
567 *
568 ****************************************************************************/
569
570static void dump_dte_entry(struct amd_iommu *iommu, u16 devid)
571{
572 int i;
573 struct dev_table_entry *dev_table = get_dev_table(iommu);
574
575 for (i = 0; i < 4; ++i)
576 pr_err("DTE[%d]: %016llx\n", i, dev_table[devid].data[i]);
577}
578
579static void dump_command(unsigned long phys_addr)
580{
581 struct iommu_cmd *cmd = iommu_phys_to_virt(paddr: phys_addr);
582 int i;
583
584 for (i = 0; i < 4; ++i)
585 pr_err("CMD[%d]: %08x\n", i, cmd->data[i]);
586}
587
588static void amd_iommu_report_rmp_hw_error(struct amd_iommu *iommu, volatile u32 *event)
589{
590 struct iommu_dev_data *dev_data = NULL;
591 int devid, vmg_tag, flags;
592 struct pci_dev *pdev;
593 u64 spa;
594
595 devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
596 vmg_tag = (event[1]) & 0xFFFF;
597 flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
598 spa = ((u64)event[3] << 32) | (event[2] & 0xFFFFFFF8);
599
600 pdev = pci_get_domain_bus_and_slot(domain: iommu->pci_seg->id, PCI_BUS_NUM(devid),
601 devfn: devid & 0xff);
602 if (pdev)
603 dev_data = dev_iommu_priv_get(dev: &pdev->dev);
604
605 if (dev_data) {
606 if (__ratelimit(&dev_data->rs)) {
607 pci_err(pdev, "Event logged [RMP_HW_ERROR vmg_tag=0x%04x, spa=0x%llx, flags=0x%04x]\n",
608 vmg_tag, spa, flags);
609 }
610 } else {
611 pr_err_ratelimited("Event logged [RMP_HW_ERROR device=%04x:%02x:%02x.%x, vmg_tag=0x%04x, spa=0x%llx, flags=0x%04x]\n",
612 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
613 vmg_tag, spa, flags);
614 }
615
616 if (pdev)
617 pci_dev_put(dev: pdev);
618}
619
620static void amd_iommu_report_rmp_fault(struct amd_iommu *iommu, volatile u32 *event)
621{
622 struct iommu_dev_data *dev_data = NULL;
623 int devid, flags_rmp, vmg_tag, flags;
624 struct pci_dev *pdev;
625 u64 gpa;
626
627 devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
628 flags_rmp = (event[0] >> EVENT_FLAGS_SHIFT) & 0xFF;
629 vmg_tag = (event[1]) & 0xFFFF;
630 flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
631 gpa = ((u64)event[3] << 32) | event[2];
632
633 pdev = pci_get_domain_bus_and_slot(domain: iommu->pci_seg->id, PCI_BUS_NUM(devid),
634 devfn: devid & 0xff);
635 if (pdev)
636 dev_data = dev_iommu_priv_get(dev: &pdev->dev);
637
638 if (dev_data) {
639 if (__ratelimit(&dev_data->rs)) {
640 pci_err(pdev, "Event logged [RMP_PAGE_FAULT vmg_tag=0x%04x, gpa=0x%llx, flags_rmp=0x%04x, flags=0x%04x]\n",
641 vmg_tag, gpa, flags_rmp, flags);
642 }
643 } else {
644 pr_err_ratelimited("Event logged [RMP_PAGE_FAULT device=%04x:%02x:%02x.%x, vmg_tag=0x%04x, gpa=0x%llx, flags_rmp=0x%04x, flags=0x%04x]\n",
645 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
646 vmg_tag, gpa, flags_rmp, flags);
647 }
648
649 if (pdev)
650 pci_dev_put(dev: pdev);
651}
652
653#define IS_IOMMU_MEM_TRANSACTION(flags) \
654 (((flags) & EVENT_FLAG_I) == 0)
655
656#define IS_WRITE_REQUEST(flags) \
657 ((flags) & EVENT_FLAG_RW)
658
659static void amd_iommu_report_page_fault(struct amd_iommu *iommu,
660 u16 devid, u16 domain_id,
661 u64 address, int flags)
662{
663 struct iommu_dev_data *dev_data = NULL;
664 struct pci_dev *pdev;
665
666 pdev = pci_get_domain_bus_and_slot(domain: iommu->pci_seg->id, PCI_BUS_NUM(devid),
667 devfn: devid & 0xff);
668 if (pdev)
669 dev_data = dev_iommu_priv_get(dev: &pdev->dev);
670
671 if (dev_data) {
672 /*
673 * If this is a DMA fault (for which the I(nterrupt)
674 * bit will be unset), allow report_iommu_fault() to
675 * prevent logging it.
676 */
677 if (IS_IOMMU_MEM_TRANSACTION(flags)) {
678 /* Device not attached to domain properly */
679 if (dev_data->domain == NULL) {
680 pr_err_ratelimited("Event logged [Device not attached to domain properly]\n");
681 pr_err_ratelimited(" device=%04x:%02x:%02x.%x domain=0x%04x\n",
682 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid),
683 PCI_FUNC(devid), domain_id);
684 goto out;
685 }
686
687 if (!report_iommu_fault(domain: &dev_data->domain->domain,
688 dev: &pdev->dev, iova: address,
689 IS_WRITE_REQUEST(flags) ?
690 IOMMU_FAULT_WRITE :
691 IOMMU_FAULT_READ))
692 goto out;
693 }
694
695 if (__ratelimit(&dev_data->rs)) {
696 pci_err(pdev, "Event logged [IO_PAGE_FAULT domain=0x%04x address=0x%llx flags=0x%04x]\n",
697 domain_id, address, flags);
698 }
699 } else {
700 pr_err_ratelimited("Event logged [IO_PAGE_FAULT device=%04x:%02x:%02x.%x domain=0x%04x address=0x%llx flags=0x%04x]\n",
701 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
702 domain_id, address, flags);
703 }
704
705out:
706 if (pdev)
707 pci_dev_put(dev: pdev);
708}
709
710static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
711{
712 struct device *dev = iommu->iommu.dev;
713 int type, devid, flags, tag;
714 volatile u32 *event = __evt;
715 int count = 0;
716 u64 address;
717 u32 pasid;
718
719retry:
720 type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK;
721 devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
722 pasid = (event[0] & EVENT_DOMID_MASK_HI) |
723 (event[1] & EVENT_DOMID_MASK_LO);
724 flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
725 address = (u64)(((u64)event[3]) << 32) | event[2];
726
727 if (type == 0) {
728 /* Did we hit the erratum? */
729 if (++count == LOOP_TIMEOUT) {
730 pr_err("No event written to event log\n");
731 return;
732 }
733 udelay(1);
734 goto retry;
735 }
736
737 if (type == EVENT_TYPE_IO_FAULT) {
738 amd_iommu_report_page_fault(iommu, devid, domain_id: pasid, address, flags);
739 return;
740 }
741
742 switch (type) {
743 case EVENT_TYPE_ILL_DEV:
744 dev_err(dev, "Event logged [ILLEGAL_DEV_TABLE_ENTRY device=%04x:%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x]\n",
745 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
746 pasid, address, flags);
747 dump_dte_entry(iommu, devid);
748 break;
749 case EVENT_TYPE_DEV_TAB_ERR:
750 dev_err(dev, "Event logged [DEV_TAB_HARDWARE_ERROR device=%04x:%02x:%02x.%x "
751 "address=0x%llx flags=0x%04x]\n",
752 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
753 address, flags);
754 break;
755 case EVENT_TYPE_PAGE_TAB_ERR:
756 dev_err(dev, "Event logged [PAGE_TAB_HARDWARE_ERROR device=%04x:%02x:%02x.%x pasid=0x%04x address=0x%llx flags=0x%04x]\n",
757 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
758 pasid, address, flags);
759 break;
760 case EVENT_TYPE_ILL_CMD:
761 dev_err(dev, "Event logged [ILLEGAL_COMMAND_ERROR address=0x%llx]\n", address);
762 dump_command(phys_addr: address);
763 break;
764 case EVENT_TYPE_CMD_HARD_ERR:
765 dev_err(dev, "Event logged [COMMAND_HARDWARE_ERROR address=0x%llx flags=0x%04x]\n",
766 address, flags);
767 break;
768 case EVENT_TYPE_IOTLB_INV_TO:
769 dev_err(dev, "Event logged [IOTLB_INV_TIMEOUT device=%04x:%02x:%02x.%x address=0x%llx]\n",
770 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
771 address);
772 break;
773 case EVENT_TYPE_INV_DEV_REQ:
774 dev_err(dev, "Event logged [INVALID_DEVICE_REQUEST device=%04x:%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x]\n",
775 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
776 pasid, address, flags);
777 break;
778 case EVENT_TYPE_RMP_FAULT:
779 amd_iommu_report_rmp_fault(iommu, event);
780 break;
781 case EVENT_TYPE_RMP_HW_ERR:
782 amd_iommu_report_rmp_hw_error(iommu, event);
783 break;
784 case EVENT_TYPE_INV_PPR_REQ:
785 pasid = PPR_PASID(*((u64 *)__evt));
786 tag = event[1] & 0x03FF;
787 dev_err(dev, "Event logged [INVALID_PPR_REQUEST device=%04x:%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x tag=0x%03x]\n",
788 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
789 pasid, address, flags, tag);
790 break;
791 default:
792 dev_err(dev, "Event logged [UNKNOWN event[0]=0x%08x event[1]=0x%08x event[2]=0x%08x event[3]=0x%08x\n",
793 event[0], event[1], event[2], event[3]);
794 }
795
796 /*
797 * To detect the hardware errata 732 we need to clear the
798 * entry back to zero. This issue does not exist on SNP
799 * enabled system. Also this buffer is not writeable on
800 * SNP enabled system.
801 */
802 if (!amd_iommu_snp_en)
803 memset(__evt, 0, 4 * sizeof(u32));
804}
805
806static void iommu_poll_events(struct amd_iommu *iommu)
807{
808 u32 head, tail;
809
810 head = readl(addr: iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
811 tail = readl(addr: iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
812
813 while (head != tail) {
814 iommu_print_event(iommu, evt: iommu->evt_buf + head);
815 head = (head + EVENT_ENTRY_SIZE) % EVT_BUFFER_SIZE;
816 }
817
818 writel(val: head, addr: iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
819}
820
821static void iommu_poll_ppr_log(struct amd_iommu *iommu)
822{
823 u32 head, tail;
824
825 if (iommu->ppr_log == NULL)
826 return;
827
828 head = readl(addr: iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
829 tail = readl(addr: iommu->mmio_base + MMIO_PPR_TAIL_OFFSET);
830
831 while (head != tail) {
832 volatile u64 *raw;
833 u64 entry[2];
834 int i;
835
836 raw = (u64 *)(iommu->ppr_log + head);
837
838 /*
839 * Hardware bug: Interrupt may arrive before the entry is
840 * written to memory. If this happens we need to wait for the
841 * entry to arrive.
842 */
843 for (i = 0; i < LOOP_TIMEOUT; ++i) {
844 if (PPR_REQ_TYPE(raw[0]) != 0)
845 break;
846 udelay(1);
847 }
848
849 /* Avoid memcpy function-call overhead */
850 entry[0] = raw[0];
851 entry[1] = raw[1];
852
853 /*
854 * To detect the hardware errata 733 we need to clear the
855 * entry back to zero. This issue does not exist on SNP
856 * enabled system. Also this buffer is not writeable on
857 * SNP enabled system.
858 */
859 if (!amd_iommu_snp_en)
860 raw[0] = raw[1] = 0UL;
861
862 /* Update head pointer of hardware ring-buffer */
863 head = (head + PPR_ENTRY_SIZE) % PPR_LOG_SIZE;
864 writel(val: head, addr: iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
865
866 /* TODO: PPR Handler will be added when we add IOPF support */
867
868 /* Refresh ring-buffer information */
869 head = readl(addr: iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
870 tail = readl(addr: iommu->mmio_base + MMIO_PPR_TAIL_OFFSET);
871 }
872}
873
874#ifdef CONFIG_IRQ_REMAP
875static int (*iommu_ga_log_notifier)(u32);
876
877int amd_iommu_register_ga_log_notifier(int (*notifier)(u32))
878{
879 iommu_ga_log_notifier = notifier;
880
881 return 0;
882}
883EXPORT_SYMBOL(amd_iommu_register_ga_log_notifier);
884
885static void iommu_poll_ga_log(struct amd_iommu *iommu)
886{
887 u32 head, tail;
888
889 if (iommu->ga_log == NULL)
890 return;
891
892 head = readl(addr: iommu->mmio_base + MMIO_GA_HEAD_OFFSET);
893 tail = readl(addr: iommu->mmio_base + MMIO_GA_TAIL_OFFSET);
894
895 while (head != tail) {
896 volatile u64 *raw;
897 u64 log_entry;
898
899 raw = (u64 *)(iommu->ga_log + head);
900
901 /* Avoid memcpy function-call overhead */
902 log_entry = *raw;
903
904 /* Update head pointer of hardware ring-buffer */
905 head = (head + GA_ENTRY_SIZE) % GA_LOG_SIZE;
906 writel(val: head, addr: iommu->mmio_base + MMIO_GA_HEAD_OFFSET);
907
908 /* Handle GA entry */
909 switch (GA_REQ_TYPE(log_entry)) {
910 case GA_GUEST_NR:
911 if (!iommu_ga_log_notifier)
912 break;
913
914 pr_debug("%s: devid=%#x, ga_tag=%#x\n",
915 __func__, GA_DEVID(log_entry),
916 GA_TAG(log_entry));
917
918 if (iommu_ga_log_notifier(GA_TAG(log_entry)) != 0)
919 pr_err("GA log notifier failed.\n");
920 break;
921 default:
922 break;
923 }
924 }
925}
926
927static void
928amd_iommu_set_pci_msi_domain(struct device *dev, struct amd_iommu *iommu)
929{
930 if (!irq_remapping_enabled || !dev_is_pci(dev) ||
931 !pci_dev_has_default_msi_parent_domain(to_pci_dev(dev)))
932 return;
933
934 dev_set_msi_domain(dev, d: iommu->ir_domain);
935}
936
937#else /* CONFIG_IRQ_REMAP */
938static inline void
939amd_iommu_set_pci_msi_domain(struct device *dev, struct amd_iommu *iommu) { }
940#endif /* !CONFIG_IRQ_REMAP */
941
942static void amd_iommu_handle_irq(void *data, const char *evt_type,
943 u32 int_mask, u32 overflow_mask,
944 void (*int_handler)(struct amd_iommu *),
945 void (*overflow_handler)(struct amd_iommu *))
946{
947 struct amd_iommu *iommu = (struct amd_iommu *) data;
948 u32 status = readl(addr: iommu->mmio_base + MMIO_STATUS_OFFSET);
949 u32 mask = int_mask | overflow_mask;
950
951 while (status & mask) {
952 /* Enable interrupt sources again */
953 writel(val: mask, addr: iommu->mmio_base + MMIO_STATUS_OFFSET);
954
955 if (int_handler) {
956 pr_devel("Processing IOMMU (ivhd%d) %s Log\n",
957 iommu->index, evt_type);
958 int_handler(iommu);
959 }
960
961 if ((status & overflow_mask) && overflow_handler)
962 overflow_handler(iommu);
963
964 /*
965 * Hardware bug: ERBT1312
966 * When re-enabling interrupt (by writing 1
967 * to clear the bit), the hardware might also try to set
968 * the interrupt bit in the event status register.
969 * In this scenario, the bit will be set, and disable
970 * subsequent interrupts.
971 *
972 * Workaround: The IOMMU driver should read back the
973 * status register and check if the interrupt bits are cleared.
974 * If not, driver will need to go through the interrupt handler
975 * again and re-clear the bits
976 */
977 status = readl(addr: iommu->mmio_base + MMIO_STATUS_OFFSET);
978 }
979}
980
981irqreturn_t amd_iommu_int_thread_evtlog(int irq, void *data)
982{
983 amd_iommu_handle_irq(data, evt_type: "Evt", MMIO_STATUS_EVT_INT_MASK,
984 MMIO_STATUS_EVT_OVERFLOW_MASK,
985 int_handler: iommu_poll_events, overflow_handler: amd_iommu_restart_event_logging);
986
987 return IRQ_HANDLED;
988}
989
990irqreturn_t amd_iommu_int_thread_pprlog(int irq, void *data)
991{
992 amd_iommu_handle_irq(data, evt_type: "PPR", MMIO_STATUS_PPR_INT_MASK,
993 MMIO_STATUS_PPR_OVERFLOW_MASK,
994 int_handler: iommu_poll_ppr_log, overflow_handler: amd_iommu_restart_ppr_log);
995
996 return IRQ_HANDLED;
997}
998
999irqreturn_t amd_iommu_int_thread_galog(int irq, void *data)
1000{
1001#ifdef CONFIG_IRQ_REMAP
1002 amd_iommu_handle_irq(data, evt_type: "GA", MMIO_STATUS_GALOG_INT_MASK,
1003 MMIO_STATUS_GALOG_OVERFLOW_MASK,
1004 int_handler: iommu_poll_ga_log, overflow_handler: amd_iommu_restart_ga_log);
1005#endif
1006
1007 return IRQ_HANDLED;
1008}
1009
1010irqreturn_t amd_iommu_int_thread(int irq, void *data)
1011{
1012 amd_iommu_int_thread_evtlog(irq, data);
1013 amd_iommu_int_thread_pprlog(irq, data);
1014 amd_iommu_int_thread_galog(irq, data);
1015
1016 return IRQ_HANDLED;
1017}
1018
1019irqreturn_t amd_iommu_int_handler(int irq, void *data)
1020{
1021 return IRQ_WAKE_THREAD;
1022}
1023
1024/****************************************************************************
1025 *
1026 * IOMMU command queuing functions
1027 *
1028 ****************************************************************************/
1029
1030static int wait_on_sem(struct amd_iommu *iommu, u64 data)
1031{
1032 int i = 0;
1033
1034 while (*iommu->cmd_sem != data && i < LOOP_TIMEOUT) {
1035 udelay(1);
1036 i += 1;
1037 }
1038
1039 if (i == LOOP_TIMEOUT) {
1040 pr_alert("Completion-Wait loop timed out\n");
1041 return -EIO;
1042 }
1043
1044 return 0;
1045}
1046
1047static void copy_cmd_to_buffer(struct amd_iommu *iommu,
1048 struct iommu_cmd *cmd)
1049{
1050 u8 *target;
1051 u32 tail;
1052
1053 /* Copy command to buffer */
1054 tail = iommu->cmd_buf_tail;
1055 target = iommu->cmd_buf + tail;
1056 memcpy(target, cmd, sizeof(*cmd));
1057
1058 tail = (tail + sizeof(*cmd)) % CMD_BUFFER_SIZE;
1059 iommu->cmd_buf_tail = tail;
1060
1061 /* Tell the IOMMU about it */
1062 writel(val: tail, addr: iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
1063}
1064
1065static void build_completion_wait(struct iommu_cmd *cmd,
1066 struct amd_iommu *iommu,
1067 u64 data)
1068{
1069 u64 paddr = iommu_virt_to_phys(vaddr: (void *)iommu->cmd_sem);
1070
1071 memset(cmd, 0, sizeof(*cmd));
1072 cmd->data[0] = lower_32_bits(paddr) | CMD_COMPL_WAIT_STORE_MASK;
1073 cmd->data[1] = upper_32_bits(paddr);
1074 cmd->data[2] = lower_32_bits(data);
1075 cmd->data[3] = upper_32_bits(data);
1076 CMD_SET_TYPE(cmd, CMD_COMPL_WAIT);
1077}
1078
1079static void build_inv_dte(struct iommu_cmd *cmd, u16 devid)
1080{
1081 memset(cmd, 0, sizeof(*cmd));
1082 cmd->data[0] = devid;
1083 CMD_SET_TYPE(cmd, CMD_INV_DEV_ENTRY);
1084}
1085
1086/*
1087 * Builds an invalidation address which is suitable for one page or multiple
1088 * pages. Sets the size bit (S) as needed is more than one page is flushed.
1089 */
1090static inline u64 build_inv_address(u64 address, size_t size)
1091{
1092 u64 pages, end, msb_diff;
1093
1094 pages = iommu_num_pages(addr: address, len: size, PAGE_SIZE);
1095
1096 if (pages == 1)
1097 return address & PAGE_MASK;
1098
1099 end = address + size - 1;
1100
1101 /*
1102 * msb_diff would hold the index of the most significant bit that
1103 * flipped between the start and end.
1104 */
1105 msb_diff = fls64(x: end ^ address) - 1;
1106
1107 /*
1108 * Bits 63:52 are sign extended. If for some reason bit 51 is different
1109 * between the start and the end, invalidate everything.
1110 */
1111 if (unlikely(msb_diff > 51)) {
1112 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
1113 } else {
1114 /*
1115 * The msb-bit must be clear on the address. Just set all the
1116 * lower bits.
1117 */
1118 address |= (1ull << msb_diff) - 1;
1119 }
1120
1121 /* Clear bits 11:0 */
1122 address &= PAGE_MASK;
1123
1124 /* Set the size bit - we flush more than one 4kb page */
1125 return address | CMD_INV_IOMMU_PAGES_SIZE_MASK;
1126}
1127
1128static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
1129 size_t size, u16 domid,
1130 ioasid_t pasid, bool gn)
1131{
1132 u64 inv_address = build_inv_address(address, size);
1133
1134 memset(cmd, 0, sizeof(*cmd));
1135
1136 cmd->data[1] |= domid;
1137 cmd->data[2] = lower_32_bits(inv_address);
1138 cmd->data[3] = upper_32_bits(inv_address);
1139 /* PDE bit - we want to flush everything, not only the PTEs */
1140 cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
1141 if (gn) {
1142 cmd->data[0] |= pasid;
1143 cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK;
1144 }
1145 CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
1146}
1147
1148static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep,
1149 u64 address, size_t size,
1150 ioasid_t pasid, bool gn)
1151{
1152 u64 inv_address = build_inv_address(address, size);
1153
1154 memset(cmd, 0, sizeof(*cmd));
1155
1156 cmd->data[0] = devid;
1157 cmd->data[0] |= (qdep & 0xff) << 24;
1158 cmd->data[1] = devid;
1159 cmd->data[2] = lower_32_bits(inv_address);
1160 cmd->data[3] = upper_32_bits(inv_address);
1161 if (gn) {
1162 cmd->data[0] |= ((pasid >> 8) & 0xff) << 16;
1163 cmd->data[1] |= (pasid & 0xff) << 16;
1164 cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK;
1165 }
1166
1167 CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES);
1168}
1169
1170static void build_complete_ppr(struct iommu_cmd *cmd, u16 devid, u32 pasid,
1171 int status, int tag, u8 gn)
1172{
1173 memset(cmd, 0, sizeof(*cmd));
1174
1175 cmd->data[0] = devid;
1176 if (gn) {
1177 cmd->data[1] = pasid;
1178 cmd->data[2] = CMD_INV_IOMMU_PAGES_GN_MASK;
1179 }
1180 cmd->data[3] = tag & 0x1ff;
1181 cmd->data[3] |= (status & PPR_STATUS_MASK) << PPR_STATUS_SHIFT;
1182
1183 CMD_SET_TYPE(cmd, CMD_COMPLETE_PPR);
1184}
1185
1186static void build_inv_all(struct iommu_cmd *cmd)
1187{
1188 memset(cmd, 0, sizeof(*cmd));
1189 CMD_SET_TYPE(cmd, CMD_INV_ALL);
1190}
1191
1192static void build_inv_irt(struct iommu_cmd *cmd, u16 devid)
1193{
1194 memset(cmd, 0, sizeof(*cmd));
1195 cmd->data[0] = devid;
1196 CMD_SET_TYPE(cmd, CMD_INV_IRT);
1197}
1198
1199/*
1200 * Writes the command to the IOMMUs command buffer and informs the
1201 * hardware about the new command.
1202 */
1203static int __iommu_queue_command_sync(struct amd_iommu *iommu,
1204 struct iommu_cmd *cmd,
1205 bool sync)
1206{
1207 unsigned int count = 0;
1208 u32 left, next_tail;
1209
1210 next_tail = (iommu->cmd_buf_tail + sizeof(*cmd)) % CMD_BUFFER_SIZE;
1211again:
1212 left = (iommu->cmd_buf_head - next_tail) % CMD_BUFFER_SIZE;
1213
1214 if (left <= 0x20) {
1215 /* Skip udelay() the first time around */
1216 if (count++) {
1217 if (count == LOOP_TIMEOUT) {
1218 pr_err("Command buffer timeout\n");
1219 return -EIO;
1220 }
1221
1222 udelay(1);
1223 }
1224
1225 /* Update head and recheck remaining space */
1226 iommu->cmd_buf_head = readl(addr: iommu->mmio_base +
1227 MMIO_CMD_HEAD_OFFSET);
1228
1229 goto again;
1230 }
1231
1232 copy_cmd_to_buffer(iommu, cmd);
1233
1234 /* Do we need to make sure all commands are processed? */
1235 iommu->need_sync = sync;
1236
1237 return 0;
1238}
1239
1240static int iommu_queue_command_sync(struct amd_iommu *iommu,
1241 struct iommu_cmd *cmd,
1242 bool sync)
1243{
1244 unsigned long flags;
1245 int ret;
1246
1247 raw_spin_lock_irqsave(&iommu->lock, flags);
1248 ret = __iommu_queue_command_sync(iommu, cmd, sync);
1249 raw_spin_unlock_irqrestore(&iommu->lock, flags);
1250
1251 return ret;
1252}
1253
1254static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
1255{
1256 return iommu_queue_command_sync(iommu, cmd, sync: true);
1257}
1258
1259/*
1260 * This function queues a completion wait command into the command
1261 * buffer of an IOMMU
1262 */
1263static int iommu_completion_wait(struct amd_iommu *iommu)
1264{
1265 struct iommu_cmd cmd;
1266 unsigned long flags;
1267 int ret;
1268 u64 data;
1269
1270 if (!iommu->need_sync)
1271 return 0;
1272
1273 data = atomic64_add_return(i: 1, v: &iommu->cmd_sem_val);
1274 build_completion_wait(cmd: &cmd, iommu, data);
1275
1276 raw_spin_lock_irqsave(&iommu->lock, flags);
1277
1278 ret = __iommu_queue_command_sync(iommu, cmd: &cmd, sync: false);
1279 if (ret)
1280 goto out_unlock;
1281
1282 ret = wait_on_sem(iommu, data);
1283
1284out_unlock:
1285 raw_spin_unlock_irqrestore(&iommu->lock, flags);
1286
1287 return ret;
1288}
1289
1290static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid)
1291{
1292 struct iommu_cmd cmd;
1293
1294 build_inv_dte(cmd: &cmd, devid);
1295
1296 return iommu_queue_command(iommu, cmd: &cmd);
1297}
1298
1299static void amd_iommu_flush_dte_all(struct amd_iommu *iommu)
1300{
1301 u32 devid;
1302 u16 last_bdf = iommu->pci_seg->last_bdf;
1303
1304 for (devid = 0; devid <= last_bdf; ++devid)
1305 iommu_flush_dte(iommu, devid);
1306
1307 iommu_completion_wait(iommu);
1308}
1309
1310/*
1311 * This function uses heavy locking and may disable irqs for some time. But
1312 * this is no issue because it is only called during resume.
1313 */
1314static void amd_iommu_flush_tlb_all(struct amd_iommu *iommu)
1315{
1316 u32 dom_id;
1317 u16 last_bdf = iommu->pci_seg->last_bdf;
1318
1319 for (dom_id = 0; dom_id <= last_bdf; ++dom_id) {
1320 struct iommu_cmd cmd;
1321 build_inv_iommu_pages(cmd: &cmd, address: 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
1322 domid: dom_id, IOMMU_NO_PASID, gn: false);
1323 iommu_queue_command(iommu, cmd: &cmd);
1324 }
1325
1326 iommu_completion_wait(iommu);
1327}
1328
1329static void amd_iommu_flush_tlb_domid(struct amd_iommu *iommu, u32 dom_id)
1330{
1331 struct iommu_cmd cmd;
1332
1333 build_inv_iommu_pages(cmd: &cmd, address: 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
1334 domid: dom_id, IOMMU_NO_PASID, gn: false);
1335 iommu_queue_command(iommu, cmd: &cmd);
1336
1337 iommu_completion_wait(iommu);
1338}
1339
1340static void amd_iommu_flush_all(struct amd_iommu *iommu)
1341{
1342 struct iommu_cmd cmd;
1343
1344 build_inv_all(cmd: &cmd);
1345
1346 iommu_queue_command(iommu, cmd: &cmd);
1347 iommu_completion_wait(iommu);
1348}
1349
1350static void iommu_flush_irt(struct amd_iommu *iommu, u16 devid)
1351{
1352 struct iommu_cmd cmd;
1353
1354 build_inv_irt(cmd: &cmd, devid);
1355
1356 iommu_queue_command(iommu, cmd: &cmd);
1357}
1358
1359static void amd_iommu_flush_irt_all(struct amd_iommu *iommu)
1360{
1361 u32 devid;
1362 u16 last_bdf = iommu->pci_seg->last_bdf;
1363
1364 if (iommu->irtcachedis_enabled)
1365 return;
1366
1367 for (devid = 0; devid <= last_bdf; devid++)
1368 iommu_flush_irt(iommu, devid);
1369
1370 iommu_completion_wait(iommu);
1371}
1372
1373void amd_iommu_flush_all_caches(struct amd_iommu *iommu)
1374{
1375 if (check_feature(FEATURE_IA)) {
1376 amd_iommu_flush_all(iommu);
1377 } else {
1378 amd_iommu_flush_dte_all(iommu);
1379 amd_iommu_flush_irt_all(iommu);
1380 amd_iommu_flush_tlb_all(iommu);
1381 }
1382}
1383
1384/*
1385 * Command send function for flushing on-device TLB
1386 */
1387static int device_flush_iotlb(struct iommu_dev_data *dev_data, u64 address,
1388 size_t size, ioasid_t pasid, bool gn)
1389{
1390 struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data);
1391 struct iommu_cmd cmd;
1392 int qdep = dev_data->ats_qdep;
1393
1394 build_inv_iotlb_pages(cmd: &cmd, devid: dev_data->devid, qdep, address,
1395 size, pasid, gn);
1396
1397 return iommu_queue_command(iommu, cmd: &cmd);
1398}
1399
1400static int device_flush_dte_alias(struct pci_dev *pdev, u16 alias, void *data)
1401{
1402 struct amd_iommu *iommu = data;
1403
1404 return iommu_flush_dte(iommu, devid: alias);
1405}
1406
1407/*
1408 * Command send function for invalidating a device table entry
1409 */
1410static int device_flush_dte(struct iommu_dev_data *dev_data)
1411{
1412 struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data);
1413 struct pci_dev *pdev = NULL;
1414 struct amd_iommu_pci_seg *pci_seg;
1415 u16 alias;
1416 int ret;
1417
1418 if (dev_is_pci(dev_data->dev))
1419 pdev = to_pci_dev(dev_data->dev);
1420
1421 if (pdev)
1422 ret = pci_for_each_dma_alias(pdev,
1423 fn: device_flush_dte_alias, data: iommu);
1424 else
1425 ret = iommu_flush_dte(iommu, devid: dev_data->devid);
1426 if (ret)
1427 return ret;
1428
1429 pci_seg = iommu->pci_seg;
1430 alias = pci_seg->alias_table[dev_data->devid];
1431 if (alias != dev_data->devid) {
1432 ret = iommu_flush_dte(iommu, devid: alias);
1433 if (ret)
1434 return ret;
1435 }
1436
1437 if (dev_data->ats_enabled) {
1438 /* Invalidate the entire contents of an IOTLB */
1439 ret = device_flush_iotlb(dev_data, address: 0, size: ~0UL,
1440 IOMMU_NO_PASID, gn: false);
1441 }
1442
1443 return ret;
1444}
1445
1446static int domain_flush_pages_v2(struct protection_domain *pdom,
1447 u64 address, size_t size)
1448{
1449 struct iommu_dev_data *dev_data;
1450 struct iommu_cmd cmd;
1451 int ret = 0;
1452
1453 list_for_each_entry(dev_data, &pdom->dev_list, list) {
1454 struct amd_iommu *iommu = get_amd_iommu_from_dev(dev: dev_data->dev);
1455 u16 domid = dev_data->gcr3_info.domid;
1456
1457 build_inv_iommu_pages(cmd: &cmd, address, size,
1458 domid, IOMMU_NO_PASID, gn: true);
1459
1460 ret |= iommu_queue_command(iommu, cmd: &cmd);
1461 }
1462
1463 return ret;
1464}
1465
1466static int domain_flush_pages_v1(struct protection_domain *pdom,
1467 u64 address, size_t size)
1468{
1469 struct iommu_cmd cmd;
1470 int ret = 0, i;
1471
1472 build_inv_iommu_pages(cmd: &cmd, address, size,
1473 domid: pdom->id, IOMMU_NO_PASID, gn: false);
1474
1475 for (i = 0; i < amd_iommu_get_num_iommus(); ++i) {
1476 if (!pdom->dev_iommu[i])
1477 continue;
1478
1479 /*
1480 * Devices of this domain are behind this IOMMU
1481 * We need a TLB flush
1482 */
1483 ret |= iommu_queue_command(iommu: amd_iommus[i], cmd: &cmd);
1484 }
1485
1486 return ret;
1487}
1488
1489/*
1490 * TLB invalidation function which is called from the mapping functions.
1491 * It flushes range of PTEs of the domain.
1492 */
1493static void __domain_flush_pages(struct protection_domain *domain,
1494 u64 address, size_t size)
1495{
1496 struct iommu_dev_data *dev_data;
1497 int ret = 0;
1498 ioasid_t pasid = IOMMU_NO_PASID;
1499 bool gn = false;
1500
1501 if (pdom_is_v2_pgtbl_mode(pdom: domain)) {
1502 gn = true;
1503 ret = domain_flush_pages_v2(pdom: domain, address, size);
1504 } else {
1505 ret = domain_flush_pages_v1(pdom: domain, address, size);
1506 }
1507
1508 list_for_each_entry(dev_data, &domain->dev_list, list) {
1509
1510 if (!dev_data->ats_enabled)
1511 continue;
1512
1513 ret |= device_flush_iotlb(dev_data, address, size, pasid, gn);
1514 }
1515
1516 WARN_ON(ret);
1517}
1518
1519void amd_iommu_domain_flush_pages(struct protection_domain *domain,
1520 u64 address, size_t size)
1521{
1522 if (likely(!amd_iommu_np_cache)) {
1523 __domain_flush_pages(domain, address, size);
1524
1525 /* Wait until IOMMU TLB and all device IOTLB flushes are complete */
1526 amd_iommu_domain_flush_complete(domain);
1527
1528 return;
1529 }
1530
1531 /*
1532 * When NpCache is on, we infer that we run in a VM and use a vIOMMU.
1533 * In such setups it is best to avoid flushes of ranges which are not
1534 * naturally aligned, since it would lead to flushes of unmodified
1535 * PTEs. Such flushes would require the hypervisor to do more work than
1536 * necessary. Therefore, perform repeated flushes of aligned ranges
1537 * until you cover the range. Each iteration flushes the smaller
1538 * between the natural alignment of the address that we flush and the
1539 * greatest naturally aligned region that fits in the range.
1540 */
1541 while (size != 0) {
1542 int addr_alignment = __ffs(address);
1543 int size_alignment = __fls(word: size);
1544 int min_alignment;
1545 size_t flush_size;
1546
1547 /*
1548 * size is always non-zero, but address might be zero, causing
1549 * addr_alignment to be negative. As the casting of the
1550 * argument in __ffs(address) to long might trim the high bits
1551 * of the address on x86-32, cast to long when doing the check.
1552 */
1553 if (likely((unsigned long)address != 0))
1554 min_alignment = min(addr_alignment, size_alignment);
1555 else
1556 min_alignment = size_alignment;
1557
1558 flush_size = 1ul << min_alignment;
1559
1560 __domain_flush_pages(domain, address, size: flush_size);
1561 address += flush_size;
1562 size -= flush_size;
1563 }
1564
1565 /* Wait until IOMMU TLB and all device IOTLB flushes are complete */
1566 amd_iommu_domain_flush_complete(domain);
1567}
1568
1569/* Flush the whole IO/TLB for a given protection domain - including PDE */
1570static void amd_iommu_domain_flush_all(struct protection_domain *domain)
1571{
1572 amd_iommu_domain_flush_pages(domain, address: 0,
1573 CMD_INV_IOMMU_ALL_PAGES_ADDRESS);
1574}
1575
1576void amd_iommu_dev_flush_pasid_pages(struct iommu_dev_data *dev_data,
1577 ioasid_t pasid, u64 address, size_t size)
1578{
1579 struct iommu_cmd cmd;
1580 struct amd_iommu *iommu = get_amd_iommu_from_dev(dev: dev_data->dev);
1581
1582 build_inv_iommu_pages(cmd: &cmd, address, size,
1583 domid: dev_data->gcr3_info.domid, pasid, gn: true);
1584 iommu_queue_command(iommu, cmd: &cmd);
1585
1586 if (dev_data->ats_enabled)
1587 device_flush_iotlb(dev_data, address, size, pasid, gn: true);
1588
1589 iommu_completion_wait(iommu);
1590}
1591
1592void amd_iommu_dev_flush_pasid_all(struct iommu_dev_data *dev_data,
1593 ioasid_t pasid)
1594{
1595 amd_iommu_dev_flush_pasid_pages(dev_data, pasid: 0,
1596 CMD_INV_IOMMU_ALL_PAGES_ADDRESS, size: pasid);
1597}
1598
1599void amd_iommu_domain_flush_complete(struct protection_domain *domain)
1600{
1601 int i;
1602
1603 for (i = 0; i < amd_iommu_get_num_iommus(); ++i) {
1604 if (domain && !domain->dev_iommu[i])
1605 continue;
1606
1607 /*
1608 * Devices of this domain are behind this IOMMU
1609 * We need to wait for completion of all commands.
1610 */
1611 iommu_completion_wait(iommu: amd_iommus[i]);
1612 }
1613}
1614
1615/* Flush the not present cache if it exists */
1616static void domain_flush_np_cache(struct protection_domain *domain,
1617 dma_addr_t iova, size_t size)
1618{
1619 if (unlikely(amd_iommu_np_cache)) {
1620 unsigned long flags;
1621
1622 spin_lock_irqsave(&domain->lock, flags);
1623 amd_iommu_domain_flush_pages(domain, address: iova, size);
1624 spin_unlock_irqrestore(lock: &domain->lock, flags);
1625 }
1626}
1627
1628
1629/*
1630 * This function flushes the DTEs for all devices in domain
1631 */
1632static void domain_flush_devices(struct protection_domain *domain)
1633{
1634 struct iommu_dev_data *dev_data;
1635
1636 list_for_each_entry(dev_data, &domain->dev_list, list)
1637 device_flush_dte(dev_data);
1638}
1639
1640static void update_device_table(struct protection_domain *domain)
1641{
1642 struct iommu_dev_data *dev_data;
1643
1644 list_for_each_entry(dev_data, &domain->dev_list, list) {
1645 struct amd_iommu *iommu = rlookup_amd_iommu(dev: dev_data->dev);
1646
1647 set_dte_entry(iommu, dev_data);
1648 clone_aliases(iommu, dev: dev_data->dev);
1649 }
1650}
1651
1652void amd_iommu_update_and_flush_device_table(struct protection_domain *domain)
1653{
1654 update_device_table(domain);
1655 domain_flush_devices(domain);
1656}
1657
1658void amd_iommu_domain_update(struct protection_domain *domain)
1659{
1660 /* Update device table */
1661 amd_iommu_update_and_flush_device_table(domain);
1662
1663 /* Flush domain TLB(s) and wait for completion */
1664 amd_iommu_domain_flush_all(domain);
1665}
1666
1667int amd_iommu_complete_ppr(struct pci_dev *pdev, u32 pasid,
1668 int status, int tag)
1669{
1670 struct iommu_dev_data *dev_data;
1671 struct amd_iommu *iommu;
1672 struct iommu_cmd cmd;
1673
1674 dev_data = dev_iommu_priv_get(dev: &pdev->dev);
1675 iommu = get_amd_iommu_from_dev(dev: &pdev->dev);
1676
1677 build_complete_ppr(cmd: &cmd, devid: dev_data->devid, pasid, status,
1678 tag, gn: dev_data->pri_tlp);
1679
1680 return iommu_queue_command(iommu, cmd: &cmd);
1681}
1682
1683/****************************************************************************
1684 *
1685 * The next functions belong to the domain allocation. A domain is
1686 * allocated for every IOMMU as the default domain. If device isolation
1687 * is enabled, every device get its own domain. The most important thing
1688 * about domains is the page table mapping the DMA address space they
1689 * contain.
1690 *
1691 ****************************************************************************/
1692
1693static u16 domain_id_alloc(void)
1694{
1695 unsigned long flags;
1696 int id;
1697
1698 spin_lock_irqsave(&pd_bitmap_lock, flags);
1699 id = find_first_zero_bit(addr: amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID);
1700 BUG_ON(id == 0);
1701 if (id > 0 && id < MAX_DOMAIN_ID)
1702 __set_bit(id, amd_iommu_pd_alloc_bitmap);
1703 else
1704 id = 0;
1705 spin_unlock_irqrestore(lock: &pd_bitmap_lock, flags);
1706
1707 return id;
1708}
1709
1710static void domain_id_free(int id)
1711{
1712 unsigned long flags;
1713
1714 spin_lock_irqsave(&pd_bitmap_lock, flags);
1715 if (id > 0 && id < MAX_DOMAIN_ID)
1716 __clear_bit(id, amd_iommu_pd_alloc_bitmap);
1717 spin_unlock_irqrestore(lock: &pd_bitmap_lock, flags);
1718}
1719
1720static void free_gcr3_tbl_level1(u64 *tbl)
1721{
1722 u64 *ptr;
1723 int i;
1724
1725 for (i = 0; i < 512; ++i) {
1726 if (!(tbl[i] & GCR3_VALID))
1727 continue;
1728
1729 ptr = iommu_phys_to_virt(paddr: tbl[i] & PAGE_MASK);
1730
1731 free_page((unsigned long)ptr);
1732 }
1733}
1734
1735static void free_gcr3_tbl_level2(u64 *tbl)
1736{
1737 u64 *ptr;
1738 int i;
1739
1740 for (i = 0; i < 512; ++i) {
1741 if (!(tbl[i] & GCR3_VALID))
1742 continue;
1743
1744 ptr = iommu_phys_to_virt(paddr: tbl[i] & PAGE_MASK);
1745
1746 free_gcr3_tbl_level1(tbl: ptr);
1747 }
1748}
1749
1750static void free_gcr3_table(struct gcr3_tbl_info *gcr3_info)
1751{
1752 if (gcr3_info->glx == 2)
1753 free_gcr3_tbl_level2(tbl: gcr3_info->gcr3_tbl);
1754 else if (gcr3_info->glx == 1)
1755 free_gcr3_tbl_level1(tbl: gcr3_info->gcr3_tbl);
1756 else
1757 WARN_ON_ONCE(gcr3_info->glx != 0);
1758
1759 gcr3_info->glx = 0;
1760
1761 /* Free per device domain ID */
1762 domain_id_free(id: gcr3_info->domid);
1763
1764 free_page((unsigned long)gcr3_info->gcr3_tbl);
1765 gcr3_info->gcr3_tbl = NULL;
1766}
1767
1768/*
1769 * Number of GCR3 table levels required. Level must be 4-Kbyte
1770 * page and can contain up to 512 entries.
1771 */
1772static int get_gcr3_levels(int pasids)
1773{
1774 int levels;
1775
1776 if (pasids == -1)
1777 return amd_iommu_max_glx_val;
1778
1779 levels = get_count_order(count: pasids);
1780
1781 return levels ? (DIV_ROUND_UP(levels, 9) - 1) : levels;
1782}
1783
1784static int setup_gcr3_table(struct gcr3_tbl_info *gcr3_info,
1785 struct amd_iommu *iommu, int pasids)
1786{
1787 int levels = get_gcr3_levels(pasids);
1788 int nid = iommu ? dev_to_node(dev: &iommu->dev->dev) : NUMA_NO_NODE;
1789
1790 if (levels > amd_iommu_max_glx_val)
1791 return -EINVAL;
1792
1793 if (gcr3_info->gcr3_tbl)
1794 return -EBUSY;
1795
1796 /* Allocate per device domain ID */
1797 gcr3_info->domid = domain_id_alloc();
1798
1799 gcr3_info->gcr3_tbl = alloc_pgtable_page(nid, GFP_ATOMIC);
1800 if (gcr3_info->gcr3_tbl == NULL) {
1801 domain_id_free(id: gcr3_info->domid);
1802 return -ENOMEM;
1803 }
1804
1805 gcr3_info->glx = levels;
1806
1807 return 0;
1808}
1809
1810static u64 *__get_gcr3_pte(struct gcr3_tbl_info *gcr3_info,
1811 ioasid_t pasid, bool alloc)
1812{
1813 int index;
1814 u64 *pte;
1815 u64 *root = gcr3_info->gcr3_tbl;
1816 int level = gcr3_info->glx;
1817
1818 while (true) {
1819
1820 index = (pasid >> (9 * level)) & 0x1ff;
1821 pte = &root[index];
1822
1823 if (level == 0)
1824 break;
1825
1826 if (!(*pte & GCR3_VALID)) {
1827 if (!alloc)
1828 return NULL;
1829
1830 root = (void *)get_zeroed_page(GFP_ATOMIC);
1831 if (root == NULL)
1832 return NULL;
1833
1834 *pte = iommu_virt_to_phys(vaddr: root) | GCR3_VALID;
1835 }
1836
1837 root = iommu_phys_to_virt(paddr: *pte & PAGE_MASK);
1838
1839 level -= 1;
1840 }
1841
1842 return pte;
1843}
1844
1845static int update_gcr3(struct iommu_dev_data *dev_data,
1846 ioasid_t pasid, unsigned long gcr3, bool set)
1847{
1848 struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
1849 u64 *pte;
1850
1851 pte = __get_gcr3_pte(gcr3_info, pasid, alloc: true);
1852 if (pte == NULL)
1853 return -ENOMEM;
1854
1855 if (set)
1856 *pte = (gcr3 & PAGE_MASK) | GCR3_VALID;
1857 else
1858 *pte = 0;
1859
1860 amd_iommu_dev_flush_pasid_all(dev_data, pasid);
1861 return 0;
1862}
1863
1864int amd_iommu_set_gcr3(struct iommu_dev_data *dev_data, ioasid_t pasid,
1865 unsigned long gcr3)
1866{
1867 struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
1868 int ret;
1869
1870 iommu_group_mutex_assert(dev: dev_data->dev);
1871
1872 ret = update_gcr3(dev_data, pasid, gcr3, set: true);
1873 if (ret)
1874 return ret;
1875
1876 gcr3_info->pasid_cnt++;
1877 return ret;
1878}
1879
1880int amd_iommu_clear_gcr3(struct iommu_dev_data *dev_data, ioasid_t pasid)
1881{
1882 struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
1883 int ret;
1884
1885 iommu_group_mutex_assert(dev: dev_data->dev);
1886
1887 ret = update_gcr3(dev_data, pasid, gcr3: 0, set: false);
1888 if (ret)
1889 return ret;
1890
1891 gcr3_info->pasid_cnt--;
1892 return ret;
1893}
1894
1895static void set_dte_entry(struct amd_iommu *iommu,
1896 struct iommu_dev_data *dev_data)
1897{
1898 u64 pte_root = 0;
1899 u64 flags = 0;
1900 u32 old_domid;
1901 u16 devid = dev_data->devid;
1902 u16 domid;
1903 struct protection_domain *domain = dev_data->domain;
1904 struct dev_table_entry *dev_table = get_dev_table(iommu);
1905 struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
1906
1907 if (gcr3_info && gcr3_info->gcr3_tbl)
1908 domid = dev_data->gcr3_info.domid;
1909 else
1910 domid = domain->id;
1911
1912 if (domain->iop.mode != PAGE_MODE_NONE)
1913 pte_root = iommu_virt_to_phys(vaddr: domain->iop.root);
1914
1915 pte_root |= (domain->iop.mode & DEV_ENTRY_MODE_MASK)
1916 << DEV_ENTRY_MODE_SHIFT;
1917
1918 pte_root |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_V;
1919
1920 /*
1921 * When SNP is enabled, Only set TV bit when IOMMU
1922 * page translation is in use.
1923 */
1924 if (!amd_iommu_snp_en || (domid != 0))
1925 pte_root |= DTE_FLAG_TV;
1926
1927 flags = dev_table[devid].data[1];
1928
1929 if (dev_data->ats_enabled)
1930 flags |= DTE_FLAG_IOTLB;
1931
1932 if (dev_data->ppr)
1933 pte_root |= 1ULL << DEV_ENTRY_PPR;
1934
1935 if (domain->dirty_tracking)
1936 pte_root |= DTE_FLAG_HAD;
1937
1938 if (gcr3_info && gcr3_info->gcr3_tbl) {
1939 u64 gcr3 = iommu_virt_to_phys(vaddr: gcr3_info->gcr3_tbl);
1940 u64 glx = gcr3_info->glx;
1941 u64 tmp;
1942
1943 pte_root |= DTE_FLAG_GV;
1944 pte_root |= (glx & DTE_GLX_MASK) << DTE_GLX_SHIFT;
1945
1946 /* First mask out possible old values for GCR3 table */
1947 tmp = DTE_GCR3_VAL_B(~0ULL) << DTE_GCR3_SHIFT_B;
1948 flags &= ~tmp;
1949
1950 tmp = DTE_GCR3_VAL_C(~0ULL) << DTE_GCR3_SHIFT_C;
1951 flags &= ~tmp;
1952
1953 /* Encode GCR3 table into DTE */
1954 tmp = DTE_GCR3_VAL_A(gcr3) << DTE_GCR3_SHIFT_A;
1955 pte_root |= tmp;
1956
1957 tmp = DTE_GCR3_VAL_B(gcr3) << DTE_GCR3_SHIFT_B;
1958 flags |= tmp;
1959
1960 tmp = DTE_GCR3_VAL_C(gcr3) << DTE_GCR3_SHIFT_C;
1961 flags |= tmp;
1962
1963 if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL) {
1964 dev_table[devid].data[2] |=
1965 ((u64)GUEST_PGTABLE_5_LEVEL << DTE_GPT_LEVEL_SHIFT);
1966 }
1967
1968 /* GIOV is supported with V2 page table mode only */
1969 if (pdom_is_v2_pgtbl_mode(pdom: domain))
1970 pte_root |= DTE_FLAG_GIOV;
1971 }
1972
1973 flags &= ~DEV_DOMID_MASK;
1974 flags |= domid;
1975
1976 old_domid = dev_table[devid].data[1] & DEV_DOMID_MASK;
1977 dev_table[devid].data[1] = flags;
1978 dev_table[devid].data[0] = pte_root;
1979
1980 /*
1981 * A kdump kernel might be replacing a domain ID that was copied from
1982 * the previous kernel--if so, it needs to flush the translation cache
1983 * entries for the old domain ID that is being overwritten
1984 */
1985 if (old_domid) {
1986 amd_iommu_flush_tlb_domid(iommu, dom_id: old_domid);
1987 }
1988}
1989
1990static void clear_dte_entry(struct amd_iommu *iommu, u16 devid)
1991{
1992 struct dev_table_entry *dev_table = get_dev_table(iommu);
1993
1994 /* remove entry from the device table seen by the hardware */
1995 dev_table[devid].data[0] = DTE_FLAG_V;
1996
1997 if (!amd_iommu_snp_en)
1998 dev_table[devid].data[0] |= DTE_FLAG_TV;
1999
2000 dev_table[devid].data[1] &= DTE_FLAG_MASK;
2001
2002 amd_iommu_apply_erratum_63(iommu, devid);
2003}
2004
2005static int do_attach(struct iommu_dev_data *dev_data,
2006 struct protection_domain *domain)
2007{
2008 struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data);
2009 int ret = 0;
2010
2011 /* Update data structures */
2012 dev_data->domain = domain;
2013 list_add(new: &dev_data->list, head: &domain->dev_list);
2014
2015 /* Update NUMA Node ID */
2016 if (domain->nid == NUMA_NO_NODE)
2017 domain->nid = dev_to_node(dev: dev_data->dev);
2018
2019 /* Do reference counting */
2020 domain->dev_iommu[iommu->index] += 1;
2021 domain->dev_cnt += 1;
2022
2023 /* Init GCR3 table and update device table */
2024 if (domain->pd_mode == PD_MODE_V2) {
2025 /* By default, setup GCR3 table to support single PASID */
2026 ret = setup_gcr3_table(gcr3_info: &dev_data->gcr3_info, iommu, pasids: 1);
2027 if (ret)
2028 return ret;
2029
2030 ret = update_gcr3(dev_data, pasid: 0,
2031 gcr3: iommu_virt_to_phys(vaddr: domain->iop.pgd), set: true);
2032 if (ret) {
2033 free_gcr3_table(gcr3_info: &dev_data->gcr3_info);
2034 return ret;
2035 }
2036 }
2037
2038 /* Update device table */
2039 set_dte_entry(iommu, dev_data);
2040 clone_aliases(iommu, dev: dev_data->dev);
2041
2042 device_flush_dte(dev_data);
2043
2044 return ret;
2045}
2046
2047static void do_detach(struct iommu_dev_data *dev_data)
2048{
2049 struct protection_domain *domain = dev_data->domain;
2050 struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data);
2051
2052 /* Clear GCR3 table */
2053 if (domain->pd_mode == PD_MODE_V2) {
2054 update_gcr3(dev_data, pasid: 0, gcr3: 0, set: false);
2055 free_gcr3_table(gcr3_info: &dev_data->gcr3_info);
2056 }
2057
2058 /* Update data structures */
2059 dev_data->domain = NULL;
2060 list_del(entry: &dev_data->list);
2061 clear_dte_entry(iommu, devid: dev_data->devid);
2062 clone_aliases(iommu, dev: dev_data->dev);
2063
2064 /* Flush the DTE entry */
2065 device_flush_dte(dev_data);
2066
2067 /* Flush IOTLB and wait for the flushes to finish */
2068 amd_iommu_domain_flush_all(domain);
2069
2070 /* decrease reference counters - needs to happen after the flushes */
2071 domain->dev_iommu[iommu->index] -= 1;
2072 domain->dev_cnt -= 1;
2073}
2074
2075/*
2076 * If a device is not yet associated with a domain, this function makes the
2077 * device visible in the domain
2078 */
2079static int attach_device(struct device *dev,
2080 struct protection_domain *domain)
2081{
2082 struct iommu_dev_data *dev_data;
2083 unsigned long flags;
2084 int ret = 0;
2085
2086 spin_lock_irqsave(&domain->lock, flags);
2087
2088 dev_data = dev_iommu_priv_get(dev);
2089
2090 spin_lock(lock: &dev_data->lock);
2091
2092 if (dev_data->domain != NULL) {
2093 ret = -EBUSY;
2094 goto out;
2095 }
2096
2097 if (dev_is_pci(dev))
2098 pdev_enable_caps(to_pci_dev(dev));
2099
2100 ret = do_attach(dev_data, domain);
2101
2102out:
2103 spin_unlock(lock: &dev_data->lock);
2104
2105 spin_unlock_irqrestore(lock: &domain->lock, flags);
2106
2107 return ret;
2108}
2109
2110/*
2111 * Removes a device from a protection domain (with devtable_lock held)
2112 */
2113static void detach_device(struct device *dev)
2114{
2115 struct protection_domain *domain;
2116 struct iommu_dev_data *dev_data;
2117 unsigned long flags;
2118
2119 dev_data = dev_iommu_priv_get(dev);
2120 domain = dev_data->domain;
2121
2122 spin_lock_irqsave(&domain->lock, flags);
2123
2124 spin_lock(lock: &dev_data->lock);
2125
2126 /*
2127 * First check if the device is still attached. It might already
2128 * be detached from its domain because the generic
2129 * iommu_detach_group code detached it and we try again here in
2130 * our alias handling.
2131 */
2132 if (WARN_ON(!dev_data->domain))
2133 goto out;
2134
2135 do_detach(dev_data);
2136
2137 if (dev_is_pci(dev))
2138 pdev_disable_caps(to_pci_dev(dev));
2139
2140out:
2141 spin_unlock(lock: &dev_data->lock);
2142
2143 spin_unlock_irqrestore(lock: &domain->lock, flags);
2144}
2145
2146static struct iommu_device *amd_iommu_probe_device(struct device *dev)
2147{
2148 struct iommu_device *iommu_dev;
2149 struct amd_iommu *iommu;
2150 int ret;
2151
2152 if (!check_device(dev))
2153 return ERR_PTR(error: -ENODEV);
2154
2155 iommu = rlookup_amd_iommu(dev);
2156 if (!iommu)
2157 return ERR_PTR(error: -ENODEV);
2158
2159 /* Not registered yet? */
2160 if (!iommu->iommu.ops)
2161 return ERR_PTR(error: -ENODEV);
2162
2163 if (dev_iommu_priv_get(dev))
2164 return &iommu->iommu;
2165
2166 ret = iommu_init_device(iommu, dev);
2167 if (ret) {
2168 dev_err(dev, "Failed to initialize - trying to proceed anyway\n");
2169 iommu_dev = ERR_PTR(error: ret);
2170 iommu_ignore_device(iommu, dev);
2171 } else {
2172 amd_iommu_set_pci_msi_domain(dev, iommu);
2173 iommu_dev = &iommu->iommu;
2174 }
2175
2176 iommu_completion_wait(iommu);
2177
2178 return iommu_dev;
2179}
2180
2181static void amd_iommu_probe_finalize(struct device *dev)
2182{
2183 /* Domains are initialized for this device - have a look what we ended up with */
2184 set_dma_ops(dev, NULL);
2185 iommu_setup_dma_ops(dev, dma_base: 0, U64_MAX);
2186}
2187
2188static void amd_iommu_release_device(struct device *dev)
2189{
2190 struct amd_iommu *iommu;
2191
2192 if (!check_device(dev))
2193 return;
2194
2195 iommu = rlookup_amd_iommu(dev);
2196 if (!iommu)
2197 return;
2198
2199 amd_iommu_uninit_device(dev);
2200 iommu_completion_wait(iommu);
2201}
2202
2203static struct iommu_group *amd_iommu_device_group(struct device *dev)
2204{
2205 if (dev_is_pci(dev))
2206 return pci_device_group(dev);
2207
2208 return acpihid_device_group(dev);
2209}
2210
2211/*****************************************************************************
2212 *
2213 * The following functions belong to the exported interface of AMD IOMMU
2214 *
2215 * This interface allows access to lower level functions of the IOMMU
2216 * like protection domain handling and assignement of devices to domains
2217 * which is not possible with the dma_ops interface.
2218 *
2219 *****************************************************************************/
2220
2221static void cleanup_domain(struct protection_domain *domain)
2222{
2223 struct iommu_dev_data *entry;
2224
2225 lockdep_assert_held(&domain->lock);
2226
2227 if (!domain->dev_cnt)
2228 return;
2229
2230 while (!list_empty(head: &domain->dev_list)) {
2231 entry = list_first_entry(&domain->dev_list,
2232 struct iommu_dev_data, list);
2233 BUG_ON(!entry->domain);
2234 do_detach(dev_data: entry);
2235 }
2236 WARN_ON(domain->dev_cnt != 0);
2237}
2238
2239static void protection_domain_free(struct protection_domain *domain)
2240{
2241 if (!domain)
2242 return;
2243
2244 if (domain->iop.pgtbl_cfg.tlb)
2245 free_io_pgtable_ops(ops: &domain->iop.iop.ops);
2246
2247 if (domain->iop.root)
2248 free_page((unsigned long)domain->iop.root);
2249
2250 if (domain->id)
2251 domain_id_free(id: domain->id);
2252
2253 kfree(objp: domain);
2254}
2255
2256static int protection_domain_init_v1(struct protection_domain *domain, int mode)
2257{
2258 u64 *pt_root = NULL;
2259
2260 BUG_ON(mode < PAGE_MODE_NONE || mode > PAGE_MODE_6_LEVEL);
2261
2262 if (mode != PAGE_MODE_NONE) {
2263 pt_root = (void *)get_zeroed_page(GFP_KERNEL);
2264 if (!pt_root)
2265 return -ENOMEM;
2266 }
2267
2268 domain->pd_mode = PD_MODE_V1;
2269 amd_iommu_domain_set_pgtable(domain, root: pt_root, mode);
2270
2271 return 0;
2272}
2273
2274static int protection_domain_init_v2(struct protection_domain *pdom)
2275{
2276 pdom->pd_mode = PD_MODE_V2;
2277 pdom->domain.pgsize_bitmap = AMD_IOMMU_PGSIZES_V2;
2278
2279 return 0;
2280}
2281
2282static struct protection_domain *protection_domain_alloc(unsigned int type)
2283{
2284 struct io_pgtable_ops *pgtbl_ops;
2285 struct protection_domain *domain;
2286 int pgtable;
2287 int ret;
2288
2289 domain = kzalloc(size: sizeof(*domain), GFP_KERNEL);
2290 if (!domain)
2291 return NULL;
2292
2293 domain->id = domain_id_alloc();
2294 if (!domain->id)
2295 goto out_err;
2296
2297 spin_lock_init(&domain->lock);
2298 INIT_LIST_HEAD(list: &domain->dev_list);
2299 domain->nid = NUMA_NO_NODE;
2300
2301 switch (type) {
2302 /* No need to allocate io pgtable ops in passthrough mode */
2303 case IOMMU_DOMAIN_IDENTITY:
2304 return domain;
2305 case IOMMU_DOMAIN_DMA:
2306 pgtable = amd_iommu_pgtable;
2307 break;
2308 /*
2309 * Force IOMMU v1 page table when allocating
2310 * domain for pass-through devices.
2311 */
2312 case IOMMU_DOMAIN_UNMANAGED:
2313 pgtable = AMD_IOMMU_V1;
2314 break;
2315 default:
2316 goto out_err;
2317 }
2318
2319 switch (pgtable) {
2320 case AMD_IOMMU_V1:
2321 ret = protection_domain_init_v1(domain, DEFAULT_PGTABLE_LEVEL);
2322 break;
2323 case AMD_IOMMU_V2:
2324 ret = protection_domain_init_v2(pdom: domain);
2325 break;
2326 default:
2327 ret = -EINVAL;
2328 break;
2329 }
2330
2331 if (ret)
2332 goto out_err;
2333
2334 pgtbl_ops = alloc_io_pgtable_ops(fmt: pgtable, cfg: &domain->iop.pgtbl_cfg, cookie: domain);
2335 if (!pgtbl_ops)
2336 goto out_err;
2337
2338 return domain;
2339out_err:
2340 protection_domain_free(domain);
2341 return NULL;
2342}
2343
2344static inline u64 dma_max_address(void)
2345{
2346 if (amd_iommu_pgtable == AMD_IOMMU_V1)
2347 return ~0ULL;
2348
2349 /* V2 with 4/5 level page table */
2350 return ((1ULL << PM_LEVEL_SHIFT(amd_iommu_gpt_level)) - 1);
2351}
2352
2353static bool amd_iommu_hd_support(struct amd_iommu *iommu)
2354{
2355 return iommu && (iommu->features & FEATURE_HDSUP);
2356}
2357
2358static struct iommu_domain *do_iommu_domain_alloc(unsigned int type,
2359 struct device *dev, u32 flags)
2360{
2361 bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
2362 struct protection_domain *domain;
2363 struct amd_iommu *iommu = NULL;
2364
2365 if (dev)
2366 iommu = get_amd_iommu_from_dev(dev);
2367
2368 /*
2369 * Since DTE[Mode]=0 is prohibited on SNP-enabled system,
2370 * default to use IOMMU_DOMAIN_DMA[_FQ].
2371 */
2372 if (amd_iommu_snp_en && (type == IOMMU_DOMAIN_IDENTITY))
2373 return ERR_PTR(error: -EINVAL);
2374
2375 if (dirty_tracking && !amd_iommu_hd_support(iommu))
2376 return ERR_PTR(error: -EOPNOTSUPP);
2377
2378 domain = protection_domain_alloc(type);
2379 if (!domain)
2380 return ERR_PTR(error: -ENOMEM);
2381
2382 domain->domain.geometry.aperture_start = 0;
2383 domain->domain.geometry.aperture_end = dma_max_address();
2384 domain->domain.geometry.force_aperture = true;
2385
2386 if (iommu) {
2387 domain->domain.type = type;
2388 domain->domain.pgsize_bitmap = iommu->iommu.ops->pgsize_bitmap;
2389 domain->domain.ops = iommu->iommu.ops->default_domain_ops;
2390
2391 if (dirty_tracking)
2392 domain->domain.dirty_ops = &amd_dirty_ops;
2393 }
2394
2395 return &domain->domain;
2396}
2397
2398static struct iommu_domain *amd_iommu_domain_alloc(unsigned int type)
2399{
2400 struct iommu_domain *domain;
2401
2402 domain = do_iommu_domain_alloc(type, NULL, flags: 0);
2403 if (IS_ERR(ptr: domain))
2404 return NULL;
2405
2406 return domain;
2407}
2408
2409static struct iommu_domain *
2410amd_iommu_domain_alloc_user(struct device *dev, u32 flags,
2411 struct iommu_domain *parent,
2412 const struct iommu_user_data *user_data)
2413
2414{
2415 unsigned int type = IOMMU_DOMAIN_UNMANAGED;
2416
2417 if ((flags & ~IOMMU_HWPT_ALLOC_DIRTY_TRACKING) || parent || user_data)
2418 return ERR_PTR(error: -EOPNOTSUPP);
2419
2420 return do_iommu_domain_alloc(type, dev, flags);
2421}
2422
2423static void amd_iommu_domain_free(struct iommu_domain *dom)
2424{
2425 struct protection_domain *domain;
2426 unsigned long flags;
2427
2428 if (!dom)
2429 return;
2430
2431 domain = to_pdomain(dom);
2432
2433 spin_lock_irqsave(&domain->lock, flags);
2434
2435 cleanup_domain(domain);
2436
2437 spin_unlock_irqrestore(lock: &domain->lock, flags);
2438
2439 protection_domain_free(domain);
2440}
2441
2442static int amd_iommu_attach_device(struct iommu_domain *dom,
2443 struct device *dev)
2444{
2445 struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
2446 struct protection_domain *domain = to_pdomain(dom);
2447 struct amd_iommu *iommu = get_amd_iommu_from_dev(dev);
2448 int ret;
2449
2450 /*
2451 * Skip attach device to domain if new domain is same as
2452 * devices current domain
2453 */
2454 if (dev_data->domain == domain)
2455 return 0;
2456
2457 dev_data->defer_attach = false;
2458
2459 /*
2460 * Restrict to devices with compatible IOMMU hardware support
2461 * when enforcement of dirty tracking is enabled.
2462 */
2463 if (dom->dirty_ops && !amd_iommu_hd_support(iommu))
2464 return -EINVAL;
2465
2466 if (dev_data->domain)
2467 detach_device(dev);
2468
2469 ret = attach_device(dev, domain);
2470
2471#ifdef CONFIG_IRQ_REMAP
2472 if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) {
2473 if (dom->type == IOMMU_DOMAIN_UNMANAGED)
2474 dev_data->use_vapic = 1;
2475 else
2476 dev_data->use_vapic = 0;
2477 }
2478#endif
2479
2480 iommu_completion_wait(iommu);
2481
2482 return ret;
2483}
2484
2485static int amd_iommu_iotlb_sync_map(struct iommu_domain *dom,
2486 unsigned long iova, size_t size)
2487{
2488 struct protection_domain *domain = to_pdomain(dom);
2489 struct io_pgtable_ops *ops = &domain->iop.iop.ops;
2490
2491 if (ops->map_pages)
2492 domain_flush_np_cache(domain, iova, size);
2493 return 0;
2494}
2495
2496static int amd_iommu_map_pages(struct iommu_domain *dom, unsigned long iova,
2497 phys_addr_t paddr, size_t pgsize, size_t pgcount,
2498 int iommu_prot, gfp_t gfp, size_t *mapped)
2499{
2500 struct protection_domain *domain = to_pdomain(dom);
2501 struct io_pgtable_ops *ops = &domain->iop.iop.ops;
2502 int prot = 0;
2503 int ret = -EINVAL;
2504
2505 if ((domain->pd_mode == PD_MODE_V1) &&
2506 (domain->iop.mode == PAGE_MODE_NONE))
2507 return -EINVAL;
2508
2509 if (iommu_prot & IOMMU_READ)
2510 prot |= IOMMU_PROT_IR;
2511 if (iommu_prot & IOMMU_WRITE)
2512 prot |= IOMMU_PROT_IW;
2513
2514 if (ops->map_pages) {
2515 ret = ops->map_pages(ops, iova, paddr, pgsize,
2516 pgcount, prot, gfp, mapped);
2517 }
2518
2519 return ret;
2520}
2521
2522static void amd_iommu_iotlb_gather_add_page(struct iommu_domain *domain,
2523 struct iommu_iotlb_gather *gather,
2524 unsigned long iova, size_t size)
2525{
2526 /*
2527 * AMD's IOMMU can flush as many pages as necessary in a single flush.
2528 * Unless we run in a virtual machine, which can be inferred according
2529 * to whether "non-present cache" is on, it is probably best to prefer
2530 * (potentially) too extensive TLB flushing (i.e., more misses) over
2531 * mutliple TLB flushes (i.e., more flushes). For virtual machines the
2532 * hypervisor needs to synchronize the host IOMMU PTEs with those of
2533 * the guest, and the trade-off is different: unnecessary TLB flushes
2534 * should be avoided.
2535 */
2536 if (amd_iommu_np_cache &&
2537 iommu_iotlb_gather_is_disjoint(gather, iova, size))
2538 iommu_iotlb_sync(domain, iotlb_gather: gather);
2539
2540 iommu_iotlb_gather_add_range(gather, iova, size);
2541}
2542
2543static size_t amd_iommu_unmap_pages(struct iommu_domain *dom, unsigned long iova,
2544 size_t pgsize, size_t pgcount,
2545 struct iommu_iotlb_gather *gather)
2546{
2547 struct protection_domain *domain = to_pdomain(dom);
2548 struct io_pgtable_ops *ops = &domain->iop.iop.ops;
2549 size_t r;
2550
2551 if ((domain->pd_mode == PD_MODE_V1) &&
2552 (domain->iop.mode == PAGE_MODE_NONE))
2553 return 0;
2554
2555 r = (ops->unmap_pages) ? ops->unmap_pages(ops, iova, pgsize, pgcount, NULL) : 0;
2556
2557 if (r)
2558 amd_iommu_iotlb_gather_add_page(domain: dom, gather, iova, size: r);
2559
2560 return r;
2561}
2562
2563static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
2564 dma_addr_t iova)
2565{
2566 struct protection_domain *domain = to_pdomain(dom);
2567 struct io_pgtable_ops *ops = &domain->iop.iop.ops;
2568
2569 return ops->iova_to_phys(ops, iova);
2570}
2571
2572static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap)
2573{
2574 switch (cap) {
2575 case IOMMU_CAP_CACHE_COHERENCY:
2576 return true;
2577 case IOMMU_CAP_NOEXEC:
2578 return false;
2579 case IOMMU_CAP_PRE_BOOT_PROTECTION:
2580 return amdr_ivrs_remap_support;
2581 case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
2582 return true;
2583 case IOMMU_CAP_DEFERRED_FLUSH:
2584 return true;
2585 case IOMMU_CAP_DIRTY_TRACKING: {
2586 struct amd_iommu *iommu = get_amd_iommu_from_dev(dev);
2587
2588 return amd_iommu_hd_support(iommu);
2589 }
2590 default:
2591 break;
2592 }
2593
2594 return false;
2595}
2596
2597static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain,
2598 bool enable)
2599{
2600 struct protection_domain *pdomain = to_pdomain(dom: domain);
2601 struct dev_table_entry *dev_table;
2602 struct iommu_dev_data *dev_data;
2603 bool domain_flush = false;
2604 struct amd_iommu *iommu;
2605 unsigned long flags;
2606 u64 pte_root;
2607
2608 spin_lock_irqsave(&pdomain->lock, flags);
2609 if (!(pdomain->dirty_tracking ^ enable)) {
2610 spin_unlock_irqrestore(lock: &pdomain->lock, flags);
2611 return 0;
2612 }
2613
2614 list_for_each_entry(dev_data, &pdomain->dev_list, list) {
2615 iommu = get_amd_iommu_from_dev_data(dev_data);
2616
2617 dev_table = get_dev_table(iommu);
2618 pte_root = dev_table[dev_data->devid].data[0];
2619
2620 pte_root = (enable ? pte_root | DTE_FLAG_HAD :
2621 pte_root & ~DTE_FLAG_HAD);
2622
2623 /* Flush device DTE */
2624 dev_table[dev_data->devid].data[0] = pte_root;
2625 device_flush_dte(dev_data);
2626 domain_flush = true;
2627 }
2628
2629 /* Flush IOTLB to mark IOPTE dirty on the next translation(s) */
2630 if (domain_flush)
2631 amd_iommu_domain_flush_all(domain: pdomain);
2632
2633 pdomain->dirty_tracking = enable;
2634 spin_unlock_irqrestore(lock: &pdomain->lock, flags);
2635
2636 return 0;
2637}
2638
2639static int amd_iommu_read_and_clear_dirty(struct iommu_domain *domain,
2640 unsigned long iova, size_t size,
2641 unsigned long flags,
2642 struct iommu_dirty_bitmap *dirty)
2643{
2644 struct protection_domain *pdomain = to_pdomain(dom: domain);
2645 struct io_pgtable_ops *ops = &pdomain->iop.iop.ops;
2646 unsigned long lflags;
2647
2648 if (!ops || !ops->read_and_clear_dirty)
2649 return -EOPNOTSUPP;
2650
2651 spin_lock_irqsave(&pdomain->lock, lflags);
2652 if (!pdomain->dirty_tracking && dirty->bitmap) {
2653 spin_unlock_irqrestore(lock: &pdomain->lock, flags: lflags);
2654 return -EINVAL;
2655 }
2656 spin_unlock_irqrestore(lock: &pdomain->lock, flags: lflags);
2657
2658 return ops->read_and_clear_dirty(ops, iova, size, flags, dirty);
2659}
2660
2661static void amd_iommu_get_resv_regions(struct device *dev,
2662 struct list_head *head)
2663{
2664 struct iommu_resv_region *region;
2665 struct unity_map_entry *entry;
2666 struct amd_iommu *iommu;
2667 struct amd_iommu_pci_seg *pci_seg;
2668 int devid, sbdf;
2669
2670 sbdf = get_device_sbdf_id(dev);
2671 if (sbdf < 0)
2672 return;
2673
2674 devid = PCI_SBDF_TO_DEVID(sbdf);
2675 iommu = get_amd_iommu_from_dev(dev);
2676 pci_seg = iommu->pci_seg;
2677
2678 list_for_each_entry(entry, &pci_seg->unity_map, list) {
2679 int type, prot = 0;
2680 size_t length;
2681
2682 if (devid < entry->devid_start || devid > entry->devid_end)
2683 continue;
2684
2685 type = IOMMU_RESV_DIRECT;
2686 length = entry->address_end - entry->address_start;
2687 if (entry->prot & IOMMU_PROT_IR)
2688 prot |= IOMMU_READ;
2689 if (entry->prot & IOMMU_PROT_IW)
2690 prot |= IOMMU_WRITE;
2691 if (entry->prot & IOMMU_UNITY_MAP_FLAG_EXCL_RANGE)
2692 /* Exclusion range */
2693 type = IOMMU_RESV_RESERVED;
2694
2695 region = iommu_alloc_resv_region(start: entry->address_start,
2696 length, prot, type,
2697 GFP_KERNEL);
2698 if (!region) {
2699 dev_err(dev, "Out of memory allocating dm-regions\n");
2700 return;
2701 }
2702 list_add_tail(new: &region->list, head);
2703 }
2704
2705 region = iommu_alloc_resv_region(MSI_RANGE_START,
2706 MSI_RANGE_END - MSI_RANGE_START + 1,
2707 prot: 0, type: IOMMU_RESV_MSI, GFP_KERNEL);
2708 if (!region)
2709 return;
2710 list_add_tail(new: &region->list, head);
2711
2712 region = iommu_alloc_resv_region(HT_RANGE_START,
2713 HT_RANGE_END - HT_RANGE_START + 1,
2714 prot: 0, type: IOMMU_RESV_RESERVED, GFP_KERNEL);
2715 if (!region)
2716 return;
2717 list_add_tail(new: &region->list, head);
2718}
2719
2720bool amd_iommu_is_attach_deferred(struct device *dev)
2721{
2722 struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
2723
2724 return dev_data->defer_attach;
2725}
2726
2727static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain)
2728{
2729 struct protection_domain *dom = to_pdomain(dom: domain);
2730 unsigned long flags;
2731
2732 spin_lock_irqsave(&dom->lock, flags);
2733 amd_iommu_domain_flush_all(domain: dom);
2734 spin_unlock_irqrestore(lock: &dom->lock, flags);
2735}
2736
2737static void amd_iommu_iotlb_sync(struct iommu_domain *domain,
2738 struct iommu_iotlb_gather *gather)
2739{
2740 struct protection_domain *dom = to_pdomain(dom: domain);
2741 unsigned long flags;
2742
2743 spin_lock_irqsave(&dom->lock, flags);
2744 amd_iommu_domain_flush_pages(domain: dom, address: gather->start,
2745 size: gather->end - gather->start + 1);
2746 spin_unlock_irqrestore(lock: &dom->lock, flags);
2747}
2748
2749static int amd_iommu_def_domain_type(struct device *dev)
2750{
2751 struct iommu_dev_data *dev_data;
2752
2753 dev_data = dev_iommu_priv_get(dev);
2754 if (!dev_data)
2755 return 0;
2756
2757 /*
2758 * Do not identity map IOMMUv2 capable devices when:
2759 * - memory encryption is active, because some of those devices
2760 * (AMD GPUs) don't have the encryption bit in their DMA-mask
2761 * and require remapping.
2762 * - SNP is enabled, because it prohibits DTE[Mode]=0.
2763 */
2764 if (pdev_pasid_supported(dev_data) &&
2765 !cc_platform_has(attr: CC_ATTR_MEM_ENCRYPT) &&
2766 !amd_iommu_snp_en) {
2767 return IOMMU_DOMAIN_IDENTITY;
2768 }
2769
2770 return 0;
2771}
2772
2773static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain)
2774{
2775 /* IOMMU_PTE_FC is always set */
2776 return true;
2777}
2778
2779static const struct iommu_dirty_ops amd_dirty_ops = {
2780 .set_dirty_tracking = amd_iommu_set_dirty_tracking,
2781 .read_and_clear_dirty = amd_iommu_read_and_clear_dirty,
2782};
2783
2784const struct iommu_ops amd_iommu_ops = {
2785 .capable = amd_iommu_capable,
2786 .domain_alloc = amd_iommu_domain_alloc,
2787 .domain_alloc_user = amd_iommu_domain_alloc_user,
2788 .probe_device = amd_iommu_probe_device,
2789 .release_device = amd_iommu_release_device,
2790 .probe_finalize = amd_iommu_probe_finalize,
2791 .device_group = amd_iommu_device_group,
2792 .get_resv_regions = amd_iommu_get_resv_regions,
2793 .is_attach_deferred = amd_iommu_is_attach_deferred,
2794 .pgsize_bitmap = AMD_IOMMU_PGSIZES,
2795 .def_domain_type = amd_iommu_def_domain_type,
2796 .default_domain_ops = &(const struct iommu_domain_ops) {
2797 .attach_dev = amd_iommu_attach_device,
2798 .map_pages = amd_iommu_map_pages,
2799 .unmap_pages = amd_iommu_unmap_pages,
2800 .iotlb_sync_map = amd_iommu_iotlb_sync_map,
2801 .iova_to_phys = amd_iommu_iova_to_phys,
2802 .flush_iotlb_all = amd_iommu_flush_iotlb_all,
2803 .iotlb_sync = amd_iommu_iotlb_sync,
2804 .free = amd_iommu_domain_free,
2805 .enforce_cache_coherency = amd_iommu_enforce_cache_coherency,
2806 }
2807};
2808
2809#ifdef CONFIG_IRQ_REMAP
2810
2811/*****************************************************************************
2812 *
2813 * Interrupt Remapping Implementation
2814 *
2815 *****************************************************************************/
2816
2817static struct irq_chip amd_ir_chip;
2818static DEFINE_SPINLOCK(iommu_table_lock);
2819
2820static void iommu_flush_irt_and_complete(struct amd_iommu *iommu, u16 devid)
2821{
2822 int ret;
2823 u64 data;
2824 unsigned long flags;
2825 struct iommu_cmd cmd, cmd2;
2826
2827 if (iommu->irtcachedis_enabled)
2828 return;
2829
2830 build_inv_irt(cmd: &cmd, devid);
2831 data = atomic64_add_return(i: 1, v: &iommu->cmd_sem_val);
2832 build_completion_wait(cmd: &cmd2, iommu, data);
2833
2834 raw_spin_lock_irqsave(&iommu->lock, flags);
2835 ret = __iommu_queue_command_sync(iommu, cmd: &cmd, sync: true);
2836 if (ret)
2837 goto out;
2838 ret = __iommu_queue_command_sync(iommu, cmd: &cmd2, sync: false);
2839 if (ret)
2840 goto out;
2841 wait_on_sem(iommu, data);
2842out:
2843 raw_spin_unlock_irqrestore(&iommu->lock, flags);
2844}
2845
2846static void set_dte_irq_entry(struct amd_iommu *iommu, u16 devid,
2847 struct irq_remap_table *table)
2848{
2849 u64 dte;
2850 struct dev_table_entry *dev_table = get_dev_table(iommu);
2851
2852 dte = dev_table[devid].data[2];
2853 dte &= ~DTE_IRQ_PHYS_ADDR_MASK;
2854 dte |= iommu_virt_to_phys(vaddr: table->table);
2855 dte |= DTE_IRQ_REMAP_INTCTL;
2856 dte |= DTE_INTTABLEN;
2857 dte |= DTE_IRQ_REMAP_ENABLE;
2858
2859 dev_table[devid].data[2] = dte;
2860}
2861
2862static struct irq_remap_table *get_irq_table(struct amd_iommu *iommu, u16 devid)
2863{
2864 struct irq_remap_table *table;
2865 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
2866
2867 if (WARN_ONCE(!pci_seg->rlookup_table[devid],
2868 "%s: no iommu for devid %x:%x\n",
2869 __func__, pci_seg->id, devid))
2870 return NULL;
2871
2872 table = pci_seg->irq_lookup_table[devid];
2873 if (WARN_ONCE(!table, "%s: no table for devid %x:%x\n",
2874 __func__, pci_seg->id, devid))
2875 return NULL;
2876
2877 return table;
2878}
2879
2880static struct irq_remap_table *__alloc_irq_table(void)
2881{
2882 struct irq_remap_table *table;
2883
2884 table = kzalloc(size: sizeof(*table), GFP_KERNEL);
2885 if (!table)
2886 return NULL;
2887
2888 table->table = kmem_cache_alloc(cachep: amd_iommu_irq_cache, GFP_KERNEL);
2889 if (!table->table) {
2890 kfree(objp: table);
2891 return NULL;
2892 }
2893 raw_spin_lock_init(&table->lock);
2894
2895 if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
2896 memset(table->table, 0,
2897 MAX_IRQS_PER_TABLE * sizeof(u32));
2898 else
2899 memset(table->table, 0,
2900 (MAX_IRQS_PER_TABLE * (sizeof(u64) * 2)));
2901 return table;
2902}
2903
2904static void set_remap_table_entry(struct amd_iommu *iommu, u16 devid,
2905 struct irq_remap_table *table)
2906{
2907 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
2908
2909 pci_seg->irq_lookup_table[devid] = table;
2910 set_dte_irq_entry(iommu, devid, table);
2911 iommu_flush_dte(iommu, devid);
2912}
2913
2914static int set_remap_table_entry_alias(struct pci_dev *pdev, u16 alias,
2915 void *data)
2916{
2917 struct irq_remap_table *table = data;
2918 struct amd_iommu_pci_seg *pci_seg;
2919 struct amd_iommu *iommu = rlookup_amd_iommu(dev: &pdev->dev);
2920
2921 if (!iommu)
2922 return -EINVAL;
2923
2924 pci_seg = iommu->pci_seg;
2925 pci_seg->irq_lookup_table[alias] = table;
2926 set_dte_irq_entry(iommu, devid: alias, table);
2927 iommu_flush_dte(iommu: pci_seg->rlookup_table[alias], devid: alias);
2928
2929 return 0;
2930}
2931
2932static struct irq_remap_table *alloc_irq_table(struct amd_iommu *iommu,
2933 u16 devid, struct pci_dev *pdev)
2934{
2935 struct irq_remap_table *table = NULL;
2936 struct irq_remap_table *new_table = NULL;
2937 struct amd_iommu_pci_seg *pci_seg;
2938 unsigned long flags;
2939 u16 alias;
2940
2941 spin_lock_irqsave(&iommu_table_lock, flags);
2942
2943 pci_seg = iommu->pci_seg;
2944 table = pci_seg->irq_lookup_table[devid];
2945 if (table)
2946 goto out_unlock;
2947
2948 alias = pci_seg->alias_table[devid];
2949 table = pci_seg->irq_lookup_table[alias];
2950 if (table) {
2951 set_remap_table_entry(iommu, devid, table);
2952 goto out_wait;
2953 }
2954 spin_unlock_irqrestore(lock: &iommu_table_lock, flags);
2955
2956 /* Nothing there yet, allocate new irq remapping table */
2957 new_table = __alloc_irq_table();
2958 if (!new_table)
2959 return NULL;
2960
2961 spin_lock_irqsave(&iommu_table_lock, flags);
2962
2963 table = pci_seg->irq_lookup_table[devid];
2964 if (table)
2965 goto out_unlock;
2966
2967 table = pci_seg->irq_lookup_table[alias];
2968 if (table) {
2969 set_remap_table_entry(iommu, devid, table);
2970 goto out_wait;
2971 }
2972
2973 table = new_table;
2974 new_table = NULL;
2975
2976 if (pdev)
2977 pci_for_each_dma_alias(pdev, fn: set_remap_table_entry_alias,
2978 data: table);
2979 else
2980 set_remap_table_entry(iommu, devid, table);
2981
2982 if (devid != alias)
2983 set_remap_table_entry(iommu, devid: alias, table);
2984
2985out_wait:
2986 iommu_completion_wait(iommu);
2987
2988out_unlock:
2989 spin_unlock_irqrestore(lock: &iommu_table_lock, flags);
2990
2991 if (new_table) {
2992 kmem_cache_free(s: amd_iommu_irq_cache, objp: new_table->table);
2993 kfree(objp: new_table);
2994 }
2995 return table;
2996}
2997
2998static int alloc_irq_index(struct amd_iommu *iommu, u16 devid, int count,
2999 bool align, struct pci_dev *pdev)
3000{
3001 struct irq_remap_table *table;
3002 int index, c, alignment = 1;
3003 unsigned long flags;
3004
3005 table = alloc_irq_table(iommu, devid, pdev);
3006 if (!table)
3007 return -ENODEV;
3008
3009 if (align)
3010 alignment = roundup_pow_of_two(count);
3011
3012 raw_spin_lock_irqsave(&table->lock, flags);
3013
3014 /* Scan table for free entries */
3015 for (index = ALIGN(table->min_index, alignment), c = 0;
3016 index < MAX_IRQS_PER_TABLE;) {
3017 if (!iommu->irte_ops->is_allocated(table, index)) {
3018 c += 1;
3019 } else {
3020 c = 0;
3021 index = ALIGN(index + 1, alignment);
3022 continue;
3023 }
3024
3025 if (c == count) {
3026 for (; c != 0; --c)
3027 iommu->irte_ops->set_allocated(table, index - c + 1);
3028
3029 index -= count - 1;
3030 goto out;
3031 }
3032
3033 index++;
3034 }
3035
3036 index = -ENOSPC;
3037
3038out:
3039 raw_spin_unlock_irqrestore(&table->lock, flags);
3040
3041 return index;
3042}
3043
3044static int __modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index,
3045 struct irte_ga *irte)
3046{
3047 struct irq_remap_table *table;
3048 struct irte_ga *entry;
3049 unsigned long flags;
3050 u128 old;
3051
3052 table = get_irq_table(iommu, devid);
3053 if (!table)
3054 return -ENOMEM;
3055
3056 raw_spin_lock_irqsave(&table->lock, flags);
3057
3058 entry = (struct irte_ga *)table->table;
3059 entry = &entry[index];
3060
3061 /*
3062 * We use cmpxchg16 to atomically update the 128-bit IRTE,
3063 * and it cannot be updated by the hardware or other processors
3064 * behind us, so the return value of cmpxchg16 should be the
3065 * same as the old value.
3066 */
3067 old = entry->irte;
3068 WARN_ON(!try_cmpxchg128(&entry->irte, &old, irte->irte));
3069
3070 raw_spin_unlock_irqrestore(&table->lock, flags);
3071
3072 return 0;
3073}
3074
3075static int modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index,
3076 struct irte_ga *irte)
3077{
3078 bool ret;
3079
3080 ret = __modify_irte_ga(iommu, devid, index, irte);
3081 if (ret)
3082 return ret;
3083
3084 iommu_flush_irt_and_complete(iommu, devid);
3085
3086 return 0;
3087}
3088
3089static int modify_irte(struct amd_iommu *iommu,
3090 u16 devid, int index, union irte *irte)
3091{
3092 struct irq_remap_table *table;
3093 unsigned long flags;
3094
3095 table = get_irq_table(iommu, devid);
3096 if (!table)
3097 return -ENOMEM;
3098
3099 raw_spin_lock_irqsave(&table->lock, flags);
3100 table->table[index] = irte->val;
3101 raw_spin_unlock_irqrestore(&table->lock, flags);
3102
3103 iommu_flush_irt_and_complete(iommu, devid);
3104
3105 return 0;
3106}
3107
3108static void free_irte(struct amd_iommu *iommu, u16 devid, int index)
3109{
3110 struct irq_remap_table *table;
3111 unsigned long flags;
3112
3113 table = get_irq_table(iommu, devid);
3114 if (!table)
3115 return;
3116
3117 raw_spin_lock_irqsave(&table->lock, flags);
3118 iommu->irte_ops->clear_allocated(table, index);
3119 raw_spin_unlock_irqrestore(&table->lock, flags);
3120
3121 iommu_flush_irt_and_complete(iommu, devid);
3122}
3123
3124static void irte_prepare(void *entry,
3125 u32 delivery_mode, bool dest_mode,
3126 u8 vector, u32 dest_apicid, int devid)
3127{
3128 union irte *irte = (union irte *) entry;
3129
3130 irte->val = 0;
3131 irte->fields.vector = vector;
3132 irte->fields.int_type = delivery_mode;
3133 irte->fields.destination = dest_apicid;
3134 irte->fields.dm = dest_mode;
3135 irte->fields.valid = 1;
3136}
3137
3138static void irte_ga_prepare(void *entry,
3139 u32 delivery_mode, bool dest_mode,
3140 u8 vector, u32 dest_apicid, int devid)
3141{
3142 struct irte_ga *irte = (struct irte_ga *) entry;
3143
3144 irte->lo.val = 0;
3145 irte->hi.val = 0;
3146 irte->lo.fields_remap.int_type = delivery_mode;
3147 irte->lo.fields_remap.dm = dest_mode;
3148 irte->hi.fields.vector = vector;
3149 irte->lo.fields_remap.destination = APICID_TO_IRTE_DEST_LO(dest_apicid);
3150 irte->hi.fields.destination = APICID_TO_IRTE_DEST_HI(dest_apicid);
3151 irte->lo.fields_remap.valid = 1;
3152}
3153
3154static void irte_activate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index)
3155{
3156 union irte *irte = (union irte *) entry;
3157
3158 irte->fields.valid = 1;
3159 modify_irte(iommu, devid, index, irte);
3160}
3161
3162static void irte_ga_activate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index)
3163{
3164 struct irte_ga *irte = (struct irte_ga *) entry;
3165
3166 irte->lo.fields_remap.valid = 1;
3167 modify_irte_ga(iommu, devid, index, irte);
3168}
3169
3170static void irte_deactivate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index)
3171{
3172 union irte *irte = (union irte *) entry;
3173
3174 irte->fields.valid = 0;
3175 modify_irte(iommu, devid, index, irte);
3176}
3177
3178static void irte_ga_deactivate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index)
3179{
3180 struct irte_ga *irte = (struct irte_ga *) entry;
3181
3182 irte->lo.fields_remap.valid = 0;
3183 modify_irte_ga(iommu, devid, index, irte);
3184}
3185
3186static void irte_set_affinity(struct amd_iommu *iommu, void *entry, u16 devid, u16 index,
3187 u8 vector, u32 dest_apicid)
3188{
3189 union irte *irte = (union irte *) entry;
3190
3191 irte->fields.vector = vector;
3192 irte->fields.destination = dest_apicid;
3193 modify_irte(iommu, devid, index, irte);
3194}
3195
3196static void irte_ga_set_affinity(struct amd_iommu *iommu, void *entry, u16 devid, u16 index,
3197 u8 vector, u32 dest_apicid)
3198{
3199 struct irte_ga *irte = (struct irte_ga *) entry;
3200
3201 if (!irte->lo.fields_remap.guest_mode) {
3202 irte->hi.fields.vector = vector;
3203 irte->lo.fields_remap.destination =
3204 APICID_TO_IRTE_DEST_LO(dest_apicid);
3205 irte->hi.fields.destination =
3206 APICID_TO_IRTE_DEST_HI(dest_apicid);
3207 modify_irte_ga(iommu, devid, index, irte);
3208 }
3209}
3210
3211#define IRTE_ALLOCATED (~1U)
3212static void irte_set_allocated(struct irq_remap_table *table, int index)
3213{
3214 table->table[index] = IRTE_ALLOCATED;
3215}
3216
3217static void irte_ga_set_allocated(struct irq_remap_table *table, int index)
3218{
3219 struct irte_ga *ptr = (struct irte_ga *)table->table;
3220 struct irte_ga *irte = &ptr[index];
3221
3222 memset(&irte->lo.val, 0, sizeof(u64));
3223 memset(&irte->hi.val, 0, sizeof(u64));
3224 irte->hi.fields.vector = 0xff;
3225}
3226
3227static bool irte_is_allocated(struct irq_remap_table *table, int index)
3228{
3229 union irte *ptr = (union irte *)table->table;
3230 union irte *irte = &ptr[index];
3231
3232 return irte->val != 0;
3233}
3234
3235static bool irte_ga_is_allocated(struct irq_remap_table *table, int index)
3236{
3237 struct irte_ga *ptr = (struct irte_ga *)table->table;
3238 struct irte_ga *irte = &ptr[index];
3239
3240 return irte->hi.fields.vector != 0;
3241}
3242
3243static void irte_clear_allocated(struct irq_remap_table *table, int index)
3244{
3245 table->table[index] = 0;
3246}
3247
3248static void irte_ga_clear_allocated(struct irq_remap_table *table, int index)
3249{
3250 struct irte_ga *ptr = (struct irte_ga *)table->table;
3251 struct irte_ga *irte = &ptr[index];
3252
3253 memset(&irte->lo.val, 0, sizeof(u64));
3254 memset(&irte->hi.val, 0, sizeof(u64));
3255}
3256
3257static int get_devid(struct irq_alloc_info *info)
3258{
3259 switch (info->type) {
3260 case X86_IRQ_ALLOC_TYPE_IOAPIC:
3261 return get_ioapic_devid(id: info->devid);
3262 case X86_IRQ_ALLOC_TYPE_HPET:
3263 return get_hpet_devid(id: info->devid);
3264 case X86_IRQ_ALLOC_TYPE_PCI_MSI:
3265 case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
3266 return get_device_sbdf_id(msi_desc_to_dev(info->desc));
3267 default:
3268 WARN_ON_ONCE(1);
3269 return -1;
3270 }
3271}
3272
3273struct irq_remap_ops amd_iommu_irq_ops = {
3274 .prepare = amd_iommu_prepare,
3275 .enable = amd_iommu_enable,
3276 .disable = amd_iommu_disable,
3277 .reenable = amd_iommu_reenable,
3278 .enable_faulting = amd_iommu_enable_faulting,
3279};
3280
3281static void fill_msi_msg(struct msi_msg *msg, u32 index)
3282{
3283 msg->data = index;
3284 msg->address_lo = 0;
3285 msg->arch_addr_lo.base_address = X86_MSI_BASE_ADDRESS_LOW;
3286 msg->address_hi = X86_MSI_BASE_ADDRESS_HIGH;
3287}
3288
3289static void irq_remapping_prepare_irte(struct amd_ir_data *data,
3290 struct irq_cfg *irq_cfg,
3291 struct irq_alloc_info *info,
3292 int devid, int index, int sub_handle)
3293{
3294 struct irq_2_irte *irte_info = &data->irq_2_irte;
3295 struct amd_iommu *iommu = data->iommu;
3296
3297 if (!iommu)
3298 return;
3299
3300 data->irq_2_irte.devid = devid;
3301 data->irq_2_irte.index = index + sub_handle;
3302 iommu->irte_ops->prepare(data->entry, APIC_DELIVERY_MODE_FIXED,
3303 apic->dest_mode_logical, irq_cfg->vector,
3304 irq_cfg->dest_apicid, devid);
3305
3306 switch (info->type) {
3307 case X86_IRQ_ALLOC_TYPE_IOAPIC:
3308 case X86_IRQ_ALLOC_TYPE_HPET:
3309 case X86_IRQ_ALLOC_TYPE_PCI_MSI:
3310 case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
3311 fill_msi_msg(msg: &data->msi_entry, index: irte_info->index);
3312 break;
3313
3314 default:
3315 BUG_ON(1);
3316 break;
3317 }
3318}
3319
3320struct amd_irte_ops irte_32_ops = {
3321 .prepare = irte_prepare,
3322 .activate = irte_activate,
3323 .deactivate = irte_deactivate,
3324 .set_affinity = irte_set_affinity,
3325 .set_allocated = irte_set_allocated,
3326 .is_allocated = irte_is_allocated,
3327 .clear_allocated = irte_clear_allocated,
3328};
3329
3330struct amd_irte_ops irte_128_ops = {
3331 .prepare = irte_ga_prepare,
3332 .activate = irte_ga_activate,
3333 .deactivate = irte_ga_deactivate,
3334 .set_affinity = irte_ga_set_affinity,
3335 .set_allocated = irte_ga_set_allocated,
3336 .is_allocated = irte_ga_is_allocated,
3337 .clear_allocated = irte_ga_clear_allocated,
3338};
3339
3340static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq,
3341 unsigned int nr_irqs, void *arg)
3342{
3343 struct irq_alloc_info *info = arg;
3344 struct irq_data *irq_data;
3345 struct amd_ir_data *data = NULL;
3346 struct amd_iommu *iommu;
3347 struct irq_cfg *cfg;
3348 int i, ret, devid, seg, sbdf;
3349 int index;
3350
3351 if (!info)
3352 return -EINVAL;
3353 if (nr_irqs > 1 && info->type != X86_IRQ_ALLOC_TYPE_PCI_MSI)
3354 return -EINVAL;
3355
3356 sbdf = get_devid(info);
3357 if (sbdf < 0)
3358 return -EINVAL;
3359
3360 seg = PCI_SBDF_TO_SEGID(sbdf);
3361 devid = PCI_SBDF_TO_DEVID(sbdf);
3362 iommu = __rlookup_amd_iommu(seg, devid);
3363 if (!iommu)
3364 return -EINVAL;
3365
3366 ret = irq_domain_alloc_irqs_parent(domain, irq_base: virq, nr_irqs, arg);
3367 if (ret < 0)
3368 return ret;
3369
3370 if (info->type == X86_IRQ_ALLOC_TYPE_IOAPIC) {
3371 struct irq_remap_table *table;
3372
3373 table = alloc_irq_table(iommu, devid, NULL);
3374 if (table) {
3375 if (!table->min_index) {
3376 /*
3377 * Keep the first 32 indexes free for IOAPIC
3378 * interrupts.
3379 */
3380 table->min_index = 32;
3381 for (i = 0; i < 32; ++i)
3382 iommu->irte_ops->set_allocated(table, i);
3383 }
3384 WARN_ON(table->min_index != 32);
3385 index = info->ioapic.pin;
3386 } else {
3387 index = -ENOMEM;
3388 }
3389 } else if (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI ||
3390 info->type == X86_IRQ_ALLOC_TYPE_PCI_MSIX) {
3391 bool align = (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI);
3392
3393 index = alloc_irq_index(iommu, devid, count: nr_irqs, align,
3394 pdev: msi_desc_to_pci_dev(desc: info->desc));
3395 } else {
3396 index = alloc_irq_index(iommu, devid, count: nr_irqs, align: false, NULL);
3397 }
3398
3399 if (index < 0) {
3400 pr_warn("Failed to allocate IRTE\n");
3401 ret = index;
3402 goto out_free_parent;
3403 }
3404
3405 for (i = 0; i < nr_irqs; i++) {
3406 irq_data = irq_domain_get_irq_data(domain, virq: virq + i);
3407 cfg = irq_data ? irqd_cfg(irq_data) : NULL;
3408 if (!cfg) {
3409 ret = -EINVAL;
3410 goto out_free_data;
3411 }
3412
3413 ret = -ENOMEM;
3414 data = kzalloc(size: sizeof(*data), GFP_KERNEL);
3415 if (!data)
3416 goto out_free_data;
3417
3418 if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
3419 data->entry = kzalloc(size: sizeof(union irte), GFP_KERNEL);
3420 else
3421 data->entry = kzalloc(size: sizeof(struct irte_ga),
3422 GFP_KERNEL);
3423 if (!data->entry) {
3424 kfree(objp: data);
3425 goto out_free_data;
3426 }
3427
3428 data->iommu = iommu;
3429 irq_data->hwirq = (devid << 16) + i;
3430 irq_data->chip_data = data;
3431 irq_data->chip = &amd_ir_chip;
3432 irq_remapping_prepare_irte(data, irq_cfg: cfg, info, devid, index, sub_handle: i);
3433 irq_set_status_flags(irq: virq + i, set: IRQ_MOVE_PCNTXT);
3434 }
3435
3436 return 0;
3437
3438out_free_data:
3439 for (i--; i >= 0; i--) {
3440 irq_data = irq_domain_get_irq_data(domain, virq: virq + i);
3441 if (irq_data)
3442 kfree(objp: irq_data->chip_data);
3443 }
3444 for (i = 0; i < nr_irqs; i++)
3445 free_irte(iommu, devid, index: index + i);
3446out_free_parent:
3447 irq_domain_free_irqs_common(domain, virq, nr_irqs);
3448 return ret;
3449}
3450
3451static void irq_remapping_free(struct irq_domain *domain, unsigned int virq,
3452 unsigned int nr_irqs)
3453{
3454 struct irq_2_irte *irte_info;
3455 struct irq_data *irq_data;
3456 struct amd_ir_data *data;
3457 int i;
3458
3459 for (i = 0; i < nr_irqs; i++) {
3460 irq_data = irq_domain_get_irq_data(domain, virq: virq + i);
3461 if (irq_data && irq_data->chip_data) {
3462 data = irq_data->chip_data;
3463 irte_info = &data->irq_2_irte;
3464 free_irte(iommu: data->iommu, devid: irte_info->devid, index: irte_info->index);
3465 kfree(objp: data->entry);
3466 kfree(objp: data);
3467 }
3468 }
3469 irq_domain_free_irqs_common(domain, virq, nr_irqs);
3470}
3471
3472static void amd_ir_update_irte(struct irq_data *irqd, struct amd_iommu *iommu,
3473 struct amd_ir_data *ir_data,
3474 struct irq_2_irte *irte_info,
3475 struct irq_cfg *cfg);
3476
3477static int irq_remapping_activate(struct irq_domain *domain,
3478 struct irq_data *irq_data, bool reserve)
3479{
3480 struct amd_ir_data *data = irq_data->chip_data;
3481 struct irq_2_irte *irte_info = &data->irq_2_irte;
3482 struct amd_iommu *iommu = data->iommu;
3483 struct irq_cfg *cfg = irqd_cfg(irq_data);
3484
3485 if (!iommu)
3486 return 0;
3487
3488 iommu->irte_ops->activate(iommu, data->entry, irte_info->devid,
3489 irte_info->index);
3490 amd_ir_update_irte(irqd: irq_data, iommu, ir_data: data, irte_info, cfg);
3491 return 0;
3492}
3493
3494static void irq_remapping_deactivate(struct irq_domain *domain,
3495 struct irq_data *irq_data)
3496{
3497 struct amd_ir_data *data = irq_data->chip_data;
3498 struct irq_2_irte *irte_info = &data->irq_2_irte;
3499 struct amd_iommu *iommu = data->iommu;
3500
3501 if (iommu)
3502 iommu->irte_ops->deactivate(iommu, data->entry, irte_info->devid,
3503 irte_info->index);
3504}
3505
3506static int irq_remapping_select(struct irq_domain *d, struct irq_fwspec *fwspec,
3507 enum irq_domain_bus_token bus_token)
3508{
3509 struct amd_iommu *iommu;
3510 int devid = -1;
3511
3512 if (!amd_iommu_irq_remap)
3513 return 0;
3514
3515 if (x86_fwspec_is_ioapic(fwspec))
3516 devid = get_ioapic_devid(id: fwspec->param[0]);
3517 else if (x86_fwspec_is_hpet(fwspec))
3518 devid = get_hpet_devid(id: fwspec->param[0]);
3519
3520 if (devid < 0)
3521 return 0;
3522 iommu = __rlookup_amd_iommu(seg: (devid >> 16), devid: (devid & 0xffff));
3523
3524 return iommu && iommu->ir_domain == d;
3525}
3526
3527static const struct irq_domain_ops amd_ir_domain_ops = {
3528 .select = irq_remapping_select,
3529 .alloc = irq_remapping_alloc,
3530 .free = irq_remapping_free,
3531 .activate = irq_remapping_activate,
3532 .deactivate = irq_remapping_deactivate,
3533};
3534
3535int amd_iommu_activate_guest_mode(void *data)
3536{
3537 struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
3538 struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
3539 u64 valid;
3540
3541 if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) || !entry)
3542 return 0;
3543
3544 valid = entry->lo.fields_vapic.valid;
3545
3546 entry->lo.val = 0;
3547 entry->hi.val = 0;
3548
3549 entry->lo.fields_vapic.valid = valid;
3550 entry->lo.fields_vapic.guest_mode = 1;
3551 entry->lo.fields_vapic.ga_log_intr = 1;
3552 entry->hi.fields.ga_root_ptr = ir_data->ga_root_ptr;
3553 entry->hi.fields.vector = ir_data->ga_vector;
3554 entry->lo.fields_vapic.ga_tag = ir_data->ga_tag;
3555
3556 return modify_irte_ga(iommu: ir_data->iommu, devid: ir_data->irq_2_irte.devid,
3557 index: ir_data->irq_2_irte.index, irte: entry);
3558}
3559EXPORT_SYMBOL(amd_iommu_activate_guest_mode);
3560
3561int amd_iommu_deactivate_guest_mode(void *data)
3562{
3563 struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
3564 struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
3565 struct irq_cfg *cfg = ir_data->cfg;
3566 u64 valid;
3567
3568 if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) ||
3569 !entry || !entry->lo.fields_vapic.guest_mode)
3570 return 0;
3571
3572 valid = entry->lo.fields_remap.valid;
3573
3574 entry->lo.val = 0;
3575 entry->hi.val = 0;
3576
3577 entry->lo.fields_remap.valid = valid;
3578 entry->lo.fields_remap.dm = apic->dest_mode_logical;
3579 entry->lo.fields_remap.int_type = APIC_DELIVERY_MODE_FIXED;
3580 entry->hi.fields.vector = cfg->vector;
3581 entry->lo.fields_remap.destination =
3582 APICID_TO_IRTE_DEST_LO(cfg->dest_apicid);
3583 entry->hi.fields.destination =
3584 APICID_TO_IRTE_DEST_HI(cfg->dest_apicid);
3585
3586 return modify_irte_ga(iommu: ir_data->iommu, devid: ir_data->irq_2_irte.devid,
3587 index: ir_data->irq_2_irte.index, irte: entry);
3588}
3589EXPORT_SYMBOL(amd_iommu_deactivate_guest_mode);
3590
3591static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info)
3592{
3593 int ret;
3594 struct amd_iommu_pi_data *pi_data = vcpu_info;
3595 struct vcpu_data *vcpu_pi_info = pi_data->vcpu_data;
3596 struct amd_ir_data *ir_data = data->chip_data;
3597 struct irq_2_irte *irte_info = &ir_data->irq_2_irte;
3598 struct iommu_dev_data *dev_data;
3599
3600 if (ir_data->iommu == NULL)
3601 return -EINVAL;
3602
3603 dev_data = search_dev_data(iommu: ir_data->iommu, devid: irte_info->devid);
3604
3605 /* Note:
3606 * This device has never been set up for guest mode.
3607 * we should not modify the IRTE
3608 */
3609 if (!dev_data || !dev_data->use_vapic)
3610 return 0;
3611
3612 ir_data->cfg = irqd_cfg(irq_data: data);
3613 pi_data->ir_data = ir_data;
3614
3615 /* Note:
3616 * SVM tries to set up for VAPIC mode, but we are in
3617 * legacy mode. So, we force legacy mode instead.
3618 */
3619 if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) {
3620 pr_debug("%s: Fall back to using intr legacy remap\n",
3621 __func__);
3622 pi_data->is_guest_mode = false;
3623 }
3624
3625 pi_data->prev_ga_tag = ir_data->cached_ga_tag;
3626 if (pi_data->is_guest_mode) {
3627 ir_data->ga_root_ptr = (pi_data->base >> 12);
3628 ir_data->ga_vector = vcpu_pi_info->vector;
3629 ir_data->ga_tag = pi_data->ga_tag;
3630 ret = amd_iommu_activate_guest_mode(ir_data);
3631 if (!ret)
3632 ir_data->cached_ga_tag = pi_data->ga_tag;
3633 } else {
3634 ret = amd_iommu_deactivate_guest_mode(ir_data);
3635
3636 /*
3637 * This communicates the ga_tag back to the caller
3638 * so that it can do all the necessary clean up.
3639 */
3640 if (!ret)
3641 ir_data->cached_ga_tag = 0;
3642 }
3643
3644 return ret;
3645}
3646
3647
3648static void amd_ir_update_irte(struct irq_data *irqd, struct amd_iommu *iommu,
3649 struct amd_ir_data *ir_data,
3650 struct irq_2_irte *irte_info,
3651 struct irq_cfg *cfg)
3652{
3653
3654 /*
3655 * Atomically updates the IRTE with the new destination, vector
3656 * and flushes the interrupt entry cache.
3657 */
3658 iommu->irte_ops->set_affinity(iommu, ir_data->entry, irte_info->devid,
3659 irte_info->index, cfg->vector,
3660 cfg->dest_apicid);
3661}
3662
3663static int amd_ir_set_affinity(struct irq_data *data,
3664 const struct cpumask *mask, bool force)
3665{
3666 struct amd_ir_data *ir_data = data->chip_data;
3667 struct irq_2_irte *irte_info = &ir_data->irq_2_irte;
3668 struct irq_cfg *cfg = irqd_cfg(irq_data: data);
3669 struct irq_data *parent = data->parent_data;
3670 struct amd_iommu *iommu = ir_data->iommu;
3671 int ret;
3672
3673 if (!iommu)
3674 return -ENODEV;
3675
3676 ret = parent->chip->irq_set_affinity(parent, mask, force);
3677 if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
3678 return ret;
3679
3680 amd_ir_update_irte(irqd: data, iommu, ir_data, irte_info, cfg);
3681 /*
3682 * After this point, all the interrupts will start arriving
3683 * at the new destination. So, time to cleanup the previous
3684 * vector allocation.
3685 */
3686 vector_schedule_cleanup(cfg);
3687
3688 return IRQ_SET_MASK_OK_DONE;
3689}
3690
3691static void ir_compose_msi_msg(struct irq_data *irq_data, struct msi_msg *msg)
3692{
3693 struct amd_ir_data *ir_data = irq_data->chip_data;
3694
3695 *msg = ir_data->msi_entry;
3696}
3697
3698static struct irq_chip amd_ir_chip = {
3699 .name = "AMD-IR",
3700 .irq_ack = apic_ack_irq,
3701 .irq_set_affinity = amd_ir_set_affinity,
3702 .irq_set_vcpu_affinity = amd_ir_set_vcpu_affinity,
3703 .irq_compose_msi_msg = ir_compose_msi_msg,
3704};
3705
3706static const struct msi_parent_ops amdvi_msi_parent_ops = {
3707 .supported_flags = X86_VECTOR_MSI_FLAGS_SUPPORTED |
3708 MSI_FLAG_MULTI_PCI_MSI |
3709 MSI_FLAG_PCI_IMS,
3710 .prefix = "IR-",
3711 .init_dev_msi_info = msi_parent_init_dev_msi_info,
3712};
3713
3714static const struct msi_parent_ops virt_amdvi_msi_parent_ops = {
3715 .supported_flags = X86_VECTOR_MSI_FLAGS_SUPPORTED |
3716 MSI_FLAG_MULTI_PCI_MSI,
3717 .prefix = "vIR-",
3718 .init_dev_msi_info = msi_parent_init_dev_msi_info,
3719};
3720
3721int amd_iommu_create_irq_domain(struct amd_iommu *iommu)
3722{
3723 struct fwnode_handle *fn;
3724
3725 fn = irq_domain_alloc_named_id_fwnode(name: "AMD-IR", id: iommu->index);
3726 if (!fn)
3727 return -ENOMEM;
3728 iommu->ir_domain = irq_domain_create_hierarchy(parent: arch_get_ir_parent_domain(), flags: 0, size: 0,
3729 fwnode: fn, ops: &amd_ir_domain_ops, host_data: iommu);
3730 if (!iommu->ir_domain) {
3731 irq_domain_free_fwnode(fwnode: fn);
3732 return -ENOMEM;
3733 }
3734
3735 irq_domain_update_bus_token(domain: iommu->ir_domain, bus_token: DOMAIN_BUS_AMDVI);
3736 iommu->ir_domain->flags |= IRQ_DOMAIN_FLAG_MSI_PARENT |
3737 IRQ_DOMAIN_FLAG_ISOLATED_MSI;
3738
3739 if (amd_iommu_np_cache)
3740 iommu->ir_domain->msi_parent_ops = &virt_amdvi_msi_parent_ops;
3741 else
3742 iommu->ir_domain->msi_parent_ops = &amdvi_msi_parent_ops;
3743
3744 return 0;
3745}
3746
3747int amd_iommu_update_ga(int cpu, bool is_run, void *data)
3748{
3749 struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
3750 struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
3751
3752 if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) ||
3753 !entry || !entry->lo.fields_vapic.guest_mode)
3754 return 0;
3755
3756 if (!ir_data->iommu)
3757 return -ENODEV;
3758
3759 if (cpu >= 0) {
3760 entry->lo.fields_vapic.destination =
3761 APICID_TO_IRTE_DEST_LO(cpu);
3762 entry->hi.fields.destination =
3763 APICID_TO_IRTE_DEST_HI(cpu);
3764 }
3765 entry->lo.fields_vapic.is_run = is_run;
3766
3767 return __modify_irte_ga(iommu: ir_data->iommu, devid: ir_data->irq_2_irte.devid,
3768 index: ir_data->irq_2_irte.index, irte: entry);
3769}
3770EXPORT_SYMBOL(amd_iommu_update_ga);
3771#endif
3772

source code of linux/drivers/iommu/amd/iommu.c