From 5ce73c524f5fb5abd7b1bfed0115474b4fb437b4 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 7 Oct 2024 10:43:31 +0200 Subject: [PATCH 01/54] iommu/amd: Use atomic64_inc_return() in iommu.c Use atomic64_inc_return(&ref) instead of atomic64_add_return(1, &ref) to use optimized implementation and ease register pressure around the primitive for targets that implement optimized variant. Signed-off-by: Uros Bizjak Cc: Joerg Roedel Cc: Suravee Suthikulpanit Cc: Will Deacon Cc: Robin Murphy Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241007084356.47799-1-ubizjak@gmail.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/iommu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 8364cd6fa47d..074effba7fbe 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1230,7 +1230,7 @@ static int iommu_completion_wait(struct amd_iommu *iommu) if (!iommu->need_sync) return 0; - data = atomic64_add_return(1, &iommu->cmd_sem_val); + data = atomic64_inc_return(&iommu->cmd_sem_val); build_completion_wait(&cmd, iommu, data); raw_spin_lock_irqsave(&iommu->lock, flags); @@ -2890,7 +2890,7 @@ static void iommu_flush_irt_and_complete(struct amd_iommu *iommu, u16 devid) return; build_inv_irt(&cmd, devid); - data = atomic64_add_return(1, &iommu->cmd_sem_val); + data = atomic64_inc_return(&iommu->cmd_sem_val); build_completion_wait(&cmd2, iommu, data); raw_spin_lock_irqsave(&iommu->lock, flags); From 3f6eeada6930056c38f20964120ac402cf0030b9 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 16 Oct 2024 08:49:58 +0000 Subject: [PATCH 02/54] iommu/amd: Do not try copy old DTE resume path In suspend/resume path, no need to copy old DTE (early_enable_iommus()). Just need to reload IOMMU hardware. This is the side effect of commit 3ac3e5ee5ed5 ("iommu/amd: Copy old trans table from old kernel") which changed early_enable_iommus() but missed to fix enable_iommus(). Resume path continue to work as 'amd_iommu_pre_enabled' is set to false and copy_device_table() will fail. It will just re-loaded IOMMU. Hence I think we don't need to backport this to stable tree. Signed-off-by: Vasant Hegde Link: https://lore.kernel.org/r/20241016084958.99727-1-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/init.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 43131c3a2172..3fa70169aace 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -2882,11 +2882,6 @@ static void enable_iommus_vapic(void) #endif } -static void enable_iommus(void) -{ - early_enable_iommus(); -} - static void disable_iommus(void) { struct amd_iommu *iommu; @@ -2913,7 +2908,8 @@ static void amd_iommu_resume(void) iommu_apply_resume_quirks(iommu); /* re-load the hardware */ - enable_iommus(); + for_each_iommu(iommu) + early_enable_iommu(iommu); amd_iommu_enable_interrupts(); } From e3a682eaf2af51a83f5313145ef592ce50fa787f Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 18 Oct 2024 14:12:19 -0300 Subject: [PATCH 03/54] iommu/amd: Fix corruption when mapping large pages from 0 If a page is mapped starting at 0 that is equal to or larger than can fit in the current mode (number of table levels) it results in corrupting the mapping as the following logic assumes the mode is correct for the page size being requested. There are two issues here, the check if the address fits within the table uses the start address, it should use the last address to ensure that last byte of the mapping fits within the current table mode. The second is if the mapping is exactly the size of the full page table it has to add another level to instead hold a single IOPTE for the large size. Since both corner cases require a 0 IOVA to be hit and doesn't start until a page size of 2^48 it is unlikely to ever hit in a real system. Reported-by: Alejandro Jimenez Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/0-v1-27ab08d646a1+29-amd_0map_jgg@nvidia.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/io_pgtable.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c index 804b788f3f16..f3399087859f 100644 --- a/drivers/iommu/amd/io_pgtable.c +++ b/drivers/iommu/amd/io_pgtable.c @@ -118,6 +118,7 @@ static void free_sub_pt(u64 *root, int mode, struct list_head *freelist) */ static bool increase_address_space(struct amd_io_pgtable *pgtable, unsigned long address, + unsigned int page_size_level, gfp_t gfp) { struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; @@ -133,7 +134,8 @@ static bool increase_address_space(struct amd_io_pgtable *pgtable, spin_lock_irqsave(&domain->lock, flags); - if (address <= PM_LEVEL_SIZE(pgtable->mode)) + if (address <= PM_LEVEL_SIZE(pgtable->mode) && + pgtable->mode - 1 >= page_size_level) goto out; ret = false; @@ -163,18 +165,21 @@ static u64 *alloc_pte(struct amd_io_pgtable *pgtable, gfp_t gfp, bool *updated) { + unsigned long last_addr = address + (page_size - 1); struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; int level, end_lvl; u64 *pte, *page; BUG_ON(!is_power_of_2(page_size)); - while (address > PM_LEVEL_SIZE(pgtable->mode)) { + while (last_addr > PM_LEVEL_SIZE(pgtable->mode) || + pgtable->mode - 1 < PAGE_SIZE_LEVEL(page_size)) { /* * Return an error if there is no memory to update the * page-table. */ - if (!increase_address_space(pgtable, address, gfp)) + if (!increase_address_space(pgtable, last_addr, + PAGE_SIZE_LEVEL(page_size), gfp)) return NULL; } From 016991606aa01c4d92e6941be636c0c897aa05c7 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 30 Oct 2024 06:35:45 +0000 Subject: [PATCH 04/54] iommu/amd/pgtbl_v2: Take protection domain lock before invalidating TLB Commit c7fc12354be0 ("iommu/amd/pgtbl_v2: Invalidate updated page ranges only") missed to take domain lock before calling amd_iommu_domain_flush_pages(). Fix this by taking protection domain lock before calling TLB invalidation function. Fixes: c7fc12354be0 ("iommu/amd/pgtbl_v2: Invalidate updated page ranges only") Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241030063556.6104-2-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/io_pgtable_v2.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/iommu/amd/io_pgtable_v2.c b/drivers/iommu/amd/io_pgtable_v2.c index 25b9042fa453..c616de2c5926 100644 --- a/drivers/iommu/amd/io_pgtable_v2.c +++ b/drivers/iommu/amd/io_pgtable_v2.c @@ -268,8 +268,11 @@ static int iommu_v2_map_pages(struct io_pgtable_ops *ops, unsigned long iova, out: if (updated) { struct protection_domain *pdom = io_pgtable_ops_to_domain(ops); + unsigned long flags; + spin_lock_irqsave(&pdom->lock, flags); amd_iommu_domain_flush_pages(pdom, o_iova, size); + spin_unlock_irqrestore(&pdom->lock, flags); } if (mapped) From 2fcab2deebc33640516d9d379b626557a38d5d4f Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 30 Oct 2024 06:35:46 +0000 Subject: [PATCH 05/54] iommu/amd: Use ida interface to manage protection domain ID Replace custom domain ID allocator with IDA interface. Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241030063556.6104-3-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/amd_iommu_types.h | 6 ++-- drivers/iommu/amd/init.c | 31 ++++------------ drivers/iommu/amd/iommu.c | 56 +++++++++++++---------------- 3 files changed, 35 insertions(+), 58 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index 601fb4ee6900..e70f6299f765 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -912,14 +912,14 @@ struct unity_map_entry { /* size of the dma_ops aperture as power of 2 */ extern unsigned amd_iommu_aperture_order; -/* allocation bitmap for domain ids */ -extern unsigned long *amd_iommu_pd_alloc_bitmap; - extern bool amd_iommu_force_isolation; /* Max levels of glxval supported */ extern int amd_iommu_max_glx_val; +/* IDA to track protection domain IDs */ +extern struct ida pdom_ids; + /* Global EFR and EFR2 registers */ extern u64 amd_iommu_efr; extern u64 amd_iommu_efr2; diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index ca7ae13968e3..939b011d6ff2 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -194,12 +194,6 @@ bool amd_iommu_force_isolation __read_mostly; unsigned long amd_iommu_pgsize_bitmap __ro_after_init = AMD_IOMMU_PGSIZES; -/* - * AMD IOMMU allows up to 2^16 different protection domains. This is a bitmap - * to know which ones are already in use. - */ -unsigned long *amd_iommu_pd_alloc_bitmap; - enum iommu_init_state { IOMMU_START_STATE, IOMMU_IVRS_DETECTED, @@ -1082,7 +1076,12 @@ static bool __copy_device_table(struct amd_iommu *iommu) if (dte_v && dom_id) { pci_seg->old_dev_tbl_cpy[devid].data[0] = old_devtb[devid].data[0]; pci_seg->old_dev_tbl_cpy[devid].data[1] = old_devtb[devid].data[1]; - __set_bit(dom_id, amd_iommu_pd_alloc_bitmap); + /* Reserve the Domain IDs used by previous kernel */ + if (ida_alloc_range(&pdom_ids, dom_id, dom_id, GFP_ATOMIC) != dom_id) { + pr_err("Failed to reserve domain ID 0x%x\n", dom_id); + memunmap(old_devtb); + return false; + } /* If gcr3 table existed, mask it out */ if (old_devtb[devid].data[0] & DTE_FLAG_GV) { tmp = DTE_GCR3_VAL_B(~0ULL) << DTE_GCR3_SHIFT_B; @@ -2985,9 +2984,7 @@ static bool __init check_ioapic_information(void) static void __init free_dma_resources(void) { - iommu_free_pages(amd_iommu_pd_alloc_bitmap, - get_order(MAX_DOMAIN_ID / 8)); - amd_iommu_pd_alloc_bitmap = NULL; + ida_destroy(&pdom_ids); free_unity_maps(); } @@ -3055,20 +3052,6 @@ static int __init early_amd_iommu_init(void) amd_iommu_target_ivhd_type = get_highest_supported_ivhd_type(ivrs_base); DUMP_printk("Using IVHD type %#x\n", amd_iommu_target_ivhd_type); - /* Device table - directly used by all IOMMUs */ - ret = -ENOMEM; - - amd_iommu_pd_alloc_bitmap = iommu_alloc_pages(GFP_KERNEL, - get_order(MAX_DOMAIN_ID / 8)); - if (amd_iommu_pd_alloc_bitmap == NULL) - goto out; - - /* - * never allocate domain 0 because its used as the non-allocated and - * error value placeholder - */ - __set_bit(0, amd_iommu_pd_alloc_bitmap); - /* * now the data structures are allocated and basically initialized * start the real acpi table scan diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 35af5b470421..f831526b65f4 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -52,8 +53,6 @@ #define HT_RANGE_START (0xfd00000000ULL) #define HT_RANGE_END (0xffffffffffULL) -static DEFINE_SPINLOCK(pd_bitmap_lock); - LIST_HEAD(ioapic_map); LIST_HEAD(hpet_map); LIST_HEAD(acpihid_map); @@ -70,6 +69,12 @@ struct iommu_cmd { u32 data[4]; }; +/* + * AMD IOMMU allows up to 2^16 different protection domains. This is a bitmap + * to know which ones are already in use. + */ +DEFINE_IDA(pdom_ids); + struct kmem_cache *amd_iommu_irq_cache; static void detach_device(struct device *dev); @@ -1643,31 +1648,14 @@ int amd_iommu_complete_ppr(struct device *dev, u32 pasid, int status, int tag) * ****************************************************************************/ -static u16 domain_id_alloc(void) +static int pdom_id_alloc(void) { - unsigned long flags; - int id; - - spin_lock_irqsave(&pd_bitmap_lock, flags); - id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID); - BUG_ON(id == 0); - if (id > 0 && id < MAX_DOMAIN_ID) - __set_bit(id, amd_iommu_pd_alloc_bitmap); - else - id = 0; - spin_unlock_irqrestore(&pd_bitmap_lock, flags); - - return id; + return ida_alloc_range(&pdom_ids, 1, MAX_DOMAIN_ID - 1, GFP_ATOMIC); } -static void domain_id_free(int id) +static void pdom_id_free(int id) { - unsigned long flags; - - spin_lock_irqsave(&pd_bitmap_lock, flags); - if (id > 0 && id < MAX_DOMAIN_ID) - __clear_bit(id, amd_iommu_pd_alloc_bitmap); - spin_unlock_irqrestore(&pd_bitmap_lock, flags); + ida_free(&pdom_ids, id); } static void free_gcr3_tbl_level1(u64 *tbl) @@ -1712,7 +1700,7 @@ static void free_gcr3_table(struct gcr3_tbl_info *gcr3_info) gcr3_info->glx = 0; /* Free per device domain ID */ - domain_id_free(gcr3_info->domid); + pdom_id_free(gcr3_info->domid); iommu_free_page(gcr3_info->gcr3_tbl); gcr3_info->gcr3_tbl = NULL; @@ -1739,6 +1727,7 @@ static int setup_gcr3_table(struct gcr3_tbl_info *gcr3_info, { int levels = get_gcr3_levels(pasids); int nid = iommu ? dev_to_node(&iommu->dev->dev) : NUMA_NO_NODE; + int domid; if (levels > amd_iommu_max_glx_val) return -EINVAL; @@ -1747,11 +1736,14 @@ static int setup_gcr3_table(struct gcr3_tbl_info *gcr3_info, return -EBUSY; /* Allocate per device domain ID */ - gcr3_info->domid = domain_id_alloc(); + domid = pdom_id_alloc(); + if (domid <= 0) + return -ENOSPC; + gcr3_info->domid = domid; gcr3_info->gcr3_tbl = iommu_alloc_page_node(nid, GFP_ATOMIC); if (gcr3_info->gcr3_tbl == NULL) { - domain_id_free(gcr3_info->domid); + pdom_id_free(domid); return -ENOMEM; } @@ -2262,7 +2254,7 @@ void protection_domain_free(struct protection_domain *domain) WARN_ON(!list_empty(&domain->dev_list)); if (domain->domain.type & __IOMMU_DOMAIN_PAGING) free_io_pgtable_ops(&domain->iop.pgtbl.ops); - domain_id_free(domain->id); + pdom_id_free(domain->id); kfree(domain); } @@ -2277,16 +2269,18 @@ static void protection_domain_init(struct protection_domain *domain, int nid) struct protection_domain *protection_domain_alloc(unsigned int type, int nid) { struct protection_domain *domain; + int domid; domain = kzalloc(sizeof(*domain), GFP_KERNEL); if (!domain) return NULL; - domain->id = domain_id_alloc(); - if (!domain->id) { + domid = pdom_id_alloc(); + if (domid <= 0) { kfree(domain); return NULL; } + domain->id = domid; protection_domain_init(domain, nid); @@ -2361,7 +2355,7 @@ static struct iommu_domain *do_iommu_domain_alloc(unsigned int type, ret = pdom_setup_pgtable(domain, type, pgtable); if (ret) { - domain_id_free(domain->id); + pdom_id_free(domain->id); kfree(domain); return ERR_PTR(ret); } @@ -2493,7 +2487,7 @@ void amd_iommu_init_identity_domain(void) domain->ops = &identity_domain_ops; domain->owner = &amd_iommu_ops; - identity_domain.id = domain_id_alloc(); + identity_domain.id = pdom_id_alloc(); protection_domain_init(&identity_domain, NUMA_NO_NODE); } From 743a4bae9fa1480e5f6837f6a55be918d6cd0e16 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 30 Oct 2024 06:35:47 +0000 Subject: [PATCH 06/54] iommu/amd: Remove protection_domain.dev_cnt variable protection_domain->dev_list tracks list of attached devices to domain. We can use list_* functions on dev_list to get device count. Hence remove 'dev_cnt' variable. No functional change intended. Signed-off-by: Vasant Hegde Reviewed-by: Suravee Suthikulpanit Reviewed-by: Joerg Roedel Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241030063556.6104-4-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/amd_iommu_types.h | 1 - drivers/iommu/amd/iommu.c | 7 +------ 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index e70f6299f765..90a752f57463 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -578,7 +578,6 @@ struct protection_domain { u16 id; /* the domain id written to the device table */ enum protection_domain_mode pd_mode; /* Track page table type */ bool dirty_tracking; /* dirty tracking is enabled in the domain */ - unsigned dev_cnt; /* devices assigned to this domain */ unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */ struct mmu_notifier mn; /* mmu notifier for the SVA domain */ diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index f831526b65f4..7f6acbf8e8f4 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2031,7 +2031,6 @@ static int do_attach(struct iommu_dev_data *dev_data, /* Do reference counting */ domain->dev_iommu[iommu->index] += 1; - domain->dev_cnt += 1; /* Setup GCR3 table */ if (pdom_is_sva_capable(domain)) { @@ -2064,7 +2063,6 @@ static void do_detach(struct iommu_dev_data *dev_data) /* decrease reference counters - needs to happen after the flushes */ domain->dev_iommu[iommu->index] -= 1; - domain->dev_cnt -= 1; } /* @@ -2237,16 +2235,13 @@ static void cleanup_domain(struct protection_domain *domain) lockdep_assert_held(&domain->lock); - if (!domain->dev_cnt) - return; - while (!list_empty(&domain->dev_list)) { entry = list_first_entry(&domain->dev_list, struct iommu_dev_data, list); BUG_ON(!entry->domain); do_detach(entry); } - WARN_ON(domain->dev_cnt != 0); + WARN_ON(!list_empty(&domain->dev_list)); } void protection_domain_free(struct protection_domain *domain) From d16041124de1dea4389b5e6b330657f34f8c0492 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 30 Oct 2024 06:35:48 +0000 Subject: [PATCH 07/54] iommu/amd: xarray to track protection_domain->iommu list Use xarray to track IOMMU attached to protection domain instead of static array of MAX_IOMMUS. Also add lockdep assertion. Signed-off-by: Vasant Hegde Reviewed-by: Joerg Roedel Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241030063556.6104-5-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/amd_iommu_types.h | 8 ++- drivers/iommu/amd/iommu.c | 89 +++++++++++++++++++++++------ 2 files changed, 77 insertions(+), 20 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index 90a752f57463..f88f6e4e910f 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -565,6 +565,12 @@ struct pdom_dev_data { struct list_head list; }; +/* Keeps track of the IOMMUs attached to protection domain */ +struct pdom_iommu_info { + struct amd_iommu *iommu; /* IOMMUs attach to protection domain */ + u32 refcnt; /* Count of attached dev/pasid per domain/IOMMU */ +}; + /* * This structure contains generic data for IOMMU protection domains * independent of their use. @@ -578,7 +584,7 @@ struct protection_domain { u16 id; /* the domain id written to the device table */ enum protection_domain_mode pd_mode; /* Track page table type */ bool dirty_tracking; /* dirty tracking is enabled in the domain */ - unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */ + struct xarray iommu_array; /* per-IOMMU reference count */ struct mmu_notifier mn; /* mmu notifier for the SVA domain */ struct list_head dev_data_list; /* List of pdom_dev_data */ diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 7f6acbf8e8f4..48472f45d6b7 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1257,18 +1257,17 @@ out_unlock: static void domain_flush_complete(struct protection_domain *domain) { - int i; + struct pdom_iommu_info *pdom_iommu_info; + unsigned long i; - for (i = 0; i < amd_iommu_get_num_iommus(); ++i) { - if (domain && !domain->dev_iommu[i]) - continue; + lockdep_assert_held(&domain->lock); - /* - * Devices of this domain are behind this IOMMU - * We need to wait for completion of all commands. - */ - iommu_completion_wait(amd_iommus[i]); - } + /* + * Devices of this domain are behind this IOMMU + * We need to wait for completion of all commands. + */ + xa_for_each(&domain->iommu_array, i, pdom_iommu_info) + iommu_completion_wait(pdom_iommu_info->iommu); } static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid) @@ -1450,21 +1449,22 @@ static int domain_flush_pages_v2(struct protection_domain *pdom, static int domain_flush_pages_v1(struct protection_domain *pdom, u64 address, size_t size) { + struct pdom_iommu_info *pdom_iommu_info; struct iommu_cmd cmd; - int ret = 0, i; + int ret = 0; + unsigned long i; + + lockdep_assert_held(&pdom->lock); build_inv_iommu_pages(&cmd, address, size, pdom->id, IOMMU_NO_PASID, false); - for (i = 0; i < amd_iommu_get_num_iommus(); ++i) { - if (!pdom->dev_iommu[i]) - continue; - + xa_for_each(&pdom->iommu_array, i, pdom_iommu_info) { /* * Devices of this domain are behind this IOMMU * We need a TLB flush */ - ret |= iommu_queue_command(amd_iommus[i], &cmd); + ret |= iommu_queue_command(pdom_iommu_info->iommu, &cmd); } return ret; @@ -1503,6 +1503,8 @@ static void __domain_flush_pages(struct protection_domain *domain, void amd_iommu_domain_flush_pages(struct protection_domain *domain, u64 address, size_t size) { + lockdep_assert_held(&domain->lock); + if (likely(!amd_iommu_np_cache)) { __domain_flush_pages(domain, address, size); @@ -2014,6 +2016,50 @@ static void destroy_gcr3_table(struct iommu_dev_data *dev_data, free_gcr3_table(gcr3_info); } +static int pdom_attach_iommu(struct amd_iommu *iommu, + struct protection_domain *pdom) +{ + struct pdom_iommu_info *pdom_iommu_info, *curr; + + pdom_iommu_info = xa_load(&pdom->iommu_array, iommu->index); + if (pdom_iommu_info) { + pdom_iommu_info->refcnt++; + return 0; + } + + pdom_iommu_info = kzalloc(sizeof(*pdom_iommu_info), GFP_ATOMIC); + if (!pdom_iommu_info) + return -ENOMEM; + + pdom_iommu_info->iommu = iommu; + pdom_iommu_info->refcnt = 1; + + curr = xa_cmpxchg(&pdom->iommu_array, iommu->index, + NULL, pdom_iommu_info, GFP_ATOMIC); + if (curr) { + kfree(pdom_iommu_info); + return -ENOSPC; + } + + return 0; +} + +static void pdom_detach_iommu(struct amd_iommu *iommu, + struct protection_domain *pdom) +{ + struct pdom_iommu_info *pdom_iommu_info; + + pdom_iommu_info = xa_load(&pdom->iommu_array, iommu->index); + if (!pdom_iommu_info) + return; + + pdom_iommu_info->refcnt--; + if (pdom_iommu_info->refcnt == 0) { + xa_erase(&pdom->iommu_array, iommu->index); + kfree(pdom_iommu_info); + } +} + static int do_attach(struct iommu_dev_data *dev_data, struct protection_domain *domain) { @@ -2030,13 +2076,17 @@ static int do_attach(struct iommu_dev_data *dev_data, cfg->amd.nid = dev_to_node(dev_data->dev); /* Do reference counting */ - domain->dev_iommu[iommu->index] += 1; + ret = pdom_attach_iommu(iommu, domain); + if (ret) + return ret; /* Setup GCR3 table */ if (pdom_is_sva_capable(domain)) { ret = init_gcr3_table(dev_data, domain); - if (ret) + if (ret) { + pdom_detach_iommu(iommu, domain); return ret; + } } return ret; @@ -2062,7 +2112,7 @@ static void do_detach(struct iommu_dev_data *dev_data) list_del(&dev_data->list); /* decrease reference counters - needs to happen after the flushes */ - domain->dev_iommu[iommu->index] -= 1; + pdom_detach_iommu(iommu, domain); } /* @@ -2258,6 +2308,7 @@ static void protection_domain_init(struct protection_domain *domain, int nid) spin_lock_init(&domain->lock); INIT_LIST_HEAD(&domain->dev_list); INIT_LIST_HEAD(&domain->dev_data_list); + xa_init(&domain->iommu_array); domain->iop.pgtbl.cfg.amd.nid = nid; } From b73c698fd5b4a00888ad5a77ad09d3f6b48baa2d Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 30 Oct 2024 06:35:49 +0000 Subject: [PATCH 08/54] iommu/amd: Remove unused amd_iommus variable protection_domain structure is updated to use xarray to track the IOMMUs attached to the domain. Now domain flush code is not using amd_iommus. Hence remove this unused variable. Signed-off-by: Vasant Hegde Reviewed-by: Joerg Roedel Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241030063556.6104-6-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/amd_iommu_types.h | 6 ------ drivers/iommu/amd/init.c | 6 ------ 2 files changed, 12 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index f88f6e4e910f..f44de5f31625 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -877,12 +877,6 @@ extern struct list_head amd_iommu_pci_seg_list; */ extern struct list_head amd_iommu_list; -/* - * Array with pointers to each IOMMU struct - * The indices are referenced in the protection domains - */ -extern struct amd_iommu *amd_iommus[MAX_IOMMUS]; - /* * Structure defining one entry in the device table */ diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 939b011d6ff2..0e0a531042ac 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -177,9 +177,6 @@ LIST_HEAD(amd_iommu_pci_seg_list); /* list of all PCI segments */ LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the system */ -/* Array to assign indices to IOMMUs*/ -struct amd_iommu *amd_iommus[MAX_IOMMUS]; - /* Number of IOMMUs present in the system */ static int amd_iommus_present; @@ -1743,9 +1740,6 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h, return -ENOSYS; } - /* Index is fine - add IOMMU to the array */ - amd_iommus[iommu->index] = iommu; - /* * Copy data from ACPI table entry to the iommu struct */ From 07bbd660dbd6ff03907d9ddbdfe9deabbd18ac4d Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 30 Oct 2024 06:35:50 +0000 Subject: [PATCH 09/54] iommu/amd: Do not detach devices in domain free path All devices attached to a protection domain must be freed before calling domain free. Hence do not try to free devices in domain free path. Continue to throw warning if pdom->dev_list is not empty so that any potential issues can be fixed. Signed-off-by: Vasant Hegde Reviewed-by: Joerg Roedel Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241030063556.6104-7-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/iommu.c | 26 +------------------------- 1 file changed, 1 insertion(+), 25 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 48472f45d6b7..1219e1555fc2 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2279,21 +2279,6 @@ static struct iommu_group *amd_iommu_device_group(struct device *dev) * *****************************************************************************/ -static void cleanup_domain(struct protection_domain *domain) -{ - struct iommu_dev_data *entry; - - lockdep_assert_held(&domain->lock); - - while (!list_empty(&domain->dev_list)) { - entry = list_first_entry(&domain->dev_list, - struct iommu_dev_data, list); - BUG_ON(!entry->domain); - do_detach(entry); - } - WARN_ON(!list_empty(&domain->dev_list)); -} - void protection_domain_free(struct protection_domain *domain) { WARN_ON(!list_empty(&domain->dev_list)); @@ -2482,16 +2467,7 @@ amd_iommu_domain_alloc_user(struct device *dev, u32 flags, void amd_iommu_domain_free(struct iommu_domain *dom) { - struct protection_domain *domain; - unsigned long flags; - - domain = to_pdomain(dom); - - spin_lock_irqsave(&domain->lock, flags); - - cleanup_domain(domain); - - spin_unlock_irqrestore(&domain->lock, flags); + struct protection_domain *domain = to_pdomain(dom); protection_domain_free(domain); } From d6b47dec368400a62d2b9d44c8e136fc15eac72c Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 30 Oct 2024 06:35:51 +0000 Subject: [PATCH 10/54] iommu/amd: Reduce domain lock scope in attach device path Currently attach device path takes protection domain lock followed by dev_data lock. Most of the operations in this function is specific to device data except pdom_attach_iommu() where it updates protection domain structure. Hence reduce the scope of protection domain lock. Note that this changes the locking order. Now it takes device lock before taking doamin lock (group->mutex -> dev_data->lock -> pdom->lock). dev_data->lock is used only in device attachment path. So changing order is fine. It will not create any issue. Finally move numa node assignment to pdom_attach_iommu(). Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241030063556.6104-8-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/iommu.c | 52 ++++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 1219e1555fc2..7412e0bba2cd 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2020,16 +2020,23 @@ static int pdom_attach_iommu(struct amd_iommu *iommu, struct protection_domain *pdom) { struct pdom_iommu_info *pdom_iommu_info, *curr; + struct io_pgtable_cfg *cfg = &pdom->iop.pgtbl.cfg; + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&pdom->lock, flags); pdom_iommu_info = xa_load(&pdom->iommu_array, iommu->index); if (pdom_iommu_info) { pdom_iommu_info->refcnt++; - return 0; + goto out_unlock; } pdom_iommu_info = kzalloc(sizeof(*pdom_iommu_info), GFP_ATOMIC); - if (!pdom_iommu_info) - return -ENOMEM; + if (!pdom_iommu_info) { + ret = -ENOMEM; + goto out_unlock; + } pdom_iommu_info->iommu = iommu; pdom_iommu_info->refcnt = 1; @@ -2038,43 +2045,52 @@ static int pdom_attach_iommu(struct amd_iommu *iommu, NULL, pdom_iommu_info, GFP_ATOMIC); if (curr) { kfree(pdom_iommu_info); - return -ENOSPC; + ret = -ENOSPC; + goto out_unlock; } - return 0; + /* Update NUMA Node ID */ + if (cfg->amd.nid == NUMA_NO_NODE) + cfg->amd.nid = dev_to_node(&iommu->dev->dev); + +out_unlock: + spin_unlock_irqrestore(&pdom->lock, flags); + return ret; } static void pdom_detach_iommu(struct amd_iommu *iommu, struct protection_domain *pdom) { struct pdom_iommu_info *pdom_iommu_info; + unsigned long flags; + + spin_lock_irqsave(&pdom->lock, flags); pdom_iommu_info = xa_load(&pdom->iommu_array, iommu->index); - if (!pdom_iommu_info) + if (!pdom_iommu_info) { + spin_unlock_irqrestore(&pdom->lock, flags); return; + } pdom_iommu_info->refcnt--; if (pdom_iommu_info->refcnt == 0) { xa_erase(&pdom->iommu_array, iommu->index); kfree(pdom_iommu_info); } + + spin_unlock_irqrestore(&pdom->lock, flags); } static int do_attach(struct iommu_dev_data *dev_data, struct protection_domain *domain) { struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); - struct io_pgtable_cfg *cfg = &domain->iop.pgtbl.cfg; int ret = 0; /* Update data structures */ dev_data->domain = domain; list_add(&dev_data->list, &domain->dev_list); - /* Update NUMA Node ID */ - if (cfg->amd.nid == NUMA_NO_NODE) - cfg->amd.nid = dev_to_node(dev_data->dev); - /* Do reference counting */ ret = pdom_attach_iommu(iommu, domain); if (ret) @@ -2096,12 +2112,15 @@ static void do_detach(struct iommu_dev_data *dev_data) { struct protection_domain *domain = dev_data->domain; struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); + unsigned long flags; /* Clear DTE and flush the entry */ dev_update_dte(dev_data, false); /* Flush IOTLB and wait for the flushes to finish */ + spin_lock_irqsave(&domain->lock, flags); amd_iommu_domain_flush_all(domain); + spin_unlock_irqrestore(&domain->lock, flags); /* Clear GCR3 table */ if (pdom_is_sva_capable(domain)) @@ -2123,11 +2142,8 @@ static int attach_device(struct device *dev, struct protection_domain *domain) { struct iommu_dev_data *dev_data; - unsigned long flags; int ret = 0; - spin_lock_irqsave(&domain->lock, flags); - dev_data = dev_iommu_priv_get(dev); spin_lock(&dev_data->lock); @@ -2142,8 +2158,6 @@ static int attach_device(struct device *dev, out: spin_unlock(&dev_data->lock); - spin_unlock_irqrestore(&domain->lock, flags); - return ret; } @@ -2153,13 +2167,9 @@ out: static void detach_device(struct device *dev) { struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); - struct protection_domain *domain = dev_data->domain; struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); - unsigned long flags; bool ppr = dev_data->ppr; - spin_lock_irqsave(&domain->lock, flags); - spin_lock(&dev_data->lock); /* @@ -2183,8 +2193,6 @@ static void detach_device(struct device *dev) out: spin_unlock(&dev_data->lock); - spin_unlock_irqrestore(&domain->lock, flags); - /* Remove IOPF handler */ if (ppr) amd_iommu_iopf_remove_device(iommu, dev_data); From 4b18ef8491b06e353e8801705092cc292582cb7a Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 30 Oct 2024 06:35:52 +0000 Subject: [PATCH 11/54] iommu/amd: Rearrange attach device code attach_device() is just holding lock and calling do_attach(). There is not need to have another function. Just move do_attach() code to attach_device(). Similarly move do_detach() code to detach_device(). Signed-off-by: Vasant Hegde Reviewed-by: Joerg Roedel Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241030063556.6104-9-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/iommu.c | 97 ++++++++++++++++----------------------- 1 file changed, 39 insertions(+), 58 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 7412e0bba2cd..4426e68de808 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2081,59 +2081,6 @@ static void pdom_detach_iommu(struct amd_iommu *iommu, spin_unlock_irqrestore(&pdom->lock, flags); } -static int do_attach(struct iommu_dev_data *dev_data, - struct protection_domain *domain) -{ - struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); - int ret = 0; - - /* Update data structures */ - dev_data->domain = domain; - list_add(&dev_data->list, &domain->dev_list); - - /* Do reference counting */ - ret = pdom_attach_iommu(iommu, domain); - if (ret) - return ret; - - /* Setup GCR3 table */ - if (pdom_is_sva_capable(domain)) { - ret = init_gcr3_table(dev_data, domain); - if (ret) { - pdom_detach_iommu(iommu, domain); - return ret; - } - } - - return ret; -} - -static void do_detach(struct iommu_dev_data *dev_data) -{ - struct protection_domain *domain = dev_data->domain; - struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); - unsigned long flags; - - /* Clear DTE and flush the entry */ - dev_update_dte(dev_data, false); - - /* Flush IOTLB and wait for the flushes to finish */ - spin_lock_irqsave(&domain->lock, flags); - amd_iommu_domain_flush_all(domain); - spin_unlock_irqrestore(&domain->lock, flags); - - /* Clear GCR3 table */ - if (pdom_is_sva_capable(domain)) - destroy_gcr3_table(dev_data, domain); - - /* Update data structures */ - dev_data->domain = NULL; - list_del(&dev_data->list); - - /* decrease reference counters - needs to happen after the flushes */ - pdom_detach_iommu(iommu, domain); -} - /* * If a device is not yet associated with a domain, this function makes the * device visible in the domain @@ -2141,11 +2088,10 @@ static void do_detach(struct iommu_dev_data *dev_data) static int attach_device(struct device *dev, struct protection_domain *domain) { - struct iommu_dev_data *dev_data; + struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); + struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); int ret = 0; - dev_data = dev_iommu_priv_get(dev); - spin_lock(&dev_data->lock); if (dev_data->domain != NULL) { @@ -2153,7 +2099,23 @@ static int attach_device(struct device *dev, goto out; } - ret = do_attach(dev_data, domain); + /* Update data structures */ + dev_data->domain = domain; + list_add(&dev_data->list, &domain->dev_list); + + /* Do reference counting */ + ret = pdom_attach_iommu(iommu, domain); + if (ret) + goto out; + + /* Setup GCR3 table */ + if (pdom_is_sva_capable(domain)) { + ret = init_gcr3_table(dev_data, domain); + if (ret) { + pdom_detach_iommu(iommu, domain); + goto out; + } + } out: spin_unlock(&dev_data->lock); @@ -2168,7 +2130,9 @@ static void detach_device(struct device *dev) { struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); + struct protection_domain *domain = dev_data->domain; bool ppr = dev_data->ppr; + unsigned long flags; spin_lock(&dev_data->lock); @@ -2188,7 +2152,24 @@ static void detach_device(struct device *dev) dev_data->ppr = false; } - do_detach(dev_data); + /* Clear DTE and flush the entry */ + dev_update_dte(dev_data, false); + + /* Flush IOTLB and wait for the flushes to finish */ + spin_lock_irqsave(&domain->lock, flags); + amd_iommu_domain_flush_all(domain); + spin_unlock_irqrestore(&domain->lock, flags); + + /* Clear GCR3 table */ + if (pdom_is_sva_capable(domain)) + destroy_gcr3_table(dev_data, domain); + + /* Update data structures */ + dev_data->domain = NULL; + list_del(&dev_data->list); + + /* decrease reference counters - needs to happen after the flushes */ + pdom_detach_iommu(iommu, domain); out: spin_unlock(&dev_data->lock); From e843aedbeb82b17a5fe6172449bff133fc8b68a1 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 30 Oct 2024 06:35:53 +0000 Subject: [PATCH 12/54] iommu/amd: Convert dev_data lock from spinlock to mutex Currently in attach device path it takes dev_data->spinlock. But as per design attach device path can sleep. Also if device is PRI capable then it adds device to IOMMU fault handler queue which takes mutex. Hence currently PRI enablement is done outside dev_data lock. Covert dev_data lock from spinlock to mutex so that it follows the design and also PRI enablement can be done properly. Signed-off-by: Vasant Hegde Reviewed-by: Joerg Roedel Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241030063556.6104-10-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/amd_iommu_types.h | 2 +- drivers/iommu/amd/iommu.c | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index f44de5f31625..fdb0357e0bb9 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -836,7 +836,7 @@ struct devid_map { */ struct iommu_dev_data { /*Protect against attach/detach races */ - spinlock_t lock; + struct mutex mutex; struct list_head list; /* For domain->dev_list */ struct llist_node dev_data_list; /* For global dev_data_list */ diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 4426e68de808..599aae889be8 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -210,7 +210,7 @@ static struct iommu_dev_data *alloc_dev_data(struct amd_iommu *iommu, u16 devid) if (!dev_data) return NULL; - spin_lock_init(&dev_data->lock); + mutex_init(&dev_data->mutex); dev_data->devid = devid; ratelimit_default_init(&dev_data->rs); @@ -2092,7 +2092,7 @@ static int attach_device(struct device *dev, struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); int ret = 0; - spin_lock(&dev_data->lock); + mutex_lock(&dev_data->mutex); if (dev_data->domain != NULL) { ret = -EBUSY; @@ -2118,7 +2118,7 @@ static int attach_device(struct device *dev, } out: - spin_unlock(&dev_data->lock); + mutex_unlock(&dev_data->mutex); return ret; } @@ -2134,7 +2134,7 @@ static void detach_device(struct device *dev) bool ppr = dev_data->ppr; unsigned long flags; - spin_lock(&dev_data->lock); + mutex_lock(&dev_data->mutex); /* * First check if the device is still attached. It might already @@ -2172,7 +2172,7 @@ static void detach_device(struct device *dev) pdom_detach_iommu(iommu, domain); out: - spin_unlock(&dev_data->lock); + mutex_unlock(&dev_data->mutex); /* Remove IOPF handler */ if (ppr) @@ -2470,9 +2470,9 @@ static int blocked_domain_attach_device(struct iommu_domain *domain, detach_device(dev); /* Clear DTE and flush the entry */ - spin_lock(&dev_data->lock); + mutex_lock(&dev_data->mutex); dev_update_dte(dev_data, false); - spin_unlock(&dev_data->lock); + mutex_unlock(&dev_data->mutex); return 0; } From 0b136493d3ffa1358783dcf5b9f866ceef2ff122 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 30 Oct 2024 06:35:54 +0000 Subject: [PATCH 13/54] iommu/amd: Reorder attach device code Ideally in attach device path, it should take dev_data lock before making changes to device data including IOPF enablement. So far dev_data was using spinlock and it was hitting lock order issue when it tries to enable IOPF. Hence Commit 526606b0a199 ("iommu/amd: Fix Invalid wait context issue") moved IOPF enablement outside dev_data->lock. Previous patch converted dev_data lock to mutex. Now its safe to call amd_iommu_iopf_add_device() with dev_data->mutex. Hence move back PCI device capability enablement (ATS, PRI, PASID) and IOPF enablement code inside the lock. Also in attach_device(), update 'dev_data->domain' at the end so that error handling becomes simple. Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241030063556.6104-11-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/iommu.c | 65 +++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 36 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 599aae889be8..a481c4565bc3 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2090,6 +2090,7 @@ static int attach_device(struct device *dev, { struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); + struct pci_dev *pdev; int ret = 0; mutex_lock(&dev_data->mutex); @@ -2099,10 +2100,6 @@ static int attach_device(struct device *dev, goto out; } - /* Update data structures */ - dev_data->domain = domain; - list_add(&dev_data->list, &domain->dev_list); - /* Do reference counting */ ret = pdom_attach_iommu(iommu, domain); if (ret) @@ -2117,6 +2114,28 @@ static int attach_device(struct device *dev, } } + pdev = dev_is_pci(dev_data->dev) ? to_pci_dev(dev_data->dev) : NULL; + if (pdev && pdom_is_sva_capable(domain)) { + pdev_enable_caps(pdev); + + /* + * Device can continue to function even if IOPF + * enablement failed. Hence in error path just + * disable device PRI support. + */ + if (amd_iommu_iopf_add_device(iommu, dev_data)) + pdev_disable_cap_pri(pdev); + } else if (pdev) { + pdev_enable_cap_ats(pdev); + } + + /* Update data structures */ + dev_data->domain = domain; + list_add(&dev_data->list, &domain->dev_list); + + /* Update device table */ + dev_update_dte(dev_data, true); + out: mutex_unlock(&dev_data->mutex); @@ -2131,7 +2150,6 @@ static void detach_device(struct device *dev) struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); struct protection_domain *domain = dev_data->domain; - bool ppr = dev_data->ppr; unsigned long flags; mutex_lock(&dev_data->mutex); @@ -2145,13 +2163,15 @@ static void detach_device(struct device *dev) if (WARN_ON(!dev_data->domain)) goto out; - if (ppr) { + /* Remove IOPF handler */ + if (dev_data->ppr) { iopf_queue_flush_dev(dev); - - /* Updated here so that it gets reflected in DTE */ - dev_data->ppr = false; + amd_iommu_iopf_remove_device(iommu, dev_data); } + if (dev_is_pci(dev)) + pdev_disable_caps(to_pci_dev(dev)); + /* Clear DTE and flush the entry */ dev_update_dte(dev_data, false); @@ -2173,14 +2193,6 @@ static void detach_device(struct device *dev) out: mutex_unlock(&dev_data->mutex); - - /* Remove IOPF handler */ - if (ppr) - amd_iommu_iopf_remove_device(iommu, dev_data); - - if (dev_is_pci(dev)) - pdev_disable_caps(to_pci_dev(dev)); - } static struct iommu_device *amd_iommu_probe_device(struct device *dev) @@ -2509,7 +2521,6 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); struct protection_domain *domain = to_pdomain(dom); struct amd_iommu *iommu = get_amd_iommu_from_dev(dev); - struct pci_dev *pdev; int ret; /* @@ -2542,24 +2553,6 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, } #endif - pdev = dev_is_pci(dev_data->dev) ? to_pci_dev(dev_data->dev) : NULL; - if (pdev && pdom_is_sva_capable(domain)) { - pdev_enable_caps(pdev); - - /* - * Device can continue to function even if IOPF - * enablement failed. Hence in error path just - * disable device PRI support. - */ - if (amd_iommu_iopf_add_device(iommu, dev_data)) - pdev_disable_cap_pri(pdev); - } else if (pdev) { - pdev_enable_cap_ats(pdev); - } - - /* Update device table */ - dev_update_dte(dev_data, true); - return ret; } From a0e086b16eca3fe58d30595b0b08d31f8ec7f74e Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 30 Oct 2024 06:35:55 +0000 Subject: [PATCH 14/54] iommu/amd: Add ops->release_domain In release path, remove device from existing domain and attach it to blocked domain. So that all DMAs from device is blocked. Note that soon blocked_domain will support other ops like set_dev_pasid() but release_domain supports only attach_dev ops. Hence added separate 'release_domain' variable. Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241030063556.6104-12-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/iommu.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index a481c4565bc3..fd39b5457883 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2515,6 +2515,14 @@ void amd_iommu_init_identity_domain(void) protection_domain_init(&identity_domain, NUMA_NO_NODE); } +/* Same as blocked domain except it supports only ops->attach_dev() */ +static struct iommu_domain release_domain = { + .type = IOMMU_DOMAIN_BLOCKED, + .ops = &(const struct iommu_domain_ops) { + .attach_dev = blocked_domain_attach_device, + } +}; + static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev) { @@ -2894,6 +2902,7 @@ static int amd_iommu_dev_disable_feature(struct device *dev, const struct iommu_ops amd_iommu_ops = { .capable = amd_iommu_capable, .blocked_domain = &blocked_domain, + .release_domain = &release_domain, .identity_domain = &identity_domain.domain, .domain_alloc = amd_iommu_domain_alloc, .domain_alloc_user = amd_iommu_domain_alloc_user, From 18f5a6b34b0696cd90b182e6af819bbfc6901c2a Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 30 Oct 2024 06:35:56 +0000 Subject: [PATCH 15/54] iommu/amd: Improve amd_iommu_release_device() Previous patch added ops->release_domain support. Core will attach devices to release_domain->attach_dev() before calling this function. Devices are already detached their current domain and attached to blocked domain. This is mostly dummy function now. Just throw warning if device is still attached to domain. Suggested-by: Jason Gunthorpe Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241030063556.6104-13-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/iommu.c | 33 ++++++--------------------------- 1 file changed, 6 insertions(+), 27 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index fd39b5457883..5ce8e6504ba7 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -77,8 +77,6 @@ DEFINE_IDA(pdom_ids); struct kmem_cache *amd_iommu_irq_cache; -static void detach_device(struct device *dev); - static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev); @@ -563,22 +561,6 @@ static void iommu_ignore_device(struct amd_iommu *iommu, struct device *dev) setup_aliases(iommu, dev); } -static void amd_iommu_uninit_device(struct device *dev) -{ - struct iommu_dev_data *dev_data; - - dev_data = dev_iommu_priv_get(dev); - if (!dev_data) - return; - - if (dev_data->domain) - detach_device(dev); - - /* - * We keep dev_data around for unplugged devices and reuse it when the - * device is re-plugged - not doing so would introduce a ton of races. - */ -} /**************************************************************************** * @@ -2249,17 +2231,14 @@ out_err: static void amd_iommu_release_device(struct device *dev) { - struct amd_iommu *iommu; + struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); - if (!check_device(dev)) - return; + WARN_ON(dev_data->domain); - iommu = rlookup_amd_iommu(dev); - if (!iommu) - return; - - amd_iommu_uninit_device(dev); - iommu_completion_wait(iommu); + /* + * We keep dev_data around for unplugged devices and reuse it when the + * device is re-plugged - not doing so would introduce a ton of races. + */ } static struct iommu_group *amd_iommu_device_group(struct device *dev) From 35890f85573c2ebbbf3491dc66f7ee2ad63055af Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 30 Oct 2024 21:20:45 -0300 Subject: [PATCH 16/54] vfio: Remove VFIO_TYPE1_NESTING_IOMMU This control causes the ARM SMMU drivers to choose a stage 2 implementation for the IO pagetable (vs the stage 1 usual default), however this choice has no significant visible impact to the VFIO user. Further qemu never implemented this and no other userspace user is known. The original description in commit f5c9ecebaf2a ("vfio/iommu_type1: add new VFIO_TYPE1_NESTING_IOMMU IOMMU type") suggested this was to "provide SMMU translation services to the guest operating system" however the rest of the API to set the guest table pointer for the stage 1 and manage invalidation was never completed, or at least never upstreamed, rendering this part useless dead code. Upstream has now settled on iommufd as the uAPI for controlling nested translation. Choosing the stage 2 implementation should be done by through the IOMMU_HWPT_ALLOC_NEST_PARENT flag during domain allocation. Remove VFIO_TYPE1_NESTING_IOMMU and everything under it including the enable_nesting iommu_domain_op. Just in-case there is some userspace using this continue to treat requesting it as a NOP, but do not advertise support any more. Acked-by: Alex Williamson Reviewed-by: Mostafa Saleh Reviewed-by: Kevin Tian Reviewed-by: Jerry Snitselaar Reviewed-by: Donald Dutile Tested-by: Nicolin Chen Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/1-v4-9e99b76f3518+3a8-smmuv3_nesting_jgg@nvidia.com Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 16 ---------------- drivers/iommu/arm/arm-smmu/arm-smmu.c | 16 ---------------- drivers/iommu/iommu.c | 10 ---------- drivers/iommu/iommufd/vfio_compat.c | 7 +------ drivers/vfio/vfio_iommu_type1.c | 12 +----------- include/linux/iommu.h | 3 --- include/uapi/linux/vfio.h | 2 +- 7 files changed, 3 insertions(+), 63 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 737c5b882355..acf250aeb18b 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -3378,21 +3378,6 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev) return group; } -static int arm_smmu_enable_nesting(struct iommu_domain *domain) -{ - struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); - int ret = 0; - - mutex_lock(&smmu_domain->init_mutex); - if (smmu_domain->smmu) - ret = -EPERM; - else - smmu_domain->stage = ARM_SMMU_DOMAIN_S2; - mutex_unlock(&smmu_domain->init_mutex); - - return ret; -} - static int arm_smmu_of_xlate(struct device *dev, const struct of_phandle_args *args) { @@ -3514,7 +3499,6 @@ static struct iommu_ops arm_smmu_ops = { .flush_iotlb_all = arm_smmu_flush_iotlb_all, .iotlb_sync = arm_smmu_iotlb_sync, .iova_to_phys = arm_smmu_iova_to_phys, - .enable_nesting = arm_smmu_enable_nesting, .free = arm_smmu_domain_free_paging, } }; diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index 8321962b3714..12b173eec454 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -1558,21 +1558,6 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev) return group; } -static int arm_smmu_enable_nesting(struct iommu_domain *domain) -{ - struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); - int ret = 0; - - mutex_lock(&smmu_domain->init_mutex); - if (smmu_domain->smmu) - ret = -EPERM; - else - smmu_domain->stage = ARM_SMMU_DOMAIN_NESTED; - mutex_unlock(&smmu_domain->init_mutex); - - return ret; -} - static int arm_smmu_set_pgtable_quirks(struct iommu_domain *domain, unsigned long quirks) { @@ -1656,7 +1641,6 @@ static struct iommu_ops arm_smmu_ops = { .flush_iotlb_all = arm_smmu_flush_iotlb_all, .iotlb_sync = arm_smmu_iotlb_sync, .iova_to_phys = arm_smmu_iova_to_phys, - .enable_nesting = arm_smmu_enable_nesting, .set_pgtable_quirks = arm_smmu_set_pgtable_quirks, .free = arm_smmu_domain_free, } diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 83c8e617a2c5..dbd70d5a4702 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2723,16 +2723,6 @@ static int __init iommu_init(void) } core_initcall(iommu_init); -int iommu_enable_nesting(struct iommu_domain *domain) -{ - if (domain->type != IOMMU_DOMAIN_UNMANAGED) - return -EINVAL; - if (!domain->ops->enable_nesting) - return -EINVAL; - return domain->ops->enable_nesting(domain); -} -EXPORT_SYMBOL_GPL(iommu_enable_nesting); - int iommu_set_pgtable_quirks(struct iommu_domain *domain, unsigned long quirk) { diff --git a/drivers/iommu/iommufd/vfio_compat.c b/drivers/iommu/iommufd/vfio_compat.c index a3ad5f0b6c59..514aacd64009 100644 --- a/drivers/iommu/iommufd/vfio_compat.c +++ b/drivers/iommu/iommufd/vfio_compat.c @@ -291,12 +291,7 @@ static int iommufd_vfio_check_extension(struct iommufd_ctx *ictx, case VFIO_DMA_CC_IOMMU: return iommufd_vfio_cc_iommu(ictx); - /* - * This is obsolete, and to be removed from VFIO. It was an incomplete - * idea that got merged. - * https://lore.kernel.org/kvm/0-v1-0093c9b0e345+19-vfio_no_nesting_jgg@nvidia.com/ - */ - case VFIO_TYPE1_NESTING_IOMMU: + case __VFIO_RESERVED_TYPE1_NESTING_IOMMU: return 0; /* diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index bf391b40e576..50ebc9593c9d 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -72,7 +72,6 @@ struct vfio_iommu { uint64_t pgsize_bitmap; uint64_t num_non_pinned_groups; bool v2; - bool nesting; bool dirty_page_tracking; struct list_head emulated_iommu_groups; }; @@ -2195,12 +2194,6 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, goto out_free_domain; } - if (iommu->nesting) { - ret = iommu_enable_nesting(domain->domain); - if (ret) - goto out_domain; - } - ret = iommu_attach_group(domain->domain, group->iommu_group); if (ret) goto out_domain; @@ -2541,9 +2534,7 @@ static void *vfio_iommu_type1_open(unsigned long arg) switch (arg) { case VFIO_TYPE1_IOMMU: break; - case VFIO_TYPE1_NESTING_IOMMU: - iommu->nesting = true; - fallthrough; + case __VFIO_RESERVED_TYPE1_NESTING_IOMMU: case VFIO_TYPE1v2_IOMMU: iommu->v2 = true; break; @@ -2638,7 +2629,6 @@ static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu, switch (arg) { case VFIO_TYPE1_IOMMU: case VFIO_TYPE1v2_IOMMU: - case VFIO_TYPE1_NESTING_IOMMU: case VFIO_UNMAP_ALL: return 1; case VFIO_UPDATE_VADDR: diff --git a/include/linux/iommu.h b/include/linux/iommu.h index bd722f473635..c88d18d2c928 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -635,7 +635,6 @@ struct iommu_ops { * @enforce_cache_coherency: Prevent any kind of DMA from bypassing IOMMU_CACHE, * including no-snoop TLPs on PCIe or other platform * specific mechanisms. - * @enable_nesting: Enable nesting * @set_pgtable_quirks: Set io page table quirks (IO_PGTABLE_QUIRK_*) * @free: Release the domain after use. */ @@ -663,7 +662,6 @@ struct iommu_domain_ops { dma_addr_t iova); bool (*enforce_cache_coherency)(struct iommu_domain *domain); - int (*enable_nesting)(struct iommu_domain *domain); int (*set_pgtable_quirks)(struct iommu_domain *domain, unsigned long quirks); @@ -844,7 +842,6 @@ extern void iommu_group_put(struct iommu_group *group); extern int iommu_group_id(struct iommu_group *group); extern struct iommu_domain *iommu_group_default_domain(struct iommu_group *); -int iommu_enable_nesting(struct iommu_domain *domain); int iommu_set_pgtable_quirks(struct iommu_domain *domain, unsigned long quirks); diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 2b68e6cdf190..c8dbf8219c4f 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -35,7 +35,7 @@ #define VFIO_EEH 5 /* Two-stage IOMMU */ -#define VFIO_TYPE1_NESTING_IOMMU 6 /* Implies v2 */ +#define __VFIO_RESERVED_TYPE1_NESTING_IOMMU 6 /* Implies v2 */ #define VFIO_SPAPR_TCE_v2_IOMMU 7 From 1b8655bb8d977ca110436c1cd0ca957c19670c1e Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 30 Oct 2024 21:20:46 -0300 Subject: [PATCH 17/54] ACPICA: IORT: Update for revision E.f ACPICA commit c4f5c083d24df9ddd71d5782c0988408cf0fc1ab The IORT spec, Issue E.f (April 2024), adds a new CANWBS bit to the Memory Access Flag field in the Memory Access Properties table, mainly for a PCI Root Complex. This CANWBS defines the coherency of memory accesses to be not marked IOWB cacheable/shareable. Its value further implies the coherency impact from a pair of mismatched memory attributes (e.g. in a nested translation case): 0x0: Use of mismatched memory attributes for accesses made by this device may lead to a loss of coherency. 0x1: Coherency of accesses made by this device to locations in Conventional memory are ensured as follows, even if the memory attributes for the accesses presented by the device or provided by the SMMU are different from Inner and Outer Write-back cacheable, Shareable. Link: https://github.com/acpica/acpica/commit/c4f5c083 Acked-by: Rafael J. Wysocki Signed-off-by: Nicolin Chen Acked-by: Hanjun Guo Tested-by: Nicolin Chen Reviewed-by: Jerry Snitselaar Reviewed-by: Donald Dutile Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/2-v4-9e99b76f3518+3a8-smmuv3_nesting_jgg@nvidia.com Signed-off-by: Will Deacon --- include/acpi/actbl2.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/acpi/actbl2.h b/include/acpi/actbl2.h index d3858eebc255..2e917a8f8bca 100644 --- a/include/acpi/actbl2.h +++ b/include/acpi/actbl2.h @@ -453,7 +453,7 @@ struct acpi_table_ccel { * IORT - IO Remapping Table * * Conforms to "IO Remapping Table System Software on ARM Platforms", - * Document number: ARM DEN 0049E.e, Sep 2022 + * Document number: ARM DEN 0049E.f, Apr 2024 * ******************************************************************************/ @@ -524,6 +524,7 @@ struct acpi_iort_memory_access { #define ACPI_IORT_MF_COHERENCY (1) #define ACPI_IORT_MF_ATTRIBUTES (1<<1) +#define ACPI_IORT_MF_CANWBS (1<<2) /* * IORT node specific subtables From 807404d66fcf898d4bcc6a3e3edb07ffd5b88400 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 30 Oct 2024 21:20:47 -0300 Subject: [PATCH 18/54] ACPI/IORT: Support CANWBS memory access flag The IORT spec, Issue E.f (April 2024), adds a new CANWBS bit to the Memory Access Flag field in the Memory Access Properties table, mainly for a PCI Root Complex. This CANWBS defines the coherency of memory accesses to be not marked IOWB cacheable/shareable. Its value further implies the coherency impact from a pair of mismatched memory attributes (e.g. in a nested translation case): 0x0: Use of mismatched memory attributes for accesses made by this device may lead to a loss of coherency. 0x1: Coherency of accesses made by this device to locations in Conventional memory are ensured as follows, even if the memory attributes for the accesses presented by the device or provided by the SMMU are different from Inner and Outer Write-back cacheable, Shareable. Note that the loss of coherency on a CANWBS-unsupported HW typically could occur to an SMMU that doesn't implement the S2FWB feature where additional cache flush operations would be required to prevent that from happening. Add a new ACPI_IORT_MF_CANWBS flag and set IOMMU_FWSPEC_PCI_RC_CANWBS upon the presence of this new flag. CANWBS and S2FWB are similar features, in that they both guarantee the VM can not violate coherency, however S2FWB can be bypassed by PCI No Snoop TLPs, while CANWBS cannot. Thus CANWBS meets the requirements to set IOMMU_CAP_ENFORCE_CACHE_COHERENCY. Architecturally ARM has expected that VFIO would disable No Snoop through PCI Config space, if this is done then the two would have the same protections. Tested-by: Nicolin Chen Signed-off-by: Nicolin Chen Reviewed-by: Kevin Tian Acked-by: Hanjun Guo Reviewed-by: Jerry Snitselaar Reviewed-by: Donald Dutile Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/3-v4-9e99b76f3518+3a8-smmuv3_nesting_jgg@nvidia.com Signed-off-by: Will Deacon --- drivers/acpi/arm64/iort.c | 13 +++++++++++++ include/linux/iommu.h | 2 ++ 2 files changed, 15 insertions(+) diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c index 4c745a26226b..1f7e4c691d9e 100644 --- a/drivers/acpi/arm64/iort.c +++ b/drivers/acpi/arm64/iort.c @@ -1218,6 +1218,17 @@ static bool iort_pci_rc_supports_ats(struct acpi_iort_node *node) return pci_rc->ats_attribute & ACPI_IORT_ATS_SUPPORTED; } +static bool iort_pci_rc_supports_canwbs(struct acpi_iort_node *node) +{ + struct acpi_iort_memory_access *memory_access; + struct acpi_iort_root_complex *pci_rc; + + pci_rc = (struct acpi_iort_root_complex *)node->node_data; + memory_access = + (struct acpi_iort_memory_access *)&pci_rc->memory_properties; + return memory_access->memory_flags & ACPI_IORT_MF_CANWBS; +} + static int iort_iommu_xlate(struct device *dev, struct acpi_iort_node *node, u32 streamid) { @@ -1335,6 +1346,8 @@ int iort_iommu_configure_id(struct device *dev, const u32 *id_in) fwspec = dev_iommu_fwspec_get(dev); if (fwspec && iort_pci_rc_supports_ats(node)) fwspec->flags |= IOMMU_FWSPEC_PCI_RC_ATS; + if (fwspec && iort_pci_rc_supports_canwbs(node)) + fwspec->flags |= IOMMU_FWSPEC_PCI_RC_CANWBS; } else { node = iort_scan_node(ACPI_IORT_NODE_NAMED_COMPONENT, iort_match_node_callback, dev); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index c88d18d2c928..4ad9b9ec6c9b 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -991,6 +991,8 @@ struct iommu_fwspec { /* ATS is supported */ #define IOMMU_FWSPEC_PCI_RC_ATS (1 << 0) +/* CANWBS is supported */ +#define IOMMU_FWSPEC_PCI_RC_CANWBS (1 << 1) /* * An iommu attach handle represents a relationship between an iommu domain From e89573cf4a13ca4e314d03d56ac84c0ba2af464b Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 30 Oct 2024 21:20:48 -0300 Subject: [PATCH 19/54] iommu/arm-smmu-v3: Report IOMMU_CAP_ENFORCE_CACHE_COHERENCY for CANWBS HW with CANWBS is always cache coherent and ignores PCI No Snoop requests as well. This meets the requirement for IOMMU_CAP_ENFORCE_CACHE_COHERENCY, so let's return it. Implement the enforce_cache_coherency() op to reject attaching devices that don't have CANWBS. Reviewed-by: Nicolin Chen Reviewed-by: Mostafa Saleh Reviewed-by: Kevin Tian Reviewed-by: Jerry Snitselaar Reviewed-by: Donald Dutile Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/4-v4-9e99b76f3518+3a8-smmuv3_nesting_jgg@nvidia.com Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 31 +++++++++++++++++++++ drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 7 +++++ 2 files changed, 38 insertions(+) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index acf250aeb18b..38725810c14e 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -2293,6 +2293,8 @@ static bool arm_smmu_capable(struct device *dev, enum iommu_cap cap) case IOMMU_CAP_CACHE_COHERENCY: /* Assume that a coherent TCU implies coherent TBUs */ return master->smmu->features & ARM_SMMU_FEAT_COHERENCY; + case IOMMU_CAP_ENFORCE_CACHE_COHERENCY: + return arm_smmu_master_canwbs(master); case IOMMU_CAP_NOEXEC: case IOMMU_CAP_DEFERRED_FLUSH: return true; @@ -2303,6 +2305,26 @@ static bool arm_smmu_capable(struct device *dev, enum iommu_cap cap) } } +static bool arm_smmu_enforce_cache_coherency(struct iommu_domain *domain) +{ + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); + struct arm_smmu_master_domain *master_domain; + unsigned long flags; + bool ret = true; + + spin_lock_irqsave(&smmu_domain->devices_lock, flags); + list_for_each_entry(master_domain, &smmu_domain->devices, + devices_elm) { + if (!arm_smmu_master_canwbs(master_domain->master)) { + ret = false; + break; + } + } + smmu_domain->enforce_cache_coherency = ret; + spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); + return ret; +} + struct arm_smmu_domain *arm_smmu_domain_alloc(void) { struct arm_smmu_domain *smmu_domain; @@ -2731,6 +2753,14 @@ static int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state, * one of them. */ spin_lock_irqsave(&smmu_domain->devices_lock, flags); + if (smmu_domain->enforce_cache_coherency && + !arm_smmu_master_canwbs(master)) { + spin_unlock_irqrestore(&smmu_domain->devices_lock, + flags); + kfree(master_domain); + return -EINVAL; + } + if (state->ats_enabled) atomic_inc(&smmu_domain->nr_ats_masters); list_add(&master_domain->devices_elm, &smmu_domain->devices); @@ -3493,6 +3523,7 @@ static struct iommu_ops arm_smmu_ops = { .owner = THIS_MODULE, .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = arm_smmu_attach_dev, + .enforce_cache_coherency = arm_smmu_enforce_cache_coherency, .set_dev_pasid = arm_smmu_s1_set_dev_pasid, .map_pages = arm_smmu_map_pages, .unmap_pages = arm_smmu_unmap_pages, diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 1e9952ca989f..06e3d88932df 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -811,6 +811,7 @@ struct arm_smmu_domain { /* List of struct arm_smmu_master_domain */ struct list_head devices; spinlock_t devices_lock; + bool enforce_cache_coherency : 1; struct mmu_notifier mmu_notifier; }; @@ -893,6 +894,12 @@ int arm_smmu_init_one_queue(struct arm_smmu_device *smmu, int arm_smmu_cmdq_init(struct arm_smmu_device *smmu, struct arm_smmu_cmdq *cmdq); +static inline bool arm_smmu_master_canwbs(struct arm_smmu_master *master) +{ + return dev_iommu_fwspec_get(master->dev)->flags & + IOMMU_FWSPEC_PCI_RC_CANWBS; +} + #ifdef CONFIG_ARM_SMMU_V3_SVA bool arm_smmu_sva_supported(struct arm_smmu_device *smmu); bool arm_smmu_master_sva_supported(struct arm_smmu_master *master); From 6912ec91828b8d7f21b393befad1c36dadbcd751 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 30 Oct 2024 21:20:49 -0300 Subject: [PATCH 20/54] iommu/arm-smmu-v3: Support IOMMU_GET_HW_INFO via struct arm_smmu_hw_info For virtualization cases the IDR/IIDR/AIDR values of the actual SMMU instance need to be available to the VMM so it can construct an appropriate vSMMUv3 that reflects the correct HW capabilities. For userspace page tables these values are required to constrain the valid values within the CD table and the IOPTEs. The kernel does not sanitize these values. If building a VMM then userspace is required to only forward bits into a VM that it knows it can implement. Some bits will also require a VMM to detect if appropriate kernel support is available such as for ATS and BTM. Start a new file and kconfig for the advanced iommufd support. This lets it be compiled out for kernels that are not intended to support virtualization, and allows distros to leave it disabled until they are shipping a matching qemu too. Tested-by: Nicolin Chen Signed-off-by: Nicolin Chen Reviewed-by: Kevin Tian Reviewed-by: Jerry Snitselaar Reviewed-by: Donald Dutile Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/5-v4-9e99b76f3518+3a8-smmuv3_nesting_jgg@nvidia.com Signed-off-by: Will Deacon --- drivers/iommu/Kconfig | 9 +++++ drivers/iommu/arm/arm-smmu-v3/Makefile | 1 + .../arm/arm-smmu-v3/arm-smmu-v3-iommufd.c | 31 ++++++++++++++++ drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 1 + drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 9 +++++ include/uapi/linux/iommufd.h | 35 +++++++++++++++++++ 6 files changed, 86 insertions(+) create mode 100644 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index b3aa1f5d5321..0c9bceb1653d 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -415,6 +415,15 @@ config ARM_SMMU_V3_SVA Say Y here if your system supports SVA extensions such as PCIe PASID and PRI. +config ARM_SMMU_V3_IOMMUFD + bool "Enable IOMMUFD features for ARM SMMUv3 (EXPERIMENTAL)" + depends on IOMMUFD + help + Support for IOMMUFD features intended to support virtual machines + with accelerated virtual IOMMUs. + + Say Y here if you are doing development and testing on this feature. + config ARM_SMMU_V3_KUNIT_TEST tristate "KUnit tests for arm-smmu-v3 driver" if !KUNIT_ALL_TESTS depends on KUNIT diff --git a/drivers/iommu/arm/arm-smmu-v3/Makefile b/drivers/iommu/arm/arm-smmu-v3/Makefile index dc98c88b48c8..493a659cc66b 100644 --- a/drivers/iommu/arm/arm-smmu-v3/Makefile +++ b/drivers/iommu/arm/arm-smmu-v3/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_ARM_SMMU_V3) += arm_smmu_v3.o arm_smmu_v3-y := arm-smmu-v3.o +arm_smmu_v3-$(CONFIG_ARM_SMMU_V3_IOMMUFD) += arm-smmu-v3-iommufd.o arm_smmu_v3-$(CONFIG_ARM_SMMU_V3_SVA) += arm-smmu-v3-sva.o arm_smmu_v3-$(CONFIG_TEGRA241_CMDQV) += tegra241-cmdqv.o diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c new file mode 100644 index 000000000000..3d2671031c9b --- /dev/null +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES + */ + +#include + +#include "arm-smmu-v3.h" + +void *arm_smmu_hw_info(struct device *dev, u32 *length, u32 *type) +{ + struct arm_smmu_master *master = dev_iommu_priv_get(dev); + struct iommu_hw_info_arm_smmuv3 *info; + u32 __iomem *base_idr; + unsigned int i; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return ERR_PTR(-ENOMEM); + + base_idr = master->smmu->base + ARM_SMMU_IDR0; + for (i = 0; i <= 5; i++) + info->idr[i] = readl_relaxed(base_idr + i); + info->iidr = readl_relaxed(master->smmu->base + ARM_SMMU_IIDR); + info->aidr = readl_relaxed(master->smmu->base + ARM_SMMU_AIDR); + + *length = sizeof(*info); + *type = IOMMU_HW_INFO_TYPE_ARM_SMMUV3; + + return info; +} diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 38725810c14e..996774d461ae 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -3506,6 +3506,7 @@ static struct iommu_ops arm_smmu_ops = { .identity_domain = &arm_smmu_identity_domain, .blocked_domain = &arm_smmu_blocked_domain, .capable = arm_smmu_capable, + .hw_info = arm_smmu_hw_info, .domain_alloc_paging = arm_smmu_domain_alloc_paging, .domain_alloc_sva = arm_smmu_sva_domain_alloc, .domain_alloc_user = arm_smmu_domain_alloc_user, diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 06e3d88932df..66261fd5bfb2 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -81,6 +81,8 @@ struct arm_smmu_device; #define IIDR_REVISION GENMASK(15, 12) #define IIDR_IMPLEMENTER GENMASK(11, 0) +#define ARM_SMMU_AIDR 0x1C + #define ARM_SMMU_CR0 0x20 #define CR0_ATSCHK (1 << 4) #define CR0_CMDQEN (1 << 3) @@ -956,4 +958,11 @@ tegra241_cmdqv_probe(struct arm_smmu_device *smmu) return ERR_PTR(-ENODEV); } #endif /* CONFIG_TEGRA241_CMDQV */ + +#if IS_ENABLED(CONFIG_ARM_SMMU_V3_IOMMUFD) +void *arm_smmu_hw_info(struct device *dev, u32 *length, u32 *type); +#else +#define arm_smmu_hw_info NULL +#endif /* CONFIG_ARM_SMMU_V3_IOMMUFD */ + #endif /* _ARM_SMMU_V3_H */ diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 72010f71c5e4..b5c94fecb94c 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -484,15 +484,50 @@ struct iommu_hw_info_vtd { __aligned_u64 ecap_reg; }; +/** + * struct iommu_hw_info_arm_smmuv3 - ARM SMMUv3 hardware information + * (IOMMU_HW_INFO_TYPE_ARM_SMMUV3) + * + * @flags: Must be set to 0 + * @__reserved: Must be 0 + * @idr: Implemented features for ARM SMMU Non-secure programming interface + * @iidr: Information about the implementation and implementer of ARM SMMU, + * and architecture version supported + * @aidr: ARM SMMU architecture version + * + * For the details of @idr, @iidr and @aidr, please refer to the chapters + * from 6.3.1 to 6.3.6 in the SMMUv3 Spec. + * + * User space should read the underlying ARM SMMUv3 hardware information for + * the list of supported features. + * + * Note that these values reflect the raw HW capability, without any insight if + * any required kernel driver support is present. Bits may be set indicating the + * HW has functionality that is lacking kernel software support, such as BTM. If + * a VMM is using this information to construct emulated copies of these + * registers it should only forward bits that it knows it can support. + * + * In future, presence of required kernel support will be indicated in flags. + */ +struct iommu_hw_info_arm_smmuv3 { + __u32 flags; + __u32 __reserved; + __u32 idr[6]; + __u32 iidr; + __u32 aidr; +}; + /** * enum iommu_hw_info_type - IOMMU Hardware Info Types * @IOMMU_HW_INFO_TYPE_NONE: Used by the drivers that do not report hardware * info * @IOMMU_HW_INFO_TYPE_INTEL_VTD: Intel VT-d iommu info type + * @IOMMU_HW_INFO_TYPE_ARM_SMMUV3: ARM SMMUv3 iommu info type */ enum iommu_hw_info_type { IOMMU_HW_INFO_TYPE_NONE = 0, IOMMU_HW_INFO_TYPE_INTEL_VTD = 1, + IOMMU_HW_INFO_TYPE_ARM_SMMUV3 = 2, }; /** From 874b87c7539f9de366954917a2e35f95c0a4a4f9 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 30 Oct 2024 21:20:50 -0300 Subject: [PATCH 21/54] iommu/arm-smmu-v3: Implement IOMMU_HWPT_ALLOC_NEST_PARENT For SMMUv3 the parent must be a S2 domain, which can be composed into a IOMMU_DOMAIN_NESTED. In future the S2 parent will also need a VMID linked to the VIOMMU and even to KVM. Reviewed-by: Nicolin Chen Tested-by: Nicolin Chen Reviewed-by: Kevin Tian Reviewed-by: Jerry Snitselaar Reviewed-by: Mostafa Saleh Reviewed-by: Donald Dutile Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/6-v4-9e99b76f3518+3a8-smmuv3_nesting_jgg@nvidia.com Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 996774d461ae..80847fa386fc 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -3114,7 +3114,8 @@ arm_smmu_domain_alloc_user(struct device *dev, u32 flags, const struct iommu_user_data *user_data) { struct arm_smmu_master *master = dev_iommu_priv_get(dev); - const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING; + const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING | + IOMMU_HWPT_ALLOC_NEST_PARENT; struct arm_smmu_domain *smmu_domain; int ret; @@ -3127,6 +3128,14 @@ arm_smmu_domain_alloc_user(struct device *dev, u32 flags, if (IS_ERR(smmu_domain)) return ERR_CAST(smmu_domain); + if (flags & IOMMU_HWPT_ALLOC_NEST_PARENT) { + if (!(master->smmu->features & ARM_SMMU_FEAT_NESTING)) { + ret = -EOPNOTSUPP; + goto err_free; + } + smmu_domain->stage = ARM_SMMU_DOMAIN_S2; + } + smmu_domain->domain.type = IOMMU_DOMAIN_UNMANAGED; smmu_domain->domain.ops = arm_smmu_ops.default_domain_ops; ret = arm_smmu_domain_finalise(smmu_domain, master->smmu, flags); From f6681abd413919472d8a142420b639a3a8d84673 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 30 Oct 2024 21:20:51 -0300 Subject: [PATCH 22/54] iommu/arm-smmu-v3: Expose the arm_smmu_attach interface The arm-smmuv3-iommufd.c file will need to call these functions too. Remove statics and put them in the header file. Remove the kunit visibility protections from arm_smmu_make_abort_ste() and arm_smmu_make_s2_domain_ste(). Reviewed-by: Nicolin Chen Reviewed-by: Kevin Tian Reviewed-by: Jerry Snitselaar Reviewed-by: Mostafa Saleh Reviewed-by: Donald Dutile Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/7-v4-9e99b76f3518+3a8-smmuv3_nesting_jgg@nvidia.com Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 22 ++++------------- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 27 +++++++++++++++++---- 2 files changed, 27 insertions(+), 22 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 80847fa386fc..b4b03206afbf 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1549,7 +1549,6 @@ static void arm_smmu_write_ste(struct arm_smmu_master *master, u32 sid, } } -VISIBLE_IF_KUNIT void arm_smmu_make_abort_ste(struct arm_smmu_ste *target) { memset(target, 0, sizeof(*target)); @@ -1632,7 +1631,6 @@ void arm_smmu_make_cdtable_ste(struct arm_smmu_ste *target, } EXPORT_SYMBOL_IF_KUNIT(arm_smmu_make_cdtable_ste); -VISIBLE_IF_KUNIT void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target, struct arm_smmu_master *master, struct arm_smmu_domain *smmu_domain, @@ -2505,8 +2503,8 @@ arm_smmu_get_step_for_sid(struct arm_smmu_device *smmu, u32 sid) } } -static void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master, - const struct arm_smmu_ste *target) +void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master, + const struct arm_smmu_ste *target) { int i, j; struct arm_smmu_device *smmu = master->smmu; @@ -2671,16 +2669,6 @@ static void arm_smmu_remove_master_domain(struct arm_smmu_master *master, spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); } -struct arm_smmu_attach_state { - /* Inputs */ - struct iommu_domain *old_domain; - struct arm_smmu_master *master; - bool cd_needs_ats; - ioasid_t ssid; - /* Resulting state */ - bool ats_enabled; -}; - /* * Start the sequence to attach a domain to a master. The sequence contains three * steps: @@ -2701,8 +2689,8 @@ struct arm_smmu_attach_state { * new_domain can be a non-paging domain. In this case ATS will not be enabled, * and invalidations won't be tracked. */ -static int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state, - struct iommu_domain *new_domain) +int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state, + struct iommu_domain *new_domain) { struct arm_smmu_master *master = state->master; struct arm_smmu_master_domain *master_domain; @@ -2784,7 +2772,7 @@ static int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state, * completes synchronizing the PCI device's ATC and finishes manipulating the * smmu_domain->devices list. */ -static void arm_smmu_attach_commit(struct arm_smmu_attach_state *state) +void arm_smmu_attach_commit(struct arm_smmu_attach_state *state) { struct arm_smmu_master *master = state->master; diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 66261fd5bfb2..c9e5290e995a 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -830,21 +830,22 @@ struct arm_smmu_entry_writer_ops { void (*sync)(struct arm_smmu_entry_writer *writer); }; +void arm_smmu_make_abort_ste(struct arm_smmu_ste *target); +void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target, + struct arm_smmu_master *master, + struct arm_smmu_domain *smmu_domain, + bool ats_enabled); + #if IS_ENABLED(CONFIG_KUNIT) void arm_smmu_get_ste_used(const __le64 *ent, __le64 *used_bits); void arm_smmu_write_entry(struct arm_smmu_entry_writer *writer, __le64 *cur, const __le64 *target); void arm_smmu_get_cd_used(const __le64 *ent, __le64 *used_bits); -void arm_smmu_make_abort_ste(struct arm_smmu_ste *target); void arm_smmu_make_bypass_ste(struct arm_smmu_device *smmu, struct arm_smmu_ste *target); void arm_smmu_make_cdtable_ste(struct arm_smmu_ste *target, struct arm_smmu_master *master, bool ats_enabled, unsigned int s1dss); -void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target, - struct arm_smmu_master *master, - struct arm_smmu_domain *smmu_domain, - bool ats_enabled); void arm_smmu_make_sva_cd(struct arm_smmu_cd *target, struct arm_smmu_master *master, struct mm_struct *mm, u16 asid); @@ -902,6 +903,22 @@ static inline bool arm_smmu_master_canwbs(struct arm_smmu_master *master) IOMMU_FWSPEC_PCI_RC_CANWBS; } +struct arm_smmu_attach_state { + /* Inputs */ + struct iommu_domain *old_domain; + struct arm_smmu_master *master; + bool cd_needs_ats; + ioasid_t ssid; + /* Resulting state */ + bool ats_enabled; +}; + +int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state, + struct iommu_domain *new_domain); +void arm_smmu_attach_commit(struct arm_smmu_attach_state *state); +void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master, + const struct arm_smmu_ste *target); + #ifdef CONFIG_ARM_SMMU_V3_SVA bool arm_smmu_sva_supported(struct arm_smmu_device *smmu); bool arm_smmu_master_sva_supported(struct arm_smmu_master *master); From 7c204426b81822ba768b3a5e5393b1489917fb84 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 4 Nov 2024 09:40:21 +0800 Subject: [PATCH 23/54] iommu/vt-d: Add domain_alloc_paging support Add the domain_alloc_paging callback for domain allocation using the iommu_paging_domain_alloc() interface. Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241021085125.192333-2-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 9f6b0780f2ef..4803e0cb8279 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4590,6 +4590,19 @@ static struct iommu_domain identity_domain = { }, }; +static struct iommu_domain *intel_iommu_domain_alloc_paging(struct device *dev) +{ + struct dmar_domain *dmar_domain; + bool first_stage; + + first_stage = first_level_by_default(0); + dmar_domain = paging_domain_alloc(dev, first_stage); + if (IS_ERR(dmar_domain)) + return ERR_CAST(dmar_domain); + + return &dmar_domain->domain; +} + const struct iommu_ops intel_iommu_ops = { .blocked_domain = &blocking_domain, .release_domain = &blocking_domain, @@ -4599,6 +4612,7 @@ const struct iommu_ops intel_iommu_ops = { .domain_alloc = intel_iommu_domain_alloc, .domain_alloc_user = intel_iommu_domain_alloc_user, .domain_alloc_sva = intel_svm_domain_alloc, + .domain_alloc_paging = intel_iommu_domain_alloc_paging, .probe_device = intel_iommu_probe_device, .release_device = intel_iommu_release_device, .get_resv_regions = intel_iommu_get_resv_regions, From 9ecfcac1fe15e097cfd74663bcb8fbeaf3cc2910 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 4 Nov 2024 09:40:22 +0800 Subject: [PATCH 24/54] iommu/vt-d: Remove unused domain_alloc callback With domain_alloc_paging callback supported, the legacy domain_alloc callback will never be used anymore. Remove it to avoid dead code. Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241021085125.192333-3-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 90 ------------------------------------- 1 file changed, 90 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 4803e0cb8279..dd158ff5fd45 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1454,27 +1454,6 @@ static bool first_level_by_default(unsigned int type) return type != IOMMU_DOMAIN_UNMANAGED; } -static struct dmar_domain *alloc_domain(unsigned int type) -{ - struct dmar_domain *domain; - - domain = kzalloc(sizeof(*domain), GFP_KERNEL); - if (!domain) - return NULL; - - domain->nid = NUMA_NO_NODE; - if (first_level_by_default(type)) - domain->use_first_level = true; - INIT_LIST_HEAD(&domain->devices); - INIT_LIST_HEAD(&domain->dev_pasids); - INIT_LIST_HEAD(&domain->cache_tags); - spin_lock_init(&domain->lock); - spin_lock_init(&domain->cache_lock); - xa_init(&domain->iommu_array); - - return domain; -} - int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) { struct iommu_domain_info *info, *curr; @@ -1546,20 +1525,6 @@ void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) spin_unlock(&iommu->lock); } -static int guestwidth_to_adjustwidth(int gaw) -{ - int agaw; - int r = (gaw - 12) % 9; - - if (r == 0) - agaw = gaw; - else - agaw = gaw + 9 - r; - if (agaw > 64) - agaw = 64; - return agaw; -} - static void domain_exit(struct dmar_domain *domain) { if (domain->pgd) { @@ -3379,27 +3344,6 @@ void device_block_translation(struct device *dev) info->domain = NULL; } -static int md_domain_init(struct dmar_domain *domain, int guest_width) -{ - int adjust_width; - - /* calculate AGAW */ - domain->gaw = guest_width; - adjust_width = guestwidth_to_adjustwidth(guest_width); - domain->agaw = width_to_agaw(adjust_width); - - domain->iommu_coherency = false; - domain->iommu_superpage = 0; - domain->max_addr = 0; - - /* always allocate the top pgd */ - domain->pgd = iommu_alloc_page_node(domain->nid, GFP_ATOMIC); - if (!domain->pgd) - return -ENOMEM; - domain_flush_cache(domain, domain->pgd, PAGE_SIZE); - return 0; -} - static int blocking_domain_attach_dev(struct iommu_domain *domain, struct device *dev) { @@ -3486,39 +3430,6 @@ static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_st return domain; } -static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) -{ - struct dmar_domain *dmar_domain; - struct iommu_domain *domain; - - switch (type) { - case IOMMU_DOMAIN_DMA: - case IOMMU_DOMAIN_UNMANAGED: - dmar_domain = alloc_domain(type); - if (!dmar_domain) { - pr_err("Can't allocate dmar_domain\n"); - return NULL; - } - if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { - pr_err("Domain initialization failed\n"); - domain_exit(dmar_domain); - return NULL; - } - - domain = &dmar_domain->domain; - domain->geometry.aperture_start = 0; - domain->geometry.aperture_end = - __DOMAIN_MAX_ADDR(dmar_domain->gaw); - domain->geometry.force_aperture = true; - - return domain; - default: - return NULL; - } - - return NULL; -} - static struct iommu_domain * intel_iommu_domain_alloc_user(struct device *dev, u32 flags, struct iommu_domain *parent, @@ -4609,7 +4520,6 @@ const struct iommu_ops intel_iommu_ops = { .identity_domain = &identity_domain, .capable = intel_iommu_capable, .hw_info = intel_iommu_hw_info, - .domain_alloc = intel_iommu_domain_alloc, .domain_alloc_user = intel_iommu_domain_alloc_user, .domain_alloc_sva = intel_svm_domain_alloc, .domain_alloc_paging = intel_iommu_domain_alloc_paging, From a98db518dde246e01ead53617dc0a30d6aaa3752 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 4 Nov 2024 09:40:23 +0800 Subject: [PATCH 25/54] iommu/vt-d: Enhance compatibility check for paging domain attach The driver now supports domain_alloc_paging, ensuring that a valid device pointer is provided whenever a paging domain is allocated. Additionally, the dmar_domain attributes are set up at the time of allocation. Consistent with the established semantics in the IOMMU core, if a domain is attached to a device and found to be incompatible with the IOMMU hardware capabilities, the operation will return an -EINVAL error. This implicitly advises the caller to allocate a new domain for the device and attempt the domain attachment again. Rename prepare_domain_attach_device() to a more meaningful name. Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241021085125.192333-4-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 70 ++++++++++++------------------------ drivers/iommu/intel/iommu.h | 3 +- drivers/iommu/intel/nested.c | 2 +- drivers/iommu/intel/pasid.c | 28 +-------------- 4 files changed, 26 insertions(+), 77 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index dd158ff5fd45..eeb341aafe3e 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1606,7 +1606,7 @@ static int domain_context_mapping_one(struct dmar_domain *domain, int translation = CONTEXT_TT_MULTI_LEVEL; struct dma_pte *pgd = domain->pgd; struct context_entry *context; - int agaw, ret; + int ret; pr_debug("Set context mapping for %02x:%02x.%d\n", bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); @@ -1623,27 +1623,15 @@ static int domain_context_mapping_one(struct dmar_domain *domain, copied_context_tear_down(iommu, context, bus, devfn); context_clear_entry(context); - context_set_domain_id(context, did); - /* - * Skip top levels of page tables for iommu which has - * less agaw than default. Unnecessary for PT mode. - */ - for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { - ret = -ENOMEM; - pgd = phys_to_virt(dma_pte_addr(pgd)); - if (!dma_pte_present(pgd)) - goto out_unlock; - } - if (info && info->ats_supported) translation = CONTEXT_TT_DEV_IOTLB; else translation = CONTEXT_TT_MULTI_LEVEL; context_set_address_root(context, virt_to_phys(pgd)); - context_set_address_width(context, agaw); + context_set_address_width(context, domain->agaw); context_set_translation_type(context, translation); context_set_fault_enable(context); context_set_present(context); @@ -1876,20 +1864,9 @@ static int domain_setup_first_level(struct intel_iommu *iommu, u32 pasid) { struct dma_pte *pgd = domain->pgd; - int agaw, level; - int flags = 0; + int level, flags = 0; - /* - * Skip top levels of page tables for iommu which has - * less agaw than default. Unnecessary for PT mode. - */ - for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { - pgd = phys_to_virt(dma_pte_addr(pgd)); - if (!dma_pte_present(pgd)) - return -ENOMEM; - } - - level = agaw_to_level(agaw); + level = agaw_to_level(domain->agaw); if (level != 4 && level != 5) return -EINVAL; @@ -3492,42 +3469,41 @@ static void intel_iommu_domain_free(struct iommu_domain *domain) domain_exit(dmar_domain); } -int prepare_domain_attach_device(struct iommu_domain *domain, - struct device *dev) +int paging_domain_compatible(struct iommu_domain *domain, struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct dmar_domain *dmar_domain = to_dmar_domain(domain); struct intel_iommu *iommu = info->iommu; int addr_width; + if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING))) + return -EPERM; + if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap)) return -EINVAL; if (domain->dirty_ops && !ssads_supported(iommu)) return -EINVAL; + if (dmar_domain->iommu_coherency != + iommu_paging_structure_coherency(iommu)) + return -EINVAL; + + if (dmar_domain->iommu_superpage != + iommu_superpage_capability(iommu, dmar_domain->use_first_level)) + return -EINVAL; + + if (dmar_domain->use_first_level && + (!sm_supported(iommu) || !ecap_flts(iommu->ecap))) + return -EINVAL; + /* check if this iommu agaw is sufficient for max mapped address */ addr_width = agaw_to_width(iommu->agaw); if (addr_width > cap_mgaw(iommu->cap)) addr_width = cap_mgaw(iommu->cap); - if (dmar_domain->max_addr > (1LL << addr_width)) + if (dmar_domain->gaw > addr_width || dmar_domain->agaw > iommu->agaw) return -EINVAL; - dmar_domain->gaw = addr_width; - - /* - * Knock out extra levels of page tables if necessary - */ - while (iommu->agaw < dmar_domain->agaw) { - struct dma_pte *pte; - - pte = dmar_domain->pgd; - if (dma_pte_present(pte)) { - dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte)); - iommu_free_page(pte); - } - dmar_domain->agaw--; - } if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) && context_copied(iommu, info->bus, info->devfn)) @@ -3543,7 +3519,7 @@ static int intel_iommu_attach_device(struct iommu_domain *domain, device_block_translation(dev); - ret = prepare_domain_attach_device(domain, dev); + ret = paging_domain_compatible(domain, dev); if (ret) return ret; @@ -4214,7 +4190,7 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, if (context_copied(iommu, info->bus, info->devfn)) return -EBUSY; - ret = prepare_domain_attach_device(domain, dev); + ret = paging_domain_compatible(domain, dev); if (ret) return ret; diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 1497f3112b12..b1928ca3aaa8 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -1230,8 +1230,7 @@ void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu); void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu); void device_block_translation(struct device *dev); -int prepare_domain_attach_device(struct iommu_domain *domain, - struct device *dev); +int paging_domain_compatible(struct iommu_domain *domain, struct device *dev); void domain_update_iommu_cap(struct dmar_domain *domain); int dmar_ir_support(void); diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c index 433c58944401..96016bc40f94 100644 --- a/drivers/iommu/intel/nested.c +++ b/drivers/iommu/intel/nested.c @@ -40,7 +40,7 @@ static int intel_nested_attach_dev(struct iommu_domain *domain, * The s2_domain will be used in nested translation, hence needs * to ensure the s2_domain is compatible with this IOMMU. */ - ret = prepare_domain_attach_device(&dmar_domain->s2_domain->domain, dev); + ret = paging_domain_compatible(&dmar_domain->s2_domain->domain, dev); if (ret) { dev_err_ratelimited(dev, "s2 domain is not compatible\n"); return ret; diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 2e5fa0a23299..53157e1194f4 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -345,25 +345,6 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu, return 0; } -/* - * Skip top levels of page tables for iommu which has less agaw - * than default. Unnecessary for PT mode. - */ -static int iommu_skip_agaw(struct dmar_domain *domain, - struct intel_iommu *iommu, - struct dma_pte **pgd) -{ - int agaw; - - for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { - *pgd = phys_to_virt(dma_pte_addr(*pgd)); - if (!dma_pte_present(*pgd)) - return -EINVAL; - } - - return agaw; -} - /* * Set up the scalable mode pasid entry for second only translation type. */ @@ -374,7 +355,6 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, struct pasid_entry *pte; struct dma_pte *pgd; u64 pgd_val; - int agaw; u16 did; /* @@ -388,12 +368,6 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, } pgd = domain->pgd; - agaw = iommu_skip_agaw(domain, iommu, &pgd); - if (agaw < 0) { - dev_err(dev, "Invalid domain page table\n"); - return -EINVAL; - } - pgd_val = virt_to_phys(pgd); did = domain_id_iommu(domain, iommu); @@ -412,7 +386,7 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, pasid_clear_entry(pte); pasid_set_domain_id(pte, did); pasid_set_slptr(pte, pgd_val); - pasid_set_address_width(pte, agaw); + pasid_set_address_width(pte, domain->agaw); pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY); pasid_set_fault_enable(pte); pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); From c376a3456d8bef43ec556a98c0a04c35086c2737 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 4 Nov 2024 09:40:24 +0800 Subject: [PATCH 26/54] iommu/vt-d: Remove domain_update_iommu_cap() The attributes of a paging domain are initialized during the allocation process, and any attempt to attach a domain that is not compatible will result in a failure. Therefore, there is no need to update the domain attributes at the time of domain attachment. Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241021085125.192333-5-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 83 ------------------------------------- drivers/iommu/intel/iommu.h | 1 - 2 files changed, 84 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index eeb341aafe3e..756caa24008f 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -352,36 +352,6 @@ static bool iommu_paging_structure_coherency(struct intel_iommu *iommu) ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap); } -static void domain_update_iommu_coherency(struct dmar_domain *domain) -{ - struct iommu_domain_info *info; - struct dmar_drhd_unit *drhd; - struct intel_iommu *iommu; - bool found = false; - unsigned long i; - - domain->iommu_coherency = true; - xa_for_each(&domain->iommu_array, i, info) { - found = true; - if (!iommu_paging_structure_coherency(info->iommu)) { - domain->iommu_coherency = false; - break; - } - } - if (found) - return; - - /* No hardware attached; use lowest common denominator */ - rcu_read_lock(); - for_each_active_iommu(iommu, drhd) { - if (!iommu_paging_structure_coherency(iommu)) { - domain->iommu_coherency = false; - break; - } - } - rcu_read_unlock(); -} - static int domain_update_iommu_superpage(struct dmar_domain *domain, struct intel_iommu *skip) { @@ -412,29 +382,6 @@ static int domain_update_iommu_superpage(struct dmar_domain *domain, return fls(mask); } -static int domain_update_device_node(struct dmar_domain *domain) -{ - struct device_domain_info *info; - int nid = NUMA_NO_NODE; - unsigned long flags; - - spin_lock_irqsave(&domain->lock, flags); - list_for_each_entry(info, &domain->devices, link) { - /* - * There could possibly be multiple device numa nodes as devices - * within the same domain may sit behind different IOMMUs. There - * isn't perfect answer in such situation, so we select first - * come first served policy. - */ - nid = dev_to_node(info->dev); - if (nid != NUMA_NO_NODE) - break; - } - spin_unlock_irqrestore(&domain->lock, flags); - - return nid; -} - /* Return the super pagesize bitmap if supported. */ static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain) { @@ -452,34 +399,6 @@ static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain) return bitmap; } -/* Some capabilities may be different across iommus */ -void domain_update_iommu_cap(struct dmar_domain *domain) -{ - domain_update_iommu_coherency(domain); - domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL); - - /* - * If RHSA is missing, we should default to the device numa domain - * as fall back. - */ - if (domain->nid == NUMA_NO_NODE) - domain->nid = domain_update_device_node(domain); - - /* - * First-level translation restricts the input-address to a - * canonical address (i.e., address bits 63:N have the same - * value as address bit [N-1], where N is 48-bits with 4-level - * paging and 57-bits with 5-level paging). Hence, skip bit - * [N-1]. - */ - if (domain->use_first_level) - domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1); - else - domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw); - - domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain); -} - struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, u8 devfn, int alloc) { @@ -1493,7 +1412,6 @@ int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) ret = xa_err(curr) ? : -EBUSY; goto err_clear; } - domain_update_iommu_cap(domain); spin_unlock(&iommu->lock); return 0; @@ -1519,7 +1437,6 @@ void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) clear_bit(info->did, iommu->domain_ids); xa_erase(&domain->iommu_array, iommu->seq_id); domain->nid = NUMA_NO_NODE; - domain_update_iommu_cap(domain); kfree(info); } spin_unlock(&iommu->lock); diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index b1928ca3aaa8..f9fba9a26dac 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -1231,7 +1231,6 @@ int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu); void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu); void device_block_translation(struct device *dev); int paging_domain_compatible(struct iommu_domain *domain, struct device *dev); -void domain_update_iommu_cap(struct dmar_domain *domain); int dmar_ir_support(void); From 5bdd86ec5d19060f63c00fff3b081c887242a37a Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 4 Nov 2024 09:40:25 +0800 Subject: [PATCH 27/54] iommu/vt-d: Remove domain_update_iommu_superpage() The requirement for consistent super page support across all the IOMMU hardware in the system has been removed. In the past, if a new IOMMU was hot-added and lacked consistent super page capability, the hot-add process would be aborted. However, with the updated attachment semantics, it is now permissible for the super page capability to vary among different IOMMU hardware units. Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241021085125.192333-6-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 39 +------------------------------------ 1 file changed, 1 insertion(+), 38 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 756caa24008f..36854b683b11 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -352,36 +352,6 @@ static bool iommu_paging_structure_coherency(struct intel_iommu *iommu) ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap); } -static int domain_update_iommu_superpage(struct dmar_domain *domain, - struct intel_iommu *skip) -{ - struct dmar_drhd_unit *drhd; - struct intel_iommu *iommu; - int mask = 0x3; - - if (!intel_iommu_superpage) - return 0; - - /* set iommu_superpage to the smallest common denominator */ - rcu_read_lock(); - for_each_active_iommu(iommu, drhd) { - if (iommu != skip) { - if (domain && domain->use_first_level) { - if (!cap_fl1gp_support(iommu->cap)) - mask = 0x1; - } else { - mask &= cap_super_page_val(iommu->cap); - } - - if (!mask) - break; - } - } - rcu_read_unlock(); - - return fls(mask); -} - /* Return the super pagesize bitmap if supported. */ static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain) { @@ -2605,20 +2575,13 @@ int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg) static int intel_iommu_add(struct dmar_drhd_unit *dmaru) { - int sp, ret; struct intel_iommu *iommu = dmaru->iommu; + int ret; ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu); if (ret) goto out; - sp = domain_update_iommu_superpage(NULL, iommu) - 1; - if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) { - pr_warn("%s: Doesn't support large page.\n", - iommu->name); - return -ENXIO; - } - /* * Disable translation if already enabled prior to OS handover. */ From ed56de8a9e90d9771c4517fb9f2daac8282269ba Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 4 Nov 2024 09:40:26 +0800 Subject: [PATCH 28/54] iommu/vt-d: Refactor first_level_by_default() The first stage page table is compatible across host and guest kernels. Therefore, this driver uses the first stage page table as the default for paging domains. The helper first_level_by_default() determines the feasibility of using the first stage page table based on a global policy. This policy requires consistency in scalable mode and first stage translation capability among all iommu units. However, this is unnecessary as domain allocation, attachment, and removal operations are performed on a per-device basis. The domain type (IOMMU_DOMAIN_DMA vs. IOMMU_DOMAIN_UNMANAGED) should not be a factor in determining the first stage page table usage. Both types are for paging domains, and there's no fundamental difference between them. The driver should not be aware of this distinction unless the core specifies allocation flags that require special handling. Convert first_level_by_default() from global to per-iommu and remove the 'type' input. Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241021085125.192333-7-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 36854b683b11..bad9593f2464 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1329,18 +1329,17 @@ static void free_dmar_iommu(struct intel_iommu *iommu) * Check and return whether first level is used by default for * DMA translation. */ -static bool first_level_by_default(unsigned int type) +static bool first_level_by_default(struct intel_iommu *iommu) { /* Only SL is available in legacy mode */ - if (!scalable_mode_support()) + if (!sm_supported(iommu)) return false; /* Only level (either FL or SL) is available, just use it */ - if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity()) - return intel_cap_flts_sanity(); + if (ecap_flts(iommu->ecap) ^ ecap_slts(iommu->ecap)) + return ecap_flts(iommu->ecap); - /* Both levels are available, decide it based on domain type */ - return type != IOMMU_DOMAIN_UNMANAGED; + return true; } int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) @@ -3110,7 +3109,7 @@ int __init intel_iommu_init(void) * the virtual and physical IOMMU page-tables. */ if (cap_caching_mode(iommu->cap) && - !first_level_by_default(IOMMU_DOMAIN_DMA)) { + !first_level_by_default(iommu)) { pr_info_once("IOMMU batching disallowed due to virtualization\n"); iommu_set_dma_strict(); } @@ -4359,10 +4358,12 @@ static struct iommu_domain identity_domain = { static struct iommu_domain *intel_iommu_domain_alloc_paging(struct device *dev) { + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; struct dmar_domain *dmar_domain; bool first_stage; - first_stage = first_level_by_default(0); + first_stage = first_level_by_default(iommu); dmar_domain = paging_domain_alloc(dev, first_stage); if (IS_ERR(dmar_domain)) return ERR_CAST(dmar_domain); From 621838c718a81ba3bfb8e0f941bc0133166bc534 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 4 Nov 2024 09:40:27 +0800 Subject: [PATCH 29/54] iommu/vt-d: Refine intel_iommu_domain_alloc_user() The domain_alloc_user ops should always allocate a guest-compatible page table unless specific allocation flags are specified. Currently, IOMMU_HWPT_ALLOC_NEST_PARENT and IOMMU_HWPT_ALLOC_DIRTY_TRACKING require special handling, as both require hardware support for scalable mode and second-stage translation. In such cases, the driver should select a second-stage page table for the paging domain. Suggested-by: Jason Gunthorpe Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241021085125.192333-8-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index bad9593f2464..2b5027dd0c96 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3297,6 +3297,7 @@ intel_iommu_domain_alloc_user(struct device *dev, u32 flags, struct intel_iommu *iommu = info->iommu; struct dmar_domain *dmar_domain; struct iommu_domain *domain; + bool first_stage; /* Must be NESTING domain */ if (parent) { @@ -3313,8 +3314,20 @@ intel_iommu_domain_alloc_user(struct device *dev, u32 flags, if (user_data || (dirty_tracking && !ssads_supported(iommu))) return ERR_PTR(-EOPNOTSUPP); - /* Do not use first stage for user domain translation. */ - dmar_domain = paging_domain_alloc(dev, false); + /* + * Always allocate the guest compatible page table unless + * IOMMU_HWPT_ALLOC_NEST_PARENT or IOMMU_HWPT_ALLOC_DIRTY_TRACKING + * is specified. + */ + if (nested_parent || dirty_tracking) { + if (!sm_supported(iommu) || !ecap_slts(iommu->ecap)) + return ERR_PTR(-EOPNOTSUPP); + first_stage = false; + } else { + first_stage = first_level_by_default(iommu); + } + + dmar_domain = paging_domain_alloc(dev, first_stage); if (IS_ERR(dmar_domain)) return ERR_CAST(dmar_domain); domain = &dmar_domain->domain; From 2a32309345ef2977ceb4fba81600066474ac8581 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Mon, 4 Nov 2024 09:40:28 +0800 Subject: [PATCH 30/54] iommu/vt-d: Use PCI_DEVID() macro The macro PCI_DEVID() can be used instead of compose it manually. Signed-off-by: Jinjie Ruan Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20240829021011.4135618-1-ruanjinjie@huawei.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 4 ++-- drivers/iommu/intel/irq_remapping.c | 4 ++-- drivers/iommu/intel/pasid.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 2b5027dd0c96..2d67db67f5e3 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1452,7 +1452,7 @@ static void copied_context_tear_down(struct intel_iommu *iommu, if (did_old < cap_ndoms(iommu->cap)) { iommu->flush.flush_context(iommu, did_old, - (((u16)bus) << 8) | devfn, + PCI_DEVID(bus, devfn), DMA_CCMD_MASK_NOBIT, DMA_CCMD_DEVICE_INVL); iommu->flush.flush_iotlb(iommu, did_old, 0, 0, @@ -1473,7 +1473,7 @@ static void context_present_cache_flush(struct intel_iommu *iommu, u16 did, { if (cap_caching_mode(iommu->cap)) { iommu->flush.flush_context(iommu, 0, - (((u16)bus) << 8) | devfn, + PCI_DEVID(bus, devfn), DMA_CCMD_MASK_NOBIT, DMA_CCMD_DEVICE_INVL); iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c index 7a6d188e3bea..466c1412dd45 100644 --- a/drivers/iommu/intel/irq_remapping.c +++ b/drivers/iommu/intel/irq_remapping.c @@ -312,7 +312,7 @@ static int set_ioapic_sid(struct irte *irte, int apic) for (i = 0; i < MAX_IO_APICS; i++) { if (ir_ioapic[i].iommu && ir_ioapic[i].id == apic) { - sid = (ir_ioapic[i].bus << 8) | ir_ioapic[i].devfn; + sid = PCI_DEVID(ir_ioapic[i].bus, ir_ioapic[i].devfn); break; } } @@ -337,7 +337,7 @@ static int set_hpet_sid(struct irte *irte, u8 id) for (i = 0; i < MAX_HPET_TBS; i++) { if (ir_hpet[i].iommu && ir_hpet[i].id == id) { - sid = (ir_hpet[i].bus << 8) | ir_hpet[i].devfn; + sid = PCI_DEVID(ir_hpet[i].bus, ir_hpet[i].devfn); break; } } diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 53157e1194f4..7ef157615e0f 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -220,7 +220,7 @@ devtlb_invalidation_with_pasid(struct intel_iommu *iommu, if (pci_dev_is_disconnected(to_pci_dev(dev))) return; - sid = info->bus << 8 | info->devfn; + sid = PCI_DEVID(info->bus, info->devfn); qdep = info->ats_qdep; pfsid = info->pfsid; From 6d8bac098e6e44b9a2768e38e9bf77626dc591b7 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 4 Nov 2024 09:40:29 +0800 Subject: [PATCH 31/54] iommu/vt-d: Increase buffer size for device name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GCC is not happy with the current code, e.g.: .../iommu/intel/dmar.c:1063:9: note: ‘sprintf’ output between 6 and 15 bytes into a destination of size 13 1063 | sprintf(iommu->name, "dmar%d", iommu->seq_id); When `make W=1` is supplied, this prevents kernel building. Fix it by increasing the buffer size for device name and use sizeoF() instead of hard coded constants. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20241014104529.4025937-1-andriy.shevchenko@linux.intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/dmar.c | 2 +- drivers/iommu/intel/iommu.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c index eaf862e8dea1..e16c2b1d7633 100644 --- a/drivers/iommu/intel/dmar.c +++ b/drivers/iommu/intel/dmar.c @@ -1060,7 +1060,7 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd) err = iommu->seq_id; goto error; } - sprintf(iommu->name, "dmar%d", iommu->seq_id); + snprintf(iommu->name, sizeof(iommu->name), "dmar%d", iommu->seq_id); err = map_iommu(iommu, drhd); if (err) { diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index f9fba9a26dac..79692d7a26d1 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -720,7 +720,7 @@ struct intel_iommu { int msagaw; /* max sagaw of this iommu */ unsigned int irq, pr_irq, perf_irq; u16 segment; /* PCI segment# */ - unsigned char name[13]; /* Device Name */ + unsigned char name[16]; /* Device Name */ #ifdef CONFIG_INTEL_IOMMU unsigned long *domain_ids; /* bitmap of domains */ From 95e2eaf5b91aae6c3a433cd7882733bd806fa3c8 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Mon, 4 Nov 2024 09:40:30 +0800 Subject: [PATCH 32/54] iommu/vt-d: Remove unused dmar_msi_read dmar_msi_read() has been unused since 2022 in commit cf8e8658100d ("arch: Remove Itanium (IA-64) architecture") Remove it. (dmar_msi_write still exists and is used once). Signed-off-by: Dr. David Alan Gilbert Link: https://lore.kernel.org/r/20241022002702.302728-1-linux@treblig.org Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/dmar.c | 13 ------------- include/linux/dmar.h | 1 - 2 files changed, 14 deletions(-) diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c index e16c2b1d7633..9f424acf474e 100644 --- a/drivers/iommu/intel/dmar.c +++ b/drivers/iommu/intel/dmar.c @@ -1895,19 +1895,6 @@ void dmar_msi_write(int irq, struct msi_msg *msg) raw_spin_unlock_irqrestore(&iommu->register_lock, flag); } -void dmar_msi_read(int irq, struct msi_msg *msg) -{ - struct intel_iommu *iommu = irq_get_handler_data(irq); - int reg = dmar_msi_reg(iommu, irq); - unsigned long flag; - - raw_spin_lock_irqsave(&iommu->register_lock, flag); - msg->data = readl(iommu->reg + reg + 4); - msg->address_lo = readl(iommu->reg + reg + 8); - msg->address_hi = readl(iommu->reg + reg + 12); - raw_spin_unlock_irqrestore(&iommu->register_lock, flag); -} - static int dmar_fault_do_one(struct intel_iommu *iommu, int type, u8 fault_reason, u32 pasid, u16 source_id, unsigned long long addr) diff --git a/include/linux/dmar.h b/include/linux/dmar.h index 499bb2c63483..692b2b445761 100644 --- a/include/linux/dmar.h +++ b/include/linux/dmar.h @@ -292,7 +292,6 @@ static inline void dmar_copy_shared_irte(struct irte *dst, struct irte *src) struct irq_data; extern void dmar_msi_unmask(struct irq_data *data); extern void dmar_msi_mask(struct irq_data *data); -extern void dmar_msi_read(int irq, struct msi_msg *msg); extern void dmar_msi_write(int irq, struct msi_msg *msg); extern int dmar_set_interrupt(struct intel_iommu *iommu); extern irqreturn_t dmar_fault(int irq, void *dev_id); From 4f178e07a2e62293311e2107786a843d6290d77a Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Mon, 4 Nov 2024 09:40:31 +0800 Subject: [PATCH 33/54] iommu/vt-d: Drop s1_pgtbl from dmar_domain dmar_domian has stored the s1_cfg which includes the s1_pgtbl info, so no need to store s1_pgtbl, hence drop it. Signed-off-by: Yi Liu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20241025143339.2328991-1-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.h | 2 -- drivers/iommu/intel/nested.c | 1 - drivers/iommu/intel/pasid.c | 3 +-- 3 files changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 79692d7a26d1..4c6135a2e2f8 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -653,8 +653,6 @@ struct dmar_domain { struct { /* parent page table which the user domain is nested on */ struct dmar_domain *s2_domain; - /* user page table pointer (in GPA) */ - unsigned long s1_pgtbl; /* page table attributes */ struct iommu_hwpt_vtd_s1 s1_cfg; /* link to parent domain siblings */ diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c index 96016bc40f94..989ca5cc04eb 100644 --- a/drivers/iommu/intel/nested.c +++ b/drivers/iommu/intel/nested.c @@ -162,7 +162,6 @@ struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent, domain->use_first_level = true; domain->s2_domain = s2_domain; - domain->s1_pgtbl = vtd.pgtbl_addr; domain->s1_cfg = vtd; domain->domain.ops = &intel_nested_domain_ops; domain->domain.type = IOMMU_DOMAIN_NESTED; diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 7ef157615e0f..7e76062a7ad2 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -560,7 +560,6 @@ int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, u32 pasid, struct dmar_domain *domain) { struct iommu_hwpt_vtd_s1 *s1_cfg = &domain->s1_cfg; - pgd_t *s1_gpgd = (pgd_t *)(uintptr_t)domain->s1_pgtbl; struct dmar_domain *s2_domain = domain->s2_domain; u16 did = domain_id_iommu(domain, iommu); struct dma_pte *pgd = s2_domain->pgd; @@ -611,7 +610,7 @@ int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, if (s1_cfg->addr_width == ADDR_WIDTH_5LEVEL) pasid_set_flpm(pte, 1); - pasid_set_flptr(pte, (uintptr_t)s1_gpgd); + pasid_set_flptr(pte, s1_cfg->pgtbl_addr); if (s1_cfg->flags & IOMMU_VTD_S1_SRE) { pasid_set_sre(pte); From 6ceb93f952f6ca34823ce3650c902c31b8385b40 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Mon, 4 Nov 2024 09:40:32 +0800 Subject: [PATCH 34/54] iommu/vt-d: Fix checks and print in dmar_fault_dump_ptes() There are some issues in dmar_fault_dump_ptes(): 1. return value of phys_to_virt() is used for checking if an entry is present. 2. dump is confusing, e.g., "pasid table entry is not present", confusing by unpresent pasid table vs. unpresent pasid table entry. Current code means the former. 3. pgtable_walk() is called without checking if page table is present. Fix 1 by checking present bit of an entry before dump a lower level entry. Fix 2 by removing "entry" string, e.g., "pasid table is not present". Fix 3 by checking page table present before walk. Take issue 3 for example, before fix: [ 442.240357] DMAR: pasid dir entry: 0x000000012c83e001 [ 442.246661] DMAR: pasid table entry[0]: 0x0000000000000000 [ 442.253429] DMAR: pasid table entry[1]: 0x0000000000000000 [ 442.260203] DMAR: pasid table entry[2]: 0x0000000000000000 [ 442.266969] DMAR: pasid table entry[3]: 0x0000000000000000 [ 442.273733] DMAR: pasid table entry[4]: 0x0000000000000000 [ 442.280479] DMAR: pasid table entry[5]: 0x0000000000000000 [ 442.287234] DMAR: pasid table entry[6]: 0x0000000000000000 [ 442.293989] DMAR: pasid table entry[7]: 0x0000000000000000 [ 442.300742] DMAR: PTE not present at level 2 After fix: ... [ 357.241214] DMAR: pasid table entry[6]: 0x0000000000000000 [ 357.248022] DMAR: pasid table entry[7]: 0x0000000000000000 [ 357.254824] DMAR: scalable mode page table is not present Fixes: 914ff7719e8a ("iommu/vt-d: Dump DMAR translation structure when DMA fault occurs") Signed-off-by: Zhenzhong Duan Link: https://lore.kernel.org/r/20241024092146.715063-2-zhenzhong.duan@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 2d67db67f5e3..2337baa3cb80 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -626,11 +626,11 @@ void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr); /* root entry dump */ - rt_entry = &iommu->root_entry[bus]; - if (!rt_entry) { - pr_info("root table entry is not present\n"); + if (!iommu->root_entry) { + pr_info("root table is not present\n"); return; } + rt_entry = &iommu->root_entry[bus]; if (sm_supported(iommu)) pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n", @@ -641,7 +641,7 @@ void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, /* context entry dump */ ctx_entry = iommu_context_addr(iommu, bus, devfn, 0); if (!ctx_entry) { - pr_info("context table entry is not present\n"); + pr_info("context table is not present\n"); return; } @@ -650,17 +650,23 @@ void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, /* legacy mode does not require PASID entries */ if (!sm_supported(iommu)) { + if (!context_present(ctx_entry)) { + pr_info("legacy mode page table is not present\n"); + return; + } level = agaw_to_level(ctx_entry->hi & 7); pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); goto pgtable_walk; } - /* get the pointer to pasid directory entry */ - dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); - if (!dir) { - pr_info("pasid directory entry is not present\n"); + if (!context_present(ctx_entry)) { + pr_info("pasid directory table is not present\n"); return; } + + /* get the pointer to pasid directory entry */ + dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); + /* For request-without-pasid, get the pasid from context entry */ if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID) pasid = IOMMU_NO_PASID; @@ -672,7 +678,7 @@ void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, /* get the pointer to the pasid table entry */ entries = get_pasid_table_from_pde(pde); if (!entries) { - pr_info("pasid table entry is not present\n"); + pr_info("pasid table is not present\n"); return; } index = pasid & PASID_PTE_MASK; @@ -680,6 +686,11 @@ void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, for (i = 0; i < ARRAY_SIZE(pte->val); i++) pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]); + if (!pasid_pte_is_present(pte)) { + pr_info("scalable mode page table is not present\n"); + return; + } + if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) { level = pte->val[2] & BIT_ULL(2) ? 5 : 4; pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK); From f1645676f25d2c846798f0233c3a953efd62aafb Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Mon, 4 Nov 2024 09:40:33 +0800 Subject: [PATCH 35/54] iommu/vt-d: Fix checks and print in pgtable_walk() There are some issues in pgtable_walk(): 1. Super page is dumped as non-present page 2. dma_pte_superpage() should not check against leaf page table entries 3. Pointer pte is never NULL so checking it is meaningless 4. When an entry is not present, it still makes sense to dump the entry content. Fix 1,2 by checking dma_pte_superpage()'s returned value after level check. Fix 3 by removing pte check. Fix 4 by checking present bit after printing. By this chance, change to print "page table not present" instead of "PTE not present" to be clearer. Fixes: 914ff7719e8a ("iommu/vt-d: Dump DMAR translation structure when DMA fault occurs") Signed-off-by: Zhenzhong Duan Link: https://lore.kernel.org/r/20241024092146.715063-3-zhenzhong.duan@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 2337baa3cb80..5095147f6ba2 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -596,14 +596,15 @@ static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, while (1) { offset = pfn_level_offset(pfn, level); pte = &parent[offset]; - if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) { - pr_info("PTE not present at level %d\n", level); - break; - } pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val); - if (level == 1) + if (!dma_pte_present(pte)) { + pr_info("page table not present at level %d\n", level - 1); + break; + } + + if (level == 1 || dma_pte_superpage(pte)) break; parent = phys_to_virt(dma_pte_addr(pte)); From 4d5440957641fb5652cbef8df6183baf473cab6b Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Mon, 4 Nov 2024 09:40:34 +0800 Subject: [PATCH 36/54] iommu/vt-d: Separate page request queue from SVM IO page faults are no longer dependent on CONFIG_INTEL_IOMMU_SVM. Move all Page Request Queue (PRQ) functions that handle prq events to a new file in drivers/iommu/intel/prq.c. The page_req_des struct is now declared in drivers/iommu/intel/prq.c. No functional changes are intended. This is a preparation patch to enable the use of IO page faults outside the SVM/PASID use cases. Signed-off-by: Joel Granados Link: https://lore.kernel.org/r/20241015-jag-iopfv8-v4-1-b696ca89ba29@kernel.org Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/Makefile | 2 +- drivers/iommu/intel/iommu.c | 20 +- drivers/iommu/intel/iommu.h | 14 +- drivers/iommu/intel/prq.c | 410 +++++++++++++++++++++++++++++++++++ drivers/iommu/intel/svm.c | 397 --------------------------------- 5 files changed, 424 insertions(+), 419 deletions(-) create mode 100644 drivers/iommu/intel/prq.c diff --git a/drivers/iommu/intel/Makefile b/drivers/iommu/intel/Makefile index c8beb0281559..d3bb0798092d 100644 --- a/drivers/iommu/intel/Makefile +++ b/drivers/iommu/intel/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_DMAR_TABLE) += dmar.o -obj-$(CONFIG_INTEL_IOMMU) += iommu.o pasid.o nested.o cache.o +obj-$(CONFIG_INTEL_IOMMU) += iommu.o pasid.o nested.o cache.o prq.o obj-$(CONFIG_DMAR_TABLE) += trace.o cap_audit.o obj-$(CONFIG_DMAR_PERF) += perf.o obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += debugfs.o diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 5095147f6ba2..d0c325115b45 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1329,12 +1329,10 @@ static void free_dmar_iommu(struct intel_iommu *iommu) /* free context mapping */ free_context_table(iommu); -#ifdef CONFIG_INTEL_IOMMU_SVM if (pasid_supported(iommu)) { if (ecap_prs(iommu->ecap)) - intel_svm_finish_prq(iommu); + intel_iommu_finish_prq(iommu); } -#endif } /* @@ -2194,19 +2192,18 @@ static int __init init_dmars(void) iommu_flush_write_buffer(iommu); -#ifdef CONFIG_INTEL_IOMMU_SVM if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { /* * Call dmar_alloc_hwirq() with dmar_global_lock held, * could cause possible lock race condition. */ up_write(&dmar_global_lock); - ret = intel_svm_enable_prq(iommu); + ret = intel_iommu_enable_prq(iommu); down_write(&dmar_global_lock); if (ret) goto free_iommu; } -#endif + ret = dmar_set_interrupt(iommu); if (ret) goto free_iommu; @@ -2619,13 +2616,12 @@ static int intel_iommu_add(struct dmar_drhd_unit *dmaru) intel_iommu_init_qi(iommu); iommu_flush_write_buffer(iommu); -#ifdef CONFIG_INTEL_IOMMU_SVM if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { - ret = intel_svm_enable_prq(iommu); + ret = intel_iommu_enable_prq(iommu); if (ret) goto disable_iommu; } -#endif + ret = dmar_set_interrupt(iommu); if (ret) goto disable_iommu; @@ -4072,7 +4068,7 @@ static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, intel_iommu_debugfs_remove_dev_pasid(dev_pasid); kfree(dev_pasid); intel_pasid_tear_down_entry(iommu, dev, pasid, false); - intel_drain_pasid_prq(dev, pasid); + intel_iommu_drain_pasid_prq(dev, pasid); } static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, @@ -4415,9 +4411,7 @@ const struct iommu_ops intel_iommu_ops = { .def_domain_type = device_def_domain_type, .remove_dev_pasid = intel_iommu_remove_dev_pasid, .pgsize_bitmap = SZ_4K, -#ifdef CONFIG_INTEL_IOMMU_SVM - .page_response = intel_svm_page_response, -#endif + .page_response = intel_iommu_page_response, .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = intel_iommu_attach_device, .set_dev_pasid = intel_iommu_set_dev_pasid, diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 4c6135a2e2f8..b3912633ce25 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -728,12 +728,10 @@ struct intel_iommu { struct iommu_flush flush; #endif -#ifdef CONFIG_INTEL_IOMMU_SVM struct page_req_dsc *prq; unsigned char prq_name[16]; /* Name for PRQ interrupt */ unsigned long prq_seq_number; struct completion prq_complete; -#endif struct iopf_queue *iopf_queue; unsigned char iopfq_name[16]; /* Synchronization between fault report and iommu device release. */ @@ -1274,18 +1272,18 @@ void intel_context_flush_present(struct device_domain_info *info, struct context_entry *context, u16 did, bool affect_domains); +int intel_iommu_enable_prq(struct intel_iommu *iommu); +int intel_iommu_finish_prq(struct intel_iommu *iommu); +void intel_iommu_page_response(struct device *dev, struct iopf_fault *evt, + struct iommu_page_response *msg); +void intel_iommu_drain_pasid_prq(struct device *dev, u32 pasid); + #ifdef CONFIG_INTEL_IOMMU_SVM void intel_svm_check(struct intel_iommu *iommu); -int intel_svm_enable_prq(struct intel_iommu *iommu); -int intel_svm_finish_prq(struct intel_iommu *iommu); -void intel_svm_page_response(struct device *dev, struct iopf_fault *evt, - struct iommu_page_response *msg); struct iommu_domain *intel_svm_domain_alloc(struct device *dev, struct mm_struct *mm); -void intel_drain_pasid_prq(struct device *dev, u32 pasid); #else static inline void intel_svm_check(struct intel_iommu *iommu) {} -static inline void intel_drain_pasid_prq(struct device *dev, u32 pasid) {} static inline struct iommu_domain *intel_svm_domain_alloc(struct device *dev, struct mm_struct *mm) { diff --git a/drivers/iommu/intel/prq.c b/drivers/iommu/intel/prq.c new file mode 100644 index 000000000000..edda5da8ba15 --- /dev/null +++ b/drivers/iommu/intel/prq.c @@ -0,0 +1,410 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2015 Intel Corporation + * + * Originally split from drivers/iommu/intel/svm.c + */ + +#include +#include + +#include "iommu.h" +#include "pasid.h" +#include "../iommu-pages.h" +#include "trace.h" + +/* Page request queue descriptor */ +struct page_req_dsc { + union { + struct { + u64 type:8; + u64 pasid_present:1; + u64 rsvd:7; + u64 rid:16; + u64 pasid:20; + u64 exe_req:1; + u64 pm_req:1; + u64 rsvd2:10; + }; + u64 qw_0; + }; + union { + struct { + u64 rd_req:1; + u64 wr_req:1; + u64 lpig:1; + u64 prg_index:9; + u64 addr:52; + }; + u64 qw_1; + }; + u64 qw_2; + u64 qw_3; +}; + +/** + * intel_iommu_drain_pasid_prq - Drain page requests and responses for a pasid + * @dev: target device + * @pasid: pasid for draining + * + * Drain all pending page requests and responses related to @pasid in both + * software and hardware. This is supposed to be called after the device + * driver has stopped DMA, the pasid entry has been cleared, and both IOTLB + * and DevTLB have been invalidated. + * + * It waits until all pending page requests for @pasid in the page fault + * queue are completed by the prq handling thread. Then follow the steps + * described in VT-d spec CH7.10 to drain all page requests and page + * responses pending in the hardware. + */ +void intel_iommu_drain_pasid_prq(struct device *dev, u32 pasid) +{ + struct device_domain_info *info; + struct dmar_domain *domain; + struct intel_iommu *iommu; + struct qi_desc desc[3]; + struct pci_dev *pdev; + int head, tail; + u16 sid, did; + int qdep; + + info = dev_iommu_priv_get(dev); + if (WARN_ON(!info || !dev_is_pci(dev))) + return; + + if (!info->pri_enabled) + return; + + iommu = info->iommu; + domain = info->domain; + pdev = to_pci_dev(dev); + sid = PCI_DEVID(info->bus, info->devfn); + did = domain ? domain_id_iommu(domain, iommu) : FLPT_DEFAULT_DID; + + qdep = pci_ats_queue_depth(pdev); + + /* + * Check and wait until all pending page requests in the queue are + * handled by the prq handling thread. + */ +prq_retry: + reinit_completion(&iommu->prq_complete); + tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; + head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; + while (head != tail) { + struct page_req_dsc *req; + + req = &iommu->prq[head / sizeof(*req)]; + if (!req->pasid_present || req->pasid != pasid) { + head = (head + sizeof(*req)) & PRQ_RING_MASK; + continue; + } + + wait_for_completion(&iommu->prq_complete); + goto prq_retry; + } + + iopf_queue_flush_dev(dev); + + /* + * Perform steps described in VT-d spec CH7.10 to drain page + * requests and responses in hardware. + */ + memset(desc, 0, sizeof(desc)); + desc[0].qw0 = QI_IWD_STATUS_DATA(QI_DONE) | + QI_IWD_FENCE | + QI_IWD_TYPE; + desc[1].qw0 = QI_EIOTLB_PASID(pasid) | + QI_EIOTLB_DID(did) | + QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | + QI_EIOTLB_TYPE; + desc[2].qw0 = QI_DEV_EIOTLB_PASID(pasid) | + QI_DEV_EIOTLB_SID(sid) | + QI_DEV_EIOTLB_QDEP(qdep) | + QI_DEIOTLB_TYPE | + QI_DEV_IOTLB_PFSID(info->pfsid); +qi_retry: + reinit_completion(&iommu->prq_complete); + qi_submit_sync(iommu, desc, 3, QI_OPT_WAIT_DRAIN); + if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) { + wait_for_completion(&iommu->prq_complete); + goto qi_retry; + } +} + +static bool is_canonical_address(u64 addr) +{ + int shift = 64 - (__VIRTUAL_MASK_SHIFT + 1); + long saddr = (long)addr; + + return (((saddr << shift) >> shift) == saddr); +} + +static void handle_bad_prq_event(struct intel_iommu *iommu, + struct page_req_dsc *req, int result) +{ + struct qi_desc desc = { }; + + pr_err("%s: Invalid page request: %08llx %08llx\n", + iommu->name, ((unsigned long long *)req)[0], + ((unsigned long long *)req)[1]); + + if (!req->lpig) + return; + + desc.qw0 = QI_PGRP_PASID(req->pasid) | + QI_PGRP_DID(req->rid) | + QI_PGRP_PASID_P(req->pasid_present) | + QI_PGRP_RESP_CODE(result) | + QI_PGRP_RESP_TYPE; + desc.qw1 = QI_PGRP_IDX(req->prg_index) | + QI_PGRP_LPIG(req->lpig); + + qi_submit_sync(iommu, &desc, 1, 0); +} + +static int prq_to_iommu_prot(struct page_req_dsc *req) +{ + int prot = 0; + + if (req->rd_req) + prot |= IOMMU_FAULT_PERM_READ; + if (req->wr_req) + prot |= IOMMU_FAULT_PERM_WRITE; + if (req->exe_req) + prot |= IOMMU_FAULT_PERM_EXEC; + if (req->pm_req) + prot |= IOMMU_FAULT_PERM_PRIV; + + return prot; +} + +static void intel_prq_report(struct intel_iommu *iommu, struct device *dev, + struct page_req_dsc *desc) +{ + struct iopf_fault event = { }; + + /* Fill in event data for device specific processing */ + event.fault.type = IOMMU_FAULT_PAGE_REQ; + event.fault.prm.addr = (u64)desc->addr << VTD_PAGE_SHIFT; + event.fault.prm.pasid = desc->pasid; + event.fault.prm.grpid = desc->prg_index; + event.fault.prm.perm = prq_to_iommu_prot(desc); + + if (desc->lpig) + event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; + if (desc->pasid_present) { + event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; + event.fault.prm.flags |= IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID; + } + + iommu_report_device_fault(dev, &event); +} + +static irqreturn_t prq_event_thread(int irq, void *d) +{ + struct intel_iommu *iommu = d; + struct page_req_dsc *req; + int head, tail, handled; + struct device *dev; + u64 address; + + /* + * Clear PPR bit before reading head/tail registers, to ensure that + * we get a new interrupt if needed. + */ + writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG); + + tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; + head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; + handled = (head != tail); + while (head != tail) { + req = &iommu->prq[head / sizeof(*req)]; + address = (u64)req->addr << VTD_PAGE_SHIFT; + + if (unlikely(!req->pasid_present)) { + pr_err("IOMMU: %s: Page request without PASID\n", + iommu->name); +bad_req: + handle_bad_prq_event(iommu, req, QI_RESP_INVALID); + goto prq_advance; + } + + if (unlikely(!is_canonical_address(address))) { + pr_err("IOMMU: %s: Address is not canonical\n", + iommu->name); + goto bad_req; + } + + if (unlikely(req->pm_req && (req->rd_req | req->wr_req))) { + pr_err("IOMMU: %s: Page request in Privilege Mode\n", + iommu->name); + goto bad_req; + } + + if (unlikely(req->exe_req && req->rd_req)) { + pr_err("IOMMU: %s: Execution request not supported\n", + iommu->name); + goto bad_req; + } + + /* Drop Stop Marker message. No need for a response. */ + if (unlikely(req->lpig && !req->rd_req && !req->wr_req)) + goto prq_advance; + + /* + * If prq is to be handled outside iommu driver via receiver of + * the fault notifiers, we skip the page response here. + */ + mutex_lock(&iommu->iopf_lock); + dev = device_rbtree_find(iommu, req->rid); + if (!dev) { + mutex_unlock(&iommu->iopf_lock); + goto bad_req; + } + + intel_prq_report(iommu, dev, req); + trace_prq_report(iommu, dev, req->qw_0, req->qw_1, + req->qw_2, req->qw_3, + iommu->prq_seq_number++); + mutex_unlock(&iommu->iopf_lock); +prq_advance: + head = (head + sizeof(*req)) & PRQ_RING_MASK; + } + + dmar_writeq(iommu->reg + DMAR_PQH_REG, tail); + + /* + * Clear the page request overflow bit and wake up all threads that + * are waiting for the completion of this handling. + */ + if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) { + pr_info_ratelimited("IOMMU: %s: PRQ overflow detected\n", + iommu->name); + head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; + tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; + if (head == tail) { + iopf_queue_discard_partial(iommu->iopf_queue); + writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG); + pr_info_ratelimited("IOMMU: %s: PRQ overflow cleared", + iommu->name); + } + } + + if (!completion_done(&iommu->prq_complete)) + complete(&iommu->prq_complete); + + return IRQ_RETVAL(handled); +} + +int intel_iommu_enable_prq(struct intel_iommu *iommu) +{ + struct iopf_queue *iopfq; + int irq, ret; + + iommu->prq = iommu_alloc_pages_node(iommu->node, GFP_KERNEL, PRQ_ORDER); + if (!iommu->prq) { + pr_warn("IOMMU: %s: Failed to allocate page request queue\n", + iommu->name); + return -ENOMEM; + } + + irq = dmar_alloc_hwirq(IOMMU_IRQ_ID_OFFSET_PRQ + iommu->seq_id, iommu->node, iommu); + if (irq <= 0) { + pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n", + iommu->name); + ret = -EINVAL; + goto free_prq; + } + iommu->pr_irq = irq; + + snprintf(iommu->iopfq_name, sizeof(iommu->iopfq_name), + "dmar%d-iopfq", iommu->seq_id); + iopfq = iopf_queue_alloc(iommu->iopfq_name); + if (!iopfq) { + pr_err("IOMMU: %s: Failed to allocate iopf queue\n", iommu->name); + ret = -ENOMEM; + goto free_hwirq; + } + iommu->iopf_queue = iopfq; + + snprintf(iommu->prq_name, sizeof(iommu->prq_name), "dmar%d-prq", iommu->seq_id); + + ret = request_threaded_irq(irq, NULL, prq_event_thread, IRQF_ONESHOT, + iommu->prq_name, iommu); + if (ret) { + pr_err("IOMMU: %s: Failed to request IRQ for page request queue\n", + iommu->name); + goto free_iopfq; + } + dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); + dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); + dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER); + + init_completion(&iommu->prq_complete); + + return 0; + +free_iopfq: + iopf_queue_free(iommu->iopf_queue); + iommu->iopf_queue = NULL; +free_hwirq: + dmar_free_hwirq(irq); + iommu->pr_irq = 0; +free_prq: + iommu_free_pages(iommu->prq, PRQ_ORDER); + iommu->prq = NULL; + + return ret; +} + +int intel_iommu_finish_prq(struct intel_iommu *iommu) +{ + dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); + dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); + dmar_writeq(iommu->reg + DMAR_PQA_REG, 0ULL); + + if (iommu->pr_irq) { + free_irq(iommu->pr_irq, iommu); + dmar_free_hwirq(iommu->pr_irq); + iommu->pr_irq = 0; + } + + if (iommu->iopf_queue) { + iopf_queue_free(iommu->iopf_queue); + iommu->iopf_queue = NULL; + } + + iommu_free_pages(iommu->prq, PRQ_ORDER); + iommu->prq = NULL; + + return 0; +} + +void intel_iommu_page_response(struct device *dev, struct iopf_fault *evt, + struct iommu_page_response *msg) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + u8 bus = info->bus, devfn = info->devfn; + struct iommu_fault_page_request *prm; + struct qi_desc desc; + bool pasid_present; + bool last_page; + u16 sid; + + prm = &evt->fault.prm; + sid = PCI_DEVID(bus, devfn); + pasid_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; + last_page = prm->flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; + + desc.qw0 = QI_PGRP_PASID(prm->pasid) | QI_PGRP_DID(sid) | + QI_PGRP_PASID_P(pasid_present) | + QI_PGRP_RESP_CODE(msg->code) | + QI_PGRP_RESP_TYPE; + desc.qw1 = QI_PGRP_IDX(prm->grpid) | QI_PGRP_LPIG(last_page); + desc.qw2 = 0; + desc.qw3 = 0; + + qi_submit_sync(iommu, &desc, 1, 0); +} diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index 078d1e32a24e..3cc43a958b4d 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -25,92 +25,6 @@ #include "../iommu-pages.h" #include "trace.h" -static irqreturn_t prq_event_thread(int irq, void *d); - -int intel_svm_enable_prq(struct intel_iommu *iommu) -{ - struct iopf_queue *iopfq; - int irq, ret; - - iommu->prq = iommu_alloc_pages_node(iommu->node, GFP_KERNEL, PRQ_ORDER); - if (!iommu->prq) { - pr_warn("IOMMU: %s: Failed to allocate page request queue\n", - iommu->name); - return -ENOMEM; - } - - irq = dmar_alloc_hwirq(IOMMU_IRQ_ID_OFFSET_PRQ + iommu->seq_id, iommu->node, iommu); - if (irq <= 0) { - pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n", - iommu->name); - ret = -EINVAL; - goto free_prq; - } - iommu->pr_irq = irq; - - snprintf(iommu->iopfq_name, sizeof(iommu->iopfq_name), - "dmar%d-iopfq", iommu->seq_id); - iopfq = iopf_queue_alloc(iommu->iopfq_name); - if (!iopfq) { - pr_err("IOMMU: %s: Failed to allocate iopf queue\n", iommu->name); - ret = -ENOMEM; - goto free_hwirq; - } - iommu->iopf_queue = iopfq; - - snprintf(iommu->prq_name, sizeof(iommu->prq_name), "dmar%d-prq", iommu->seq_id); - - ret = request_threaded_irq(irq, NULL, prq_event_thread, IRQF_ONESHOT, - iommu->prq_name, iommu); - if (ret) { - pr_err("IOMMU: %s: Failed to request IRQ for page request queue\n", - iommu->name); - goto free_iopfq; - } - dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); - dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); - dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER); - - init_completion(&iommu->prq_complete); - - return 0; - -free_iopfq: - iopf_queue_free(iommu->iopf_queue); - iommu->iopf_queue = NULL; -free_hwirq: - dmar_free_hwirq(irq); - iommu->pr_irq = 0; -free_prq: - iommu_free_pages(iommu->prq, PRQ_ORDER); - iommu->prq = NULL; - - return ret; -} - -int intel_svm_finish_prq(struct intel_iommu *iommu) -{ - dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); - dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); - dmar_writeq(iommu->reg + DMAR_PQA_REG, 0ULL); - - if (iommu->pr_irq) { - free_irq(iommu->pr_irq, iommu); - dmar_free_hwirq(iommu->pr_irq); - iommu->pr_irq = 0; - } - - if (iommu->iopf_queue) { - iopf_queue_free(iommu->iopf_queue); - iommu->iopf_queue = NULL; - } - - iommu_free_pages(iommu->prq, PRQ_ORDER); - iommu->prq = NULL; - - return 0; -} - void intel_svm_check(struct intel_iommu *iommu) { if (!pasid_supported(iommu)) @@ -240,317 +154,6 @@ free_dev_pasid: return ret; } -/* Page request queue descriptor */ -struct page_req_dsc { - union { - struct { - u64 type:8; - u64 pasid_present:1; - u64 rsvd:7; - u64 rid:16; - u64 pasid:20; - u64 exe_req:1; - u64 pm_req:1; - u64 rsvd2:10; - }; - u64 qw_0; - }; - union { - struct { - u64 rd_req:1; - u64 wr_req:1; - u64 lpig:1; - u64 prg_index:9; - u64 addr:52; - }; - u64 qw_1; - }; - u64 qw_2; - u64 qw_3; -}; - -static bool is_canonical_address(u64 addr) -{ - int shift = 64 - (__VIRTUAL_MASK_SHIFT + 1); - long saddr = (long) addr; - - return (((saddr << shift) >> shift) == saddr); -} - -/** - * intel_drain_pasid_prq - Drain page requests and responses for a pasid - * @dev: target device - * @pasid: pasid for draining - * - * Drain all pending page requests and responses related to @pasid in both - * software and hardware. This is supposed to be called after the device - * driver has stopped DMA, the pasid entry has been cleared, and both IOTLB - * and DevTLB have been invalidated. - * - * It waits until all pending page requests for @pasid in the page fault - * queue are completed by the prq handling thread. Then follow the steps - * described in VT-d spec CH7.10 to drain all page requests and page - * responses pending in the hardware. - */ -void intel_drain_pasid_prq(struct device *dev, u32 pasid) -{ - struct device_domain_info *info; - struct dmar_domain *domain; - struct intel_iommu *iommu; - struct qi_desc desc[3]; - struct pci_dev *pdev; - int head, tail; - u16 sid, did; - int qdep; - - info = dev_iommu_priv_get(dev); - if (WARN_ON(!info || !dev_is_pci(dev))) - return; - - if (!info->pri_enabled) - return; - - iommu = info->iommu; - domain = info->domain; - pdev = to_pci_dev(dev); - sid = PCI_DEVID(info->bus, info->devfn); - did = domain ? domain_id_iommu(domain, iommu) : FLPT_DEFAULT_DID; - qdep = pci_ats_queue_depth(pdev); - - /* - * Check and wait until all pending page requests in the queue are - * handled by the prq handling thread. - */ -prq_retry: - reinit_completion(&iommu->prq_complete); - tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; - head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; - while (head != tail) { - struct page_req_dsc *req; - - req = &iommu->prq[head / sizeof(*req)]; - if (!req->pasid_present || req->pasid != pasid) { - head = (head + sizeof(*req)) & PRQ_RING_MASK; - continue; - } - - wait_for_completion(&iommu->prq_complete); - goto prq_retry; - } - - iopf_queue_flush_dev(dev); - - /* - * Perform steps described in VT-d spec CH7.10 to drain page - * requests and responses in hardware. - */ - memset(desc, 0, sizeof(desc)); - desc[0].qw0 = QI_IWD_STATUS_DATA(QI_DONE) | - QI_IWD_FENCE | - QI_IWD_TYPE; - desc[1].qw0 = QI_EIOTLB_PASID(pasid) | - QI_EIOTLB_DID(did) | - QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | - QI_EIOTLB_TYPE; - desc[2].qw0 = QI_DEV_EIOTLB_PASID(pasid) | - QI_DEV_EIOTLB_SID(sid) | - QI_DEV_EIOTLB_QDEP(qdep) | - QI_DEIOTLB_TYPE | - QI_DEV_IOTLB_PFSID(info->pfsid); -qi_retry: - reinit_completion(&iommu->prq_complete); - qi_submit_sync(iommu, desc, 3, QI_OPT_WAIT_DRAIN); - if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) { - wait_for_completion(&iommu->prq_complete); - goto qi_retry; - } -} - -static int prq_to_iommu_prot(struct page_req_dsc *req) -{ - int prot = 0; - - if (req->rd_req) - prot |= IOMMU_FAULT_PERM_READ; - if (req->wr_req) - prot |= IOMMU_FAULT_PERM_WRITE; - if (req->exe_req) - prot |= IOMMU_FAULT_PERM_EXEC; - if (req->pm_req) - prot |= IOMMU_FAULT_PERM_PRIV; - - return prot; -} - -static void intel_svm_prq_report(struct intel_iommu *iommu, struct device *dev, - struct page_req_dsc *desc) -{ - struct iopf_fault event = { }; - - /* Fill in event data for device specific processing */ - event.fault.type = IOMMU_FAULT_PAGE_REQ; - event.fault.prm.addr = (u64)desc->addr << VTD_PAGE_SHIFT; - event.fault.prm.pasid = desc->pasid; - event.fault.prm.grpid = desc->prg_index; - event.fault.prm.perm = prq_to_iommu_prot(desc); - - if (desc->lpig) - event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; - if (desc->pasid_present) { - event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; - event.fault.prm.flags |= IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID; - } - - iommu_report_device_fault(dev, &event); -} - -static void handle_bad_prq_event(struct intel_iommu *iommu, - struct page_req_dsc *req, int result) -{ - struct qi_desc desc = { }; - - pr_err("%s: Invalid page request: %08llx %08llx\n", - iommu->name, ((unsigned long long *)req)[0], - ((unsigned long long *)req)[1]); - - if (!req->lpig) - return; - - desc.qw0 = QI_PGRP_PASID(req->pasid) | - QI_PGRP_DID(req->rid) | - QI_PGRP_PASID_P(req->pasid_present) | - QI_PGRP_RESP_CODE(result) | - QI_PGRP_RESP_TYPE; - desc.qw1 = QI_PGRP_IDX(req->prg_index) | - QI_PGRP_LPIG(req->lpig); - - qi_submit_sync(iommu, &desc, 1, 0); -} - -static irqreturn_t prq_event_thread(int irq, void *d) -{ - struct intel_iommu *iommu = d; - struct page_req_dsc *req; - int head, tail, handled; - struct device *dev; - u64 address; - - /* - * Clear PPR bit before reading head/tail registers, to ensure that - * we get a new interrupt if needed. - */ - writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG); - - tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; - head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; - handled = (head != tail); - while (head != tail) { - req = &iommu->prq[head / sizeof(*req)]; - address = (u64)req->addr << VTD_PAGE_SHIFT; - - if (unlikely(!req->pasid_present)) { - pr_err("IOMMU: %s: Page request without PASID\n", - iommu->name); -bad_req: - handle_bad_prq_event(iommu, req, QI_RESP_INVALID); - goto prq_advance; - } - - if (unlikely(!is_canonical_address(address))) { - pr_err("IOMMU: %s: Address is not canonical\n", - iommu->name); - goto bad_req; - } - - if (unlikely(req->pm_req && (req->rd_req | req->wr_req))) { - pr_err("IOMMU: %s: Page request in Privilege Mode\n", - iommu->name); - goto bad_req; - } - - if (unlikely(req->exe_req && req->rd_req)) { - pr_err("IOMMU: %s: Execution request not supported\n", - iommu->name); - goto bad_req; - } - - /* Drop Stop Marker message. No need for a response. */ - if (unlikely(req->lpig && !req->rd_req && !req->wr_req)) - goto prq_advance; - - /* - * If prq is to be handled outside iommu driver via receiver of - * the fault notifiers, we skip the page response here. - */ - mutex_lock(&iommu->iopf_lock); - dev = device_rbtree_find(iommu, req->rid); - if (!dev) { - mutex_unlock(&iommu->iopf_lock); - goto bad_req; - } - - intel_svm_prq_report(iommu, dev, req); - trace_prq_report(iommu, dev, req->qw_0, req->qw_1, - req->qw_2, req->qw_3, - iommu->prq_seq_number++); - mutex_unlock(&iommu->iopf_lock); -prq_advance: - head = (head + sizeof(*req)) & PRQ_RING_MASK; - } - - dmar_writeq(iommu->reg + DMAR_PQH_REG, tail); - - /* - * Clear the page request overflow bit and wake up all threads that - * are waiting for the completion of this handling. - */ - if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) { - pr_info_ratelimited("IOMMU: %s: PRQ overflow detected\n", - iommu->name); - head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; - tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; - if (head == tail) { - iopf_queue_discard_partial(iommu->iopf_queue); - writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG); - pr_info_ratelimited("IOMMU: %s: PRQ overflow cleared", - iommu->name); - } - } - - if (!completion_done(&iommu->prq_complete)) - complete(&iommu->prq_complete); - - return IRQ_RETVAL(handled); -} - -void intel_svm_page_response(struct device *dev, struct iopf_fault *evt, - struct iommu_page_response *msg) -{ - struct device_domain_info *info = dev_iommu_priv_get(dev); - struct intel_iommu *iommu = info->iommu; - u8 bus = info->bus, devfn = info->devfn; - struct iommu_fault_page_request *prm; - struct qi_desc desc; - bool pasid_present; - bool last_page; - u16 sid; - - prm = &evt->fault.prm; - sid = PCI_DEVID(bus, devfn); - pasid_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; - last_page = prm->flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; - - desc.qw0 = QI_PGRP_PASID(prm->pasid) | QI_PGRP_DID(sid) | - QI_PGRP_PASID_P(pasid_present) | - QI_PGRP_RESP_CODE(msg->code) | - QI_PGRP_RESP_TYPE; - desc.qw1 = QI_PGRP_IDX(prm->grpid) | QI_PGRP_LPIG(last_page); - desc.qw2 = 0; - desc.qw3 = 0; - - qi_submit_sync(iommu, &desc, 1, 0); -} - static void intel_svm_domain_free(struct iommu_domain *domain) { struct dmar_domain *dmar_domain = to_dmar_domain(domain); From 9f831c16c69e4f2f042919c8409be300fd9f4248 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 4 Nov 2024 09:40:35 +0800 Subject: [PATCH 37/54] iommu/vt-d: Remove the pasid present check in prq_event_thread PASID is not strictly needed when handling a PRQ event; remove the check for the pasid present bit in the request. This change was not included in the creation of prq.c to emphasize the change in capability checks when handing PRQ events. Signed-off-by: Klaus Jensen Reviewed-by: Kevin Tian Signed-off-by: Joel Granados Link: https://lore.kernel.org/r/20241015-jag-iopfv8-v4-2-b696ca89ba29@kernel.org Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/prq.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/drivers/iommu/intel/prq.c b/drivers/iommu/intel/prq.c index edda5da8ba15..621cd26504b3 100644 --- a/drivers/iommu/intel/prq.c +++ b/drivers/iommu/intel/prq.c @@ -222,20 +222,14 @@ static irqreturn_t prq_event_thread(int irq, void *d) req = &iommu->prq[head / sizeof(*req)]; address = (u64)req->addr << VTD_PAGE_SHIFT; - if (unlikely(!req->pasid_present)) { - pr_err("IOMMU: %s: Page request without PASID\n", + if (unlikely(!is_canonical_address(address))) { + pr_err("IOMMU: %s: Address is not canonical\n", iommu->name); bad_req: handle_bad_prq_event(iommu, req, QI_RESP_INVALID); goto prq_advance; } - if (unlikely(!is_canonical_address(address))) { - pr_err("IOMMU: %s: Address is not canonical\n", - iommu->name); - goto bad_req; - } - if (unlikely(req->pm_req && (req->rd_req | req->wr_req))) { pr_err("IOMMU: %s: Page request in Privilege Mode\n", iommu->name); From 140f5dedbb9ec8d74d24fab5714ffccac0d8b1d3 Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Mon, 4 Nov 2024 09:40:36 +0800 Subject: [PATCH 38/54] iommu/vt-d: Move IOMMU_IOPF into INTEL_IOMMU Move IOMMU_IOPF from under INTEL_IOMMU_SVM into INTEL_IOMMU. This certifies that the core intel iommu utilizes the IOPF library functions, independent of the INTEL_IOMMU_SVM config. Signed-off-by: Joel Granados Link: https://lore.kernel.org/r/20241015-jag-iopfv8-v4-3-b696ca89ba29@kernel.org Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig index 88fd32a9323c..f2f538c70650 100644 --- a/drivers/iommu/intel/Kconfig +++ b/drivers/iommu/intel/Kconfig @@ -14,6 +14,7 @@ config INTEL_IOMMU depends on PCI_MSI && ACPI && X86 select IOMMU_API select IOMMU_IOVA + select IOMMU_IOPF select IOMMUFD_DRIVER if IOMMUFD select NEED_DMA_MAP_STATE select DMAR_TABLE @@ -50,7 +51,6 @@ config INTEL_IOMMU_SVM depends on X86_64 select MMU_NOTIFIER select IOMMU_SVA - select IOMMU_IOPF help Shared Virtual Memory (SVM) provides a facility for devices to access DMA resources through process address space by From cbeb1b7eee2ffa63b4d809c0a3f068309df933fa Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Mon, 4 Nov 2024 09:40:37 +0800 Subject: [PATCH 39/54] iommufd: Enable PRI when doing the iommufd_hwpt_alloc Add IOMMU_HWPT_FAULT_ID_VALID as part of the valid flags when doing an iommufd_hwpt_alloc allowing the use of an iommu fault allocation (iommu_fault_alloc) with the IOMMU_HWPT_ALLOC ioctl. Reviewed-by: Kevin Tian Signed-off-by: Joel Granados Link: https://lore.kernel.org/r/20241015-jag-iopfv8-v4-4-b696ca89ba29@kernel.org Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 3 ++- drivers/iommu/iommufd/hw_pagetable.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index d0c325115b45..0ac4ce192baa 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3315,7 +3315,8 @@ intel_iommu_domain_alloc_user(struct device *dev, u32 flags, } if (flags & - (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING))) + (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING + | IOMMU_HWPT_FAULT_ID_VALID))) return ERR_PTR(-EOPNOTSUPP); if (nested_parent && !nested_supported(iommu)) return ERR_PTR(-EOPNOTSUPP); diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index d06bf6e6c19f..8f020bc0815f 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -107,7 +107,8 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, const struct iommu_user_data *user_data) { const u32 valid_flags = IOMMU_HWPT_ALLOC_NEST_PARENT | - IOMMU_HWPT_ALLOC_DIRTY_TRACKING; + IOMMU_HWPT_ALLOC_DIRTY_TRACKING | + IOMMU_HWPT_FAULT_ID_VALID; const struct iommu_ops *ops = dev_iommu_ops(idev->dev); struct iommufd_hwpt_paging *hwpt_paging; struct iommufd_hw_pagetable *hwpt; From 9baed1c28030ebac5d44b889b3adf155340fa751 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 4 Nov 2024 09:40:38 +0800 Subject: [PATCH 40/54] iommu/vt-d: Drop pasid requirement for prq initialization PASID support within the IOMMU is not required to enable the Page Request Queue, only the PRS capability. Signed-off-by: Klaus Jensen Reviewed-by: Kevin Tian Signed-off-by: Joel Granados Link: https://lore.kernel.org/r/20241015-jag-iopfv8-v4-5-b696ca89ba29@kernel.org Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 0ac4ce192baa..74cf84e3e4e0 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1329,10 +1329,8 @@ static void free_dmar_iommu(struct intel_iommu *iommu) /* free context mapping */ free_context_table(iommu); - if (pasid_supported(iommu)) { - if (ecap_prs(iommu->ecap)) - intel_iommu_finish_prq(iommu); - } + if (ecap_prs(iommu->ecap)) + intel_iommu_finish_prq(iommu); } /* @@ -2192,7 +2190,7 @@ static int __init init_dmars(void) iommu_flush_write_buffer(iommu); - if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { + if (ecap_prs(iommu->ecap)) { /* * Call dmar_alloc_hwirq() with dmar_global_lock held, * could cause possible lock race condition. @@ -2616,7 +2614,7 @@ static int intel_iommu_add(struct dmar_drhd_unit *dmaru) intel_iommu_init_qi(iommu); iommu_flush_write_buffer(iommu); - if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { + if (ecap_prs(iommu->ecap)) { ret = intel_iommu_enable_prq(iommu); if (ret) goto disable_iommu; From c43e1ccdebf2c950545fdf12c5796ad6f7bad7ee Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 4 Nov 2024 09:40:39 +0800 Subject: [PATCH 41/54] iommu/vt-d: Drain PRQs when domain removed from RID As this iommu driver now supports page faults for requests without PASID, page requests should be drained when a domain is removed from the RID2PASID entry. This results in the intel_iommu_drain_pasid_prq() call being moved to intel_pasid_tear_down_entry(). This indicates that when a translation is removed from any PASID entry and the PRI has been enabled on the device, page requests are drained in the domain detachment path. The intel_iommu_drain_pasid_prq() helper has been modified to support sending device TLB invalidation requests for both PASID and non-PASID cases. Signed-off-by: Lu Baolu Reviewed-by: Yi Liu Link: https://lore.kernel.org/r/20241101045543.70086-1-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 1 - drivers/iommu/intel/pasid.c | 1 + drivers/iommu/intel/prq.c | 26 +++++++++----------------- 3 files changed, 10 insertions(+), 18 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 74cf84e3e4e0..e2b3aa601191 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4067,7 +4067,6 @@ static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, intel_iommu_debugfs_remove_dev_pasid(dev_pasid); kfree(dev_pasid); intel_pasid_tear_down_entry(iommu, dev, pasid, false); - intel_iommu_drain_pasid_prq(dev, pasid); } static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 7e76062a7ad2..31665fb62e1c 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -265,6 +265,7 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev, iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); devtlb_invalidation_with_pasid(iommu, dev, pasid); + intel_iommu_drain_pasid_prq(dev, pasid); } /* diff --git a/drivers/iommu/intel/prq.c b/drivers/iommu/intel/prq.c index 621cd26504b3..c2d792db52c3 100644 --- a/drivers/iommu/intel/prq.c +++ b/drivers/iommu/intel/prq.c @@ -63,26 +63,18 @@ void intel_iommu_drain_pasid_prq(struct device *dev, u32 pasid) struct dmar_domain *domain; struct intel_iommu *iommu; struct qi_desc desc[3]; - struct pci_dev *pdev; int head, tail; u16 sid, did; - int qdep; info = dev_iommu_priv_get(dev); - if (WARN_ON(!info || !dev_is_pci(dev))) - return; - if (!info->pri_enabled) return; iommu = info->iommu; domain = info->domain; - pdev = to_pci_dev(dev); sid = PCI_DEVID(info->bus, info->devfn); did = domain ? domain_id_iommu(domain, iommu) : FLPT_DEFAULT_DID; - qdep = pci_ats_queue_depth(pdev); - /* * Check and wait until all pending page requests in the queue are * handled by the prq handling thread. @@ -114,15 +106,15 @@ prq_retry: desc[0].qw0 = QI_IWD_STATUS_DATA(QI_DONE) | QI_IWD_FENCE | QI_IWD_TYPE; - desc[1].qw0 = QI_EIOTLB_PASID(pasid) | - QI_EIOTLB_DID(did) | - QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | - QI_EIOTLB_TYPE; - desc[2].qw0 = QI_DEV_EIOTLB_PASID(pasid) | - QI_DEV_EIOTLB_SID(sid) | - QI_DEV_EIOTLB_QDEP(qdep) | - QI_DEIOTLB_TYPE | - QI_DEV_IOTLB_PFSID(info->pfsid); + if (pasid == IOMMU_NO_PASID) { + qi_desc_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH, &desc[1]); + qi_desc_dev_iotlb(sid, info->pfsid, info->ats_qdep, 0, + MAX_AGAW_PFN_WIDTH, &desc[2]); + } else { + qi_desc_piotlb(did, pasid, 0, -1, 0, &desc[1]); + qi_desc_dev_iotlb_pasid(sid, info->pfsid, pasid, info->ats_qdep, + 0, MAX_AGAW_PFN_WIDTH, &desc[2]); + } qi_retry: reinit_completion(&iommu->prq_complete); qi_submit_sync(iommu, desc, 3, QI_OPT_WAIT_DRAIN); From b45a3777ceabbe08ab7a6e97f258191c07cbab8d Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 8 Nov 2024 10:13:52 +0800 Subject: [PATCH 42/54] iommu: Pass old domain to set_dev_pasid op To support domain replacement for pasid, the underlying iommu driver needs to know the old domain hence be able to clean up the existing attachment. It would be much convenient for iommu layer to pass down the old domain. Otherwise, iommu drivers would need to track domain for pasids by themselves, this would duplicate code among the iommu drivers. Or iommu drivers would rely group->pasid_array to get domain, which may not always the correct one. Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Reviewed-by: Nicolin Chen Reviewed-by: Vasant Hegde Signed-off-by: Yi Liu Link: https://lore.kernel.org/r/20241107122234.7424-2-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/amd/amd_iommu.h | 3 ++- drivers/iommu/amd/pasid.c | 3 ++- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 3 ++- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 3 ++- drivers/iommu/intel/iommu.c | 6 ++++-- drivers/iommu/intel/svm.c | 3 ++- drivers/iommu/iommu.c | 3 ++- include/linux/iommu.h | 2 +- 8 files changed, 17 insertions(+), 9 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h index 6386fa4556d9..b11b014fa82d 100644 --- a/drivers/iommu/amd/amd_iommu.h +++ b/drivers/iommu/amd/amd_iommu.h @@ -52,7 +52,8 @@ struct iommu_domain *amd_iommu_domain_alloc_sva(struct device *dev, struct mm_struct *mm); void amd_iommu_domain_free(struct iommu_domain *dom); int iommu_sva_set_dev_pasid(struct iommu_domain *domain, - struct device *dev, ioasid_t pasid); + struct device *dev, ioasid_t pasid, + struct iommu_domain *old); void amd_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, struct iommu_domain *domain); diff --git a/drivers/iommu/amd/pasid.c b/drivers/iommu/amd/pasid.c index 0657b9373be5..d1dfc745f55e 100644 --- a/drivers/iommu/amd/pasid.c +++ b/drivers/iommu/amd/pasid.c @@ -100,7 +100,8 @@ static const struct mmu_notifier_ops sva_mn = { }; int iommu_sva_set_dev_pasid(struct iommu_domain *domain, - struct device *dev, ioasid_t pasid) + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) { struct pdom_dev_data *pdom_dev_data; struct protection_domain *sva_pdom = to_pdomain(domain); diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c index a7c36654dee5..645da7b69bed 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c @@ -332,7 +332,8 @@ void arm_smmu_sva_notifier_synchronize(void) } static int arm_smmu_sva_set_dev_pasid(struct iommu_domain *domain, - struct device *dev, ioasid_t id) + struct device *dev, ioasid_t id, + struct iommu_domain *old) { struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); struct arm_smmu_master *master = dev_iommu_priv_get(dev); diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 737c5b882355..1517fe2c356e 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -2856,7 +2856,8 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) } static int arm_smmu_s1_set_dev_pasid(struct iommu_domain *domain, - struct device *dev, ioasid_t id) + struct device *dev, ioasid_t id, + struct iommu_domain *old) { struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); struct arm_smmu_master *master = dev_iommu_priv_get(dev); diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index e2b3aa601191..2d1d208d5125 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4070,7 +4070,8 @@ static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, } static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, - struct device *dev, ioasid_t pasid) + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct dmar_domain *dmar_domain = to_dmar_domain(domain); @@ -4356,7 +4357,8 @@ static int identity_domain_attach_dev(struct iommu_domain *domain, struct device } static int identity_domain_set_dev_pasid(struct iommu_domain *domain, - struct device *dev, ioasid_t pasid) + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index 3cc43a958b4d..4a2bd65614ad 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -111,7 +111,8 @@ static const struct mmu_notifier_ops intel_mmuops = { }; static int intel_svm_set_dev_pasid(struct iommu_domain *domain, - struct device *dev, ioasid_t pasid) + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct dmar_domain *dmar_domain = to_dmar_domain(domain); diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 83c8e617a2c5..f3f81c04b8fb 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -3331,7 +3331,8 @@ static int __iommu_set_group_pasid(struct iommu_domain *domain, int ret; for_each_group_device(group, device) { - ret = domain->ops->set_dev_pasid(domain, device->dev, pasid); + ret = domain->ops->set_dev_pasid(domain, device->dev, + pasid, NULL); if (ret) goto err_revert; } diff --git a/include/linux/iommu.h b/include/linux/iommu.h index bd722f473635..32dce80aa7fd 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -642,7 +642,7 @@ struct iommu_ops { struct iommu_domain_ops { int (*attach_dev)(struct iommu_domain *domain, struct device *dev); int (*set_dev_pasid)(struct iommu_domain *domain, struct device *dev, - ioasid_t pasid); + ioasid_t pasid, struct iommu_domain *old); int (*map_pages)(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t pgsize, size_t pgcount, From 9bd008f1a91530f6b58f524f0979521161513fdd Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 8 Nov 2024 10:13:53 +0800 Subject: [PATCH 43/54] iommu/vt-d: Add a helper to flush cache for updating present pasid entry Generalize the logic for flushing pasid-related cache upon changes to bits other than SSADE and P which requires a different flow according to VT-d spec. No functional change is intended. Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Signed-off-by: Yi Liu Link: https://lore.kernel.org/r/20241107122234.7424-3-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/pasid.c | 52 ++++++++++++++++++++++++------------- 1 file changed, 34 insertions(+), 18 deletions(-) diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 31665fb62e1c..8d11701c2e76 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -287,6 +287,39 @@ static void pasid_flush_caches(struct intel_iommu *iommu, } } +/* + * This function is supposed to be used after caller updates the fields + * except for the SSADE and P bit of a pasid table entry. It does the + * below: + * - Flush cacheline if needed + * - Flush the caches per Table 28 ”Guidance to Software for Invalidations“ + * of VT-d spec 5.0. + */ +static void intel_pasid_flush_present(struct intel_iommu *iommu, + struct device *dev, + u32 pasid, u16 did, + struct pasid_entry *pte) +{ + if (!ecap_coherent(iommu->ecap)) + clflush_cache_range(pte, sizeof(*pte)); + + /* + * VT-d spec 5.0 table28 states guides for cache invalidation: + * + * - PASID-selective-within-Domain PASID-cache invalidation + * - PASID-selective PASID-based IOTLB invalidation + * - If (pasid is RID_PASID) + * - Global Device-TLB invalidation to affected functions + * Else + * - PASID-based Device-TLB invalidation (with S=1 and + * Addr[63:12]=0x7FFFFFFF_FFFFF) to affected functions + */ + pasid_cache_invalidation_with_pasid(iommu, did, pasid); + qi_flush_piotlb(iommu, did, pasid, 0, -1, 0); + + devtlb_invalidation_with_pasid(iommu, dev, pasid); +} + /* * Set up the scalable mode pasid table entry for first only * translation type. @@ -526,24 +559,7 @@ void intel_pasid_setup_page_snoop_control(struct intel_iommu *iommu, did = pasid_get_domain_id(pte); spin_unlock(&iommu->lock); - if (!ecap_coherent(iommu->ecap)) - clflush_cache_range(pte, sizeof(*pte)); - - /* - * VT-d spec 3.4 table23 states guides for cache invalidation: - * - * - PASID-selective-within-Domain PASID-cache invalidation - * - PASID-selective PASID-based IOTLB invalidation - * - If (pasid is RID_PASID) - * - Global Device-TLB invalidation to affected functions - * Else - * - PASID-based Device-TLB invalidation (with S=1 and - * Addr[63:12]=0x7FFFFFFF_FFFFF) to affected functions - */ - pasid_cache_invalidation_with_pasid(iommu, did, pasid); - qi_flush_piotlb(iommu, did, pasid, 0, -1, 0); - - devtlb_invalidation_with_pasid(iommu, dev, pasid); + intel_pasid_flush_present(iommu, dev, pasid, did, pte); } /** From 2cb5ff623d95f382b13ed975ac99be3211e9a284 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 8 Nov 2024 10:13:54 +0800 Subject: [PATCH 44/54] iommu/vt-d: Refactor the pasid setup helpers It is clearer to have a new set of pasid replacement helpers other than extending the existing ones to cover both initial setup and replacement. Then abstract out the common code for manipulating the pasid entry as preparation. No functional change is intended. Suggested-by: Lu Baolu Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Signed-off-by: Yi Liu Link: https://lore.kernel.org/r/20241107122234.7424-4-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/pasid.c | 169 ++++++++++++++++++++++-------------- 1 file changed, 105 insertions(+), 64 deletions(-) diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 8d11701c2e76..6841b9892d55 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -324,6 +324,32 @@ static void intel_pasid_flush_present(struct intel_iommu *iommu, * Set up the scalable mode pasid table entry for first only * translation type. */ +static void pasid_pte_config_first_level(struct intel_iommu *iommu, + struct pasid_entry *pte, + pgd_t *pgd, u16 did, int flags) +{ + lockdep_assert_held(&iommu->lock); + + pasid_clear_entry(pte); + + /* Setup the first level page table pointer: */ + pasid_set_flptr(pte, (u64)__pa(pgd)); + + if (flags & PASID_FLAG_FL5LP) + pasid_set_flpm(pte, 1); + + if (flags & PASID_FLAG_PAGE_SNOOP) + pasid_set_pgsnp(pte); + + pasid_set_domain_id(pte, did); + pasid_set_address_width(pte, iommu->agaw); + pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); + + /* Setup Present and PASID Granular Transfer Type: */ + pasid_set_translation_type(pte, PASID_ENTRY_PGTT_FL_ONLY); + pasid_set_present(pte); +} + int intel_pasid_setup_first_level(struct intel_iommu *iommu, struct device *dev, pgd_t *pgd, u32 pasid, u16 did, int flags) @@ -354,24 +380,8 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu, return -EBUSY; } - pasid_clear_entry(pte); + pasid_pte_config_first_level(iommu, pte, pgd, did, flags); - /* Setup the first level page table pointer: */ - pasid_set_flptr(pte, (u64)__pa(pgd)); - - if (flags & PASID_FLAG_FL5LP) - pasid_set_flpm(pte, 1); - - if (flags & PASID_FLAG_PAGE_SNOOP) - pasid_set_pgsnp(pte); - - pasid_set_domain_id(pte, did); - pasid_set_address_width(pte, iommu->agaw); - pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); - - /* Setup Present and PASID Granular Transfer Type: */ - pasid_set_translation_type(pte, PASID_ENTRY_PGTT_FL_ONLY); - pasid_set_present(pte); spin_unlock(&iommu->lock); pasid_flush_caches(iommu, pte, pasid, did); @@ -382,6 +392,26 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu, /* * Set up the scalable mode pasid entry for second only translation type. */ +static void pasid_pte_config_second_level(struct intel_iommu *iommu, + struct pasid_entry *pte, + u64 pgd_val, int agaw, u16 did, + bool dirty_tracking) +{ + lockdep_assert_held(&iommu->lock); + + pasid_clear_entry(pte); + pasid_set_domain_id(pte, did); + pasid_set_slptr(pte, pgd_val); + pasid_set_address_width(pte, agaw); + pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY); + pasid_set_fault_enable(pte); + pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); + if (dirty_tracking) + pasid_set_ssade(pte); + + pasid_set_present(pte); +} + int intel_pasid_setup_second_level(struct intel_iommu *iommu, struct dmar_domain *domain, struct device *dev, u32 pasid) @@ -417,17 +447,8 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, return -EBUSY; } - pasid_clear_entry(pte); - pasid_set_domain_id(pte, did); - pasid_set_slptr(pte, pgd_val); - pasid_set_address_width(pte, domain->agaw); - pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY); - pasid_set_fault_enable(pte); - pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); - if (domain->dirty_tracking) - pasid_set_ssade(pte); - - pasid_set_present(pte); + pasid_pte_config_second_level(iommu, pte, pgd_val, domain->agaw, + did, domain->dirty_tracking); spin_unlock(&iommu->lock); pasid_flush_caches(iommu, pte, pasid, did); @@ -507,6 +528,20 @@ int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu, /* * Set up the scalable mode pasid entry for passthrough translation type. */ +static void pasid_pte_config_pass_through(struct intel_iommu *iommu, + struct pasid_entry *pte, u16 did) +{ + lockdep_assert_held(&iommu->lock); + + pasid_clear_entry(pte); + pasid_set_domain_id(pte, did); + pasid_set_address_width(pte, iommu->agaw); + pasid_set_translation_type(pte, PASID_ENTRY_PGTT_PT); + pasid_set_fault_enable(pte); + pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); + pasid_set_present(pte); +} + int intel_pasid_setup_pass_through(struct intel_iommu *iommu, struct device *dev, u32 pasid) { @@ -525,13 +560,7 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu, return -EBUSY; } - pasid_clear_entry(pte); - pasid_set_domain_id(pte, did); - pasid_set_address_width(pte, iommu->agaw); - pasid_set_translation_type(pte, PASID_ENTRY_PGTT_PT); - pasid_set_fault_enable(pte); - pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); - pasid_set_present(pte); + pasid_pte_config_pass_through(iommu, pte, did); spin_unlock(&iommu->lock); pasid_flush_caches(iommu, pte, pasid, did); @@ -562,6 +591,46 @@ void intel_pasid_setup_page_snoop_control(struct intel_iommu *iommu, intel_pasid_flush_present(iommu, dev, pasid, did, pte); } +static void pasid_pte_config_nestd(struct intel_iommu *iommu, + struct pasid_entry *pte, + struct iommu_hwpt_vtd_s1 *s1_cfg, + struct dmar_domain *s2_domain, + u16 did) +{ + struct dma_pte *pgd = s2_domain->pgd; + + lockdep_assert_held(&iommu->lock); + + pasid_clear_entry(pte); + + if (s1_cfg->addr_width == ADDR_WIDTH_5LEVEL) + pasid_set_flpm(pte, 1); + + pasid_set_flptr(pte, s1_cfg->pgtbl_addr); + + if (s1_cfg->flags & IOMMU_VTD_S1_SRE) { + pasid_set_sre(pte); + if (s1_cfg->flags & IOMMU_VTD_S1_WPE) + pasid_set_wpe(pte); + } + + if (s1_cfg->flags & IOMMU_VTD_S1_EAFE) + pasid_set_eafe(pte); + + if (s2_domain->force_snooping) + pasid_set_pgsnp(pte); + + pasid_set_slptr(pte, virt_to_phys(pgd)); + pasid_set_fault_enable(pte); + pasid_set_domain_id(pte, did); + pasid_set_address_width(pte, s2_domain->agaw); + pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); + if (s2_domain->dirty_tracking) + pasid_set_ssade(pte); + pasid_set_translation_type(pte, PASID_ENTRY_PGTT_NESTED); + pasid_set_present(pte); +} + /** * intel_pasid_setup_nested() - Set up PASID entry for nested translation. * @iommu: IOMMU which the device belong to @@ -579,7 +648,6 @@ int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, struct iommu_hwpt_vtd_s1 *s1_cfg = &domain->s1_cfg; struct dmar_domain *s2_domain = domain->s2_domain; u16 did = domain_id_iommu(domain, iommu); - struct dma_pte *pgd = s2_domain->pgd; struct pasid_entry *pte; /* Address width should match the address width supported by hardware */ @@ -622,34 +690,7 @@ int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, return -EBUSY; } - pasid_clear_entry(pte); - - if (s1_cfg->addr_width == ADDR_WIDTH_5LEVEL) - pasid_set_flpm(pte, 1); - - pasid_set_flptr(pte, s1_cfg->pgtbl_addr); - - if (s1_cfg->flags & IOMMU_VTD_S1_SRE) { - pasid_set_sre(pte); - if (s1_cfg->flags & IOMMU_VTD_S1_WPE) - pasid_set_wpe(pte); - } - - if (s1_cfg->flags & IOMMU_VTD_S1_EAFE) - pasid_set_eafe(pte); - - if (s2_domain->force_snooping) - pasid_set_pgsnp(pte); - - pasid_set_slptr(pte, virt_to_phys(pgd)); - pasid_set_fault_enable(pte); - pasid_set_domain_id(pte, did); - pasid_set_address_width(pte, s2_domain->agaw); - pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); - if (s2_domain->dirty_tracking) - pasid_set_ssade(pte); - pasid_set_translation_type(pte, PASID_ENTRY_PGTT_NESTED); - pasid_set_present(pte); + pasid_pte_config_nestd(iommu, pte, s1_cfg, s2_domain, did); spin_unlock(&iommu->lock); pasid_flush_caches(iommu, pte, pasid, did); From 7543ee63e8113aa34b07df3b16b3b9d2c5f73939 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 8 Nov 2024 10:13:55 +0800 Subject: [PATCH 45/54] iommu/vt-d: Add pasid replace helpers pasid replacement allows converting a present pasid entry to be FS, SS, PT or nested, hence add helpers for such operations. Suggested-by: Lu Baolu Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Signed-off-by: Yi Liu Link: https://lore.kernel.org/r/20241107122234.7424-5-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/pasid.c | 190 ++++++++++++++++++++++++++++++++++++ drivers/iommu/intel/pasid.h | 15 +++ 2 files changed, 205 insertions(+) diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 6841b9892d55..0f2a926d3bd5 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -389,6 +389,50 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu, return 0; } +int intel_pasid_replace_first_level(struct intel_iommu *iommu, + struct device *dev, pgd_t *pgd, + u32 pasid, u16 did, u16 old_did, + int flags) +{ + struct pasid_entry *pte, new_pte; + + if (!ecap_flts(iommu->ecap)) { + pr_err("No first level translation support on %s\n", + iommu->name); + return -EINVAL; + } + + if ((flags & PASID_FLAG_FL5LP) && !cap_fl5lp_support(iommu->cap)) { + pr_err("No 5-level paging support for first-level on %s\n", + iommu->name); + return -EINVAL; + } + + pasid_pte_config_first_level(iommu, &new_pte, pgd, did, flags); + + spin_lock(&iommu->lock); + pte = intel_pasid_get_entry(dev, pasid); + if (!pte) { + spin_unlock(&iommu->lock); + return -ENODEV; + } + + if (!pasid_pte_is_present(pte)) { + spin_unlock(&iommu->lock); + return -EINVAL; + } + + WARN_ON(old_did != pasid_get_domain_id(pte)); + + *pte = new_pte; + spin_unlock(&iommu->lock); + + intel_pasid_flush_present(iommu, dev, pasid, old_did, pte); + intel_iommu_drain_pasid_prq(dev, pasid); + + return 0; +} + /* * Set up the scalable mode pasid entry for second only translation type. */ @@ -456,6 +500,57 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, return 0; } +int intel_pasid_replace_second_level(struct intel_iommu *iommu, + struct dmar_domain *domain, + struct device *dev, u16 old_did, + u32 pasid) +{ + struct pasid_entry *pte, new_pte; + struct dma_pte *pgd; + u64 pgd_val; + u16 did; + + /* + * If hardware advertises no support for second level + * translation, return directly. + */ + if (!ecap_slts(iommu->ecap)) { + pr_err("No second level translation support on %s\n", + iommu->name); + return -EINVAL; + } + + pgd = domain->pgd; + pgd_val = virt_to_phys(pgd); + did = domain_id_iommu(domain, iommu); + + pasid_pte_config_second_level(iommu, &new_pte, pgd_val, + domain->agaw, did, + domain->dirty_tracking); + + spin_lock(&iommu->lock); + pte = intel_pasid_get_entry(dev, pasid); + if (!pte) { + spin_unlock(&iommu->lock); + return -ENODEV; + } + + if (!pasid_pte_is_present(pte)) { + spin_unlock(&iommu->lock); + return -EINVAL; + } + + WARN_ON(old_did != pasid_get_domain_id(pte)); + + *pte = new_pte; + spin_unlock(&iommu->lock); + + intel_pasid_flush_present(iommu, dev, pasid, old_did, pte); + intel_iommu_drain_pasid_prq(dev, pasid); + + return 0; +} + /* * Set up dirty tracking on a second only or nested translation type. */ @@ -568,6 +663,38 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu, return 0; } +int intel_pasid_replace_pass_through(struct intel_iommu *iommu, + struct device *dev, u16 old_did, + u32 pasid) +{ + struct pasid_entry *pte, new_pte; + u16 did = FLPT_DEFAULT_DID; + + pasid_pte_config_pass_through(iommu, &new_pte, did); + + spin_lock(&iommu->lock); + pte = intel_pasid_get_entry(dev, pasid); + if (!pte) { + spin_unlock(&iommu->lock); + return -ENODEV; + } + + if (!pasid_pte_is_present(pte)) { + spin_unlock(&iommu->lock); + return -EINVAL; + } + + WARN_ON(old_did != pasid_get_domain_id(pte)); + + *pte = new_pte; + spin_unlock(&iommu->lock); + + intel_pasid_flush_present(iommu, dev, pasid, old_did, pte); + intel_iommu_drain_pasid_prq(dev, pasid); + + return 0; +} + /* * Set the page snoop control for a pasid entry which has been set up. */ @@ -698,6 +825,69 @@ int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, return 0; } +int intel_pasid_replace_nested(struct intel_iommu *iommu, + struct device *dev, u32 pasid, + u16 old_did, struct dmar_domain *domain) +{ + struct iommu_hwpt_vtd_s1 *s1_cfg = &domain->s1_cfg; + struct dmar_domain *s2_domain = domain->s2_domain; + u16 did = domain_id_iommu(domain, iommu); + struct pasid_entry *pte, new_pte; + + /* Address width should match the address width supported by hardware */ + switch (s1_cfg->addr_width) { + case ADDR_WIDTH_4LEVEL: + break; + case ADDR_WIDTH_5LEVEL: + if (!cap_fl5lp_support(iommu->cap)) { + dev_err_ratelimited(dev, + "5-level paging not supported\n"); + return -EINVAL; + } + break; + default: + dev_err_ratelimited(dev, "Invalid stage-1 address width %d\n", + s1_cfg->addr_width); + return -EINVAL; + } + + if ((s1_cfg->flags & IOMMU_VTD_S1_SRE) && !ecap_srs(iommu->ecap)) { + pr_err_ratelimited("No supervisor request support on %s\n", + iommu->name); + return -EINVAL; + } + + if ((s1_cfg->flags & IOMMU_VTD_S1_EAFE) && !ecap_eafs(iommu->ecap)) { + pr_err_ratelimited("No extended access flag support on %s\n", + iommu->name); + return -EINVAL; + } + + pasid_pte_config_nestd(iommu, &new_pte, s1_cfg, s2_domain, did); + + spin_lock(&iommu->lock); + pte = intel_pasid_get_entry(dev, pasid); + if (!pte) { + spin_unlock(&iommu->lock); + return -ENODEV; + } + + if (!pasid_pte_is_present(pte)) { + spin_unlock(&iommu->lock); + return -EINVAL; + } + + WARN_ON(old_did != pasid_get_domain_id(pte)); + + *pte = new_pte; + spin_unlock(&iommu->lock); + + intel_pasid_flush_present(iommu, dev, pasid, old_did, pte); + intel_iommu_drain_pasid_prq(dev, pasid); + + return 0; +} + /* * Interfaces to setup or teardown a pasid table to the scalable-mode * context table entry: diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h index dde6d3ba5ae0..06d1f7006d01 100644 --- a/drivers/iommu/intel/pasid.h +++ b/drivers/iommu/intel/pasid.h @@ -303,6 +303,21 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu, struct device *dev, u32 pasid); int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, u32 pasid, struct dmar_domain *domain); +int intel_pasid_replace_first_level(struct intel_iommu *iommu, + struct device *dev, pgd_t *pgd, + u32 pasid, u16 did, u16 old_did, + int flags); +int intel_pasid_replace_second_level(struct intel_iommu *iommu, + struct dmar_domain *domain, + struct device *dev, u16 old_did, + u32 pasid); +int intel_pasid_replace_pass_through(struct intel_iommu *iommu, + struct device *dev, u16 old_did, + u32 pasid); +int intel_pasid_replace_nested(struct intel_iommu *iommu, + struct device *dev, u32 pasid, + u16 old_did, struct dmar_domain *domain); + void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev, u32 pasid, bool fault_ignore); From d93cf86cc66a0b8ae80da8c1ffd6903897786124 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 8 Nov 2024 10:13:56 +0800 Subject: [PATCH 46/54] iommu/vt-d: Consolidate the struct dev_pasid_info add/remove The domain_add_dev_pasid() and domain_remove_dev_pasid() are added to consolidate the adding/removing of the struct dev_pasid_info. Besides, it includes the cache tag assign/unassign as well. This also prepares for adding domain replacement for pasid. The set_dev_pasid callbacks need to deal with the dev_pasid_info for both old and new domain. These two helpers make the life easier. intel_iommu_set_dev_pasid() and intel_svm_set_dev_pasid() are updated to use the helpers. Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Signed-off-by: Yi Liu Link: https://lore.kernel.org/r/20241107122234.7424-6-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 91 +++++++++++++++++++++++++------------ drivers/iommu/intel/iommu.h | 6 +++ drivers/iommu/intel/svm.c | 28 +++--------- 3 files changed, 74 insertions(+), 51 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 2d1d208d5125..103b109e23a9 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4036,8 +4036,8 @@ static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain, return 0; } -static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, - struct iommu_domain *domain) +void domain_remove_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct dev_pasid_info *curr, *dev_pasid = NULL; @@ -4045,10 +4045,12 @@ static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, struct dmar_domain *dmar_domain; unsigned long flags; - if (domain->type == IOMMU_DOMAIN_IDENTITY) { - intel_pasid_tear_down_entry(iommu, dev, pasid, false); + if (!domain) + return; + + /* Identity domain has no meta data for pasid. */ + if (domain->type == IOMMU_DOMAIN_IDENTITY) return; - } dmar_domain = to_dmar_domain(domain); spin_lock_irqsave(&dmar_domain->lock, flags); @@ -4066,7 +4068,52 @@ static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, domain_detach_iommu(dmar_domain, iommu); intel_iommu_debugfs_remove_dev_pasid(dev_pasid); kfree(dev_pasid); - intel_pasid_tear_down_entry(iommu, dev, pasid, false); +} + +static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, + struct iommu_domain *domain) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + + intel_pasid_tear_down_entry(info->iommu, dev, pasid, false); + domain_remove_dev_pasid(domain, dev, pasid); +} + +struct dev_pasid_info * +domain_add_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + struct intel_iommu *iommu = info->iommu; + struct dev_pasid_info *dev_pasid; + unsigned long flags; + int ret; + + dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL); + if (!dev_pasid) + return ERR_PTR(-ENOMEM); + + ret = domain_attach_iommu(dmar_domain, iommu); + if (ret) + goto out_free; + + ret = cache_tag_assign_domain(dmar_domain, dev, pasid); + if (ret) + goto out_detach_iommu; + + dev_pasid->dev = dev; + dev_pasid->pasid = pasid; + spin_lock_irqsave(&dmar_domain->lock, flags); + list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids); + spin_unlock_irqrestore(&dmar_domain->lock, flags); + + return dev_pasid; +out_detach_iommu: + domain_detach_iommu(dmar_domain, iommu); +out_free: + kfree(dev_pasid); + return ERR_PTR(ret); } static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, @@ -4077,7 +4124,6 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, struct dmar_domain *dmar_domain = to_dmar_domain(domain); struct intel_iommu *iommu = info->iommu; struct dev_pasid_info *dev_pasid; - unsigned long flags; int ret; if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev)) @@ -4093,17 +4139,9 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, if (ret) return ret; - dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL); - if (!dev_pasid) - return -ENOMEM; - - ret = domain_attach_iommu(dmar_domain, iommu); - if (ret) - goto out_free; - - ret = cache_tag_assign_domain(dmar_domain, dev, pasid); - if (ret) - goto out_detach_iommu; + dev_pasid = domain_add_dev_pasid(domain, dev, pasid); + if (IS_ERR(dev_pasid)) + return PTR_ERR(dev_pasid); if (dmar_domain->use_first_level) ret = domain_setup_first_level(iommu, dmar_domain, @@ -4112,24 +4150,17 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, ret = intel_pasid_setup_second_level(iommu, dmar_domain, dev, pasid); if (ret) - goto out_unassign_tag; + goto out_remove_dev_pasid; - dev_pasid->dev = dev; - dev_pasid->pasid = pasid; - spin_lock_irqsave(&dmar_domain->lock, flags); - list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids); - spin_unlock_irqrestore(&dmar_domain->lock, flags); + domain_remove_dev_pasid(old, dev, pasid); if (domain->type & __IOMMU_DOMAIN_PAGING) intel_iommu_debugfs_create_dev_pasid(dev_pasid); return 0; -out_unassign_tag: - cache_tag_unassign_domain(dmar_domain, dev, pasid); -out_detach_iommu: - domain_detach_iommu(dmar_domain, iommu); -out_free: - kfree(dev_pasid); + +out_remove_dev_pasid: + domain_remove_dev_pasid(domain, dev, pasid); return ret; } diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index b3912633ce25..df0261e03fad 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -1228,6 +1228,12 @@ void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu); void device_block_translation(struct device *dev); int paging_domain_compatible(struct iommu_domain *domain, struct device *dev); +struct dev_pasid_info * +domain_add_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid); +void domain_remove_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid); + int dmar_ir_support(void); void iommu_flush_write_buffer(struct intel_iommu *iommu); diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index 4a2bd65614ad..6c0685ea8466 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -115,43 +115,29 @@ static int intel_svm_set_dev_pasid(struct iommu_domain *domain, struct iommu_domain *old) { struct device_domain_info *info = dev_iommu_priv_get(dev); - struct dmar_domain *dmar_domain = to_dmar_domain(domain); struct intel_iommu *iommu = info->iommu; struct mm_struct *mm = domain->mm; struct dev_pasid_info *dev_pasid; unsigned long sflags; - unsigned long flags; int ret = 0; - dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL); - if (!dev_pasid) - return -ENOMEM; - - dev_pasid->dev = dev; - dev_pasid->pasid = pasid; - - ret = cache_tag_assign_domain(to_dmar_domain(domain), dev, pasid); - if (ret) - goto free_dev_pasid; + dev_pasid = domain_add_dev_pasid(domain, dev, pasid); + if (IS_ERR(dev_pasid)) + return PTR_ERR(dev_pasid); /* Setup the pasid table: */ sflags = cpu_feature_enabled(X86_FEATURE_LA57) ? PASID_FLAG_FL5LP : 0; ret = intel_pasid_setup_first_level(iommu, dev, mm->pgd, pasid, FLPT_DEFAULT_DID, sflags); if (ret) - goto unassign_tag; + goto out_remove_dev_pasid; - spin_lock_irqsave(&dmar_domain->lock, flags); - list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids); - spin_unlock_irqrestore(&dmar_domain->lock, flags); + domain_remove_dev_pasid(old, dev, pasid); return 0; -unassign_tag: - cache_tag_unassign_domain(to_dmar_domain(domain), dev, pasid); -free_dev_pasid: - kfree(dev_pasid); - +out_remove_dev_pasid: + domain_remove_dev_pasid(domain, dev, pasid); return ret; } From a1deee90a2cd91d1253c782bc66cd2719a7d0a38 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 8 Nov 2024 10:13:57 +0800 Subject: [PATCH 47/54] iommu/vt-d: Add iommu_domain_did() to get did domain_id_iommu() does not support SVA type and identity type domains. Add iommu_domain_did() to support all domain types. Signed-off-by: Yi Liu Link: https://lore.kernel.org/r/20241107122234.7424-7-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.h | 16 ++++++++++++++++ drivers/iommu/intel/pasid.h | 7 ------- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index df0261e03fad..cdca7d5061a7 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -806,6 +806,13 @@ static inline struct dmar_domain *to_dmar_domain(struct iommu_domain *dom) return container_of(dom, struct dmar_domain, domain); } +/* + * Domain ID reserved for pasid entries programmed for first-level + * only and pass-through transfer modes. + */ +#define FLPT_DEFAULT_DID 1 +#define NUM_RESERVED_DID 2 + /* Retrieve the domain ID which has allocated to the domain */ static inline u16 domain_id_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) @@ -816,6 +823,15 @@ domain_id_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) return info->did; } +static inline u16 +iommu_domain_did(struct iommu_domain *domain, struct intel_iommu *iommu) +{ + if (domain->type == IOMMU_DOMAIN_SVA || + domain->type == IOMMU_DOMAIN_IDENTITY) + return FLPT_DEFAULT_DID; + return domain_id_iommu(to_dmar_domain(domain), iommu); +} + /* * 0: readable * 1: writable diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h index 06d1f7006d01..082f4fe20216 100644 --- a/drivers/iommu/intel/pasid.h +++ b/drivers/iommu/intel/pasid.h @@ -22,13 +22,6 @@ #define is_pasid_enabled(entry) (((entry)->lo >> 3) & 0x1) #define get_pasid_dir_size(entry) (1 << ((((entry)->lo >> 9) & 0x7) + 7)) -/* - * Domain ID reserved for pasid entries programmed for first-level - * only and pass-through transfer modes. - */ -#define FLPT_DEFAULT_DID 1 -#define NUM_RESERVED_DID 2 - #define PASID_FLAG_NESTED BIT(1) #define PASID_FLAG_PAGE_SNOOP BIT(2) From c8596d65b267d46e7f805f55676b62416504c2ec Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 8 Nov 2024 10:13:58 +0800 Subject: [PATCH 48/54] iommu/vt-d: Make intel_iommu_set_dev_pasid() to handle domain replacement Let intel_iommu_set_dev_pasid() call the pasid replace helpers hence be able to do domain replacement. Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Signed-off-by: Yi Liu Link: https://lore.kernel.org/r/20241107122234.7424-8-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 46 +++++++++++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 9 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 103b109e23a9..d19bd0ff5220 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1752,10 +1752,36 @@ static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 intel_context_flush_present(info, context, did, true); } +static int __domain_setup_first_level(struct intel_iommu *iommu, + struct device *dev, ioasid_t pasid, + u16 did, pgd_t *pgd, int flags, + struct iommu_domain *old) +{ + if (!old) + return intel_pasid_setup_first_level(iommu, dev, pgd, + pasid, did, flags); + return intel_pasid_replace_first_level(iommu, dev, pgd, pasid, did, + iommu_domain_did(old, iommu), + flags); +} + +static int domain_setup_second_level(struct intel_iommu *iommu, + struct dmar_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) +{ + if (!old) + return intel_pasid_setup_second_level(iommu, domain, + dev, pasid); + return intel_pasid_replace_second_level(iommu, domain, dev, + iommu_domain_did(old, iommu), + pasid); +} + static int domain_setup_first_level(struct intel_iommu *iommu, struct dmar_domain *domain, struct device *dev, - u32 pasid) + u32 pasid, struct iommu_domain *old) { struct dma_pte *pgd = domain->pgd; int level, flags = 0; @@ -1770,9 +1796,9 @@ static int domain_setup_first_level(struct intel_iommu *iommu, if (domain->force_snooping) flags |= PASID_FLAG_PAGE_SNOOP; - return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid, - domain_id_iommu(domain, iommu), - flags); + return __domain_setup_first_level(iommu, dev, pasid, + domain_id_iommu(domain, iommu), + (pgd_t *)pgd, flags, old); } static bool dev_is_real_dma_subdevice(struct device *dev) @@ -1804,9 +1830,11 @@ static int dmar_domain_attach_device(struct dmar_domain *domain, if (!sm_supported(iommu)) ret = domain_context_mapping(domain, dev); else if (domain->use_first_level) - ret = domain_setup_first_level(iommu, domain, dev, IOMMU_NO_PASID); + ret = domain_setup_first_level(iommu, domain, dev, + IOMMU_NO_PASID, NULL); else - ret = intel_pasid_setup_second_level(iommu, domain, dev, IOMMU_NO_PASID); + ret = domain_setup_second_level(iommu, domain, dev, + IOMMU_NO_PASID, NULL); if (ret) goto out_block_translation; @@ -4145,10 +4173,10 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, if (dmar_domain->use_first_level) ret = domain_setup_first_level(iommu, dmar_domain, - dev, pasid); + dev, pasid, old); else - ret = intel_pasid_setup_second_level(iommu, dmar_domain, - dev, pasid); + ret = domain_setup_second_level(iommu, dmar_domain, + dev, pasid, old); if (ret) goto out_remove_dev_pasid; From c33e20869c59c1f78aab11188849dfb15ab412b2 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 8 Nov 2024 10:13:59 +0800 Subject: [PATCH 49/54] iommu/vt-d: Limit intel_iommu_set_dev_pasid() for paging domain intel_iommu_set_dev_pasid() is only supposed to be used by paging domain, so limit it. Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Signed-off-by: Yi Liu Link: https://lore.kernel.org/r/20241107122234.7424-9-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index d19bd0ff5220..f3ccbe516dcb 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4154,6 +4154,9 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, struct dev_pasid_info *dev_pasid; int ret; + if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING))) + return -EINVAL; + if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev)) return -EOPNOTSUPP; @@ -4182,8 +4185,7 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, domain_remove_dev_pasid(old, dev, pasid); - if (domain->type & __IOMMU_DOMAIN_PAGING) - intel_iommu_debugfs_create_dev_pasid(dev_pasid); + intel_iommu_debugfs_create_dev_pasid(dev_pasid); return 0; From cfb31f194a1c4b54ea2109bc0fa03e4ee8d4209b Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 8 Nov 2024 10:14:00 +0800 Subject: [PATCH 50/54] iommu/vt-d: Make intel_svm_set_dev_pasid() support domain replacement Make intel_svm_set_dev_pasid() support replacement. Signed-off-by: Yi Liu Link: https://lore.kernel.org/r/20241107122234.7424-10-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 8 ++++---- drivers/iommu/intel/iommu.h | 5 +++++ drivers/iommu/intel/svm.c | 5 +++-- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index f3ccbe516dcb..251cfebe6226 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1752,10 +1752,10 @@ static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 intel_context_flush_present(info, context, did, true); } -static int __domain_setup_first_level(struct intel_iommu *iommu, - struct device *dev, ioasid_t pasid, - u16 did, pgd_t *pgd, int flags, - struct iommu_domain *old) +int __domain_setup_first_level(struct intel_iommu *iommu, + struct device *dev, ioasid_t pasid, + u16 did, pgd_t *pgd, int flags, + struct iommu_domain *old) { if (!old) return intel_pasid_setup_first_level(iommu, dev, pgd, diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index cdca7d5061a7..d23977cc7d90 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -1250,6 +1250,11 @@ domain_add_dev_pasid(struct iommu_domain *domain, void domain_remove_dev_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid); +int __domain_setup_first_level(struct intel_iommu *iommu, + struct device *dev, ioasid_t pasid, + u16 did, pgd_t *pgd, int flags, + struct iommu_domain *old); + int dmar_ir_support(void); void iommu_flush_write_buffer(struct intel_iommu *iommu); diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index 6c0685ea8466..f5569347591f 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -127,8 +127,9 @@ static int intel_svm_set_dev_pasid(struct iommu_domain *domain, /* Setup the pasid table: */ sflags = cpu_feature_enabled(X86_FEATURE_LA57) ? PASID_FLAG_FL5LP : 0; - ret = intel_pasid_setup_first_level(iommu, dev, mm->pgd, pasid, - FLPT_DEFAULT_DID, sflags); + ret = __domain_setup_first_level(iommu, dev, pasid, + FLPT_DEFAULT_DID, mm->pgd, + sflags, old); if (ret) goto out_remove_dev_pasid; From 9bc18d283d9a88ab0edc7cc537d24bf534b36110 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 8 Nov 2024 10:14:01 +0800 Subject: [PATCH 51/54] iommu/vt-d: Make identity_domain_set_dev_pasid() to handle domain replacement Let identity_domain_set_dev_pasid() call the pasid replace helpers hence be able to do domain replacement. Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Signed-off-by: Yi Liu Link: https://lore.kernel.org/r/20241107122234.7424-11-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 251cfebe6226..ba984cb4aa44 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1778,6 +1778,17 @@ static int domain_setup_second_level(struct intel_iommu *iommu, pasid); } +static int domain_setup_passthrough(struct intel_iommu *iommu, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) +{ + if (!old) + return intel_pasid_setup_pass_through(iommu, dev, pasid); + return intel_pasid_replace_pass_through(iommu, dev, + iommu_domain_did(old, iommu), + pasid); +} + static int domain_setup_first_level(struct intel_iommu *iommu, struct dmar_domain *domain, struct device *dev, @@ -4423,11 +4434,17 @@ static int identity_domain_set_dev_pasid(struct iommu_domain *domain, { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; + int ret; if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev)) return -EOPNOTSUPP; - return intel_pasid_setup_pass_through(iommu, dev, pasid); + ret = domain_setup_passthrough(iommu, dev, pasid, old); + if (ret) + return ret; + + domain_remove_dev_pasid(old, dev, pasid); + return 0; } static struct iommu_domain identity_domain = { From 67f6f56b59126b3bf4fc6f4ea564e450fcfcf9f6 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 8 Nov 2024 10:14:02 +0800 Subject: [PATCH 52/54] iommu/vt-d: Add set_dev_pasid callback for nested domain Add intel_nested_set_dev_pasid() to set a nested type domain to a PASID of a device. Co-developed-by: Lu Baolu Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Signed-off-by: Yi Liu Link: https://lore.kernel.org/r/20241107122234.7424-12-yi.l.liu@intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 6 ----- drivers/iommu/intel/iommu.h | 7 +++++ drivers/iommu/intel/nested.c | 50 ++++++++++++++++++++++++++++++++++++ 3 files changed, 57 insertions(+), 6 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index ba984cb4aa44..b380b38315b2 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1812,12 +1812,6 @@ static int domain_setup_first_level(struct intel_iommu *iommu, (pgd_t *)pgd, flags, old); } -static bool dev_is_real_dma_subdevice(struct device *dev) -{ - return dev && dev_is_pci(dev) && - pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev); -} - static int dmar_domain_attach_device(struct dmar_domain *domain, struct device *dev) { diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index d23977cc7d90..2cca094c259d 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -832,6 +833,12 @@ iommu_domain_did(struct iommu_domain *domain, struct intel_iommu *iommu) return domain_id_iommu(to_dmar_domain(domain), iommu); } +static inline bool dev_is_real_dma_subdevice(struct device *dev) +{ + return dev && dev_is_pci(dev) && + pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev); +} + /* * 0: readable * 1: writable diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c index 989ca5cc04eb..42c4533a6ea2 100644 --- a/drivers/iommu/intel/nested.c +++ b/drivers/iommu/intel/nested.c @@ -130,8 +130,58 @@ out: return ret; } +static int domain_setup_nested(struct intel_iommu *iommu, + struct dmar_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) +{ + if (!old) + return intel_pasid_setup_nested(iommu, dev, pasid, domain); + return intel_pasid_replace_nested(iommu, dev, pasid, + iommu_domain_did(old, iommu), + domain); +} + +static int intel_nested_set_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + struct intel_iommu *iommu = info->iommu; + struct dev_pasid_info *dev_pasid; + int ret; + + if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev)) + return -EOPNOTSUPP; + + if (context_copied(iommu, info->bus, info->devfn)) + return -EBUSY; + + ret = paging_domain_compatible(&dmar_domain->s2_domain->domain, dev); + if (ret) + return ret; + + dev_pasid = domain_add_dev_pasid(domain, dev, pasid); + if (IS_ERR(dev_pasid)) + return PTR_ERR(dev_pasid); + + ret = domain_setup_nested(iommu, dmar_domain, dev, pasid, old); + if (ret) + goto out_remove_dev_pasid; + + domain_remove_dev_pasid(old, dev, pasid); + + return 0; + +out_remove_dev_pasid: + domain_remove_dev_pasid(domain, dev, pasid); + return ret; +} + static const struct iommu_domain_ops intel_nested_domain_ops = { .attach_dev = intel_nested_attach_dev, + .set_dev_pasid = intel_nested_set_dev_pasid, .free = intel_nested_domain_free, .cache_invalidate_user = intel_nested_cache_invalidate_user, }; From e9f1f727e63a512a69f6568f7ed4577c96eef910 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 8 Nov 2024 10:14:03 +0800 Subject: [PATCH 53/54] iommu/arm-smmu-v3: Make set_dev_pasid() op support replace set_dev_pasid() op is going to be enhanced to support domain replacement of a pasid. This prepares for this op definition. Signed-off-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Nicolin Chen Signed-off-by: Yi Liu Link: https://lore.kernel.org/r/20241107122234.7424-13-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 2 +- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 9 +++------ drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 2 +- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c index 645da7b69bed..1d3e71569775 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c @@ -349,7 +349,7 @@ static int arm_smmu_sva_set_dev_pasid(struct iommu_domain *domain, * get reassigned */ arm_smmu_make_sva_cd(&target, master, domain->mm, smmu_domain->cd.asid); - ret = arm_smmu_set_pasid(master, smmu_domain, id, &target); + ret = arm_smmu_set_pasid(master, smmu_domain, id, &target, old); mmput(domain->mm); return ret; diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 1517fe2c356e..f70165f544df 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -2883,7 +2883,7 @@ static int arm_smmu_s1_set_dev_pasid(struct iommu_domain *domain, */ arm_smmu_make_s1_cd(&target_cd, master, smmu_domain); return arm_smmu_set_pasid(master, to_smmu_domain(domain), id, - &target_cd); + &target_cd, old); } static void arm_smmu_update_ste(struct arm_smmu_master *master, @@ -2913,16 +2913,13 @@ static void arm_smmu_update_ste(struct arm_smmu_master *master, int arm_smmu_set_pasid(struct arm_smmu_master *master, struct arm_smmu_domain *smmu_domain, ioasid_t pasid, - struct arm_smmu_cd *cd) + struct arm_smmu_cd *cd, struct iommu_domain *old) { struct iommu_domain *sid_domain = iommu_get_domain_for_dev(master->dev); struct arm_smmu_attach_state state = { .master = master, - /* - * For now the core code prevents calling this when a domain is - * already attached, no need to set old_domain. - */ .ssid = pasid, + .old_domain = old, }; struct arm_smmu_cd *cdptr; int ret; diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 1e9952ca989f..52eaa0bedee1 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -875,7 +875,7 @@ void arm_smmu_write_cd_entry(struct arm_smmu_master *master, int ssid, int arm_smmu_set_pasid(struct arm_smmu_master *master, struct arm_smmu_domain *smmu_domain, ioasid_t pasid, - struct arm_smmu_cd *cd); + struct arm_smmu_cd *cd, struct iommu_domain *old); void arm_smmu_tlb_inv_asid(struct arm_smmu_device *smmu, u16 asid); void arm_smmu_tlb_inv_range_asid(unsigned long iova, size_t size, int asid, From 980e3016ebcc00021bf8190c64fd65ba23e90498 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 8 Nov 2024 10:14:04 +0800 Subject: [PATCH 54/54] iommu: Make set_dev_pasid op support domain replacement The iommu core is going to support domain replacement for pasid, it needs to make the set_dev_pasid op support replacing domain and keep the old domain config in the failure case. AMD iommu driver does not support domain replacement for pasid yet, so it would fail the set_dev_pasid op to keep the old config if the input @old is non-NULL. Till now, all the set_dev_pasid callbacks can handle the old parameter and can keep the old config when failed, so update the kdoc of set_dev_pasid op. Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Vasant Hegde Signed-off-by: Yi Liu Link: https://lore.kernel.org/r/20241107122234.7424-14-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/amd/pasid.c | 3 +++ include/linux/iommu.h | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/amd/pasid.c b/drivers/iommu/amd/pasid.c index d1dfc745f55e..8c73a30c2800 100644 --- a/drivers/iommu/amd/pasid.c +++ b/drivers/iommu/amd/pasid.c @@ -109,6 +109,9 @@ int iommu_sva_set_dev_pasid(struct iommu_domain *domain, unsigned long flags; int ret = -EINVAL; + if (old) + return -EOPNOTSUPP; + /* PASID zero is used for requests from the I/O device without PASID */ if (!is_pasid_valid(dev_data, pasid)) return ret; diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 32dce80aa7fd..27f923450a7c 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -616,7 +616,8 @@ struct iommu_ops { * * EBUSY - device is attached to a domain and cannot be changed * * ENODEV - device specific errors, not able to be attached * * - treated as ENODEV by the caller. Use is discouraged - * @set_dev_pasid: set an iommu domain to a pasid of device + * @set_dev_pasid: set or replace an iommu domain to a pasid of device. The pasid of + * the device should be left in the old config in error case. * @map_pages: map a physically contiguous set of pages of the same size to * an iommu domain. * @unmap_pages: unmap a number of pages of the same size from an iommu domain