diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2023-11-01 16:44:56 -1000 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2023-11-01 16:44:56 -1000 |
commit | 463f46e114f74465cf8d01b124e7b74ad1ce2afd (patch) | |
tree | 67ba1f82d7a3baba926eab2a896b0f4c4f3aced2 /drivers/iommu/amd/iommu.c | |
parent | ff269e2cd5adce4ae14f883fc9c8803bc43ee1e9 (diff) | |
parent | b2b67c997bf74453f3469d8b54e4859f190943bd (diff) |
Merge tag 'for-linus-iommufd' of git://git.kernel.org/pub/scm/linux/kernel/git/jgg/iommufd
Pull iommufd updates from Jason Gunthorpe:
"This brings three new iommufd capabilities:
- Dirty tracking for DMA.
AMD/ARM/Intel CPUs can now record if a DMA writes to a page in the
IOPTEs within the IO page table. This can be used to generate a
record of what memory is being dirtied by DMA activities during a
VM migration process. A VMM like qemu will combine the IOMMU dirty
bits with the CPU's dirty log to determine what memory to transfer.
VFIO already has a DMA dirty tracking framework that requires PCI
devices to implement tracking HW internally. The iommufd version
provides an alternative that the VMM can select, if available. The
two are designed to have very similar APIs.
- Userspace controlled attributes for hardware page tables
(HWPT/iommu_domain). There are currently a few generic attributes
for HWPTs (support dirty tracking, and parent of a nest). This is
an entry point for the userspace iommu driver to control the HW in
detail.
- Nested translation support for HWPTs. This is a 2D translation
scheme similar to the CPU where a DMA goes through a first stage to
determine an intermediate address which is then translated trough a
second stage to a physical address.
Like for CPU translation the first stage table would exist in VM
controlled memory and the second stage is in the kernel and matches
the VM's guest to physical map.
As every IOMMU has a unique set of parameter to describe the S1 IO
page table and its associated parameters the userspace IOMMU driver
has to marshal the information into the correct format.
This is 1/3 of the feature, it allows creating the nested
translation and binding it to VFIO devices, however the API to
support IOTLB and ATC invalidation of the stage 1 io page table,
and forwarding of IO faults are still in progress.
The series includes AMD and Intel support for dirty tracking. Intel
support for nested translation.
Along the way are a number of internal items:
- New iommu core items: ops->domain_alloc_user(),
ops->set_dirty_tracking, ops->read_and_clear_dirty(),
IOMMU_DOMAIN_NESTED, and iommu_copy_struct_from_user
- UAF fix in iopt_area_split()
- Spelling fixes and some test suite improvement"
* tag 'for-linus-iommufd' of git://git.kernel.org/pub/scm/linux/kernel/git/jgg/iommufd: (52 commits)
iommufd: Organize the mock domain alloc functions closer to Joerg's tree
iommufd/selftest: Fix page-size check in iommufd_test_dirty()
iommufd: Add iopt_area_alloc()
iommufd: Fix missing update of domains_itree after splitting iopt_area
iommu/vt-d: Disallow read-only mappings to nest parent domain
iommu/vt-d: Add nested domain allocation
iommu/vt-d: Set the nested domain to a device
iommu/vt-d: Make domain attach helpers to be extern
iommu/vt-d: Add helper to setup pasid nested translation
iommu/vt-d: Add helper for nested domain allocation
iommu/vt-d: Extend dmar_domain to support nested domain
iommufd: Add data structure for Intel VT-d stage-1 domain allocation
iommu/vt-d: Enhance capability check for nested parent domain allocation
iommufd/selftest: Add coverage for IOMMU_HWPT_ALLOC with nested HWPTs
iommufd/selftest: Add nested domain allocation for mock domain
iommu: Add iommu_copy_struct_from_user helper
iommufd: Add a nested HW pagetable object
iommu: Pass in parent domain with user_data to domain_alloc_user op
iommufd: Share iommufd_hwpt_alloc with IOMMUFD_OBJ_HWPT_NESTED
iommufd: Derive iommufd_hwpt_paging from iommufd_hw_pagetable
...
Diffstat (limited to 'drivers/iommu/amd/iommu.c')
-rw-r--r-- | drivers/iommu/amd/iommu.c | 147 |
1 files changed, 144 insertions, 3 deletions
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 95bd7c25ba6f..b399c5741378 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -37,6 +37,7 @@ #include <asm/iommu.h> #include <asm/gart.h> #include <asm/dma.h> +#include <uapi/linux/iommufd.h> #include "amd_iommu.h" #include "../dma-iommu.h" @@ -65,6 +66,7 @@ LIST_HEAD(hpet_map); LIST_HEAD(acpihid_map); const struct iommu_ops amd_iommu_ops; +const struct iommu_dirty_ops amd_dirty_ops; static ATOMIC_NOTIFIER_HEAD(ppr_notifier); int amd_iommu_max_glx_val = -1; @@ -1610,6 +1612,9 @@ static void set_dte_entry(struct amd_iommu *iommu, u16 devid, pte_root |= 1ULL << DEV_ENTRY_PPR; } + if (domain->dirty_tracking) + pte_root |= DTE_FLAG_HAD; + if (domain->flags & PD_IOMMUV2_MASK) { u64 gcr3 = iommu_virt_to_phys(domain->gcr3_tbl); u64 glx = domain->glx; @@ -2155,28 +2160,79 @@ static inline u64 dma_max_address(void) return ((1ULL << PM_LEVEL_SHIFT(amd_iommu_gpt_level)) - 1); } -static struct iommu_domain *amd_iommu_domain_alloc(unsigned type) +static bool amd_iommu_hd_support(struct amd_iommu *iommu) { + return iommu && (iommu->features & FEATURE_HDSUP); +} + +static struct iommu_domain *do_iommu_domain_alloc(unsigned int type, + struct device *dev, u32 flags) +{ + bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; struct protection_domain *domain; + struct amd_iommu *iommu = NULL; + + if (dev) { + iommu = rlookup_amd_iommu(dev); + if (!iommu) + return ERR_PTR(-ENODEV); + } /* * Since DTE[Mode]=0 is prohibited on SNP-enabled system, * default to use IOMMU_DOMAIN_DMA[_FQ]. */ if (amd_iommu_snp_en && (type == IOMMU_DOMAIN_IDENTITY)) - return NULL; + return ERR_PTR(-EINVAL); + + if (dirty_tracking && !amd_iommu_hd_support(iommu)) + return ERR_PTR(-EOPNOTSUPP); domain = protection_domain_alloc(type); if (!domain) - return NULL; + return ERR_PTR(-ENOMEM); domain->domain.geometry.aperture_start = 0; domain->domain.geometry.aperture_end = dma_max_address(); domain->domain.geometry.force_aperture = true; + if (iommu) { + domain->domain.type = type; + domain->domain.pgsize_bitmap = iommu->iommu.ops->pgsize_bitmap; + domain->domain.ops = iommu->iommu.ops->default_domain_ops; + + if (dirty_tracking) + domain->domain.dirty_ops = &amd_dirty_ops; + } + return &domain->domain; } +static struct iommu_domain *amd_iommu_domain_alloc(unsigned int type) +{ + struct iommu_domain *domain; + + domain = do_iommu_domain_alloc(type, NULL, 0); + if (IS_ERR(domain)) + return NULL; + + return domain; +} + +static struct iommu_domain * +amd_iommu_domain_alloc_user(struct device *dev, u32 flags, + struct iommu_domain *parent, + const struct iommu_user_data *user_data) + +{ + unsigned int type = IOMMU_DOMAIN_UNMANAGED; + + if ((flags & ~IOMMU_HWPT_ALLOC_DIRTY_TRACKING) || parent || user_data) + return ERR_PTR(-EOPNOTSUPP); + + return do_iommu_domain_alloc(type, dev, flags); +} + static void amd_iommu_domain_free(struct iommu_domain *dom) { struct protection_domain *domain; @@ -2214,6 +2270,13 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, dev_data->defer_attach = false; + /* + * Restrict to devices with compatible IOMMU hardware support + * when enforcement of dirty tracking is enabled. + */ + if (dom->dirty_ops && !amd_iommu_hd_support(iommu)) + return -EINVAL; + if (dev_data->domain) detach_device(dev); @@ -2332,6 +2395,11 @@ static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap) return true; case IOMMU_CAP_DEFERRED_FLUSH: return true; + case IOMMU_CAP_DIRTY_TRACKING: { + struct amd_iommu *iommu = rlookup_amd_iommu(dev); + + return amd_iommu_hd_support(iommu); + } default: break; } @@ -2339,6 +2407,73 @@ static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap) return false; } +static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain, + bool enable) +{ + struct protection_domain *pdomain = to_pdomain(domain); + struct dev_table_entry *dev_table; + struct iommu_dev_data *dev_data; + bool domain_flush = false; + struct amd_iommu *iommu; + unsigned long flags; + u64 pte_root; + + spin_lock_irqsave(&pdomain->lock, flags); + if (!(pdomain->dirty_tracking ^ enable)) { + spin_unlock_irqrestore(&pdomain->lock, flags); + return 0; + } + + list_for_each_entry(dev_data, &pdomain->dev_list, list) { + iommu = rlookup_amd_iommu(dev_data->dev); + if (!iommu) + continue; + + dev_table = get_dev_table(iommu); + pte_root = dev_table[dev_data->devid].data[0]; + + pte_root = (enable ? pte_root | DTE_FLAG_HAD : + pte_root & ~DTE_FLAG_HAD); + + /* Flush device DTE */ + dev_table[dev_data->devid].data[0] = pte_root; + device_flush_dte(dev_data); + domain_flush = true; + } + + /* Flush IOTLB to mark IOPTE dirty on the next translation(s) */ + if (domain_flush) { + amd_iommu_domain_flush_tlb_pde(pdomain); + amd_iommu_domain_flush_complete(pdomain); + } + pdomain->dirty_tracking = enable; + spin_unlock_irqrestore(&pdomain->lock, flags); + + return 0; +} + +static int amd_iommu_read_and_clear_dirty(struct iommu_domain *domain, + unsigned long iova, size_t size, + unsigned long flags, + struct iommu_dirty_bitmap *dirty) +{ + struct protection_domain *pdomain = to_pdomain(domain); + struct io_pgtable_ops *ops = &pdomain->iop.iop.ops; + unsigned long lflags; + + if (!ops || !ops->read_and_clear_dirty) + return -EOPNOTSUPP; + + spin_lock_irqsave(&pdomain->lock, lflags); + if (!pdomain->dirty_tracking && dirty->bitmap) { + spin_unlock_irqrestore(&pdomain->lock, lflags); + return -EINVAL; + } + spin_unlock_irqrestore(&pdomain->lock, lflags); + + return ops->read_and_clear_dirty(ops, iova, size, flags, dirty); +} + static void amd_iommu_get_resv_regions(struct device *dev, struct list_head *head) { @@ -2461,9 +2596,15 @@ static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain) return true; } +const struct iommu_dirty_ops amd_dirty_ops = { + .set_dirty_tracking = amd_iommu_set_dirty_tracking, + .read_and_clear_dirty = amd_iommu_read_and_clear_dirty, +}; + const struct iommu_ops amd_iommu_ops = { .capable = amd_iommu_capable, .domain_alloc = amd_iommu_domain_alloc, + .domain_alloc_user = amd_iommu_domain_alloc_user, .probe_device = amd_iommu_probe_device, .release_device = amd_iommu_release_device, .probe_finalize = amd_iommu_probe_finalize, |