summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/userspace-api/iommufd.rst226
-rw-r--r--drivers/acpi/arm64/iort.c13
-rw-r--r--drivers/iommu/Kconfig9
-rw-r--r--drivers/iommu/arm/arm-smmu-v3/Makefile1
-rw-r--r--drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c401
-rw-r--r--drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c139
-rw-r--r--drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h92
-rw-r--r--drivers/iommu/arm/arm-smmu/arm-smmu.c16
-rw-r--r--drivers/iommu/io-pgtable-arm.c27
-rw-r--r--drivers/iommu/iommu.c10
-rw-r--r--drivers/iommu/iommufd/Kconfig4
-rw-r--r--drivers/iommu/iommufd/Makefile6
-rw-r--r--drivers/iommu/iommufd/driver.c53
-rw-r--r--drivers/iommu/iommufd/fault.c9
-rw-r--r--drivers/iommu/iommufd/hw_pagetable.c113
-rw-r--r--drivers/iommu/iommufd/io_pagetable.c105
-rw-r--r--drivers/iommu/iommufd/io_pagetable.h26
-rw-r--r--drivers/iommu/iommufd/ioas.c259
-rw-r--r--drivers/iommu/iommufd/iommufd_private.h58
-rw-r--r--drivers/iommu/iommufd/iommufd_test.h32
-rw-r--r--drivers/iommu/iommufd/main.c65
-rw-r--r--drivers/iommu/iommufd/pages.c319
-rw-r--r--drivers/iommu/iommufd/selftest.c364
-rw-r--r--drivers/iommu/iommufd/vfio_compat.c7
-rw-r--r--drivers/iommu/iommufd/viommu.c157
-rw-r--r--drivers/vfio/vfio_iommu_type1.c12
-rw-r--r--include/acpi/actbl2.h3
-rw-r--r--include/linux/io-pgtable.h2
-rw-r--r--include/linux/iommu.h67
-rw-r--r--include/linux/iommufd.h108
-rw-r--r--include/linux/mm.h1
-rw-r--r--include/uapi/linux/iommufd.h216
-rw-r--r--include/uapi/linux/vfio.h2
-rw-r--r--mm/gup.c24
-rw-r--r--tools/testing/selftests/iommu/Makefile1
-rw-r--r--tools/testing/selftests/iommu/iommufd.c606
-rw-r--r--tools/testing/selftests/iommu/iommufd_fail_nth.c54
-rw-r--r--tools/testing/selftests/iommu/iommufd_utils.h174
38 files changed, 3354 insertions, 427 deletions
diff --git a/Documentation/userspace-api/iommufd.rst b/Documentation/userspace-api/iommufd.rst
index aa004faed5fd..70289d6815d2 100644
--- a/Documentation/userspace-api/iommufd.rst
+++ b/Documentation/userspace-api/iommufd.rst
@@ -41,46 +41,133 @@ Following IOMMUFD objects are exposed to userspace:
- IOMMUFD_OBJ_DEVICE, representing a device that is bound to iommufd by an
external driver.
-- IOMMUFD_OBJ_HW_PAGETABLE, representing an actual hardware I/O page table
- (i.e. a single struct iommu_domain) managed by the iommu driver.
-
- The IOAS has a list of HW_PAGETABLES that share the same IOVA mapping and
- it will synchronize its mapping with each member HW_PAGETABLE.
+- IOMMUFD_OBJ_HWPT_PAGING, representing an actual hardware I/O page table
+ (i.e. a single struct iommu_domain) managed by the iommu driver. "PAGING"
+ primarly indicates this type of HWPT should be linked to an IOAS. It also
+ indicates that it is backed by an iommu_domain with __IOMMU_DOMAIN_PAGING
+ feature flag. This can be either an UNMANAGED stage-1 domain for a device
+ running in the user space, or a nesting parent stage-2 domain for mappings
+ from guest-level physical addresses to host-level physical addresses.
+
+ The IOAS has a list of HWPT_PAGINGs that share the same IOVA mapping and
+ it will synchronize its mapping with each member HWPT_PAGING.
+
+- IOMMUFD_OBJ_HWPT_NESTED, representing an actual hardware I/O page table
+ (i.e. a single struct iommu_domain) managed by user space (e.g. guest OS).
+ "NESTED" indicates that this type of HWPT should be linked to an HWPT_PAGING.
+ It also indicates that it is backed by an iommu_domain that has a type of
+ IOMMU_DOMAIN_NESTED. This must be a stage-1 domain for a device running in
+ the user space (e.g. in a guest VM enabling the IOMMU nested translation
+ feature.) As such, it must be created with a given nesting parent stage-2
+ domain to associate to. This nested stage-1 page table managed by the user
+ space usually has mappings from guest-level I/O virtual addresses to guest-
+ level physical addresses.
+
+- IOMMUFD_OBJ_VIOMMU, representing a slice of the physical IOMMU instance,
+ passed to or shared with a VM. It may be some HW-accelerated virtualization
+ features and some SW resources used by the VM. For examples:
+
+ * Security namespace for guest owned ID, e.g. guest-controlled cache tags
+ * Non-device-affiliated event reporting, e.g. invalidation queue errors
+ * Access to a sharable nesting parent pagetable across physical IOMMUs
+ * Virtualization of various platforms IDs, e.g. RIDs and others
+ * Delivery of paravirtualized invalidation
+ * Direct assigned invalidation queues
+ * Direct assigned interrupts
+
+ Such a vIOMMU object generally has the access to a nesting parent pagetable
+ to support some HW-accelerated virtualization features. So, a vIOMMU object
+ must be created given a nesting parent HWPT_PAGING object, and then it would
+ encapsulate that HWPT_PAGING object. Therefore, a vIOMMU object can be used
+ to allocate an HWPT_NESTED object in place of the encapsulated HWPT_PAGING.
+
+ .. note::
+
+ The name "vIOMMU" isn't necessarily identical to a virtualized IOMMU in a
+ VM. A VM can have one giant virtualized IOMMU running on a machine having
+ multiple physical IOMMUs, in which case the VMM will dispatch the requests
+ or configurations from this single virtualized IOMMU instance to multiple
+ vIOMMU objects created for individual slices of different physical IOMMUs.
+ In other words, a vIOMMU object is always a representation of one physical
+ IOMMU, not necessarily of a virtualized IOMMU. For VMMs that want the full
+ virtualization features from physical IOMMUs, it is suggested to build the
+ same number of virtualized IOMMUs as the number of physical IOMMUs, so the
+ passed-through devices would be connected to their own virtualized IOMMUs
+ backed by corresponding vIOMMU objects, in which case a guest OS would do
+ the "dispatch" naturally instead of VMM trappings.
+
+- IOMMUFD_OBJ_VDEVICE, representing a virtual device for an IOMMUFD_OBJ_DEVICE
+ against an IOMMUFD_OBJ_VIOMMU. This virtual device holds the device's virtual
+ information or attributes (related to the vIOMMU) in a VM. An immediate vDATA
+ example can be the virtual ID of the device on a vIOMMU, which is a unique ID
+ that VMM assigns to the device for a translation channel/port of the vIOMMU,
+ e.g. vSID of ARM SMMUv3, vDeviceID of AMD IOMMU, and vRID of Intel VT-d to a
+ Context Table. Potential use cases of some advanced security information can
+ be forwarded via this object too, such as security level or realm information
+ in a Confidential Compute Architecture. A VMM should create a vDEVICE object
+ to forward all the device information in a VM, when it connects a device to a
+ vIOMMU, which is a separate ioctl call from attaching the same device to an
+ HWPT_PAGING that the vIOMMU holds.
All user-visible objects are destroyed via the IOMMU_DESTROY uAPI.
-The diagram below shows relationship between user-visible objects and kernel
+The diagrams below show relationships between user-visible objects and kernel
datastructures (external to iommufd), with numbers referred to operations
creating the objects and links::
- _________________________________________________________
- | iommufd |
- | [1] |
- | _________________ |
- | | | |
- | | | |
- | | | |
- | | | |
- | | | |
- | | | |
- | | | [3] [2] |
- | | | ____________ __________ |
- | | IOAS |<--| |<------| | |
- | | | |HW_PAGETABLE| | DEVICE | |
- | | | |____________| |__________| |
- | | | | | |
- | | | | | |
- | | | | | |
- | | | | | |
- | | | | | |
- | |_________________| | | |
- | | | | |
- |_________|___________________|___________________|_______|
- | | |
- | _____v______ _______v_____
- | PFN storage | | | |
- |------------>|iommu_domain| |struct device|
- |____________| |_____________|
+ _______________________________________________________________________
+ | iommufd (HWPT_PAGING only) |
+ | |
+ | [1] [3] [2] |
+ | ________________ _____________ ________ |
+ | | | | | | | |
+ | | IOAS |<---| HWPT_PAGING |<---------------------| DEVICE | |
+ | |________________| |_____________| |________| |
+ | | | | |
+ |_________|____________________|__________________________________|_____|
+ | | |
+ | ______v_____ ___v__
+ | PFN storage | (paging) | |struct|
+ |------------>|iommu_domain|<-----------------------|device|
+ |____________| |______|
+
+ _______________________________________________________________________
+ | iommufd (with HWPT_NESTED) |
+ | |
+ | [1] [3] [4] [2] |
+ | ________________ _____________ _____________ ________ |
+ | | | | | | | | | |
+ | | IOAS |<---| HWPT_PAGING |<---| HWPT_NESTED |<--| DEVICE | |
+ | |________________| |_____________| |_____________| |________| |
+ | | | | | |
+ |_________|____________________|__________________|_______________|_____|
+ | | | |
+ | ______v_____ ______v_____ ___v__
+ | PFN storage | (paging) | | (nested) | |struct|
+ |------------>|iommu_domain|<----|iommu_domain|<----|device|
+ |____________| |____________| |______|
+
+ _______________________________________________________________________
+ | iommufd (with vIOMMU/vDEVICE) |
+ | |
+ | [5] [6] |
+ | _____________ _____________ |
+ | | | | | |
+ | |----------------| vIOMMU |<---| vDEVICE |<----| |
+ | | | | |_____________| | |
+ | | | | | |
+ | | [1] | | [4] | [2] |
+ | | ______ | | _____________ _|______ |
+ | | | | | [3] | | | | | |
+ | | | IOAS |<---|(HWPT_PAGING)|<---| HWPT_NESTED |<--| DEVICE | |
+ | | |______| |_____________| |_____________| |________| |
+ | | | | | | |
+ |______|________|______________|__________________|_______________|_____|
+ | | | | |
+ ______v_____ | ______v_____ ______v_____ ___v__
+ | struct | | PFN | (paging) | | (nested) | |struct|
+ |iommu_device| |------>|iommu_domain|<----|iommu_domain|<----|device|
+ |____________| storage|____________| |____________| |______|
1. IOMMUFD_OBJ_IOAS is created via the IOMMU_IOAS_ALLOC uAPI. An iommufd can
hold multiple IOAS objects. IOAS is the most generic object and does not
@@ -94,21 +181,63 @@ creating the objects and links::
device. The driver must also set the driver_managed_dma flag and must not
touch the device until this operation succeeds.
-3. IOMMUFD_OBJ_HW_PAGETABLE is created when an external driver calls the IOMMUFD
- kAPI to attach a bound device to an IOAS. Similarly the external driver uAPI
- allows userspace to initiate the attaching operation. If a compatible
- pagetable already exists then it is reused for the attachment. Otherwise a
- new pagetable object and iommu_domain is created. Successful completion of
- this operation sets up the linkages among IOAS, device and iommu_domain. Once
- this completes the device could do DMA.
-
- Every iommu_domain inside the IOAS is also represented to userspace as a
- HW_PAGETABLE object.
+3. IOMMUFD_OBJ_HWPT_PAGING can be created in two ways:
+
+ * IOMMUFD_OBJ_HWPT_PAGING is automatically created when an external driver
+ calls the IOMMUFD kAPI to attach a bound device to an IOAS. Similarly the
+ external driver uAPI allows userspace to initiate the attaching operation.
+ If a compatible member HWPT_PAGING object exists in the IOAS's HWPT_PAGING
+ list, then it will be reused. Otherwise a new HWPT_PAGING that represents
+ an iommu_domain to userspace will be created, and then added to the list.
+ Successful completion of this operation sets up the linkages among IOAS,
+ device and iommu_domain. Once this completes the device could do DMA.
+
+ * IOMMUFD_OBJ_HWPT_PAGING can be manually created via the IOMMU_HWPT_ALLOC
+ uAPI, provided an ioas_id via @pt_id to associate the new HWPT_PAGING to
+ the corresponding IOAS object. The benefit of this manual allocation is to
+ allow allocation flags (defined in enum iommufd_hwpt_alloc_flags), e.g. it
+ allocates a nesting parent HWPT_PAGING if the IOMMU_HWPT_ALLOC_NEST_PARENT
+ flag is set.
+
+4. IOMMUFD_OBJ_HWPT_NESTED can be only manually created via the IOMMU_HWPT_ALLOC
+ uAPI, provided an hwpt_id or a viommu_id of a vIOMMU object encapsulating a
+ nesting parent HWPT_PAGING via @pt_id to associate the new HWPT_NESTED object
+ to the corresponding HWPT_PAGING object. The associating HWPT_PAGING object
+ must be a nesting parent manually allocated via the same uAPI previously with
+ an IOMMU_HWPT_ALLOC_NEST_PARENT flag, otherwise the allocation will fail. The
+ allocation will be further validated by the IOMMU driver to ensure that the
+ nesting parent domain and the nested domain being allocated are compatible.
+ Successful completion of this operation sets up linkages among IOAS, device,
+ and iommu_domains. Once this completes the device could do DMA via a 2-stage
+ translation, a.k.a nested translation. Note that multiple HWPT_NESTED objects
+ can be allocated by (and then associated to) the same nesting parent.
.. note::
- Future IOMMUFD updates will provide an API to create and manipulate the
- HW_PAGETABLE directly.
+ Either a manual IOMMUFD_OBJ_HWPT_PAGING or an IOMMUFD_OBJ_HWPT_NESTED is
+ created via the same IOMMU_HWPT_ALLOC uAPI. The difference is at the type
+ of the object passed in via the @pt_id field of struct iommufd_hwpt_alloc.
+
+5. IOMMUFD_OBJ_VIOMMU can be only manually created via the IOMMU_VIOMMU_ALLOC
+ uAPI, provided a dev_id (for the device's physical IOMMU to back the vIOMMU)
+ and an hwpt_id (to associate the vIOMMU to a nesting parent HWPT_PAGING). The
+ iommufd core will link the vIOMMU object to the struct iommu_device that the
+ struct device is behind. And an IOMMU driver can implement a viommu_alloc op
+ to allocate its own vIOMMU data structure embedding the core-level structure
+ iommufd_viommu and some driver-specific data. If necessary, the driver can
+ also configure its HW virtualization feature for that vIOMMU (and thus for
+ the VM). Successful completion of this operation sets up the linkages between
+ the vIOMMU object and the HWPT_PAGING, then this vIOMMU object can be used
+ as a nesting parent object to allocate an HWPT_NESTED object described above.
+
+6. IOMMUFD_OBJ_VDEVICE can be only manually created via the IOMMU_VDEVICE_ALLOC
+ uAPI, provided a viommu_id for an iommufd_viommu object and a dev_id for an
+ iommufd_device object. The vDEVICE object will be the binding between these
+ two parent objects. Another @virt_id will be also set via the uAPI providing
+ the iommufd core an index to store the vDEVICE object to a vDEVICE array per
+ vIOMMU. If necessary, the IOMMU driver may choose to implement a vdevce_alloc
+ op to init its HW for virtualization feature related to a vDEVICE. Successful
+ completion of this operation sets up the linkages between vIOMMU and device.
A device can only bind to an iommufd due to DMA ownership claim and attach to at
most one IOAS object (no support of PASID yet).
@@ -120,7 +249,10 @@ User visible objects are backed by following datastructures:
- iommufd_ioas for IOMMUFD_OBJ_IOAS.
- iommufd_device for IOMMUFD_OBJ_DEVICE.
-- iommufd_hw_pagetable for IOMMUFD_OBJ_HW_PAGETABLE.
+- iommufd_hwpt_paging for IOMMUFD_OBJ_HWPT_PAGING.
+- iommufd_hwpt_nested for IOMMUFD_OBJ_HWPT_NESTED.
+- iommufd_viommu for IOMMUFD_OBJ_VIOMMU.
+- iommufd_vdevice for IOMMUFD_OBJ_VDEVICE.
Several terminologies when looking at these datastructures:
diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index 4c745a26226b..1f7e4c691d9e 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -1218,6 +1218,17 @@ static bool iort_pci_rc_supports_ats(struct acpi_iort_node *node)
return pci_rc->ats_attribute & ACPI_IORT_ATS_SUPPORTED;
}
+static bool iort_pci_rc_supports_canwbs(struct acpi_iort_node *node)
+{
+ struct acpi_iort_memory_access *memory_access;
+ struct acpi_iort_root_complex *pci_rc;
+
+ pci_rc = (struct acpi_iort_root_complex *)node->node_data;
+ memory_access =
+ (struct acpi_iort_memory_access *)&pci_rc->memory_properties;
+ return memory_access->memory_flags & ACPI_IORT_MF_CANWBS;
+}
+
static int iort_iommu_xlate(struct device *dev, struct acpi_iort_node *node,
u32 streamid)
{
@@ -1335,6 +1346,8 @@ int iort_iommu_configure_id(struct device *dev, const u32 *id_in)
fwspec = dev_iommu_fwspec_get(dev);
if (fwspec && iort_pci_rc_supports_ats(node))
fwspec->flags |= IOMMU_FWSPEC_PCI_RC_ATS;
+ if (fwspec && iort_pci_rc_supports_canwbs(node))
+ fwspec->flags |= IOMMU_FWSPEC_PCI_RC_CANWBS;
} else {
node = iort_scan_node(ACPI_IORT_NODE_NAMED_COMPONENT,
iort_match_node_callback, dev);
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index b3aa1f5d5321..0c9bceb1653d 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -415,6 +415,15 @@ config ARM_SMMU_V3_SVA
Say Y here if your system supports SVA extensions such as PCIe PASID
and PRI.
+config ARM_SMMU_V3_IOMMUFD
+ bool "Enable IOMMUFD features for ARM SMMUv3 (EXPERIMENTAL)"
+ depends on IOMMUFD
+ help
+ Support for IOMMUFD features intended to support virtual machines
+ with accelerated virtual IOMMUs.
+
+ Say Y here if you are doing development and testing on this feature.
+
config ARM_SMMU_V3_KUNIT_TEST
tristate "KUnit tests for arm-smmu-v3 driver" if !KUNIT_ALL_TESTS
depends on KUNIT
diff --git a/drivers/iommu/arm/arm-smmu-v3/Makefile b/drivers/iommu/arm/arm-smmu-v3/Makefile
index dc98c88b48c8..493a659cc66b 100644
--- a/drivers/iommu/arm/arm-smmu-v3/Makefile
+++ b/drivers/iommu/arm/arm-smmu-v3/Makefile
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_ARM_SMMU_V3) += arm_smmu_v3.o
arm_smmu_v3-y := arm-smmu-v3.o
+arm_smmu_v3-$(CONFIG_ARM_SMMU_V3_IOMMUFD) += arm-smmu-v3-iommufd.o
arm_smmu_v3-$(CONFIG_ARM_SMMU_V3_SVA) += arm-smmu-v3-sva.o
arm_smmu_v3-$(CONFIG_TEGRA241_CMDQV) += tegra241-cmdqv.o
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
new file mode 100644
index 000000000000..6cc14d82399f
--- /dev/null
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
@@ -0,0 +1,401 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ */
+
+#include <uapi/linux/iommufd.h>
+
+#include "arm-smmu-v3.h"
+
+void *arm_smmu_hw_info(struct device *dev, u32 *length, u32 *type)
+{
+ struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+ struct iommu_hw_info_arm_smmuv3 *info;
+ u32 __iomem *base_idr;
+ unsigned int i;
+
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
+ if (!info)
+ return ERR_PTR(-ENOMEM);
+
+ base_idr = master->smmu->base + ARM_SMMU_IDR0;
+ for (i = 0; i <= 5; i++)
+ info->idr[i] = readl_relaxed(base_idr + i);
+ info->iidr = readl_relaxed(master->smmu->base + ARM_SMMU_IIDR);
+ info->aidr = readl_relaxed(master->smmu->base + ARM_SMMU_AIDR);
+
+ *length = sizeof(*info);
+ *type = IOMMU_HW_INFO_TYPE_ARM_SMMUV3;
+
+ return info;
+}
+
+static void arm_smmu_make_nested_cd_table_ste(
+ struct arm_smmu_ste *target, struct arm_smmu_master *master,
+ struct arm_smmu_nested_domain *nested_domain, bool ats_enabled)
+{
+ arm_smmu_make_s2_domain_ste(
+ target, master, nested_domain->vsmmu->s2_parent, ats_enabled);
+
+ target->data[0] = cpu_to_le64(STRTAB_STE_0_V |
+ FIELD_PREP(STRTAB_STE_0_CFG,
+ STRTAB_STE_0_CFG_NESTED));
+ target->data[0] |= nested_domain->ste[0] &
+ ~cpu_to_le64(STRTAB_STE_0_CFG);
+ target->data[1] |= nested_domain->ste[1];
+}
+
+/*
+ * Create a physical STE from the virtual STE that userspace provided when it
+ * created the nested domain. Using the vSTE userspace can request:
+ * - Non-valid STE
+ * - Abort STE
+ * - Bypass STE (install the S2, no CD table)
+ * - CD table STE (install the S2 and the userspace CD table)
+ */
+static void arm_smmu_make_nested_domain_ste(
+ struct arm_smmu_ste *target, struct arm_smmu_master *master,
+ struct arm_smmu_nested_domain *nested_domain, bool ats_enabled)
+{
+ unsigned int cfg =
+ FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(nested_domain->ste[0]));
+
+ /*
+ * Userspace can request a non-valid STE through the nesting interface.
+ * We relay that into an abort physical STE with the intention that
+ * C_BAD_STE for this SID can be generated to userspace.
+ */
+ if (!(nested_domain->ste[0] & cpu_to_le64(STRTAB_STE_0_V)))
+ cfg = STRTAB_STE_0_CFG_ABORT;
+
+ switch (cfg) {
+ case STRTAB_STE_0_CFG_S1_TRANS:
+ arm_smmu_make_nested_cd_table_ste(target, master, nested_domain,
+ ats_enabled);
+ break;
+ case STRTAB_STE_0_CFG_BYPASS:
+ arm_smmu_make_s2_domain_ste(target, master,
+ nested_domain->vsmmu->s2_parent,
+ ats_enabled);
+ break;
+ case STRTAB_STE_0_CFG_ABORT:
+ default:
+ arm_smmu_make_abort_ste(target);
+ break;
+ }
+}
+
+static int arm_smmu_attach_dev_nested(struct iommu_domain *domain,
+ struct device *dev)
+{
+ struct arm_smmu_nested_domain *nested_domain =
+ to_smmu_nested_domain(domain);
+ struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+ struct arm_smmu_attach_state state = {
+ .master = master,
+ .old_domain = iommu_get_domain_for_dev(dev),
+ .ssid = IOMMU_NO_PASID,
+ };
+ struct arm_smmu_ste ste;
+ int ret;
+
+ if (nested_domain->vsmmu->smmu != master->smmu)
+ return -EINVAL;
+ if (arm_smmu_ssids_in_use(&master->cd_table))
+ return -EBUSY;
+
+ mutex_lock(&arm_smmu_asid_lock);
+ /*
+ * The VM has to control the actual ATS state at the PCI device because
+ * we forward the invalidations directly from the VM. If the VM doesn't
+ * think ATS is on it will not generate ATC flushes and the ATC will
+ * become incoherent. Since we can't access the actual virtual PCI ATS
+ * config bit here base this off the EATS value in the STE. If the EATS
+ * is set then the VM must generate ATC flushes.
+ */
+ state.disable_ats = !nested_domain->enable_ats;
+ ret = arm_smmu_attach_prepare(&state, domain);
+ if (ret) {
+ mutex_unlock(&arm_smmu_asid_lock);
+ return ret;
+ }
+
+ arm_smmu_make_nested_domain_ste(&ste, master, nested_domain,
+ state.ats_enabled);
+ arm_smmu_install_ste_for_dev(master, &ste);
+ arm_smmu_attach_commit(&state);
+ mutex_unlock(&arm_smmu_asid_lock);
+ return 0;
+}
+
+static void arm_smmu_domain_nested_free(struct iommu_domain *domain)
+{
+ kfree(to_smmu_nested_domain(domain));
+}
+
+static const struct iommu_domain_ops arm_smmu_nested_ops = {
+ .attach_dev = arm_smmu_attach_dev_nested,
+ .free = arm_smmu_domain_nested_free,
+};
+
+static int arm_smmu_validate_vste(struct iommu_hwpt_arm_smmuv3 *arg,
+ bool *enable_ats)
+{
+ unsigned int eats;
+ unsigned int cfg;
+
+ if (!(arg->ste[0] & cpu_to_le64(STRTAB_STE_0_V))) {
+ memset(arg->ste, 0, sizeof(arg->ste));
+ return 0;
+ }
+
+ /* EIO is reserved for invalid STE data. */
+ if ((arg->ste[0] & ~STRTAB_STE_0_NESTING_ALLOWED) ||
+ (arg->ste[1] & ~STRTAB_STE_1_NESTING_ALLOWED))
+ return -EIO;
+
+ cfg = FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(arg->ste[0]));
+ if (cfg != STRTAB_STE_0_CFG_ABORT && cfg != STRTAB_STE_0_CFG_BYPASS &&
+ cfg != STRTAB_STE_0_CFG_S1_TRANS)
+ return -EIO;
+
+ /*
+ * Only Full ATS or ATS UR is supported
+ * The EATS field will be set by arm_smmu_make_nested_domain_ste()
+ */
+ eats = FIELD_GET(STRTAB_STE_1_EATS, le64_to_cpu(arg->ste[1]));
+ arg->ste[1] &= ~cpu_to_le64(STRTAB_STE_1_EATS);
+ if (eats != STRTAB_STE_1_EATS_ABT && eats != STRTAB_STE_1_EATS_TRANS)
+ return -EIO;
+
+ if (cfg == STRTAB_STE_0_CFG_S1_TRANS)
+ *enable_ats = (eats == STRTAB_STE_1_EATS_TRANS);
+ return 0;
+}
+
+static struct iommu_domain *
+arm_vsmmu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
+ const struct iommu_user_data *user_data)
+{
+ struct arm_vsmmu *vsmmu = container_of(viommu, struct arm_vsmmu, core);
+ const u32 SUPPORTED_FLAGS = IOMMU_HWPT_FAULT_ID_VALID;
+ struct arm_smmu_nested_domain *nested_domain;
+ struct iommu_hwpt_arm_smmuv3 arg;
+ bool enable_ats = false;
+ int ret;
+
+ /*
+ * Faults delivered to the nested domain are faults that originated by
+ * the S1 in the domain. The core code will match all PASIDs when
+ * delivering the fault due to user_pasid_table
+ */
+ if (flags & ~SUPPORTED_FLAGS)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ ret = iommu_copy_struct_from_user(&arg, user_data,
+ IOMMU_HWPT_DATA_ARM_SMMUV3, ste);
+ if (ret)
+ return ERR_PTR(ret);
+
+ ret = arm_smmu_validate_vste(&arg, &enable_ats);
+ if (ret)
+ return ERR_PTR(ret);
+
+ nested_domain = kzalloc(sizeof(*nested_domain), GFP_KERNEL_ACCOUNT);
+ if (!nested_domain)
+ return ERR_PTR(-ENOMEM);
+
+ nested_domain->domain.type = IOMMU_DOMAIN_NESTED;
+ nested_domain->domain.ops = &arm_smmu_nested_ops;
+ nested_domain->enable_ats = enable_ats;
+ nested_domain->vsmmu = vsmmu;
+ nested_domain->ste[0] = arg.ste[0];
+ nested_domain->ste[1] = arg.ste[1] & ~cpu_to_le64(STRTAB_STE_1_EATS);
+
+ return &nested_domain->domain;
+}
+
+static int arm_vsmmu_vsid_to_sid(struct arm_vsmmu *vsmmu, u32 vsid, u32 *sid)
+{
+ struct arm_smmu_master *master;
+ struct device *dev;
+ int ret = 0;
+
+ xa_lock(&vsmmu->core.vdevs);
+ dev = iommufd_viommu_find_dev(&vsmmu->core, (unsigned long)vsid);
+ if (!dev) {
+ ret = -EIO;
+ goto unlock;
+ }
+ master = dev_iommu_priv_get(dev);
+
+ /* At this moment, iommufd only supports PCI device that has one SID */
+ if (sid)
+ *sid = master->streams[0].id;
+unlock:
+ xa_unlock(&vsmmu->core.vdevs);
+ return ret;
+}
+
+/* This is basically iommu_viommu_arm_smmuv3_invalidate in u64 for conversion */
+struct arm_vsmmu_invalidation_cmd {
+ union {
+ u64 cmd[2];
+ struct iommu_viommu_arm_smmuv3_invalidate ucmd;
+ };
+};
+
+/*
+ * Convert, in place, the raw invalidation command into an internal format that
+ * can be passed to arm_smmu_cmdq_issue_cmdlist(). Internally commands are
+ * stored in CPU endian.
+ *
+ * Enforce the VMID or SID on the command.
+ */
+static int arm_vsmmu_convert_user_cmd(struct arm_vsmmu *vsmmu,
+ struct arm_vsmmu_invalidation_cmd *cmd)
+{
+ /* Commands are le64 stored in u64 */
+ cmd->cmd[0] = le64_to_cpu(cmd->ucmd.cmd[0]);
+ cmd->cmd[1] = le64_to_cpu(cmd->ucmd.cmd[1]);
+
+ switch (cmd->cmd[0] & CMDQ_0_OP) {
+ case CMDQ_OP_TLBI_NSNH_ALL:
+ /* Convert to NH_ALL */
+ cmd->cmd[0] = CMDQ_OP_TLBI_NH_ALL |
+ FIELD_PREP(CMDQ_TLBI_0_VMID, vsmmu->vmid);
+ cmd->cmd[1] = 0;
+ break;
+ case CMDQ_OP_TLBI_NH_VA:
+ case CMDQ_OP_TLBI_NH_VAA:
+ case CMDQ_OP_TLBI_NH_ALL:
+ case CMDQ_OP_TLBI_NH_ASID:
+ cmd->cmd[0] &= ~CMDQ_TLBI_0_VMID;
+ cmd->cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, vsmmu->vmid);
+ break;
+ case CMDQ_OP_ATC_INV:
+ case CMDQ_OP_CFGI_CD:
+ case CMDQ_OP_CFGI_CD_ALL: {
+ u32 sid, vsid = FIELD_GET(CMDQ_CFGI_0_SID, cmd->cmd[0]);
+
+ if (arm_vsmmu_vsid_to_sid(vsmmu, vsid, &sid))
+ return -EIO;
+ cmd->cmd[0] &= ~CMDQ_CFGI_0_SID;
+ cmd->cmd[0] |= FIELD_PREP(CMDQ_CFGI_0_SID, sid);
+ break;
+ }
+ default:
+ return -EIO;
+ }
+ return 0;
+}
+
+static int arm_vsmmu_cache_invalidate(struct iommufd_viommu *viommu,
+ struct iommu_user_data_array *array)
+{
+ struct arm_vsmmu *vsmmu = container_of(viommu, struct arm_vsmmu, core);
+ struct arm_smmu_device *smmu = vsmmu->smmu;
+ struct arm_vsmmu_invalidation_cmd *last;
+ struct arm_vsmmu_invalidation_cmd *cmds;
+ struct arm_vsmmu_invalidation_cmd *cur;
+ struct arm_vsmmu_invalidation_cmd *end;
+ int ret;
+
+ cmds = kcalloc(array->entry_num, sizeof(*cmds), GFP_KERNEL);
+ if (!cmds)
+ return -ENOMEM;
+ cur = cmds;
+ end = cmds + array->entry_num;
+
+ static_assert(sizeof(*cmds) == 2 * sizeof(u64));
+ ret = iommu_copy_struct_from_full_user_array(
+ cmds, sizeof(*cmds), array,
+ IOMMU_VIOMMU_INVALIDATE_DATA_ARM_SMMUV3);
+ if (ret)
+ goto out;
+
+ last = cmds;
+ while (cur != end) {
+ ret = arm_vsmmu_convert_user_cmd(vsmmu, cur);
+ if (ret)
+ goto out;
+
+ /* FIXME work in blocks of CMDQ_BATCH_ENTRIES and copy each block? */
+ cur++;
+ if (cur != end && (cur - last) != CMDQ_BATCH_ENTRIES - 1)
+ continue;
+
+ /* FIXME always uses the main cmdq rather than trying to group by type */
+ ret = arm_smmu_cmdq_issue_cmdlist(smmu, &smmu->cmdq, last->cmd,
+ cur - last, true);
+ if (ret) {
+ cur--;
+ goto out;
+ }
+ last = cur;
+ }
+out:
+ array->entry_num = cur - cmds;
+ kfree(cmds);
+ return ret;
+}
+
+static const struct iommufd_viommu_ops arm_vsmmu_ops = {
+ .alloc_domain_nested = arm_vsmmu_alloc_domain_nested,
+ .cache_invalidate = arm_vsmmu_cache_invalidate,
+};
+
+struct iommufd_viommu *arm_vsmmu_alloc(struct device *dev,
+ struct iommu_domain *parent,
+ struct iommufd_ctx *ictx,
+ unsigned int viommu_type)
+{
+ struct arm_smmu_device *smmu =
+ iommu_get_iommu_dev(dev, struct arm_smmu_device, iommu);
+ struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+ struct arm_smmu_domain *s2_parent = to_smmu_domain(parent);
+ struct arm_vsmmu *vsmmu;
+
+ if (viommu_type != IOMMU_VIOMMU_TYPE_ARM_SMMUV3)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ if (!(smmu->features & ARM_SMMU_FEAT_NESTING))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ if (s2_parent->smmu != master->smmu)
+ return ERR_PTR(-EINVAL);
+
+ /*
+ * FORCE_SYNC is not set with FEAT_NESTING. Some study of the exact HW
+ * defect is needed to determine if arm_vsmmu_cache_invalidate() needs
+ * any change to remove this.
+ */
+ if (WARN_ON(smmu->options & ARM_SMMU_OPT_CMDQ_FORCE_SYNC))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ /*
+ * Must support some way to prevent the VM from bypassing the cache
+ * because VFIO currently does not do any cache maintenance. canwbs
+ * indicates the device is fully coherent and no cache maintenance is
+ * ever required, even for PCI No-Snoop. S2FWB means the S1 can't make
+ * things non-coherent using the memattr, but No-Snoop behavior is not
+ * effected.
+ */
+ if (!arm_smmu_master_canwbs(master) &&
+ !(smmu->features & ARM_SMMU_FEAT_S2FWB))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ vsmmu = iommufd_viommu_alloc(ictx, struct arm_vsmmu, core,
+ &arm_vsmmu_ops);
+ if (IS_ERR(vsmmu))
+ return ERR_CAST(vsmmu);
+
+ vsmmu->smmu = smmu;
+ vsmmu->s2_parent = s2_parent;
+ /* FIXME Move VMID allocation from the S2 domain allocation to here */
+ vsmmu->vmid = s2_parent->s2_cfg.vmid;
+
+ return &vsmmu->core;
+}
+
+MODULE_IMPORT_NS(IOMMUFD);
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 353fea58cd31..10b4dbc8d027 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -295,6 +295,7 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
case CMDQ_OP_TLBI_NH_ASID:
cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_ASID, ent->tlbi.asid);
fallthrough;
+ case CMDQ_OP_TLBI_NH_ALL:
case CMDQ_OP_TLBI_S12_VMALL:
cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, ent->tlbi.vmid);
break;
@@ -765,9 +766,9 @@ static void arm_smmu_cmdq_write_entries(struct arm_smmu_cmdq *cmdq, u64 *cmds,
* insert their own list of commands then all of the commands from one
* CPU will appear before any of the commands from the other CPU.
*/
-static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
- struct arm_smmu_cmdq *cmdq,
- u64 *cmds, int n, bool sync)
+int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
+ struct arm_smmu_cmdq *cmdq, u64 *cmds, int n,
+ bool sync)
{
u64 cmd_sync[CMDQ_ENT_DWORDS];
u32 prod;
@@ -1045,7 +1046,8 @@ void arm_smmu_get_ste_used(const __le64 *ent, __le64 *used_bits)
/* S2 translates */
if (cfg & BIT(1)) {
used_bits[1] |=
- cpu_to_le64(STRTAB_STE_1_EATS | STRTAB_STE_1_SHCFG);
+ cpu_to_le64(STRTAB_STE_1_S2FWB | STRTAB_STE_1_EATS |
+ STRTAB_STE_1_SHCFG);
used_bits[2] |=
cpu_to_le64(STRTAB_STE_2_S2VMID | STRTAB_STE_2_VTCR |
STRTAB_STE_2_S2AA64 | STRTAB_STE_2_S2ENDI |
@@ -1549,7 +1551,6 @@ static void arm_smmu_write_ste(struct arm_smmu_master *master, u32 sid,
}
}
-VISIBLE_IF_KUNIT
void arm_smmu_make_abort_ste(struct arm_smmu_ste *target)
{
memset(target, 0, sizeof(*target));
@@ -1632,7 +1633,6 @@ void arm_smmu_make_cdtable_ste(struct arm_smmu_ste *target,
}
EXPORT_SYMBOL_IF_KUNIT(arm_smmu_make_cdtable_ste);
-VISIBLE_IF_KUNIT
void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target,
struct arm_smmu_master *master,
struct arm_smmu_domain *smmu_domain,
@@ -1655,6 +1655,8 @@ void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target,
FIELD_PREP(STRTAB_STE_1_EATS,
ats_enabled ? STRTAB_STE_1_EATS_TRANS : 0));
+ if (pgtbl_cfg->quirks & IO_PGTABLE_QUIRK_ARM_S2FWB)
+ target->data[1] |= cpu_to_le64(STRTAB_STE_1_S2FWB);
if (smmu->features & ARM_SMMU_FEAT_ATTR_TYPES_OVR)
target->data[1] |= cpu_to_le64(FIELD_PREP(STRTAB_STE_1_SHCFG,
STRTAB_STE_1_SHCFG_INCOMING));
@@ -2105,7 +2107,16 @@ int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain,
if (!master->ats_enabled)
continue;
- arm_smmu_atc_inv_to_cmd(master_domain->ssid, iova, size, &cmd);
+ if (master_domain->nested_ats_flush) {
+ /*
+ * If a S2 used as a nesting parent is changed we have
+ * no option but to completely flush the ATC.
+ */
+ arm_smmu_atc_inv_to_cmd(IOMMU_NO_PASID, 0, 0, &cmd);
+ } else {
+ arm_smmu_atc_inv_to_cmd(master_domain->ssid, iova, size,
+ &cmd);
+ }
for (i = 0; i < master->num_streams; i++) {
cmd.atc.sid = master->streams[i].id;
@@ -2232,6 +2243,15 @@ static void arm_smmu_tlb_inv_range_domain(unsigned long iova, size_t size,
}
__arm_smmu_tlb_inv_range(&cmd, iova, size, granule, smmu_domain);
+ if (smmu_domain->nest_parent) {
+ /*
+ * When the S2 domain changes all the nested S1 ASIDs have to be
+ * flushed too.
+ */
+ cmd.opcode = CMDQ_OP_TLBI_NH_ALL;
+ arm_smmu_cmdq_issue_cmd_with_sync(smmu_domain->smmu, &cmd);
+ }
+
/*
* Unfortunately, this can't be leaf-only since we may have
* zapped an entire table.
@@ -2293,6 +2313,8 @@ static bool arm_smmu_capable(struct device *dev, enum iommu_cap cap)
case IOMMU_CAP_CACHE_COHERENCY:
/* Assume that a coherent TCU implies coherent TBUs */
return master->smmu->features & ARM_SMMU_FEAT_COHERENCY;
+ case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
+ return arm_smmu_master_canwbs(master);
case IOMMU_CAP_NOEXEC:
case IOMMU_CAP_DEFERRED_FLUSH:
return true;
@@ -2303,6 +2325,26 @@ static bool arm_smmu_capable(struct device *dev, enum iommu_cap cap)
}
}
+static bool arm_smmu_enforce_cache_coherency(struct iommu_domain *domain)
+{
+ struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+ struct arm_smmu_master_domain *master_domain;
+ unsigned long flags;
+ bool ret = true;
+
+ spin_lock_irqsave(&smmu_domain->devices_lock, flags);
+ list_for_each_entry(master_domain, &smmu_domain->devices,
+ devices_elm) {
+ if (!arm_smmu_master_canwbs(master_domain->master)) {
+ ret = false;
+ break;
+ }
+ }
+ smmu_domain->enforce_cache_coherency = ret;
+ spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
+ return ret;
+}
+
struct arm_smmu_domain *arm_smmu_domain_alloc(void)
{
struct arm_smmu_domain *smmu_domain;
@@ -2442,6 +2484,9 @@ static int arm_smmu_domain_finalise(struct arm_smmu_domain *smmu_domain,
pgtbl_cfg.oas = smmu->oas;
fmt = ARM_64_LPAE_S2;
finalise_stage_fn = arm_smmu_domain_finalise_s2;
+ if ((smmu->features & ARM_SMMU_FEAT_S2FWB) &&
+ (flags & IOMMU_HWPT_ALLOC_NEST_PARENT))
+ pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_ARM_S2FWB;
break;
default:
return -EINVAL;
@@ -2483,8 +2528,8 @@ arm_smmu_get_step_for_sid(struct arm_smmu_device *smmu, u32 sid)
}
}
-static void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master,
- const struct arm_smmu_ste *target)
+void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master,
+ const struct arm_smmu_ste *target)
{
int i, j;
struct arm_smmu_device *smmu = master->smmu;
@@ -2595,7 +2640,7 @@ static void arm_smmu_disable_pasid(struct arm_smmu_master *master)
static struct arm_smmu_master_domain *
arm_smmu_find_master_domain(struct arm_smmu_domain *smmu_domain,
struct arm_smmu_master *master,
- ioasid_t ssid)
+ ioasid_t ssid, bool nested_ats_flush)
{
struct arm_smmu_master_domain *master_domain;
@@ -2604,7 +2649,8 @@ arm_smmu_find_master_domain(struct arm_smmu_domain *smmu_domain,
list_for_each_entry(master_domain, &smmu_domain->devices,
devices_elm) {
if (master_domain->master == master &&
- master_domain->ssid == ssid)
+ master_domain->ssid == ssid &&
+ master_domain->nested_ats_flush == nested_ats_flush)
return master_domain;
}
return NULL;
@@ -2624,6 +2670,8 @@ to_smmu_domain_devices(struct iommu_domain *domain)
if ((domain->type & __IOMMU_DOMAIN_PAGING) ||
domain->type == IOMMU_DOMAIN_SVA)
return to_smmu_domain(domain);
+ if (domain->type == IOMMU_DOMAIN_NESTED)
+ return to_smmu_nested_domain(domain)->vsmmu->s2_parent;
return NULL;
}
@@ -2633,13 +2681,18 @@ static void arm_smmu_remove_master_domain(struct arm_smmu_master *master,
{
struct arm_smmu_domain *smmu_domain = to_smmu_domain_devices(domain);
struct arm_smmu_master_domain *master_domain;
+ bool nested_ats_flush = false;
unsigned long flags;
if (!smmu_domain)
return;
+ if (domain->type == IOMMU_DOMAIN_NESTED)
+ nested_ats_flush = to_smmu_nested_domain(domain)->enable_ats;
+
spin_lock_irqsave(&smmu_domain->devices_lock, flags);
- master_domain = arm_smmu_find_master_domain(smmu_domain, master, ssid);
+ master_domain = arm_smmu_find_master_domain(smmu_domain, master, ssid,
+ nested_ats_flush);
if (master_domain) {
list_del(&master_domain->devices_elm);
kfree(master_domain);
@@ -2649,16 +2702,6 @@ static void arm_smmu_remove_master_domain(struct arm_smmu_master *master,
spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
}
-struct arm_smmu_attach_state {
- /* Inputs */
- struct iommu_domain *old_domain;
- struct arm_smmu_master *master;
- bool cd_needs_ats;
- ioasid_t ssid;
- /* Resulting state */
- bool ats_enabled;
-};
-
/*
* Start the sequence to attach a domain to a master. The sequence contains three
* steps:
@@ -2679,8 +2722,8 @@ struct arm_smmu_attach_state {
* new_domain can be a non-paging domain. In this case ATS will not be enabled,
* and invalidations won't be tracked.
*/
-static int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state,
- struct iommu_domain *new_domain)
+int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state,
+ struct iommu_domain *new_domain)
{
struct arm_smmu_master *master = state->master;
struct arm_smmu_master_domain *master_domain;
@@ -2706,7 +2749,8 @@ static int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state,
* enabled if we have arm_smmu_domain, those always have page
* tables.
*/
- state->ats_enabled = arm_smmu_ats_supported(master);
+ state->ats_enabled = !state->disable_ats &&
+ arm_smmu_ats_supported(master);
}
if (smmu_domain) {
@@ -2715,6 +2759,9 @@ static int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state,
return -ENOMEM;
master_domain->master = master;
master_domain->ssid = state->ssid;
+ if (new_domain->type == IOMMU_DOMAIN_NESTED)
+ master_domain->nested_ats_flush =
+ to_smmu_nested_domain(new_domain)->enable_ats;
/*
* During prepare we want the current smmu_domain and new
@@ -2731,6 +2778,14 @@ static int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state,
* one of them.
*/
spin_lock_irqsave(&smmu_domain->devices_lock, flags);
+ if (smmu_domain->enforce_cache_coherency &&
+ !arm_smmu_master_canwbs(master)) {
+ spin_unlock_irqrestore(&smmu_domain->devices_lock,
+ flags);
+ kfree(master_domain);
+ return -EINVAL;
+ }
+
if (state->ats_enabled)
atomic_inc(&smmu_domain->nr_ats_masters);
list_add(&master_domain->devices_elm, &smmu_domain->devices);
@@ -2754,7 +2809,7 @@ static int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state,
* completes synchronizing the PCI device's ATC and finishes manipulating the
* smmu_domain->devices list.
*/
-static void arm_smmu_attach_commit(struct arm_smmu_attach_state *state)
+void arm_smmu_attach_commit(struct arm_smmu_attach_state *state)
{
struct arm_smmu_master *master = state->master;
@@ -3084,7 +3139,8 @@ arm_smmu_domain_alloc_user(struct device *dev, u32 flags,
const struct iommu_user_data *user_data)
{
struct arm_smmu_master *master = dev_iommu_priv_get(dev);
- const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
+ const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING |
+ IOMMU_HWPT_ALLOC_NEST_PARENT;
struct arm_smmu_domain *smmu_domain;
int ret;
@@ -3097,6 +3153,15 @@ arm_smmu_domain_alloc_user(struct device *dev, u32 flags,
if (IS_ERR(smmu_domain))
return ERR_CAST(smmu_domain);
+ if (flags & IOMMU_HWPT_ALLOC_NEST_PARENT) {
+ if (!(master->smmu->features & ARM_SMMU_FEAT_NESTING)) {
+ ret = -EOPNOTSUPP;
+ goto err_free;
+ }
+ smmu_domain->stage = ARM_SMMU_DOMAIN_S2;
+ smmu_domain->nest_parent = true;
+ }
+
smmu_domain->domain.type = IOMMU_DOMAIN_UNMANAGED;
smmu_domain->domain.ops = arm_smmu_ops.default_domain_ops;
ret = arm_smmu_domain_finalise(smmu_domain, master->smmu, flags);
@@ -3378,21 +3443,6 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev)
return group;
}
-static int arm_smmu_enable_nesting(struct iommu_domain *domain)
-{
- struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
- int ret = 0;
-
- mutex_lock(&smmu_domain->init_mutex);
- if (smmu_domain->smmu)
- ret = -EPERM;
- else
- smmu_domain->stage = ARM_SMMU_DOMAIN_S2;
- mutex_unlock(&smmu_domain->init_mutex);
-
- return ret;
-}
-
static int arm_smmu_of_xlate(struct device *dev,
const struct of_phandle_args *args)
{
@@ -3491,6 +3541,7 @@ static struct iommu_ops arm_smmu_ops = {
.identity_domain = &arm_smmu_identity_domain,
.blocked_domain = &arm_smmu_blocked_domain,
.capable = arm_smmu_capable,
+ .hw_info = arm_smmu_hw_info,
.domain_alloc_paging = arm_smmu_domain_alloc_paging,
.domain_alloc_sva = arm_smmu_sva_domain_alloc,
.domain_alloc_user = arm_smmu_domain_alloc_user,
@@ -3504,17 +3555,19 @@ static struct iommu_ops arm_smmu_ops = {
.dev_disable_feat = arm_smmu_dev_disable_feature,
.page_response = arm_smmu_page_response,
.def_domain_type = arm_smmu_def_domain_type,
+ .viommu_alloc = arm_vsmmu_alloc,
+ .user_pasid_table = 1,
.pgsize_bitmap = -1UL, /* Restricted during device attach */
.owner = THIS_MODULE,
.default_domain_ops = &(const struct iommu_domain_ops) {
.attach_dev = arm_smmu_attach_dev,
+ .enforce_cache_coherency = arm_smmu_enforce_cache_coherency,
.set_dev_pasid = arm_smmu_s1_set_dev_pasid,
.map_pages = arm_smmu_map_pages,
.unmap_pages = arm_smmu_unmap_pages,
.flush_iotlb_all = arm_smmu_flush_iotlb_all,
.iotlb_sync = arm_smmu_iotlb_sync,
.iova_to_phys = arm_smmu_iova_to_phys,
- .enable_nesting = arm_smmu_enable_nesting,
.free = arm_smmu_domain_free_paging,
}
};
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 1e9952ca989f..af25f092303f 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -10,6 +10,7 @@
#include <linux/bitfield.h>
#include <linux/iommu.h>
+#include <linux/iommufd.h>
#include <linux/kernel.h>
#include <linux/mmzone.h>
#include <linux/sizes.h>
@@ -57,6 +58,7 @@ struct arm_smmu_device;
#define IDR1_SIDSIZE GENMASK(5, 0)
#define ARM_SMMU_IDR3 0xc
+#define IDR3_FWB (1 << 8)
#define IDR3_RIL (1 << 10)
#define ARM_SMMU_IDR5 0x14
@@ -81,6 +83,8 @@ struct arm_smmu_device;
#define IIDR_REVISION GENMASK(15, 12)
#define IIDR_IMPLEMENTER GENMASK(11, 0)
+#define ARM_SMMU_AIDR 0x1C
+
#define ARM_SMMU_CR0 0x20
#define CR0_ATSCHK (1 << 4)
#define CR0_CMDQEN (1 << 3)
@@ -241,6 +245,7 @@ static inline u32 arm_smmu_strtab_l2_idx(u32 sid)
#define STRTAB_STE_0_CFG_BYPASS 4
#define STRTAB_STE_0_CFG_S1_TRANS 5
#define STRTAB_STE_0_CFG_S2_TRANS 6
+#define STRTAB_STE_0_CFG_NESTED 7
#define STRTAB_STE_0_S1FMT GENMASK_ULL(5, 4)
#define STRTAB_STE_0_S1FMT_LINEAR 0
@@ -261,6 +266,7 @@ static inline u32 arm_smmu_strtab_l2_idx(u32 sid)
#define STRTAB_STE_1_S1COR GENMASK_ULL(5, 4)
#define STRTAB_STE_1_S1CSH GENMASK_ULL(7, 6)
+#define STRTAB_STE_1_S2FWB (1UL << 25)
#define STRTAB_STE_1_S1STALLD (1UL << 27)
#define STRTAB_STE_1_EATS GENMASK_ULL(29, 28)
@@ -292,6 +298,15 @@ static inline u32 arm_smmu_strtab_l2_idx(u32 sid)
#define STRTAB_STE_3_S2TTB_MASK GENMASK_ULL(51, 4)
+/* These bits can be controlled by userspace for STRTAB_STE_0_CFG_NESTED */
+#define STRTAB_STE_0_NESTING_ALLOWED \
+ cpu_to_le64(STRTAB_STE_0_V | STRTAB_STE_0_CFG | STRTAB_STE_0_S1FMT | \
+ STRTAB_STE_0_S1CTXPTR_MASK | STRTAB_STE_0_S1CDMAX)
+#define STRTAB_STE_1_NESTING_ALLOWED \
+ cpu_to_le64(STRTAB_STE_1_S1DSS | STRTAB_STE_1_S1CIR | \
+ STRTAB_STE_1_S1COR | STRTAB_STE_1_S1CSH | \
+ STRTAB_STE_1_S1STALLD | STRTAB_STE_1_EATS)
+
/*
* Context descriptors.
*
@@ -511,8 +526,10 @@ struct arm_smmu_cmdq_ent {
};
} cfgi;
+ #define CMDQ_OP_TLBI_NH_ALL 0x10
#define CMDQ_OP_TLBI_NH_ASID 0x11
#define CMDQ_OP_TLBI_NH_VA 0x12
+ #define CMDQ_OP_TLBI_NH_VAA 0x13
#define CMDQ_OP_TLBI_EL2_ALL 0x20
#define CMDQ_OP_TLBI_EL2_ASID 0x21
#define CMDQ_OP_TLBI_EL2_VA 0x22
@@ -726,6 +743,7 @@ struct arm_smmu_device {
#define ARM_SMMU_FEAT_ATTR_TYPES_OVR (1 << 20)
#define ARM_SMMU_FEAT_HA (1 << 21)
#define ARM_SMMU_FEAT_HD (1 << 22)
+#define ARM_SMMU_FEAT_S2FWB (1 << 23)
u32 features;
#define ARM_SMMU_OPT_SKIP_PREFETCH (1 << 0)
@@ -811,10 +829,20 @@ struct arm_smmu_domain {
/* List of struct arm_smmu_master_domain */
struct list_head devices;
spinlock_t devices_lock;
+ bool enforce_cache_coherency : 1;
+ bool nest_parent : 1;
struct mmu_notifier mmu_notifier;
};
+struct arm_smmu_nested_domain {
+ struct iommu_domain domain;
+ struct arm_vsmmu *vsmmu;
+ bool enable_ats : 1;
+
+ __le64 ste[2];
+};
+
/* The following are exposed for testing purposes. */
struct arm_smmu_entry_writer_ops;
struct arm_smmu_entry_writer {
@@ -827,21 +855,22 @@ struct arm_smmu_entry_writer_ops {
void (*sync)(struct arm_smmu_entry_writer *writer);
};
+void arm_smmu_make_abort_ste(struct arm_smmu_ste *target);
+void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target,
+ struct arm_smmu_master *master,
+ struct arm_smmu_domain *smmu_domain,
+ bool ats_enabled);
+
#if IS_ENABLED(CONFIG_KUNIT)
void arm_smmu_get_ste_used(const __le64 *ent, __le64 *used_bits);
void arm_smmu_write_entry(struct arm_smmu_entry_writer *writer, __le64 *cur,
const __le64 *target);
void arm_smmu_get_cd_used(const __le64 *ent, __le64 *used_bits);
-void arm_smmu_make_abort_ste(struct arm_smmu_ste *target);
void arm_smmu_make_bypass_ste(struct arm_smmu_device *smmu,
struct arm_smmu_ste *target);
void arm_smmu_make_cdtable_ste(struct arm_smmu_ste *target,
struct arm_smmu_master *master, bool ats_enabled,
unsigned int s1dss);
-void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target,
- struct arm_smmu_master *master,
- struct arm_smmu_domain *smmu_domain,
- bool ats_enabled);
void arm_smmu_make_sva_cd(struct arm_smmu_cd *target,
struct arm_smmu_master *master, struct mm_struct *mm,
u16 asid);
@@ -851,6 +880,7 @@ struct arm_smmu_master_domain {
struct list_head devices_elm;
struct arm_smmu_master *master;
ioasid_t ssid;
+ bool nested_ats_flush : 1;
};
static inline struct arm_smmu_domain *to_smmu_domain(struct iommu_domain *dom)
@@ -858,6 +888,12 @@ static inline struct arm_smmu_domain *to_smmu_domain(struct iommu_domain *dom)
return container_of(dom, struct arm_smmu_domain, domain);
}
+static inline struct arm_smmu_nested_domain *
+to_smmu_nested_domain(struct iommu_domain *dom)
+{
+ return container_of(dom, struct arm_smmu_nested_domain, domain);
+}
+
extern struct xarray arm_smmu_asid_xa;
extern struct mutex arm_smmu_asid_lock;
@@ -893,6 +929,33 @@ int arm_smmu_init_one_queue(struct arm_smmu_device *smmu,
int arm_smmu_cmdq_init(struct arm_smmu_device *smmu,
struct arm_smmu_cmdq *cmdq);
+static inline bool arm_smmu_master_canwbs(struct arm_smmu_master *master)
+{
+ return dev_iommu_fwspec_get(master->dev)->flags &
+ IOMMU_FWSPEC_PCI_RC_CANWBS;
+}
+
+struct arm_smmu_attach_state {
+ /* Inputs */
+ struct iommu_domain *old_domain;
+ struct arm_smmu_master *master;
+ bool cd_needs_ats;
+ bool disable_ats;
+ ioasid_t ssid;
+ /* Resulting state */
+ bool ats_enabled;
+};
+
+int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state,
+ struct iommu_domain *new_domain);
+void arm_smmu_attach_commit(struct arm_smmu_attach_state *state);
+void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master,
+ const struct arm_smmu_ste *target);
+
+int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
+ struct arm_smmu_cmdq *cmdq, u64 *cmds, int n,
+ bool sync);
+
#ifdef CONFIG_ARM_SMMU_V3_SVA
bool arm_smmu_sva_supported(struct arm_smmu_device *smmu);
bool arm_smmu_master_sva_supported(struct arm_smmu_master *master);
@@ -949,4 +1012,23 @@ tegra241_cmdqv_probe(struct arm_smmu_device *smmu)
return ERR_PTR(-ENODEV);
}
#endif /* CONFIG_TEGRA241_CMDQV */
+
+struct arm_vsmmu {
+ struct iommufd_viommu core;
+ struct arm_smmu_device *smmu;
+ struct arm_smmu_domain *s2_parent;
+ u16 vmid;
+};
+
+#if IS_ENABLED(CONFIG_ARM_SMMU_V3_IOMMUFD)
+void *arm_smmu_hw_info(struct device *dev, u32 *length, u32 *type);
+struct iommufd_viommu *arm_vsmmu_alloc(struct device *dev,
+ struct iommu_domain *parent,
+ struct iommufd_ctx *ictx,
+ unsigned int viommu_type);
+#else
+#define arm_smmu_hw_info NULL
+#define arm_vsmmu_alloc NULL
+#endif /* CONFIG_ARM_SMMU_V3_IOMMUFD */
+
#endif /* _ARM_SMMU_V3_H */
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index 8321962b3714..12b173eec454 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -1558,21 +1558,6 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev)
return group;
}
-static int arm_smmu_enable_nesting(struct iommu_domain *domain)
-{
- struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
- int ret = 0;
-
- mutex_lock(&smmu_domain->init_mutex);
- if (smmu_domain->smmu)
- ret = -EPERM;
- else
- smmu_domain->stage = ARM_SMMU_DOMAIN_NESTED;
- mutex_unlock(&smmu_domain->init_mutex);
-
- return ret;
-}
-
static int arm_smmu_set_pgtable_quirks(struct iommu_domain *domain,
unsigned long quirks)
{
@@ -1656,7 +1641,6 @@ static struct iommu_ops arm_smmu_ops = {
.flush_iotlb_all = arm_smmu_flush_iotlb_all,
.iotlb_sync = arm_smmu_iotlb_sync,
.iova_to_phys = arm_smmu_iova_to_phys,
- .enable_nesting = arm_smmu_enable_nesting,
.set_pgtable_quirks = arm_smmu_set_pgtable_quirks,
.free = arm_smmu_domain_free,
}
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 0e67f1721a3d..74f58c6ac30c 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -106,6 +106,18 @@
#define ARM_LPAE_PTE_HAP_FAULT (((arm_lpae_iopte)0) << 6)
#define ARM_LPAE_PTE_HAP_READ (((arm_lpae_iopte)1) << 6)
#define ARM_LPAE_PTE_HAP_WRITE (((arm_lpae_iopte)2) << 6)
+/*
+ * For !FWB these code to:
+ * 1111 = Normal outer write back cachable / Inner Write Back Cachable
+ * Permit S1 to override
+ * 0101 = Normal Non-cachable / Inner Non-cachable
+ * 0001 = Device / Device-nGnRE
+ * For S2FWB these code:
+ * 0110 Force Normal Write Back
+ * 0101 Normal* is forced Normal-NC, Device unchanged
+ * 0001 Force Device-nGnRE
+ */
+#define ARM_LPAE_PTE_MEMATTR_FWB_WB (((arm_lpae_iopte)0x6) << 2)
#define ARM_LPAE_PTE_MEMATTR_OIWB (((arm_lpae_iopte)0xf) << 2)
#define ARM_LPAE_PTE_MEMATTR_NC (((arm_lpae_iopte)0x5) << 2)
#define ARM_LPAE_PTE_MEMATTR_DEV (((arm_lpae_iopte)0x1) << 2)
@@ -458,12 +470,16 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data,
*/
if (data->iop.fmt == ARM_64_LPAE_S2 ||
data->iop.fmt == ARM_32_LPAE_S2) {
- if (prot & IOMMU_MMIO)
+ if (prot & IOMMU_MMIO) {
pte |= ARM_LPAE_PTE_MEMATTR_DEV;
- else if (prot & IOMMU_CACHE)
- pte |= ARM_LPAE_PTE_MEMATTR_OIWB;
- else
+ } else if (prot & IOMMU_CACHE) {
+ if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_S2FWB)
+ pte |= ARM_LPAE_PTE_MEMATTR_FWB_WB;
+ else
+ pte |= ARM_LPAE_PTE_MEMATTR_OIWB;
+ } else {
pte |= ARM_LPAE_PTE_MEMATTR_NC;
+ }
} else {
if (prot & IOMMU_MMIO)
pte |= (ARM_LPAE_MAIR_ATTR_IDX_DEV
@@ -1035,8 +1051,7 @@ arm_64_lpae_alloc_pgtable_s2(struct io_pgtable_cfg *cfg, void *cookie)
struct arm_lpae_io_pgtable *data;
typeof(&cfg->arm_lpae_s2_cfg.vtcr) vtcr = &cfg->arm_lpae_s2_cfg.vtcr;
- /* The NS quirk doesn't apply at stage 2 */
- if (cfg->quirks)
+ if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_S2FWB))
return NULL;
data = arm_lpae_alloc_pgtable(cfg);
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 83c8e617a2c5..dbd70d5a4702 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2723,16 +2723,6 @@ static int __init iommu_init(void)
}
core_initcall(iommu_init);
-int iommu_enable_nesting(struct iommu_domain *domain)
-{
- if (domain->type != IOMMU_DOMAIN_UNMANAGED)
- return -EINVAL;
- if (!domain->ops->enable_nesting)
- return -EINVAL;
- return domain->ops->enable_nesting(domain);
-}
-EXPORT_SYMBOL_GPL(iommu_enable_nesting);
-
int iommu_set_pgtable_quirks(struct iommu_domain *domain,
unsigned long quirk)
{
diff --git a/drivers/iommu/iommufd/Kconfig b/drivers/iommu/iommufd/Kconfig
index 76656fe0470d..0a07f9449fd9 100644
--- a/drivers/iommu/iommufd/Kconfig
+++ b/drivers/iommu/iommufd/Kconfig
@@ -1,4 +1,8 @@
# SPDX-License-Identifier: GPL-2.0-only
+config IOMMUFD_DRIVER_CORE
+ tristate
+ default (IOMMUFD_DRIVER || IOMMUFD) if IOMMUFD!=n
+
config IOMMUFD
tristate "IOMMU Userspace API"
select INTERVAL_TREE
diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile
index cf4605962bea..cb784da6cddc 100644
--- a/drivers/iommu/iommufd/Makefile
+++ b/drivers/iommu/iommufd/Makefile
@@ -7,9 +7,13 @@ iommufd-y := \
ioas.o \
main.o \
pages.o \
- vfio_compat.o
+ vfio_compat.o \
+ viommu.o
iommufd-$(CONFIG_IOMMUFD_TEST) += selftest.o
obj-$(CONFIG_IOMMUFD) += iommufd.o
obj-$(CONFIG_IOMMUFD_DRIVER) += iova_bitmap.o
+
+iommufd_driver-y := driver.o
+obj-$(CONFIG_IOMMUFD_DRIVER_CORE) += iommufd_driver.o
diff --git a/drivers/iommu/iommufd/driver.c b/drivers/iommu/iommufd/driver.c
new file mode 100644
index 000000000000..7b67fdf44134
--- /dev/null
+++ b/drivers/iommu/iommufd/driver.c
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ */
+#include "iommufd_private.h"
+
+struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx,
+ size_t size,
+ enum iommufd_object_type type)
+{
+ struct iommufd_object *obj;
+ int rc;
+
+ obj = kzalloc(size, GFP_KERNEL_ACCOUNT);
+ if (!obj)
+ return ERR_PTR(-ENOMEM);
+ obj->type = type;
+ /* Starts out bias'd by 1 until it is removed from the xarray */
+ refcount_set(&obj->shortterm_users, 1);
+ refcount_set(&obj->users, 1);
+
+ /*
+ * Reserve an ID in the xarray but do not publish the pointer yet since
+ * the caller hasn't initialized it yet. Once the pointer is published
+ * in the xarray and visible to other threads we can't reliably destroy
+ * it anymore, so the caller must complete all errorable operations
+ * before calling iommufd_object_finalize().
+ */
+ rc = xa_alloc(&ictx->objects, &obj->id, XA_ZERO_ENTRY, xa_limit_31b,
+ GFP_KERNEL_ACCOUNT);
+ if (rc)
+ goto out_free;
+ return obj;
+out_free:
+ kfree(obj);
+ return ERR_PTR(rc);
+}
+EXPORT_SYMBOL_NS_GPL(_iommufd_object_alloc, IOMMUFD);
+
+/* Caller should xa_lock(&viommu->vdevs) to protect the return value */
+struct device *iommufd_viommu_find_dev(struct iommufd_viommu *viommu,
+ unsigned long vdev_id)
+{
+ struct iommufd_vdevice *vdev;
+
+ lockdep_assert_held(&viommu->vdevs.xa_lock);
+
+ vdev = xa_load(&viommu->vdevs, vdev_id);
+ return vdev ? vdev->dev : NULL;
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_viommu_find_dev, IOMMUFD);
+
+MODULE_DESCRIPTION("iommufd code shared with builtin modules");
+MODULE_LICENSE("GPL");
diff --git a/drivers/iommu/iommufd/fault.c b/drivers/iommu/iommufd/fault.c
index e590973ce5cf..053b0e30f55a 100644
--- a/drivers/iommu/iommufd/fault.c
+++ b/drivers/iommu/iommufd/fault.c
@@ -10,6 +10,7 @@
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/pci.h>
+#include <linux/pci-ats.h>
#include <linux/poll.h>
#include <uapi/linux/iommufd.h>
@@ -27,8 +28,12 @@ static int iommufd_fault_iopf_enable(struct iommufd_device *idev)
* resource between PF and VFs. There is no coordination for this
* shared capability. This waits for a vPRI reset to recover.
*/
- if (dev_is_pci(dev) && to_pci_dev(dev)->is_virtfn)
- return -EINVAL;
+ if (dev_is_pci(dev)) {
+ struct pci_dev *pdev = to_pci_dev(dev);
+
+ if (pdev->is_virtfn && pci_pri_supported(pdev))
+ return -EINVAL;
+ }
mutex_lock(&idev->iopf_lock);
/* Device iopf has already been on. */
diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
index d06bf6e6c19f..702057655a81 100644
--- a/drivers/iommu/iommufd/hw_pagetable.c
+++ b/drivers/iommu/iommufd/hw_pagetable.c
@@ -57,7 +57,10 @@ void iommufd_hwpt_nested_destroy(struct iommufd_object *obj)
container_of(obj, struct iommufd_hwpt_nested, common.obj);
__iommufd_hwpt_destroy(&hwpt_nested->common);
- refcount_dec(&hwpt_nested->parent->common.obj.users);
+ if (hwpt_nested->viommu)
+ refcount_dec(&hwpt_nested->viommu->obj.users);
+ else
+ refcount_dec(&hwpt_nested->parent->common.obj.users);
}
void iommufd_hwpt_nested_abort(struct iommufd_object *obj)
@@ -248,8 +251,7 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx,
}
hwpt->domain->owner = ops;
- if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED ||
- !hwpt->domain->ops->cache_invalidate_user)) {
+ if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) {
rc = -EINVAL;
goto out_abort;
}
@@ -260,6 +262,58 @@ out_abort:
return ERR_PTR(rc);
}
+/**
+ * iommufd_viommu_alloc_hwpt_nested() - Get a hwpt_nested for a vIOMMU
+ * @viommu: vIOMMU ojbect to associate the hwpt_nested/domain with
+ * @flags: Flags from userspace
+ * @user_data: user_data pointer. Must be valid
+ *
+ * Allocate a new IOMMU_DOMAIN_NESTED for a vIOMMU and return it as a NESTED
+ * hw_pagetable.
+ */
+static struct iommufd_hwpt_nested *
+iommufd_viommu_alloc_hwpt_nested(struct iommufd_viommu *viommu, u32 flags,
+ const struct iommu_user_data *user_data)
+{
+ struct iommufd_hwpt_nested *hwpt_nested;
+ struct iommufd_hw_pagetable *hwpt;
+ int rc;
+
+ if (!user_data->len)
+ return ERR_PTR(-EOPNOTSUPP);
+ if (!viommu->ops || !viommu->ops->alloc_domain_nested)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ hwpt_nested = __iommufd_object_alloc(
+ viommu->ictx, hwpt_nested, IOMMUFD_OBJ_HWPT_NESTED, common.obj);
+ if (IS_ERR(hwpt_nested))
+ return ERR_CAST(hwpt_nested);
+ hwpt = &hwpt_nested->common;
+
+ hwpt_nested->viommu = viommu;
+ refcount_inc(&viommu->obj.users);
+ hwpt_nested->parent = viommu->hwpt;
+
+ hwpt->domain =
+ viommu->ops->alloc_domain_nested(viommu, flags, user_data);
+ if (IS_ERR(hwpt->domain)) {
+ rc = PTR_ERR(hwpt->domain);
+ hwpt->domain = NULL;
+ goto out_abort;
+ }
+ hwpt->domain->owner = viommu->iommu_dev->ops;
+
+ if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) {
+ rc = -EINVAL;
+ goto out_abort;
+ }
+ return hwpt_nested;
+
+out_abort:
+ iommufd_object_abort_and_destroy(viommu->ictx, &hwpt->obj);
+ return ERR_PTR(rc);
+}
+
int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd)
{
struct iommu_hwpt_alloc *cmd = ucmd->cmd;
@@ -316,6 +370,22 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd)
goto out_unlock;
}
hwpt = &hwpt_nested->common;
+ } else if (pt_obj->type == IOMMUFD_OBJ_VIOMMU) {
+ struct iommufd_hwpt_nested *hwpt_nested;
+ struct iommufd_viommu *viommu;
+
+ viommu = container_of(pt_obj, struct iommufd_viommu, obj);
+ if (viommu->iommu_dev != __iommu_get_iommu_dev(idev->dev)) {
+ rc = -EINVAL;
+ goto out_unlock;
+ }
+ hwpt_nested = iommufd_viommu_alloc_hwpt_nested(
+ viommu, cmd->flags, &user_data);
+ if (IS_ERR(hwpt_nested)) {
+ rc = PTR_ERR(hwpt_nested);
+ goto out_unlock;
+ }
+ hwpt = &hwpt_nested->common;
} else {
rc = -EINVAL;
goto out_put_pt;
@@ -412,7 +482,7 @@ int iommufd_hwpt_invalidate(struct iommufd_ucmd *ucmd)
.entry_len = cmd->entry_len,
.entry_num = cmd->entry_num,
};
- struct iommufd_hw_pagetable *hwpt;
+ struct iommufd_object *pt_obj;
u32 done_num = 0;
int rc;
@@ -426,17 +496,40 @@ int iommufd_hwpt_invalidate(struct iommufd_ucmd *ucmd)
goto out;
}
- hwpt = iommufd_get_hwpt_nested(ucmd, cmd->hwpt_id);
- if (IS_ERR(hwpt)) {
- rc = PTR_ERR(hwpt);
+ pt_obj = iommufd_get_object(ucmd->ictx, cmd->hwpt_id, IOMMUFD_OBJ_ANY);
+ if (IS_ERR(pt_obj)) {
+ rc = PTR_ERR(pt_obj);
goto out;
}
+ if (pt_obj->type == IOMMUFD_OBJ_HWPT_NESTED) {
+ struct iommufd_hw_pagetable *hwpt =
+ container_of(pt_obj, struct iommufd_hw_pagetable, obj);
+
+ if (!hwpt->domain->ops ||
+ !hwpt->domain->ops->cache_invalidate_user) {
+ rc = -EOPNOTSUPP;
+ goto out_put_pt;
+ }
+ rc = hwpt->domain->ops->cache_invalidate_user(hwpt->domain,
+ &data_array);
+ } else if (pt_obj->type == IOMMUFD_OBJ_VIOMMU) {
+ struct iommufd_viommu *viommu =
+ container_of(pt_obj, struct iommufd_viommu, obj);
+
+ if (!viommu->ops || !viommu->ops->cache_invalidate) {
+ rc = -EOPNOTSUPP;
+ goto out_put_pt;
+ }
+ rc = viommu->ops->cache_invalidate(viommu, &data_array);
+ } else {
+ rc = -EINVAL;
+ goto out_put_pt;
+ }
- rc = hwpt->domain->ops->cache_invalidate_user(hwpt->domain,
- &data_array);
done_num = data_array.entry_num;
- iommufd_put_object(ucmd->ictx, &hwpt->obj);
+out_put_pt:
+ iommufd_put_object(ucmd->ictx, pt_obj);
out:
cmd->entry_num = done_num;
if (iommufd_ucmd_respond(ucmd, sizeof(*cmd)))
diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c
index 4bf7ccd39d46..8a790e597e12 100644
--- a/drivers/iommu/iommufd/io_pagetable.c
+++ b/drivers/iommu/iommufd/io_pagetable.c
@@ -107,9 +107,9 @@ static bool __alloc_iova_check_used(struct interval_tree_span_iter *span,
* Does not return a 0 IOVA even if it is valid.
*/
static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova,
- unsigned long uptr, unsigned long length)
+ unsigned long addr, unsigned long length)
{
- unsigned long page_offset = uptr % PAGE_SIZE;
+ unsigned long page_offset = addr % PAGE_SIZE;
struct interval_tree_double_span_iter used_span;
struct interval_tree_span_iter allowed_span;
unsigned long max_alignment = PAGE_SIZE;
@@ -122,15 +122,15 @@ static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova,
return -EOVERFLOW;
/*
- * Keep alignment present in the uptr when building the IOVA, this
+ * Keep alignment present in addr when building the IOVA, which
* increases the chance we can map a THP.
*/
- if (!uptr)
+ if (!addr)
iova_alignment = roundup_pow_of_two(length);
else
iova_alignment = min_t(unsigned long,
roundup_pow_of_two(length),
- 1UL << __ffs64(uptr));
+ 1UL << __ffs64(addr));
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
max_alignment = HPAGE_SIZE;
@@ -248,6 +248,7 @@ static int iopt_alloc_area_pages(struct io_pagetable *iopt,
int iommu_prot, unsigned int flags)
{
struct iopt_pages_list *elm;
+ unsigned long start;
unsigned long iova;
int rc = 0;
@@ -267,9 +268,15 @@ static int iopt_alloc_area_pages(struct io_pagetable *iopt,
/* Use the first entry to guess the ideal IOVA alignment */
elm = list_first_entry(pages_list, struct iopt_pages_list,
next);
- rc = iopt_alloc_iova(
- iopt, dst_iova,
- (uintptr_t)elm->pages->uptr + elm->start_byte, length);
+ switch (elm->pages->type) {
+ case IOPT_ADDRESS_USER:
+ start = elm->start_byte + (uintptr_t)elm->pages->uptr;
+ break;
+ case IOPT_ADDRESS_FILE:
+ start = elm->start_byte + elm->pages->start;
+ break;
+ }
+ rc = iopt_alloc_iova(iopt, dst_iova, start, length);
if (rc)
goto out_unlock;
if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
@@ -384,6 +391,34 @@ out_unlock_domains:
return rc;
}
+static int iopt_map_common(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
+ struct iopt_pages *pages, unsigned long *iova,
+ unsigned long length, unsigned long start_byte,
+ int iommu_prot, unsigned int flags)
+{
+ struct iopt_pages_list elm = {};
+ LIST_HEAD(pages_list);
+ int rc;
+
+ elm.pages = pages;
+ elm.start_byte = start_byte;
+ if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM &&
+ elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER)
+ elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM;
+ elm.length = length;
+ list_add(&elm.next, &pages_list);
+
+ rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags);
+ if (rc) {
+ if (elm.area)
+ iopt_abort_area(elm.area);
+ if (elm.pages)
+ iopt_put_pages(elm.pages);
+ return rc;
+ }
+ return 0;
+}
+
/**
* iopt_map_user_pages() - Map a user VA to an iova in the io page table
* @ictx: iommufd_ctx the iopt is part of
@@ -408,29 +443,41 @@ int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
unsigned long length, int iommu_prot,
unsigned int flags)
{
- struct iopt_pages_list elm = {};
- LIST_HEAD(pages_list);
- int rc;
+ struct iopt_pages *pages;
- elm.pages = iopt_alloc_pages(uptr, length, iommu_prot & IOMMU_WRITE);
- if (IS_ERR(elm.pages))
- return PTR_ERR(elm.pages);
- if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM &&
- elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER)
- elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM;
- elm.start_byte = uptr - elm.pages->uptr;
- elm.length = length;
- list_add(&elm.next, &pages_list);
+ pages = iopt_alloc_user_pages(uptr, length, iommu_prot & IOMMU_WRITE);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
- rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags);
- if (rc) {
- if (elm.area)
- iopt_abort_area(elm.area);
- if (elm.pages)
- iopt_put_pages(elm.pages);
- return rc;
- }
- return 0;
+ return iopt_map_common(ictx, iopt, pages, iova, length,
+ uptr - pages->uptr, iommu_prot, flags);
+}
+
+/**
+ * iopt_map_file_pages() - Like iopt_map_user_pages, but map a file.
+ * @ictx: iommufd_ctx the iopt is part of
+ * @iopt: io_pagetable to act on
+ * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
+ * the chosen iova on output. Otherwise is the iova to map to on input
+ * @file: file to map
+ * @start: map file starting at this byte offset
+ * @length: Number of bytes to map
+ * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
+ * @flags: IOPT_ALLOC_IOVA or zero
+ */
+int iopt_map_file_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
+ unsigned long *iova, struct file *file,
+ unsigned long start, unsigned long length,
+ int iommu_prot, unsigned int flags)
+{
+ struct iopt_pages *pages;
+
+ pages = iopt_alloc_file_pages(file, start, length,
+ iommu_prot & IOMMU_WRITE);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+ return iopt_map_common(ictx, iopt, pages, iova, length,
+ start - pages->start, iommu_prot, flags);
}
struct iova_bitmap_fn_arg {
diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h
index c61d74471684..10c928a9a463 100644
--- a/drivers/iommu/iommufd/io_pagetable.h
+++ b/drivers/iommu/iommufd/io_pagetable.h
@@ -173,6 +173,12 @@ enum {
IOPT_PAGES_ACCOUNT_NONE = 0,
IOPT_PAGES_ACCOUNT_USER = 1,
IOPT_PAGES_ACCOUNT_MM = 2,
+ IOPT_PAGES_ACCOUNT_MODE_NUM = 3,
+};
+
+enum iopt_address_type {
+ IOPT_ADDRESS_USER = 0,
+ IOPT_ADDRESS_FILE = 1,
};
/*
@@ -195,7 +201,14 @@ struct iopt_pages {
struct task_struct *source_task;
struct mm_struct *source_mm;
struct user_struct *source_user;
- void __user *uptr;
+ enum iopt_address_type type;
+ union {
+ void __user *uptr; /* IOPT_ADDRESS_USER */
+ struct { /* IOPT_ADDRESS_FILE */
+ struct file *file;
+ unsigned long start;
+ };
+ };
bool writable:1;
u8 account_mode;
@@ -206,8 +219,10 @@ struct iopt_pages {
struct rb_root_cached domains_itree;
};
-struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length,
- bool writable);
+struct iopt_pages *iopt_alloc_user_pages(void __user *uptr,
+ unsigned long length, bool writable);
+struct iopt_pages *iopt_alloc_file_pages(struct file *file, unsigned long start,
+ unsigned long length, bool writable);
void iopt_release_pages(struct kref *kref);
static inline void iopt_put_pages(struct iopt_pages *pages)
{
@@ -238,4 +253,9 @@ struct iopt_pages_access {
unsigned int users;
};
+struct pfn_reader_user;
+
+int iopt_pages_update_pinned(struct iopt_pages *pages, unsigned long npages,
+ bool inc, struct pfn_reader_user *user);
+
#endif
diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c
index 2c4b2bb11e78..1542c5fd10a8 100644
--- a/drivers/iommu/iommufd/ioas.c
+++ b/drivers/iommu/iommufd/ioas.c
@@ -2,6 +2,7 @@
/*
* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
*/
+#include <linux/file.h>
#include <linux/interval_tree.h>
#include <linux/iommu.h>
#include <linux/iommufd.h>
@@ -51,7 +52,10 @@ int iommufd_ioas_alloc_ioctl(struct iommufd_ucmd *ucmd)
rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
if (rc)
goto out_table;
+
+ down_read(&ucmd->ictx->ioas_creation_lock);
iommufd_object_finalize(ucmd->ictx, &ioas->obj);
+ up_read(&ucmd->ictx->ioas_creation_lock);
return 0;
out_table:
@@ -197,6 +201,52 @@ static int conv_iommu_prot(u32 map_flags)
return iommu_prot;
}
+int iommufd_ioas_map_file(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_ioas_map_file *cmd = ucmd->cmd;
+ unsigned long iova = cmd->iova;
+ struct iommufd_ioas *ioas;
+ unsigned int flags = 0;
+ struct file *file;
+ int rc;
+
+ if (cmd->flags &
+ ~(IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE |
+ IOMMU_IOAS_MAP_READABLE))
+ return -EOPNOTSUPP;
+
+ if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX)
+ return -EOVERFLOW;
+
+ if (!(cmd->flags &
+ (IOMMU_IOAS_MAP_WRITEABLE | IOMMU_IOAS_MAP_READABLE)))
+ return -EINVAL;
+
+ ioas = iommufd_get_ioas(ucmd->ictx, cmd->ioas_id);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+
+ if (!(cmd->flags & IOMMU_IOAS_MAP_FIXED_IOVA))
+ flags = IOPT_ALLOC_IOVA;
+
+ file = fget(cmd->fd);
+ if (!file)
+ return -EBADF;
+
+ rc = iopt_map_file_pages(ucmd->ictx, &ioas->iopt, &iova, file,
+ cmd->start, cmd->length,
+ conv_iommu_prot(cmd->flags), flags);
+ if (rc)
+ goto out_put;
+
+ cmd->iova = iova;
+ rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+out_put:
+ iommufd_put_object(ucmd->ictx, &ioas->obj);
+ fput(file);
+ return rc;
+}
+
int iommufd_ioas_map(struct iommufd_ucmd *ucmd)
{
struct iommu_ioas_map *cmd = ucmd->cmd;
@@ -327,6 +377,215 @@ out_put:
return rc;
}
+static void iommufd_release_all_iova_rwsem(struct iommufd_ctx *ictx,
+ struct xarray *ioas_list)
+{
+ struct iommufd_ioas *ioas;
+ unsigned long index;
+
+ xa_for_each(ioas_list, index, ioas) {
+ up_write(&ioas->iopt.iova_rwsem);
+ refcount_dec(&ioas->obj.users);
+ }
+ up_write(&ictx->ioas_creation_lock);
+ xa_destroy(ioas_list);
+}
+
+static int iommufd_take_all_iova_rwsem(struct iommufd_ctx *ictx,
+ struct xarray *ioas_list)
+{
+ struct iommufd_object *obj;
+ unsigned long index;
+ int rc;
+
+ /*
+ * This is very ugly, it is done instead of adding a lock around
+ * pages->source_mm, which is a performance path for mdev, we just
+ * obtain the write side of all the iova_rwsems which also protects the
+ * pages->source_*. Due to copies we can't know which IOAS could read
+ * from the pages, so we just lock everything. This is the only place
+ * locks are nested and they are uniformly taken in ID order.
+ *
+ * ioas_creation_lock prevents new IOAS from being installed in the
+ * xarray while we do this, and also prevents more than one thread from
+ * holding nested locks.
+ */
+ down_write(&ictx->ioas_creation_lock);
+ xa_lock(&ictx->objects);
+ xa_for_each(&ictx->objects, index, obj) {
+ struct iommufd_ioas *ioas;
+
+ if (!obj || obj->type != IOMMUFD_OBJ_IOAS)
+ continue;
+
+ if (!refcount_inc_not_zero(&obj->users))
+ continue;
+
+ xa_unlock(&ictx->objects);
+
+ ioas = container_of(obj, struct iommufd_ioas, obj);
+ down_write_nest_lock(&ioas->iopt.iova_rwsem,
+ &ictx->ioas_creation_lock);
+
+ rc = xa_err(xa_store(ioas_list, index, ioas, GFP_KERNEL));
+ if (rc) {
+ iommufd_release_all_iova_rwsem(ictx, ioas_list);
+ return rc;
+ }
+
+ xa_lock(&ictx->objects);
+ }
+ xa_unlock(&ictx->objects);
+ return 0;
+}
+
+static bool need_charge_update(struct iopt_pages *pages)
+{
+ switch (pages->account_mode) {
+ case IOPT_PAGES_ACCOUNT_NONE:
+ return false;
+ case IOPT_PAGES_ACCOUNT_MM:
+ return pages->source_mm != current->mm;
+ case IOPT_PAGES_ACCOUNT_USER:
+ /*
+ * Update when mm changes because it also accounts
+ * in mm->pinned_vm.
+ */
+ return (pages->source_user != current_user()) ||
+ (pages->source_mm != current->mm);
+ }
+ return true;
+}
+
+static int charge_current(unsigned long *npinned)
+{
+ struct iopt_pages tmp = {
+ .source_mm = current->mm,
+ .source_task = current->group_leader,
+ .source_user = current_user(),
+ };
+ unsigned int account_mode;
+ int rc;
+
+ for (account_mode = 0; account_mode != IOPT_PAGES_ACCOUNT_MODE_NUM;
+ account_mode++) {
+ if (!npinned[account_mode])
+ continue;
+
+ tmp.account_mode = account_mode;
+ rc = iopt_pages_update_pinned(&tmp, npinned[account_mode], true,
+ NULL);
+ if (rc)
+ goto err_undo;
+ }
+ return 0;
+
+err_undo:
+ while (account_mode != 0) {
+ account_mode--;
+ if (!npinned[account_mode])
+ continue;
+ tmp.account_mode = account_mode;
+ iopt_pages_update_pinned(&tmp, npinned[account_mode], false,
+ NULL);
+ }
+ return rc;
+}
+
+static void change_mm(struct iopt_pages *pages)
+{
+ struct task_struct *old_task = pages->source_task;
+ struct user_struct *old_user = pages->source_user;
+ struct mm_struct *old_mm = pages->source_mm;
+
+ pages->source_mm = current->mm;
+ mmgrab(pages->source_mm);
+ mmdrop(old_mm);
+
+ pages->source_task = current->group_leader;
+ get_task_struct(pages->source_task);
+ put_task_struct(old_task);
+
+ pages->source_user = get_uid(current_user());
+ free_uid(old_user);
+}
+
+#define for_each_ioas_area(_xa, _index, _ioas, _area) \
+ xa_for_each((_xa), (_index), (_ioas)) \
+ for (_area = iopt_area_iter_first(&_ioas->iopt, 0, ULONG_MAX); \
+ _area; \
+ _area = iopt_area_iter_next(_area, 0, ULONG_MAX))
+
+int iommufd_ioas_change_process(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_ioas_change_process *cmd = ucmd->cmd;
+ struct iommufd_ctx *ictx = ucmd->ictx;
+ unsigned long all_npinned[IOPT_PAGES_ACCOUNT_MODE_NUM] = {};
+ struct iommufd_ioas *ioas;
+ struct iopt_area *area;
+ struct iopt_pages *pages;
+ struct xarray ioas_list;
+ unsigned long index;
+ int rc;
+
+ if (cmd->__reserved)
+ return -EOPNOTSUPP;
+
+ xa_init(&ioas_list);
+ rc = iommufd_take_all_iova_rwsem(ictx, &ioas_list);
+ if (rc)
+ return rc;
+
+ for_each_ioas_area(&ioas_list, index, ioas, area) {
+ if (area->pages->type != IOPT_ADDRESS_FILE) {
+ rc = -EINVAL;
+ goto out;
+ }
+ }
+
+ /*
+ * Count last_pinned pages, then clear it to avoid double counting
+ * if the same iopt_pages is visited multiple times in this loop.
+ * Since we are under all the locks, npinned == last_npinned, so we
+ * can easily restore last_npinned before we return.
+ */
+ for_each_ioas_area(&ioas_list, index, ioas, area) {
+ pages = area->pages;
+
+ if (need_charge_update(pages)) {
+ all_npinned[pages->account_mode] += pages->last_npinned;
+ pages->last_npinned = 0;
+ }
+ }
+
+ rc = charge_current(all_npinned);
+
+ if (rc) {
+ /* Charge failed. Fix last_npinned and bail. */
+ for_each_ioas_area(&ioas_list, index, ioas, area)
+ area->pages->last_npinned = area->pages->npinned;
+ goto out;
+ }
+
+ for_each_ioas_area(&ioas_list, index, ioas, area) {
+ pages = area->pages;
+
+ /* Uncharge the old one (which also restores last_npinned) */
+ if (need_charge_update(pages)) {
+ int r = iopt_pages_update_pinned(pages, pages->npinned,
+ false, NULL);
+
+ if (WARN_ON(r))
+ rc = r;
+ }
+ change_mm(pages);
+ }
+
+out:
+ iommufd_release_all_iova_rwsem(ictx, &ioas_list);
+ return rc;
+}
+
int iommufd_option_rlimit_mode(struct iommu_option *cmd,
struct iommufd_ctx *ictx)
{
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index f1d865e6fab6..b6d706cf2c66 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -5,8 +5,8 @@
#define __IOMMUFD_PRIVATE_H
#include <linux/iommu.h>
+#include <linux/iommufd.h>
#include <linux/iova_bitmap.h>
-#include <linux/refcount.h>
#include <linux/rwsem.h>
#include <linux/uaccess.h>
#include <linux/xarray.h>
@@ -24,6 +24,7 @@ struct iommufd_ctx {
struct xarray objects;
struct xarray groups;
wait_queue_head_t destroy_wait;
+ struct rw_semaphore ioas_creation_lock;
u8 account_mode;
/* Compatibility with VFIO no iommu */
@@ -69,6 +70,10 @@ int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
unsigned long *iova, void __user *uptr,
unsigned long length, int iommu_prot,
unsigned int flags);
+int iopt_map_file_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
+ unsigned long *iova, struct file *file,
+ unsigned long start, unsigned long length,
+ int iommu_prot, unsigned int flags);
int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list,
unsigned long length, unsigned long *dst_iova,
int iommu_prot, unsigned int flags);
@@ -122,29 +127,6 @@ static inline int iommufd_ucmd_respond(struct iommufd_ucmd *ucmd,
return 0;
}
-enum iommufd_object_type {
- IOMMUFD_OBJ_NONE,
- IOMMUFD_OBJ_ANY = IOMMUFD_OBJ_NONE,
- IOMMUFD_OBJ_DEVICE,
- IOMMUFD_OBJ_HWPT_PAGING,
- IOMMUFD_OBJ_HWPT_NESTED,
- IOMMUFD_OBJ_IOAS,
- IOMMUFD_OBJ_ACCESS,
- IOMMUFD_OBJ_FAULT,
-#ifdef CONFIG_IOMMUFD_TEST
- IOMMUFD_OBJ_SELFTEST,
-#endif
- IOMMUFD_OBJ_MAX,
-};
-
-/* Base struct for all objects with a userspace ID handle. */
-struct iommufd_object {
- refcount_t shortterm_users;
- refcount_t users;
- enum iommufd_object_type type;
- unsigned int id;
-};
-
static inline bool iommufd_lock_obj(struct iommufd_object *obj)
{
if (!refcount_inc_not_zero(&obj->users))
@@ -225,10 +207,6 @@ iommufd_object_put_and_try_destroy(struct iommufd_ctx *ictx,
iommufd_object_remove(ictx, obj, obj->id, 0);
}
-struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx,
- size_t size,
- enum iommufd_object_type type);
-
#define __iommufd_object_alloc(ictx, ptr, type, obj) \
container_of(_iommufd_object_alloc( \
ictx, \
@@ -276,6 +254,8 @@ void iommufd_ioas_destroy(struct iommufd_object *obj);
int iommufd_ioas_iova_ranges(struct iommufd_ucmd *ucmd);
int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd);
int iommufd_ioas_map(struct iommufd_ucmd *ucmd);
+int iommufd_ioas_map_file(struct iommufd_ucmd *ucmd);
+int iommufd_ioas_change_process(struct iommufd_ucmd *ucmd);
int iommufd_ioas_copy(struct iommufd_ucmd *ucmd);
int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd);
int iommufd_ioas_option(struct iommufd_ucmd *ucmd);
@@ -312,6 +292,7 @@ struct iommufd_hwpt_paging {
struct iommufd_hwpt_nested {
struct iommufd_hw_pagetable common;
struct iommufd_hwpt_paging *parent;
+ struct iommufd_viommu *viommu;
};
static inline bool hwpt_is_paging(struct iommufd_hw_pagetable *hwpt)
@@ -528,6 +509,27 @@ static inline int iommufd_hwpt_replace_device(struct iommufd_device *idev,
return iommu_group_replace_domain(idev->igroup->group, hwpt->domain);
}
+static inline struct iommufd_viommu *
+iommufd_get_viommu(struct iommufd_ucmd *ucmd, u32 id)
+{
+ return container_of(iommufd_get_object(ucmd->ictx, id,
+ IOMMUFD_OBJ_VIOMMU),
+ struct iommufd_viommu, obj);
+}
+
+int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd);
+void iommufd_viommu_destroy(struct iommufd_object *obj);
+int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd);
+void iommufd_vdevice_destroy(struct iommufd_object *obj);
+
+struct iommufd_vdevice {
+ struct iommufd_object obj;
+ struct iommufd_ctx *ictx;
+ struct iommufd_viommu *viommu;
+ struct device *dev;
+ u64 id; /* per-vIOMMU virtual ID */
+};
+
#ifdef CONFIG_IOMMUFD_TEST
int iommufd_test(struct iommufd_ucmd *ucmd);
void iommufd_selftest_destroy(struct iommufd_object *obj);
diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h
index f4bc23a92f9a..a6b7a163f636 100644
--- a/drivers/iommu/iommufd/iommufd_test.h
+++ b/drivers/iommu/iommufd/iommufd_test.h
@@ -23,6 +23,7 @@ enum {
IOMMU_TEST_OP_DIRTY,
IOMMU_TEST_OP_MD_CHECK_IOTLB,
IOMMU_TEST_OP_TRIGGER_IOPF,
+ IOMMU_TEST_OP_DEV_CHECK_CACHE,
};
enum {
@@ -54,6 +55,11 @@ enum {
MOCK_NESTED_DOMAIN_IOTLB_NUM = 4,
};
+enum {
+ MOCK_DEV_CACHE_ID_MAX = 3,
+ MOCK_DEV_CACHE_NUM = 4,
+};
+
struct iommu_test_cmd {
__u32 size;
__u32 op;
@@ -135,6 +141,10 @@ struct iommu_test_cmd {
__u32 perm;
__u64 addr;
} trigger_iopf;
+ struct {
+ __u32 id;
+ __u32 cache;
+ } check_dev_cache;
};
__u32 last;
};
@@ -152,6 +162,7 @@ struct iommu_test_hw_info {
/* Should not be equal to any defined value in enum iommu_hwpt_data_type */
#define IOMMU_HWPT_DATA_SELFTEST 0xdead
#define IOMMU_TEST_IOTLB_DEFAULT 0xbadbeef
+#define IOMMU_TEST_DEV_CACHE_DEFAULT 0xbaddad
/**
* struct iommu_hwpt_selftest
@@ -180,4 +191,25 @@ struct iommu_hwpt_invalidate_selftest {
__u32 iotlb_id;
};
+#define IOMMU_VIOMMU_TYPE_SELFTEST 0xdeadbeef
+
+/* Should not be equal to any defined value in enum iommu_viommu_invalidate_data_type */
+#define IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST 0xdeadbeef
+#define IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST_INVALID 0xdadbeef
+
+/**
+ * struct iommu_viommu_invalidate_selftest - Invalidation data for Mock VIOMMU
+ * (IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST)
+ * @flags: Invalidate flags
+ * @cache_id: Invalidate cache entry index
+ *
+ * If IOMMU_TEST_INVALIDATE_ALL is set in @flags, @cache_id will be ignored
+ */
+struct iommu_viommu_invalidate_selftest {
+#define IOMMU_TEST_INVALIDATE_FLAG_ALL (1 << 0)
+ __u32 flags;
+ __u32 vdev_id;
+ __u32 cache_id;
+};
+
#endif
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index b5f5d27ee963..0a96cc8f27da 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -29,38 +29,6 @@ struct iommufd_object_ops {
static const struct iommufd_object_ops iommufd_object_ops[];
static struct miscdevice vfio_misc_dev;
-struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx,
- size_t size,
- enum iommufd_object_type type)
-{
- struct iommufd_object *obj;
- int rc;
-
- obj = kzalloc(size, GFP_KERNEL_ACCOUNT);
- if (!obj)
- return ERR_PTR(-ENOMEM);
- obj->type = type;
- /* Starts out bias'd by 1 until it is removed from the xarray */
- refcount_set(&obj->shortterm_users, 1);
- refcount_set(&obj->users, 1);
-
- /*
- * Reserve an ID in the xarray but do not publish the pointer yet since
- * the caller hasn't initialized it yet. Once the pointer is published
- * in the xarray and visible to other threads we can't reliably destroy
- * it anymore, so the caller must complete all errorable operations
- * before calling iommufd_object_finalize().
- */
- rc = xa_alloc(&ictx->objects, &obj->id, XA_ZERO_ENTRY,
- xa_limit_31b, GFP_KERNEL_ACCOUNT);
- if (rc)
- goto out_free;
- return obj;
-out_free:
- kfree(obj);
- return ERR_PTR(rc);
-}
-
/*
* Allow concurrent access to the object.
*
@@ -73,20 +41,26 @@ out_free:
void iommufd_object_finalize(struct iommufd_ctx *ictx,
struct iommufd_object *obj)
{
+ XA_STATE(xas, &ictx->objects, obj->id);
void *old;
- old = xa_store(&ictx->objects, obj->id, obj, GFP_KERNEL);
- /* obj->id was returned from xa_alloc() so the xa_store() cannot fail */
- WARN_ON(old);
+ xa_lock(&ictx->objects);
+ old = xas_store(&xas, obj);
+ xa_unlock(&ictx->objects);
+ /* obj->id was returned from xa_alloc() so the xas_store() cannot fail */
+ WARN_ON(old != XA_ZERO_ENTRY);
}
/* Undo _iommufd_object_alloc() if iommufd_object_finalize() was not called */
void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj)
{
+ XA_STATE(xas, &ictx->objects, obj->id);
void *old;
- old = xa_erase(&ictx->objects, obj->id);
- WARN_ON(old);
+ xa_lock(&ictx->objects);
+ old = xas_store(&xas, NULL);
+ xa_unlock(&ictx->objects);
+ WARN_ON(old != XA_ZERO_ENTRY);
kfree(obj);
}
@@ -248,6 +222,7 @@ static int iommufd_fops_open(struct inode *inode, struct file *filp)
pr_info_once("IOMMUFD is providing /dev/vfio/vfio, not VFIO.\n");
}
+ init_rwsem(&ictx->ioas_creation_lock);
xa_init_flags(&ictx->objects, XA_FLAGS_ALLOC1 | XA_FLAGS_ACCOUNT);
xa_init(&ictx->groups);
ictx->file = filp;
@@ -333,6 +308,8 @@ union ucmd_buffer {
struct iommu_ioas_unmap unmap;
struct iommu_option option;
struct iommu_vfio_ioas vfio_ioas;
+ struct iommu_viommu_alloc viommu;
+ struct iommu_vdevice_alloc vdev;
#ifdef CONFIG_IOMMUFD_TEST
struct iommu_test_cmd test;
#endif
@@ -372,18 +349,26 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
struct iommu_ioas_alloc, out_ioas_id),
IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas,
struct iommu_ioas_allow_iovas, allowed_iovas),
+ IOCTL_OP(IOMMU_IOAS_CHANGE_PROCESS, iommufd_ioas_change_process,
+ struct iommu_ioas_change_process, __reserved),
IOCTL_OP(IOMMU_IOAS_COPY, iommufd_ioas_copy, struct iommu_ioas_copy,
src_iova),
IOCTL_OP(IOMMU_IOAS_IOVA_RANGES, iommufd_ioas_iova_ranges,
struct iommu_ioas_iova_ranges, out_iova_alignment),
IOCTL_OP(IOMMU_IOAS_MAP, iommufd_ioas_map, struct iommu_ioas_map,
iova),
+ IOCTL_OP(IOMMU_IOAS_MAP_FILE, iommufd_ioas_map_file,
+ struct iommu_ioas_map_file, iova),
IOCTL_OP(IOMMU_IOAS_UNMAP, iommufd_ioas_unmap, struct iommu_ioas_unmap,
length),
IOCTL_OP(IOMMU_OPTION, iommufd_option, struct iommu_option,
val64),
IOCTL_OP(IOMMU_VFIO_IOAS, iommufd_vfio_ioas, struct iommu_vfio_ioas,
__reserved),
+ IOCTL_OP(IOMMU_VIOMMU_ALLOC, iommufd_viommu_alloc_ioctl,
+ struct iommu_viommu_alloc, out_viommu_id),
+ IOCTL_OP(IOMMU_VDEVICE_ALLOC, iommufd_vdevice_alloc_ioctl,
+ struct iommu_vdevice_alloc, virt_id),
#ifdef CONFIG_IOMMUFD_TEST
IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last),
#endif
@@ -519,6 +504,12 @@ static const struct iommufd_object_ops iommufd_object_ops[] = {
[IOMMUFD_OBJ_FAULT] = {
.destroy = iommufd_fault_destroy,
},
+ [IOMMUFD_OBJ_VIOMMU] = {
+ .destroy = iommufd_viommu_destroy,
+ },
+ [IOMMUFD_OBJ_VDEVICE] = {
+ .destroy = iommufd_vdevice_destroy,
+ },
#ifdef CONFIG_IOMMUFD_TEST
[IOMMUFD_OBJ_SELFTEST] = {
.destroy = iommufd_selftest_destroy,
diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c
index 93d806c9c073..3427749bc5ce 100644
--- a/drivers/iommu/iommufd/pages.c
+++ b/drivers/iommu/iommufd/pages.c
@@ -45,6 +45,7 @@
* last_iova + 1 can overflow. An iopt_pages index will always be much less than
* ULONG_MAX so last_index + 1 cannot overflow.
*/
+#include <linux/file.h>
#include <linux/highmem.h>
#include <linux/iommu.h>
#include <linux/iommufd.h>
@@ -346,27 +347,41 @@ static void batch_destroy(struct pfn_batch *batch, void *backup)
kfree(batch->pfns);
}
-/* true if the pfn was added, false otherwise */
-static bool batch_add_pfn(struct pfn_batch *batch, unsigned long pfn)
+static bool batch_add_pfn_num(struct pfn_batch *batch, unsigned long pfn,
+ u32 nr)
{
const unsigned int MAX_NPFNS = type_max(typeof(*batch->npfns));
-
- if (batch->end &&
- pfn == batch->pfns[batch->end - 1] + batch->npfns[batch->end - 1] &&
- batch->npfns[batch->end - 1] != MAX_NPFNS) {
- batch->npfns[batch->end - 1]++;
- batch->total_pfns++;
- return true;
- }
- if (batch->end == batch->array_size)
+ unsigned int end = batch->end;
+
+ if (end && pfn == batch->pfns[end - 1] + batch->npfns[end - 1] &&
+ nr <= MAX_NPFNS - batch->npfns[end - 1]) {
+ batch->npfns[end - 1] += nr;
+ } else if (end < batch->array_size) {
+ batch->pfns[end] = pfn;
+ batch->npfns[end] = nr;
+ batch->end++;
+ } else {
return false;
- batch->total_pfns++;
- batch->pfns[batch->end] = pfn;
- batch->npfns[batch->end] = 1;
- batch->end++;
+ }
+
+ batch->total_pfns += nr;
return true;
}
+static void batch_remove_pfn_num(struct pfn_batch *batch, unsigned long nr)
+{
+ batch->npfns[batch->end - 1] -= nr;
+ if (batch->npfns[batch->end - 1] == 0)
+ batch->end--;
+ batch->total_pfns -= nr;
+}
+
+/* true if the pfn was added, false otherwise */
+static bool batch_add_pfn(struct pfn_batch *batch, unsigned long pfn)
+{
+ return batch_add_pfn_num(batch, pfn, 1);
+}
+
/*
* Fill the batch with pfns from the domain. When the batch is full, or it
* reaches last_index, the function will return. The caller should use
@@ -622,6 +637,41 @@ static void batch_from_pages(struct pfn_batch *batch, struct page **pages,
break;
}
+static int batch_from_folios(struct pfn_batch *batch, struct folio ***folios_p,
+ unsigned long *offset_p, unsigned long npages)
+{
+ int rc = 0;
+ struct folio **folios = *folios_p;
+ unsigned long offset = *offset_p;
+
+ while (npages) {
+ struct folio *folio = *folios;
+ unsigned long nr = folio_nr_pages(folio) - offset;
+ unsigned long pfn = page_to_pfn(folio_page(folio, offset));
+
+ nr = min(nr, npages);
+ npages -= nr;
+
+ if (!batch_add_pfn_num(batch, pfn, nr))
+ break;
+ if (nr > 1) {
+ rc = folio_add_pins(folio, nr - 1);
+ if (rc) {
+ batch_remove_pfn_num(batch, nr);
+ goto out;
+ }
+ }
+
+ folios++;
+ offset = 0;
+ }
+
+out:
+ *folios_p = folios;
+ *offset_p = offset;
+ return rc;
+}
+
static void batch_unpin(struct pfn_batch *batch, struct iopt_pages *pages,
unsigned int first_page_off, size_t npages)
{
@@ -703,19 +753,32 @@ struct pfn_reader_user {
* neither
*/
int locked;
+
+ /* The following are only valid if file != NULL. */
+ struct file *file;
+ struct folio **ufolios;
+ size_t ufolios_len;
+ unsigned long ufolios_offset;
+ struct folio **ufolios_next;
};
static void pfn_reader_user_init(struct pfn_reader_user *user,
struct iopt_pages *pages)
{
user->upages = NULL;
+ user->upages_len = 0;
user->upages_start = 0;
user->upages_end = 0;
user->locked = -1;
-
user->gup_flags = FOLL_LONGTERM;
if (pages->writable)
user->gup_flags |= FOLL_WRITE;
+
+ user->file = (pages->type == IOPT_ADDRESS_FILE) ? pages->file : NULL;
+ user->ufolios = NULL;
+ user->ufolios_len = 0;
+ user->ufolios_next = NULL;
+ user->ufolios_offset = 0;
}
static void pfn_reader_user_destroy(struct pfn_reader_user *user,
@@ -724,13 +787,67 @@ static void pfn_reader_user_destroy(struct pfn_reader_user *user,
if (user->locked != -1) {
if (user->locked)
mmap_read_unlock(pages->source_mm);
- if (pages->source_mm != current->mm)
+ if (!user->file && pages->source_mm != current->mm)
mmput(pages->source_mm);
user->locked = -1;
}
kfree(user->upages);
user->upages = NULL;
+ kfree(user->ufolios);
+ user->ufolios = NULL;
+}
+
+static long pin_memfd_pages(struct pfn_reader_user *user, unsigned long start,
+ unsigned long npages)
+{
+ unsigned long i;
+ unsigned long offset;
+ unsigned long npages_out = 0;
+ struct page **upages = user->upages;
+ unsigned long end = start + (npages << PAGE_SHIFT) - 1;
+ long nfolios = user->ufolios_len / sizeof(*user->ufolios);
+
+ /*
+ * todo: memfd_pin_folios should return the last pinned offset so
+ * we can compute npages pinned, and avoid looping over folios here
+ * if upages == NULL.
+ */
+ nfolios = memfd_pin_folios(user->file, start, end, user->ufolios,
+ nfolios, &offset);
+ if (nfolios <= 0)
+ return nfolios;
+
+ offset >>= PAGE_SHIFT;
+ user->ufolios_next = user->ufolios;
+ user->ufolios_offset = offset;
+
+ for (i = 0; i < nfolios; i++) {
+ struct folio *folio = user->ufolios[i];
+ unsigned long nr = folio_nr_pages(folio);
+ unsigned long npin = min(nr - offset, npages);
+
+ npages -= npin;
+ npages_out += npin;
+
+ if (upages) {
+ if (npin == 1) {
+ *upages++ = folio_page(folio, offset);
+ } else {
+ int rc = folio_add_pins(folio, npin - 1);
+
+ if (rc)
+ return rc;
+
+ while (npin--)
+ *upages++ = folio_page(folio, offset++);
+ }
+ }
+
+ offset = 0;
+ }
+
+ return npages_out;
}
static int pfn_reader_user_pin(struct pfn_reader_user *user,
@@ -739,7 +856,9 @@ static int pfn_reader_user_pin(struct pfn_reader_user *user,
unsigned long last_index)
{
bool remote_mm = pages->source_mm != current->mm;
- unsigned long npages;
+ unsigned long npages = last_index - start_index + 1;
+ unsigned long start;
+ unsigned long unum;
uintptr_t uptr;
long rc;
@@ -747,40 +866,50 @@ static int pfn_reader_user_pin(struct pfn_reader_user *user,
WARN_ON(last_index < start_index))
return -EINVAL;
- if (!user->upages) {
+ if (!user->file && !user->upages) {
/* All undone in pfn_reader_destroy() */
- user->upages_len =
- (last_index - start_index + 1) * sizeof(*user->upages);
+ user->upages_len = npages * sizeof(*user->upages);
user->upages = temp_kmalloc(&user->upages_len, NULL, 0);
if (!user->upages)
return -ENOMEM;
}
+ if (user->file && !user->ufolios) {
+ user->ufolios_len = npages * sizeof(*user->ufolios);
+ user->ufolios = temp_kmalloc(&user->ufolios_len, NULL, 0);
+ if (!user->ufolios)
+ return -ENOMEM;
+ }
+
if (user->locked == -1) {
/*
* The majority of usages will run the map task within the mm
* providing the pages, so we can optimize into
* get_user_pages_fast()
*/
- if (remote_mm) {
+ if (!user->file && remote_mm) {
if (!mmget_not_zero(pages->source_mm))
return -EFAULT;
}
user->locked = 0;
}
- npages = min_t(unsigned long, last_index - start_index + 1,
- user->upages_len / sizeof(*user->upages));
-
+ unum = user->file ? user->ufolios_len / sizeof(*user->ufolios) :
+ user->upages_len / sizeof(*user->upages);
+ npages = min_t(unsigned long, npages, unum);
if (iommufd_should_fail())
return -EFAULT;
- uptr = (uintptr_t)(pages->uptr + start_index * PAGE_SIZE);
- if (!remote_mm)
+ if (user->file) {
+ start = pages->start + (start_index * PAGE_SIZE);
+ rc = pin_memfd_pages(user, start, npages);
+ } else if (!remote_mm) {
+ uptr = (uintptr_t)(pages->uptr + start_index * PAGE_SIZE);
rc = pin_user_pages_fast(uptr, npages, user->gup_flags,
user->upages);
- else {
+ } else {
+ uptr = (uintptr_t)(pages->uptr + start_index * PAGE_SIZE);
if (!user->locked) {
mmap_read_lock(pages->source_mm);
user->locked = 1;
@@ -838,7 +967,8 @@ static int update_mm_locked_vm(struct iopt_pages *pages, unsigned long npages,
mmap_read_unlock(pages->source_mm);
user->locked = 0;
/* If we had the lock then we also have a get */
- } else if ((!user || !user->upages) &&
+
+ } else if ((!user || (!user->upages && !user->ufolios)) &&
pages->source_mm != current->mm) {
if (!mmget_not_zero(pages->source_mm))
return -EINVAL;
@@ -855,8 +985,8 @@ static int update_mm_locked_vm(struct iopt_pages *pages, unsigned long npages,
return rc;
}
-static int do_update_pinned(struct iopt_pages *pages, unsigned long npages,
- bool inc, struct pfn_reader_user *user)
+int iopt_pages_update_pinned(struct iopt_pages *pages, unsigned long npages,
+ bool inc, struct pfn_reader_user *user)
{
int rc = 0;
@@ -890,8 +1020,8 @@ static void update_unpinned(struct iopt_pages *pages)
return;
if (pages->npinned == pages->last_npinned)
return;
- do_update_pinned(pages, pages->last_npinned - pages->npinned, false,
- NULL);
+ iopt_pages_update_pinned(pages, pages->last_npinned - pages->npinned,
+ false, NULL);
}
/*
@@ -921,7 +1051,7 @@ static int pfn_reader_user_update_pinned(struct pfn_reader_user *user,
npages = pages->npinned - pages->last_npinned;
inc = true;
}
- return do_update_pinned(pages, npages, inc, user);
+ return iopt_pages_update_pinned(pages, npages, inc, user);
}
/*
@@ -978,6 +1108,8 @@ static int pfn_reader_fill_span(struct pfn_reader *pfns)
{
struct interval_tree_double_span_iter *span = &pfns->span;
unsigned long start_index = pfns->batch_end_index;
+ struct pfn_reader_user *user = &pfns->user;
+ unsigned long npages;
struct iopt_area *area;
int rc;
@@ -1015,11 +1147,17 @@ static int pfn_reader_fill_span(struct pfn_reader *pfns)
return rc;
}
- batch_from_pages(&pfns->batch,
- pfns->user.upages +
- (start_index - pfns->user.upages_start),
- pfns->user.upages_end - start_index);
- return 0;
+ npages = user->upages_end - start_index;
+ start_index -= user->upages_start;
+ rc = 0;
+
+ if (!user->file)
+ batch_from_pages(&pfns->batch, user->upages + start_index,
+ npages);
+ else
+ rc = batch_from_folios(&pfns->batch, &user->ufolios_next,
+ &user->ufolios_offset, npages);
+ return rc;
}
static bool pfn_reader_done(struct pfn_reader *pfns)
@@ -1092,16 +1230,25 @@ static int pfn_reader_init(struct pfn_reader *pfns, struct iopt_pages *pages,
static void pfn_reader_release_pins(struct pfn_reader *pfns)
{
struct iopt_pages *pages = pfns->pages;
+ struct pfn_reader_user *user = &pfns->user;
- if (pfns->user.upages_end > pfns->batch_end_index) {
- size_t npages = pfns->user.upages_end - pfns->batch_end_index;
-
+ if (user->upages_end > pfns->batch_end_index) {
/* Any pages not transferred to the batch are just unpinned */
- unpin_user_pages(pfns->user.upages + (pfns->batch_end_index -
- pfns->user.upages_start),
- npages);
+
+ unsigned long npages = user->upages_end - pfns->batch_end_index;
+ unsigned long start_index = pfns->batch_end_index -
+ user->upages_start;
+
+ if (!user->file) {
+ unpin_user_pages(user->upages + start_index, npages);
+ } else {
+ long n = user->ufolios_len / sizeof(*user->ufolios);
+
+ unpin_folios(user->ufolios_next,
+ user->ufolios + n - user->ufolios_next);
+ }
iopt_pages_sub_npinned(pages, npages);
- pfns->user.upages_end = pfns->batch_end_index;
+ user->upages_end = pfns->batch_end_index;
}
if (pfns->batch_start_index != pfns->batch_end_index) {
pfn_reader_unpin(pfns);
@@ -1139,11 +1286,11 @@ static int pfn_reader_first(struct pfn_reader *pfns, struct iopt_pages *pages,
return 0;
}
-struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length,
- bool writable)
+static struct iopt_pages *iopt_alloc_pages(unsigned long start_byte,
+ unsigned long length,
+ bool writable)
{
struct iopt_pages *pages;
- unsigned long end;
/*
* The iommu API uses size_t as the length, and protect the DIV_ROUND_UP
@@ -1152,9 +1299,6 @@ struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length,
if (length > SIZE_MAX - PAGE_SIZE || length == 0)
return ERR_PTR(-EINVAL);
- if (check_add_overflow((unsigned long)uptr, length, &end))
- return ERR_PTR(-EOVERFLOW);
-
pages = kzalloc(sizeof(*pages), GFP_KERNEL_ACCOUNT);
if (!pages)
return ERR_PTR(-ENOMEM);
@@ -1164,8 +1308,7 @@ struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length,
mutex_init(&pages->mutex);
pages->source_mm = current->mm;
mmgrab(pages->source_mm);
- pages->uptr = (void __user *)ALIGN_DOWN((uintptr_t)uptr, PAGE_SIZE);
- pages->npages = DIV_ROUND_UP(length + (uptr - pages->uptr), PAGE_SIZE);
+ pages->npages = DIV_ROUND_UP(length + start_byte, PAGE_SIZE);
pages->access_itree = RB_ROOT_CACHED;
pages->domains_itree = RB_ROOT_CACHED;
pages->writable = writable;
@@ -1179,6 +1322,45 @@ struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length,
return pages;
}
+struct iopt_pages *iopt_alloc_user_pages(void __user *uptr,
+ unsigned long length, bool writable)
+{
+ struct iopt_pages *pages;
+ unsigned long end;
+ void __user *uptr_down =
+ (void __user *) ALIGN_DOWN((uintptr_t)uptr, PAGE_SIZE);
+
+ if (check_add_overflow((unsigned long)uptr, length, &end))
+ return ERR_PTR(-EOVERFLOW);
+
+ pages = iopt_alloc_pages(uptr - uptr_down, length, writable);
+ if (IS_ERR(pages))
+ return pages;
+ pages->uptr = uptr_down;
+ pages->type = IOPT_ADDRESS_USER;
+ return pages;
+}
+
+struct iopt_pages *iopt_alloc_file_pages(struct file *file, unsigned long start,
+ unsigned long length, bool writable)
+
+{
+ struct iopt_pages *pages;
+ unsigned long start_down = ALIGN_DOWN(start, PAGE_SIZE);
+ unsigned long end;
+
+ if (length && check_add_overflow(start, length - 1, &end))
+ return ERR_PTR(-EOVERFLOW);
+
+ pages = iopt_alloc_pages(start - start_down, length, writable);
+ if (IS_ERR(pages))
+ return pages;
+ pages->file = get_file(file);
+ pages->start = start_down;
+ pages->type = IOPT_ADDRESS_FILE;
+ return pages;
+}
+
void iopt_release_pages(struct kref *kref)
{
struct iopt_pages *pages = container_of(kref, struct iopt_pages, kref);
@@ -1191,6 +1373,8 @@ void iopt_release_pages(struct kref *kref)
mutex_destroy(&pages->mutex);
put_task_struct(pages->source_task);
free_uid(pages->source_user);
+ if (pages->type == IOPT_ADDRESS_FILE)
+ fput(pages->file);
kfree(pages);
}
@@ -1630,11 +1814,11 @@ static int iopt_pages_fill_from_domain(struct iopt_pages *pages,
return 0;
}
-static int iopt_pages_fill_from_mm(struct iopt_pages *pages,
- struct pfn_reader_user *user,
- unsigned long start_index,
- unsigned long last_index,
- struct page **out_pages)
+static int iopt_pages_fill(struct iopt_pages *pages,
+ struct pfn_reader_user *user,
+ unsigned long start_index,
+ unsigned long last_index,
+ struct page **out_pages)
{
unsigned long cur_index = start_index;
int rc;
@@ -1708,8 +1892,8 @@ int iopt_pages_fill_xarray(struct iopt_pages *pages, unsigned long start_index,
/* hole */
cur_pages = out_pages + (span.start_hole - start_index);
- rc = iopt_pages_fill_from_mm(pages, &user, span.start_hole,
- span.last_hole, cur_pages);
+ rc = iopt_pages_fill(pages, &user, span.start_hole,
+ span.last_hole, cur_pages);
if (rc)
goto out_clean_xa;
rc = pages_to_xarray(&pages->pinned_pfns, span.start_hole,
@@ -1789,6 +1973,10 @@ static int iopt_pages_rw_page(struct iopt_pages *pages, unsigned long index,
struct page *page = NULL;
int rc;
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
+ WARN_ON(pages->type != IOPT_ADDRESS_USER))
+ return -EINVAL;
+
if (!mmget_not_zero(pages->source_mm))
return iopt_pages_rw_slow(pages, index, index, offset, data,
length, flags);
@@ -1844,6 +2032,15 @@ int iopt_pages_rw_access(struct iopt_pages *pages, unsigned long start_byte,
if ((flags & IOMMUFD_ACCESS_RW_WRITE) && !pages->writable)
return -EPERM;
+ if (pages->type == IOPT_ADDRESS_FILE)
+ return iopt_pages_rw_slow(pages, start_index, last_index,
+ start_byte % PAGE_SIZE, data, length,
+ flags);
+
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
+ WARN_ON(pages->type != IOPT_ADDRESS_USER))
+ return -EINVAL;
+
if (!(flags & IOMMUFD_ACCESS_RW_KTHREAD) && change_mm) {
if (start_index == last_index)
return iopt_pages_rw_page(pages, start_index,
diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c
index 540437be168a..2f9de177dffc 100644
--- a/drivers/iommu/iommufd/selftest.c
+++ b/drivers/iommu/iommufd/selftest.c
@@ -126,12 +126,35 @@ struct mock_iommu_domain {
struct xarray pfns;
};
+static inline struct mock_iommu_domain *
+to_mock_domain(struct iommu_domain *domain)
+{
+ return container_of(domain, struct mock_iommu_domain, domain);
+}
+
struct mock_iommu_domain_nested {
struct iommu_domain domain;
+ struct mock_viommu *mock_viommu;
struct mock_iommu_domain *parent;
u32 iotlb[MOCK_NESTED_DOMAIN_IOTLB_NUM];
};
+static inline struct mock_iommu_domain_nested *
+to_mock_nested(struct iommu_domain *domain)
+{
+ return container_of(domain, struct mock_iommu_domain_nested, domain);
+}
+
+struct mock_viommu {
+ struct iommufd_viommu core;
+ struct mock_iommu_domain *s2_parent;
+};
+
+static inline struct mock_viommu *to_mock_viommu(struct iommufd_viommu *viommu)
+{
+ return container_of(viommu, struct mock_viommu, core);
+}
+
enum selftest_obj_type {
TYPE_IDEV,
};
@@ -140,8 +163,14 @@ struct mock_dev {
struct device dev;
unsigned long flags;
int id;
+ u32 cache[MOCK_DEV_CACHE_NUM];
};
+static inline struct mock_dev *to_mock_dev(struct device *dev)
+{
+ return container_of(dev, struct mock_dev, dev);
+}
+
struct selftest_obj {
struct iommufd_object obj;
enum selftest_obj_type type;
@@ -155,10 +184,15 @@ struct selftest_obj {
};
};
+static inline struct selftest_obj *to_selftest_obj(struct iommufd_object *obj)
+{
+ return container_of(obj, struct selftest_obj, obj);
+}
+
static int mock_domain_nop_attach(struct iommu_domain *domain,
struct device *dev)
{
- struct mock_dev *mdev = container_of(dev, struct mock_dev, dev);
+ struct mock_dev *mdev = to_mock_dev(dev);
if (domain->dirty_ops && (mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY))
return -EINVAL;
@@ -193,8 +227,7 @@ static void *mock_domain_hw_info(struct device *dev, u32 *length, u32 *type)
static int mock_domain_set_dirty_tracking(struct iommu_domain *domain,
bool enable)
{
- struct mock_iommu_domain *mock =
- container_of(domain, struct mock_iommu_domain, domain);
+ struct mock_iommu_domain *mock = to_mock_domain(domain);
unsigned long flags = mock->flags;
if (enable && !domain->dirty_ops)
@@ -243,8 +276,7 @@ static int mock_domain_read_and_clear_dirty(struct iommu_domain *domain,
unsigned long flags,
struct iommu_dirty_bitmap *dirty)
{
- struct mock_iommu_domain *mock =
- container_of(domain, struct mock_iommu_domain, domain);
+ struct mock_iommu_domain *mock = to_mock_domain(domain);
unsigned long end = iova + size;
void *ent;
@@ -281,7 +313,7 @@ static const struct iommu_dirty_ops dirty_ops = {
static struct iommu_domain *mock_domain_alloc_paging(struct device *dev)
{
- struct mock_dev *mdev = container_of(dev, struct mock_dev, dev);
+ struct mock_dev *mdev = to_mock_dev(dev);
struct mock_iommu_domain *mock;
mock = kzalloc(sizeof(*mock), GFP_KERNEL);
@@ -298,76 +330,85 @@ static struct iommu_domain *mock_domain_alloc_paging(struct device *dev)
return &mock->domain;
}
-static struct iommu_domain *
-__mock_domain_alloc_nested(struct mock_iommu_domain *mock_parent,
- const struct iommu_hwpt_selftest *user_cfg)
+static struct mock_iommu_domain_nested *
+__mock_domain_alloc_nested(const struct iommu_user_data *user_data)
{
struct mock_iommu_domain_nested *mock_nested;
- int i;
+ struct iommu_hwpt_selftest user_cfg;
+ int rc, i;
+
+ if (user_data->type != IOMMU_HWPT_DATA_SELFTEST)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ rc = iommu_copy_struct_from_user(&user_cfg, user_data,
+ IOMMU_HWPT_DATA_SELFTEST, iotlb);
+ if (rc)
+ return ERR_PTR(rc);
mock_nested = kzalloc(sizeof(*mock_nested), GFP_KERNEL);
if (!mock_nested)
return ERR_PTR(-ENOMEM);
- mock_nested->parent = mock_parent;
mock_nested->domain.ops = &domain_nested_ops;
mock_nested->domain.type = IOMMU_DOMAIN_NESTED;
for (i = 0; i < MOCK_NESTED_DOMAIN_IOTLB_NUM; i++)
- mock_nested->iotlb[i] = user_cfg->iotlb;
- return &mock_nested->domain;
+ mock_nested->iotlb[i] = user_cfg.iotlb;
+ return mock_nested;
}
static struct iommu_domain *
-mock_domain_alloc_user(struct device *dev, u32 flags,
- struct iommu_domain *parent,
- const struct iommu_user_data *user_data)
+mock_domain_alloc_nested(struct iommu_domain *parent, u32 flags,
+ const struct iommu_user_data *user_data)
{
+ struct mock_iommu_domain_nested *mock_nested;
struct mock_iommu_domain *mock_parent;
- struct iommu_hwpt_selftest user_cfg;
- int rc;
-
- /* must be mock_domain */
- if (!parent) {
- struct mock_dev *mdev = container_of(dev, struct mock_dev, dev);
- bool has_dirty_flag = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
- bool no_dirty_ops = mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY;
- struct iommu_domain *domain;
-
- if (flags & (~(IOMMU_HWPT_ALLOC_NEST_PARENT |
- IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
- return ERR_PTR(-EOPNOTSUPP);
- if (user_data || (has_dirty_flag && no_dirty_ops))
- return ERR_PTR(-EOPNOTSUPP);
- domain = mock_domain_alloc_paging(dev);
- if (!domain)
- return ERR_PTR(-ENOMEM);
- if (has_dirty_flag)
- container_of(domain, struct mock_iommu_domain, domain)
- ->domain.dirty_ops = &dirty_ops;
- return domain;
- }
- /* must be mock_domain_nested */
- if (user_data->type != IOMMU_HWPT_DATA_SELFTEST || flags)
+ if (flags)
return ERR_PTR(-EOPNOTSUPP);
if (!parent || parent->ops != mock_ops.default_domain_ops)
return ERR_PTR(-EINVAL);
- mock_parent = container_of(parent, struct mock_iommu_domain, domain);
+ mock_parent = to_mock_domain(parent);
if (!mock_parent)
return ERR_PTR(-EINVAL);
- rc = iommu_copy_struct_from_user(&user_cfg, user_data,
- IOMMU_HWPT_DATA_SELFTEST, iotlb);
- if (rc)
- return ERR_PTR(rc);
+ mock_nested = __mock_domain_alloc_nested(user_data);
+ if (IS_ERR(mock_nested))
+ return ERR_CAST(mock_nested);
+ mock_nested->parent = mock_parent;
+ return &mock_nested->domain;
+}
+
+static struct iommu_domain *
+mock_domain_alloc_user(struct device *dev, u32 flags,
+ struct iommu_domain *parent,
+ const struct iommu_user_data *user_data)
+{
+ bool has_dirty_flag = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
+ const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING |
+ IOMMU_HWPT_ALLOC_NEST_PARENT;
+ bool no_dirty_ops = to_mock_dev(dev)->flags &
+ MOCK_FLAGS_DEVICE_NO_DIRTY;
+ struct iommu_domain *domain;
- return __mock_domain_alloc_nested(mock_parent, &user_cfg);
+ if (parent)
+ return mock_domain_alloc_nested(parent, flags, user_data);
+
+ if (user_data)
+ return ERR_PTR(-EOPNOTSUPP);
+ if ((flags & ~PAGING_FLAGS) || (has_dirty_flag && no_dirty_ops))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ domain = mock_domain_alloc_paging(dev);
+ if (!domain)
+ return ERR_PTR(-ENOMEM);
+ if (has_dirty_flag)
+ domain->dirty_ops = &dirty_ops;
+ return domain;
}
static void mock_domain_free(struct iommu_domain *domain)
{
- struct mock_iommu_domain *mock =
- container_of(domain, struct mock_iommu_domain, domain);
+ struct mock_iommu_domain *mock = to_mock_domain(domain);
WARN_ON(!xa_empty(&mock->pfns));
kfree(mock);
@@ -378,8 +419,7 @@ static int mock_domain_map_pages(struct iommu_domain *domain,
size_t pgsize, size_t pgcount, int prot,
gfp_t gfp, size_t *mapped)
{
- struct mock_iommu_domain *mock =
- container_of(domain, struct mock_iommu_domain, domain);
+ struct mock_iommu_domain *mock = to_mock_domain(domain);
unsigned long flags = MOCK_PFN_START_IOVA;
unsigned long start_iova = iova;
@@ -430,8 +470,7 @@ static size_t mock_domain_unmap_pages(struct iommu_domain *domain,
size_t pgcount,
struct iommu_iotlb_gather *iotlb_gather)
{
- struct mock_iommu_domain *mock =
- container_of(domain, struct mock_iommu_domain, domain);
+ struct mock_iommu_domain *mock = to_mock_domain(domain);
bool first = true;
size_t ret = 0;
void *ent;
@@ -479,8 +518,7 @@ static size_t mock_domain_unmap_pages(struct iommu_domain *domain,
static phys_addr_t mock_domain_iova_to_phys(struct iommu_domain *domain,
dma_addr_t iova)
{
- struct mock_iommu_domain *mock =
- container_of(domain, struct mock_iommu_domain, domain);
+ struct mock_iommu_domain *mock = to_mock_domain(domain);
void *ent;
WARN_ON(iova % MOCK_IO_PAGE_SIZE);
@@ -491,7 +529,7 @@ static phys_addr_t mock_domain_iova_to_phys(struct iommu_domain *domain,
static bool mock_domain_capable(struct device *dev, enum iommu_cap cap)
{
- struct mock_dev *mdev = container_of(dev, struct mock_dev, dev);
+ struct mock_dev *mdev = to_mock_dev(dev);
switch (cap) {
case IOMMU_CAP_CACHE_COHERENCY:
@@ -507,14 +545,17 @@ static bool mock_domain_capable(struct device *dev, enum iommu_cap cap)
static struct iopf_queue *mock_iommu_iopf_queue;
-static struct iommu_device mock_iommu_device = {
-};
+static struct mock_iommu_device {
+ struct iommu_device iommu_dev;
+ struct completion complete;
+ refcount_t users;
+} mock_iommu;
static struct iommu_device *mock_probe_device(struct device *dev)
{
if (dev->bus != &iommufd_mock_bus_type.bus)
return ERR_PTR(-ENODEV);
- return &mock_iommu_device;
+ return &mock_iommu.iommu_dev;
}
static void mock_domain_page_response(struct device *dev, struct iopf_fault *evt,
@@ -540,6 +581,132 @@ static int mock_dev_disable_feat(struct device *dev, enum iommu_dev_features fea
return 0;
}
+static void mock_viommu_destroy(struct iommufd_viommu *viommu)
+{
+ struct mock_iommu_device *mock_iommu = container_of(
+ viommu->iommu_dev, struct mock_iommu_device, iommu_dev);
+
+ if (refcount_dec_and_test(&mock_iommu->users))
+ complete(&mock_iommu->complete);
+
+ /* iommufd core frees mock_viommu and viommu */
+}
+
+static struct iommu_domain *
+mock_viommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
+ const struct iommu_user_data *user_data)
+{
+ struct mock_viommu *mock_viommu = to_mock_viommu(viommu);
+ struct mock_iommu_domain_nested *mock_nested;
+
+ if (flags & ~IOMMU_HWPT_FAULT_ID_VALID)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ mock_nested = __mock_domain_alloc_nested(user_data);
+ if (IS_ERR(mock_nested))
+ return ERR_CAST(mock_nested);
+ mock_nested->mock_viommu = mock_viommu;
+ mock_nested->parent = mock_viommu->s2_parent;
+ return &mock_nested->domain;
+}
+
+static int mock_viommu_cache_invalidate(struct iommufd_viommu *viommu,
+ struct iommu_user_data_array *array)
+{
+ struct iommu_viommu_invalidate_selftest *cmds;
+ struct iommu_viommu_invalidate_selftest *cur;
+ struct iommu_viommu_invalidate_selftest *end;
+ int rc;
+
+ /* A zero-length array is allowed to validate the array type */
+ if (array->entry_num == 0 &&
+ array->type == IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST) {
+ array->entry_num = 0;
+ return 0;
+ }
+
+ cmds = kcalloc(array->entry_num, sizeof(*cmds), GFP_KERNEL);
+ if (!cmds)
+ return -ENOMEM;
+ cur = cmds;
+ end = cmds + array->entry_num;
+
+ static_assert(sizeof(*cmds) == 3 * sizeof(u32));
+ rc = iommu_copy_struct_from_full_user_array(
+ cmds, sizeof(*cmds), array,
+ IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST);
+ if (rc)
+ goto out;
+
+ while (cur != end) {
+ struct mock_dev *mdev;
+ struct device *dev;
+ int i;
+
+ if (cur->flags & ~IOMMU_TEST_INVALIDATE_FLAG_ALL) {
+ rc = -EOPNOTSUPP;
+ goto out;
+ }
+
+ if (cur->cache_id > MOCK_DEV_CACHE_ID_MAX) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ xa_lock(&viommu->vdevs);
+ dev = iommufd_viommu_find_dev(viommu,
+ (unsigned long)cur->vdev_id);
+ if (!dev) {
+ xa_unlock(&viommu->vdevs);
+ rc = -EINVAL;
+ goto out;
+ }
+ mdev = container_of(dev, struct mock_dev, dev);
+
+ if (cur->flags & IOMMU_TEST_INVALIDATE_FLAG_ALL) {
+ /* Invalidate all cache entries and ignore cache_id */
+ for (i = 0; i < MOCK_DEV_CACHE_NUM; i++)
+ mdev->cache[i] = 0;
+ } else {
+ mdev->cache[cur->cache_id] = 0;
+ }
+ xa_unlock(&viommu->vdevs);
+
+ cur++;
+ }
+out:
+ array->entry_num = cur - cmds;
+ kfree(cmds);
+ return rc;
+}
+
+static struct iommufd_viommu_ops mock_viommu_ops = {
+ .destroy = mock_viommu_destroy,
+ .alloc_domain_nested = mock_viommu_alloc_domain_nested,
+ .cache_invalidate = mock_viommu_cache_invalidate,
+};
+
+static struct iommufd_viommu *mock_viommu_alloc(struct device *dev,
+ struct iommu_domain *domain,
+ struct iommufd_ctx *ictx,
+ unsigned int viommu_type)
+{
+ struct mock_iommu_device *mock_iommu =
+ iommu_get_iommu_dev(dev, struct mock_iommu_device, iommu_dev);
+ struct mock_viommu *mock_viommu;
+
+ if (viommu_type != IOMMU_VIOMMU_TYPE_SELFTEST)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ mock_viommu = iommufd_viommu_alloc(ictx, struct mock_viommu, core,
+ &mock_viommu_ops);
+ if (IS_ERR(mock_viommu))
+ return ERR_CAST(mock_viommu);
+
+ refcount_inc(&mock_iommu->users);
+ return &mock_viommu->core;
+}
+
static const struct iommu_ops mock_ops = {
/*
* IOMMU_DOMAIN_BLOCKED cannot be returned from def_domain_type()
@@ -559,6 +726,7 @@ static const struct iommu_ops mock_ops = {
.dev_enable_feat = mock_dev_enable_feat,
.dev_disable_feat = mock_dev_disable_feat,
.user_pasid_table = true,
+ .viommu_alloc = mock_viommu_alloc,
.default_domain_ops =
&(struct iommu_domain_ops){
.free = mock_domain_free,
@@ -571,18 +739,14 @@ static const struct iommu_ops mock_ops = {
static void mock_domain_free_nested(struct iommu_domain *domain)
{
- struct mock_iommu_domain_nested *mock_nested =
- container_of(domain, struct mock_iommu_domain_nested, domain);
-
- kfree(mock_nested);
+ kfree(to_mock_nested(domain));
}
static int
mock_domain_cache_invalidate_user(struct iommu_domain *domain,
struct iommu_user_data_array *array)
{
- struct mock_iommu_domain_nested *mock_nested =
- container_of(domain, struct mock_iommu_domain_nested, domain);
+ struct mock_iommu_domain_nested *mock_nested = to_mock_nested(domain);
struct iommu_hwpt_invalidate_selftest inv;
u32 processed = 0;
int i = 0, j;
@@ -657,7 +821,7 @@ get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id,
iommufd_put_object(ucmd->ictx, &hwpt->obj);
return ERR_PTR(-EINVAL);
}
- *mock = container_of(hwpt->domain, struct mock_iommu_domain, domain);
+ *mock = to_mock_domain(hwpt->domain);
return hwpt;
}
@@ -675,14 +839,13 @@ get_md_pagetable_nested(struct iommufd_ucmd *ucmd, u32 mockpt_id,
iommufd_put_object(ucmd->ictx, &hwpt->obj);
return ERR_PTR(-EINVAL);
}
- *mock_nested = container_of(hwpt->domain,
- struct mock_iommu_domain_nested, domain);
+ *mock_nested = to_mock_nested(hwpt->domain);
return hwpt;
}
static void mock_dev_release(struct device *dev)
{
- struct mock_dev *mdev = container_of(dev, struct mock_dev, dev);
+ struct mock_dev *mdev = to_mock_dev(dev);
ida_free(&mock_dev_ida, mdev->id);
kfree(mdev);
@@ -691,7 +854,7 @@ static void mock_dev_release(struct device *dev)
static struct mock_dev *mock_dev_create(unsigned long dev_flags)
{
struct mock_dev *mdev;
- int rc;
+ int rc, i;
if (dev_flags &
~(MOCK_FLAGS_DEVICE_NO_DIRTY | MOCK_FLAGS_DEVICE_HUGE_IOVA))
@@ -705,6 +868,8 @@ static struct mock_dev *mock_dev_create(unsigned long dev_flags)
mdev->flags = dev_flags;
mdev->dev.release = mock_dev_release;
mdev->dev.bus = &iommufd_mock_bus_type.bus;
+ for (i = 0; i < MOCK_DEV_CACHE_NUM; i++)
+ mdev->cache[i] = IOMMU_TEST_DEV_CACHE_DEFAULT;
rc = ida_alloc(&mock_dev_ida, GFP_KERNEL);
if (rc < 0)
@@ -813,7 +978,7 @@ static int iommufd_test_mock_domain_replace(struct iommufd_ucmd *ucmd,
if (IS_ERR(dev_obj))
return PTR_ERR(dev_obj);
- sobj = container_of(dev_obj, struct selftest_obj, obj);
+ sobj = to_selftest_obj(dev_obj);
if (sobj->type != TYPE_IDEV) {
rc = -EINVAL;
goto out_dev_obj;
@@ -951,8 +1116,7 @@ static int iommufd_test_md_check_iotlb(struct iommufd_ucmd *ucmd,
if (IS_ERR(hwpt))
return PTR_ERR(hwpt);
- mock_nested = container_of(hwpt->domain,
- struct mock_iommu_domain_nested, domain);
+ mock_nested = to_mock_nested(hwpt->domain);
if (iotlb_id > MOCK_NESTED_DOMAIN_IOTLB_ID_MAX ||
mock_nested->iotlb[iotlb_id] != iotlb)
@@ -961,6 +1125,24 @@ static int iommufd_test_md_check_iotlb(struct iommufd_ucmd *ucmd,
return rc;
}
+static int iommufd_test_dev_check_cache(struct iommufd_ucmd *ucmd, u32 idev_id,
+ unsigned int cache_id, u32 cache)
+{
+ struct iommufd_device *idev;
+ struct mock_dev *mdev;
+ int rc = 0;
+
+ idev = iommufd_get_device(ucmd, idev_id);
+ if (IS_ERR(idev))
+ return PTR_ERR(idev);
+ mdev = container_of(idev->dev, struct mock_dev, dev);
+
+ if (cache_id > MOCK_DEV_CACHE_ID_MAX || mdev->cache[cache_id] != cache)
+ rc = -EINVAL;
+ iommufd_put_object(ucmd->ictx, &idev->obj);
+ return rc;
+}
+
struct selftest_access {
struct iommufd_access *access;
struct file *file;
@@ -1431,7 +1613,7 @@ static int iommufd_test_trigger_iopf(struct iommufd_ucmd *ucmd,
void iommufd_selftest_destroy(struct iommufd_object *obj)
{
- struct selftest_obj *sobj = container_of(obj, struct selftest_obj, obj);
+ struct selftest_obj *sobj = to_selftest_obj(obj);
switch (sobj->type) {
case TYPE_IDEV:
@@ -1470,6 +1652,10 @@ int iommufd_test(struct iommufd_ucmd *ucmd)
return iommufd_test_md_check_iotlb(ucmd, cmd->id,
cmd->check_iotlb.id,
cmd->check_iotlb.iotlb);
+ case IOMMU_TEST_OP_DEV_CHECK_CACHE:
+ return iommufd_test_dev_check_cache(ucmd, cmd->id,
+ cmd->check_dev_cache.id,
+ cmd->check_dev_cache.cache);
case IOMMU_TEST_OP_CREATE_ACCESS:
return iommufd_test_create_access(ucmd, cmd->id,
cmd->create_access.flags);
@@ -1536,24 +1722,27 @@ int __init iommufd_test_init(void)
if (rc)
goto err_platform;
- rc = iommu_device_sysfs_add(&mock_iommu_device,
+ rc = iommu_device_sysfs_add(&mock_iommu.iommu_dev,
&selftest_iommu_dev->dev, NULL, "%s",
dev_name(&selftest_iommu_dev->dev));
if (rc)
goto err_bus;
- rc = iommu_device_register_bus(&mock_iommu_device, &mock_ops,
+ rc = iommu_device_register_bus(&mock_iommu.iommu_dev, &mock_ops,
&iommufd_mock_bus_type.bus,
&iommufd_mock_bus_type.nb);
if (rc)
goto err_sysfs;
+ refcount_set(&mock_iommu.users, 1);
+ init_completion(&mock_iommu.complete);
+
mock_iommu_iopf_queue = iopf_queue_alloc("mock-iopfq");
return 0;
err_sysfs:
- iommu_device_sysfs_remove(&mock_iommu_device);
+ iommu_device_sysfs_remove(&mock_iommu.iommu_dev);
err_bus:
bus_unregister(&iommufd_mock_bus_type.bus);
err_platform:
@@ -1563,6 +1752,22 @@ err_dbgfs:
return rc;
}
+static void iommufd_test_wait_for_users(void)
+{
+ if (refcount_dec_and_test(&mock_iommu.users))
+ return;
+ /*
+ * Time out waiting for iommu device user count to become 0.
+ *
+ * Note that this is just making an example here, since the selftest is
+ * built into the iommufd module, i.e. it only unplugs the iommu device
+ * when unloading the module. So, it is expected that this WARN_ON will
+ * not trigger, as long as any iommufd FDs are open.
+ */
+ WARN_ON(!wait_for_completion_timeout(&mock_iommu.complete,
+ msecs_to_jiffies(10000)));
+}
+
void iommufd_test_exit(void)
{
if (mock_iommu_iopf_queue) {
@@ -1570,8 +1775,9 @@ void iommufd_test_exit(void)
mock_iommu_iopf_queue = NULL;
}
- iommu_device_sysfs_remove(&mock_iommu_device);
- iommu_device_unregister_bus(&mock_iommu_device,
+ iommufd_test_wait_for_users();
+ iommu_device_sysfs_remove(&mock_iommu.iommu_dev);
+ iommu_device_unregister_bus(&mock_iommu.iommu_dev,
&iommufd_mock_bus_type.bus,
&iommufd_mock_bus_type.nb);
bus_unregister(&iommufd_mock_bus_type.bus);
diff --git a/drivers/iommu/iommufd/vfio_compat.c b/drivers/iommu/iommufd/vfio_compat.c
index a3ad5f0b6c59..514aacd64009 100644
--- a/drivers/iommu/iommufd/vfio_compat.c
+++ b/drivers/iommu/iommufd/vfio_compat.c
@@ -291,12 +291,7 @@ static int iommufd_vfio_check_extension(struct iommufd_ctx *ictx,
case VFIO_DMA_CC_IOMMU:
return iommufd_vfio_cc_iommu(ictx);
- /*
- * This is obsolete, and to be removed from VFIO. It was an incomplete
- * idea that got merged.
- * https://lore.kernel.org/kvm/0-v1-0093c9b0e345+19-vfio_no_nesting_jgg@nvidia.com/
- */
- case VFIO_TYPE1_NESTING_IOMMU:
+ case __VFIO_RESERVED_TYPE1_NESTING_IOMMU:
return 0;
/*
diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c
new file mode 100644
index 000000000000..69b88e8c7c26
--- /dev/null
+++ b/drivers/iommu/iommufd/viommu.c
@@ -0,0 +1,157 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ */
+#include "iommufd_private.h"
+
+void iommufd_viommu_destroy(struct iommufd_object *obj)
+{
+ struct iommufd_viommu *viommu =
+ container_of(obj, struct iommufd_viommu, obj);
+
+ if (viommu->ops && viommu->ops->destroy)
+ viommu->ops->destroy(viommu);
+ refcount_dec(&viommu->hwpt->common.obj.users);
+ xa_destroy(&viommu->vdevs);
+}
+
+int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_viommu_alloc *cmd = ucmd->cmd;
+ struct iommufd_hwpt_paging *hwpt_paging;
+ struct iommufd_viommu *viommu;
+ struct iommufd_device *idev;
+ const struct iommu_ops *ops;
+ int rc;
+
+ if (cmd->flags || cmd->type == IOMMU_VIOMMU_TYPE_DEFAULT)
+ return -EOPNOTSUPP;
+
+ idev = iommufd_get_device(ucmd, cmd->dev_id);
+ if (IS_ERR(idev))
+ return PTR_ERR(idev);
+
+ ops = dev_iommu_ops(idev->dev);
+ if (!ops->viommu_alloc) {
+ rc = -EOPNOTSUPP;
+ goto out_put_idev;
+ }
+
+ hwpt_paging = iommufd_get_hwpt_paging(ucmd, cmd->hwpt_id);
+ if (IS_ERR(hwpt_paging)) {
+ rc = PTR_ERR(hwpt_paging);
+ goto out_put_idev;
+ }
+
+ if (!hwpt_paging->nest_parent) {
+ rc = -EINVAL;
+ goto out_put_hwpt;
+ }
+
+ viommu = ops->viommu_alloc(idev->dev, hwpt_paging->common.domain,
+ ucmd->ictx, cmd->type);
+ if (IS_ERR(viommu)) {
+ rc = PTR_ERR(viommu);
+ goto out_put_hwpt;
+ }
+
+ xa_init(&viommu->vdevs);
+ viommu->type = cmd->type;
+ viommu->ictx = ucmd->ictx;
+ viommu->hwpt = hwpt_paging;
+ refcount_inc(&viommu->hwpt->common.obj.users);
+ /*
+ * It is the most likely case that a physical IOMMU is unpluggable. A
+ * pluggable IOMMU instance (if exists) is responsible for refcounting
+ * on its own.
+ */
+ viommu->iommu_dev = __iommu_get_iommu_dev(idev->dev);
+
+ cmd->out_viommu_id = viommu->obj.id;
+ rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+ if (rc)
+ goto out_abort;
+ iommufd_object_finalize(ucmd->ictx, &viommu->obj);
+ goto out_put_hwpt;
+
+out_abort:
+ iommufd_object_abort_and_destroy(ucmd->ictx, &viommu->obj);
+out_put_hwpt:
+ iommufd_put_object(ucmd->ictx, &hwpt_paging->common.obj);
+out_put_idev:
+ iommufd_put_object(ucmd->ictx, &idev->obj);
+ return rc;
+}
+
+void iommufd_vdevice_destroy(struct iommufd_object *obj)
+{
+ struct iommufd_vdevice *vdev =
+ container_of(obj, struct iommufd_vdevice, obj);
+ struct iommufd_viommu *viommu = vdev->viommu;
+
+ /* xa_cmpxchg is okay to fail if alloc failed xa_cmpxchg previously */
+ xa_cmpxchg(&viommu->vdevs, vdev->id, vdev, NULL, GFP_KERNEL);
+ refcount_dec(&viommu->obj.users);
+ put_device(vdev->dev);
+}
+
+int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_vdevice_alloc *cmd = ucmd->cmd;
+ struct iommufd_vdevice *vdev, *curr;
+ struct iommufd_viommu *viommu;
+ struct iommufd_device *idev;
+ u64 virt_id = cmd->virt_id;
+ int rc = 0;
+
+ /* virt_id indexes an xarray */
+ if (virt_id > ULONG_MAX)
+ return -EINVAL;
+
+ viommu = iommufd_get_viommu(ucmd, cmd->viommu_id);
+ if (IS_ERR(viommu))
+ return PTR_ERR(viommu);
+
+ idev = iommufd_get_device(ucmd, cmd->dev_id);
+ if (IS_ERR(idev)) {
+ rc = PTR_ERR(idev);
+ goto out_put_viommu;
+ }
+
+ if (viommu->iommu_dev != __iommu_get_iommu_dev(idev->dev)) {
+ rc = -EINVAL;
+ goto out_put_idev;
+ }
+
+ vdev = iommufd_object_alloc(ucmd->ictx, vdev, IOMMUFD_OBJ_VDEVICE);
+ if (IS_ERR(vdev)) {
+ rc = PTR_ERR(vdev);
+ goto out_put_idev;
+ }
+
+ vdev->id = virt_id;
+ vdev->dev = idev->dev;
+ get_device(idev->dev);
+ vdev->viommu = viommu;
+ refcount_inc(&viommu->obj.users);
+
+ curr = xa_cmpxchg(&viommu->vdevs, virt_id, NULL, vdev, GFP_KERNEL);
+ if (curr) {
+ rc = xa_err(curr) ?: -EEXIST;
+ goto out_abort;
+ }
+
+ cmd->out_vdevice_id = vdev->obj.id;
+ rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+ if (rc)
+ goto out_abort;
+ iommufd_object_finalize(ucmd->ictx, &vdev->obj);
+ goto out_put_idev;
+
+out_abort:
+ iommufd_object_abort_and_destroy(ucmd->ictx, &vdev->obj);
+out_put_idev:
+ iommufd_put_object(ucmd->ictx, &idev->obj);
+out_put_viommu:
+ iommufd_put_object(ucmd->ictx, &viommu->obj);
+ return rc;
+}
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index bf391b40e576..50ebc9593c9d 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -72,7 +72,6 @@ struct vfio_iommu {
uint64_t pgsize_bitmap;
uint64_t num_non_pinned_groups;
bool v2;
- bool nesting;
bool dirty_page_tracking;
struct list_head emulated_iommu_groups;
};
@@ -2195,12 +2194,6 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
goto out_free_domain;
}
- if (iommu->nesting) {
- ret = iommu_enable_nesting(domain->domain);
- if (ret)
- goto out_domain;
- }
-
ret = iommu_attach_group(domain->domain, group->iommu_group);
if (ret)
goto out_domain;
@@ -2541,9 +2534,7 @@ static void *vfio_iommu_type1_open(unsigned long arg)
switch (arg) {
case VFIO_TYPE1_IOMMU:
break;
- case VFIO_TYPE1_NESTING_IOMMU:
- iommu->nesting = true;
- fallthrough;
+ case __VFIO_RESERVED_TYPE1_NESTING_IOMMU:
case VFIO_TYPE1v2_IOMMU:
iommu->v2 = true;
break;
@@ -2638,7 +2629,6 @@ static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu,
switch (arg) {
case VFIO_TYPE1_IOMMU:
case VFIO_TYPE1v2_IOMMU:
- case VFIO_TYPE1_NESTING_IOMMU:
case VFIO_UNMAP_ALL:
return 1;
case VFIO_UPDATE_VADDR:
diff --git a/include/acpi/actbl2.h b/include/acpi/actbl2.h
index d3858eebc255..2e917a8f8bca 100644
--- a/include/acpi/actbl2.h
+++ b/include/acpi/actbl2.h
@@ -453,7 +453,7 @@ struct acpi_table_ccel {
* IORT - IO Remapping Table
*
* Conforms to "IO Remapping Table System Software on ARM Platforms",
- * Document number: ARM DEN 0049E.e, Sep 2022
+ * Document number: ARM DEN 0049E.f, Apr 2024
*
******************************************************************************/
@@ -524,6 +524,7 @@ struct acpi_iort_memory_access {
#define ACPI_IORT_MF_COHERENCY (1)
#define ACPI_IORT_MF_ATTRIBUTES (1<<1)
+#define ACPI_IORT_MF_CANWBS (1<<2)
/*
* IORT node specific subtables
diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index b1ecfc3cd5bc..ce86b09ae80f 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -87,6 +87,7 @@ struct io_pgtable_cfg {
* attributes set in the TCR for a non-coherent page-table walker.
*
* IO_PGTABLE_QUIRK_ARM_HD: Enables dirty tracking in stage 1 pagetable.
+ * IO_PGTABLE_QUIRK_ARM_S2FWB: Use the FWB format for the MemAttrs bits
*/
#define IO_PGTABLE_QUIRK_ARM_NS BIT(0)
#define IO_PGTABLE_QUIRK_NO_PERMS BIT(1)
@@ -95,6 +96,7 @@ struct io_pgtable_cfg {
#define IO_PGTABLE_QUIRK_ARM_TTBR1 BIT(5)
#define IO_PGTABLE_QUIRK_ARM_OUTER_WBWA BIT(6)
#define IO_PGTABLE_QUIRK_ARM_HD BIT(7)
+ #define IO_PGTABLE_QUIRK_ARM_S2FWB BIT(8)
unsigned long quirks;
unsigned long pgsize_bitmap;
unsigned int ias;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index bd722f473635..8b02adbd14f7 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -42,6 +42,8 @@ struct notifier_block;
struct iommu_sva;
struct iommu_dma_cookie;
struct iommu_fault_param;
+struct iommufd_ctx;
+struct iommufd_viommu;
#define IOMMU_FAULT_PERM_READ (1 << 0) /* read */
#define IOMMU_FAULT_PERM_WRITE (1 << 1) /* write */
@@ -491,7 +493,9 @@ static inline int __iommu_copy_struct_from_user_array(
* @index: Index to the location in the array to copy user data from
* @min_last: The last member of the data structure @kdst points in the
* initial version.
- * Return 0 for success, otherwise -error.
+ *
+ * Copy a single entry from a user array. Return 0 for success, otherwise
+ * -error.
*/
#define iommu_copy_struct_from_user_array(kdst, user_array, data_type, index, \
min_last) \
@@ -500,6 +504,50 @@ static inline int __iommu_copy_struct_from_user_array(
offsetofend(typeof(*(kdst)), min_last))
/**
+ * iommu_copy_struct_from_full_user_array - Copy iommu driver specific user
+ * space data from an iommu_user_data_array
+ * @kdst: Pointer to an iommu driver specific user data that is defined in
+ * include/uapi/linux/iommufd.h
+ * @kdst_entry_size: sizeof(*kdst)
+ * @user_array: Pointer to a struct iommu_user_data_array for a user space
+ * array
+ * @data_type: The data type of the @kdst. Must match with @user_array->type
+ *
+ * Copy the entire user array. kdst must have room for kdst_entry_size *
+ * user_array->entry_num bytes. Return 0 for success, otherwise -error.
+ */
+static inline int
+iommu_copy_struct_from_full_user_array(void *kdst, size_t kdst_entry_size,
+ struct iommu_user_data_array *user_array,
+ unsigned int data_type)
+{
+ unsigned int i;
+ int ret;
+
+ if (user_array->type != data_type)
+ return -EINVAL;
+ if (!user_array->entry_num)
+ return -EINVAL;
+ if (likely(user_array->entry_len == kdst_entry_size)) {
+ if (copy_from_user(kdst, user_array->uptr,
+ user_array->entry_num *
+ user_array->entry_len))
+ return -EFAULT;
+ }
+
+ /* Copy item by item */
+ for (i = 0; i != user_array->entry_num; i++) {
+ ret = copy_struct_from_user(
+ kdst + kdst_entry_size * i, kdst_entry_size,
+ user_array->uptr + user_array->entry_len * i,
+ user_array->entry_len);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+/**
* struct iommu_ops - iommu ops and capabilities
* @capable: check capability
* @hw_info: report iommu hardware information. The data buffer returned by this
@@ -542,6 +590,14 @@ static inline int __iommu_copy_struct_from_user_array(
* @remove_dev_pasid: Remove any translation configurations of a specific
* pasid, so that any DMA transactions with this pasid
* will be blocked by the hardware.
+ * @viommu_alloc: Allocate an iommufd_viommu on a physical IOMMU instance behind
+ * the @dev, as the set of virtualization resources shared/passed
+ * to user space IOMMU instance. And associate it with a nesting
+ * @parent_domain. The @viommu_type must be defined in the header
+ * include/uapi/linux/iommufd.h
+ * It is required to call iommufd_viommu_alloc() helper for
+ * a bundled allocation of the core and the driver structures,
+ * using the given @ictx pointer.
* @pgsize_bitmap: bitmap of all possible supported page sizes
* @owner: Driver module providing these ops
* @identity_domain: An always available, always attachable identity
@@ -591,6 +647,10 @@ struct iommu_ops {
void (*remove_dev_pasid)(struct device *dev, ioasid_t pasid,
struct iommu_domain *domain);
+ struct iommufd_viommu *(*viommu_alloc)(
+ struct device *dev, struct iommu_domain *parent_domain,
+ struct iommufd_ctx *ictx, unsigned int viommu_type);
+
const struct iommu_domain_ops *default_domain_ops;
unsigned long pgsize_bitmap;
struct module *owner;
@@ -635,7 +695,6 @@ struct iommu_ops {
* @enforce_cache_coherency: Prevent any kind of DMA from bypassing IOMMU_CACHE,
* including no-snoop TLPs on PCIe or other platform
* specific mechanisms.
- * @enable_nesting: Enable nesting
* @set_pgtable_quirks: Set io page table quirks (IO_PGTABLE_QUIRK_*)
* @free: Release the domain after use.
*/
@@ -663,7 +722,6 @@ struct iommu_domain_ops {
dma_addr_t iova);
bool (*enforce_cache_coherency)(struct iommu_domain *domain);
- int (*enable_nesting)(struct iommu_domain *domain);
int (*set_pgtable_quirks)(struct iommu_domain *domain,
unsigned long quirks);
@@ -844,7 +902,6 @@ extern void iommu_group_put(struct iommu_group *group);
extern int iommu_group_id(struct iommu_group *group);
extern struct iommu_domain *iommu_group_default_domain(struct iommu_group *);
-int iommu_enable_nesting(struct iommu_domain *domain);
int iommu_set_pgtable_quirks(struct iommu_domain *domain,
unsigned long quirks);
@@ -994,6 +1051,8 @@ struct iommu_fwspec {
/* ATS is supported */
#define IOMMU_FWSPEC_PCI_RC_ATS (1 << 0)
+/* CANWBS is supported */
+#define IOMMU_FWSPEC_PCI_RC_CANWBS (1 << 1)
/*
* An iommu attach handle represents a relationship between an iommu domain
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index 30f832a60ccb..11110c749200 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -8,16 +8,46 @@
#include <linux/err.h>
#include <linux/errno.h>
+#include <linux/refcount.h>
#include <linux/types.h>
+#include <linux/xarray.h>
struct device;
struct file;
struct iommu_group;
+struct iommu_user_data;
+struct iommu_user_data_array;
struct iommufd_access;
struct iommufd_ctx;
struct iommufd_device;
+struct iommufd_viommu_ops;
struct page;
+enum iommufd_object_type {
+ IOMMUFD_OBJ_NONE,
+ IOMMUFD_OBJ_ANY = IOMMUFD_OBJ_NONE,
+ IOMMUFD_OBJ_DEVICE,
+ IOMMUFD_OBJ_HWPT_PAGING,
+ IOMMUFD_OBJ_HWPT_NESTED,
+ IOMMUFD_OBJ_IOAS,
+ IOMMUFD_OBJ_ACCESS,
+ IOMMUFD_OBJ_FAULT,
+ IOMMUFD_OBJ_VIOMMU,
+ IOMMUFD_OBJ_VDEVICE,
+#ifdef CONFIG_IOMMUFD_TEST
+ IOMMUFD_OBJ_SELFTEST,
+#endif
+ IOMMUFD_OBJ_MAX,
+};
+
+/* Base struct for all objects with a userspace ID handle. */
+struct iommufd_object {
+ refcount_t shortterm_users;
+ refcount_t users;
+ enum iommufd_object_type type;
+ unsigned int id;
+};
+
struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx,
struct device *dev, u32 *id);
void iommufd_device_unbind(struct iommufd_device *idev);
@@ -54,6 +84,45 @@ void iommufd_access_detach(struct iommufd_access *access);
void iommufd_ctx_get(struct iommufd_ctx *ictx);
+struct iommufd_viommu {
+ struct iommufd_object obj;
+ struct iommufd_ctx *ictx;
+ struct iommu_device *iommu_dev;
+ struct iommufd_hwpt_paging *hwpt;
+
+ const struct iommufd_viommu_ops *ops;
+
+ struct xarray vdevs;
+
+ unsigned int type;
+};
+
+/**
+ * struct iommufd_viommu_ops - vIOMMU specific operations
+ * @destroy: Clean up all driver-specific parts of an iommufd_viommu. The memory
+ * of the vIOMMU will be free-ed by iommufd core after calling this op
+ * @alloc_domain_nested: Allocate a IOMMU_DOMAIN_NESTED on a vIOMMU that holds a
+ * nesting parent domain (IOMMU_DOMAIN_PAGING). @user_data
+ * must be defined in include/uapi/linux/iommufd.h.
+ * It must fully initialize the new iommu_domain before
+ * returning. Upon failure, ERR_PTR must be returned.
+ * @cache_invalidate: Flush hardware cache used by a vIOMMU. It can be used for
+ * any IOMMU hardware specific cache: TLB and device cache.
+ * The @array passes in the cache invalidation requests, in
+ * form of a driver data structure. A driver must update the
+ * array->entry_num to report the number of handled requests.
+ * The data structure of the array entry must be defined in
+ * include/uapi/linux/iommufd.h
+ */
+struct iommufd_viommu_ops {
+ void (*destroy)(struct iommufd_viommu *viommu);
+ struct iommu_domain *(*alloc_domain_nested)(
+ struct iommufd_viommu *viommu, u32 flags,
+ const struct iommu_user_data *user_data);
+ int (*cache_invalidate)(struct iommufd_viommu *viommu,
+ struct iommu_user_data_array *array);
+};
+
#if IS_ENABLED(CONFIG_IOMMUFD)
struct iommufd_ctx *iommufd_ctx_from_file(struct file *file);
struct iommufd_ctx *iommufd_ctx_from_fd(int fd);
@@ -111,4 +180,43 @@ static inline int iommufd_vfio_compat_set_no_iommu(struct iommufd_ctx *ictx)
return -EOPNOTSUPP;
}
#endif /* CONFIG_IOMMUFD */
+
+#if IS_ENABLED(CONFIG_IOMMUFD_DRIVER_CORE)
+struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx,
+ size_t size,
+ enum iommufd_object_type type);
+struct device *iommufd_viommu_find_dev(struct iommufd_viommu *viommu,
+ unsigned long vdev_id);
+#else /* !CONFIG_IOMMUFD_DRIVER_CORE */
+static inline struct iommufd_object *
+_iommufd_object_alloc(struct iommufd_ctx *ictx, size_t size,
+ enum iommufd_object_type type)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline struct device *
+iommufd_viommu_find_dev(struct iommufd_viommu *viommu, unsigned long vdev_id)
+{
+ return NULL;
+}
+#endif /* CONFIG_IOMMUFD_DRIVER_CORE */
+
+/*
+ * Helpers for IOMMU driver to allocate driver structures that will be freed by
+ * the iommufd core. The free op will be called prior to freeing the memory.
+ */
+#define iommufd_viommu_alloc(ictx, drv_struct, member, viommu_ops) \
+ ({ \
+ drv_struct *ret; \
+ \
+ static_assert(__same_type(struct iommufd_viommu, \
+ ((drv_struct *)NULL)->member)); \
+ static_assert(offsetof(drv_struct, member.obj) == 0); \
+ ret = (drv_struct *)_iommufd_object_alloc( \
+ ictx, sizeof(drv_struct), IOMMUFD_OBJ_VIOMMU); \
+ if (!IS_ERR(ret)) \
+ ret->member.ops = viommu_ops; \
+ ret; \
+ })
#endif
diff --git a/include/linux/mm.h b/include/linux/mm.h
index feb5c8021bef..673771f34674 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2536,6 +2536,7 @@ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end,
struct folio **folios, unsigned int max_folios,
pgoff_t *offset);
+int folio_add_pins(struct folio *folio, unsigned int pins);
int get_user_pages_fast(unsigned long start, int nr_pages,
unsigned int gup_flags, struct page **pages);
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 72010f71c5e4..4ae8b1ee0444 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -51,6 +51,10 @@ enum {
IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP = 0x8c,
IOMMUFD_CMD_HWPT_INVALIDATE = 0x8d,
IOMMUFD_CMD_FAULT_QUEUE_ALLOC = 0x8e,
+ IOMMUFD_CMD_IOAS_MAP_FILE = 0x8f,
+ IOMMUFD_CMD_VIOMMU_ALLOC = 0x90,
+ IOMMUFD_CMD_VDEVICE_ALLOC = 0x91,
+ IOMMUFD_CMD_IOAS_CHANGE_PROCESS = 0x92,
};
/**
@@ -214,6 +218,30 @@ struct iommu_ioas_map {
#define IOMMU_IOAS_MAP _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_MAP)
/**
+ * struct iommu_ioas_map_file - ioctl(IOMMU_IOAS_MAP_FILE)
+ * @size: sizeof(struct iommu_ioas_map_file)
+ * @flags: same as for iommu_ioas_map
+ * @ioas_id: same as for iommu_ioas_map
+ * @fd: the memfd to map
+ * @start: byte offset from start of file to map from
+ * @length: same as for iommu_ioas_map
+ * @iova: same as for iommu_ioas_map
+ *
+ * Set an IOVA mapping from a memfd file. All other arguments and semantics
+ * match those of IOMMU_IOAS_MAP.
+ */
+struct iommu_ioas_map_file {
+ __u32 size;
+ __u32 flags;
+ __u32 ioas_id;
+ __s32 fd;
+ __aligned_u64 start;
+ __aligned_u64 length;
+ __aligned_u64 iova;
+};
+#define IOMMU_IOAS_MAP_FILE _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_MAP_FILE)
+
+/**
* struct iommu_ioas_copy - ioctl(IOMMU_IOAS_COPY)
* @size: sizeof(struct iommu_ioas_copy)
* @flags: Combination of enum iommufd_ioas_map_flags
@@ -395,13 +423,35 @@ struct iommu_hwpt_vtd_s1 {
};
/**
+ * struct iommu_hwpt_arm_smmuv3 - ARM SMMUv3 nested STE
+ * (IOMMU_HWPT_DATA_ARM_SMMUV3)
+ *
+ * @ste: The first two double words of the user space Stream Table Entry for
+ * the translation. Must be little-endian.
+ * Allowed fields: (Refer to "5.2 Stream Table Entry" in SMMUv3 HW Spec)
+ * - word-0: V, Cfg, S1Fmt, S1ContextPtr, S1CDMax
+ * - word-1: EATS, S1DSS, S1CIR, S1COR, S1CSH, S1STALLD
+ *
+ * -EIO will be returned if @ste is not legal or contains any non-allowed field.
+ * Cfg can be used to select a S1, Bypass or Abort configuration. A Bypass
+ * nested domain will translate the same as the nesting parent. The S1 will
+ * install a Context Descriptor Table pointing at userspace memory translated
+ * by the nesting parent.
+ */
+struct iommu_hwpt_arm_smmuv3 {
+ __aligned_le64 ste[2];
+};
+
+/**
* enum iommu_hwpt_data_type - IOMMU HWPT Data Type
* @IOMMU_HWPT_DATA_NONE: no data
* @IOMMU_HWPT_DATA_VTD_S1: Intel VT-d stage-1 page table
+ * @IOMMU_HWPT_DATA_ARM_SMMUV3: ARM SMMUv3 Context Descriptor Table
*/
enum iommu_hwpt_data_type {
IOMMU_HWPT_DATA_NONE = 0,
IOMMU_HWPT_DATA_VTD_S1 = 1,
+ IOMMU_HWPT_DATA_ARM_SMMUV3 = 2,
};
/**
@@ -409,7 +459,7 @@ enum iommu_hwpt_data_type {
* @size: sizeof(struct iommu_hwpt_alloc)
* @flags: Combination of enum iommufd_hwpt_alloc_flags
* @dev_id: The device to allocate this HWPT for
- * @pt_id: The IOAS or HWPT to connect this HWPT to
+ * @pt_id: The IOAS or HWPT or vIOMMU to connect this HWPT to
* @out_hwpt_id: The ID of the new HWPT
* @__reserved: Must be 0
* @data_type: One of enum iommu_hwpt_data_type
@@ -428,11 +478,13 @@ enum iommu_hwpt_data_type {
* IOMMU_HWPT_DATA_NONE. The HWPT can be allocated as a parent HWPT for a
* nesting configuration by passing IOMMU_HWPT_ALLOC_NEST_PARENT via @flags.
*
- * A user-managed nested HWPT will be created from a given parent HWPT via
- * @pt_id, in which the parent HWPT must be allocated previously via the
- * same ioctl from a given IOAS (@pt_id). In this case, the @data_type
- * must be set to a pre-defined type corresponding to an I/O page table
- * type supported by the underlying IOMMU hardware.
+ * A user-managed nested HWPT will be created from a given vIOMMU (wrapping a
+ * parent HWPT) or a parent HWPT via @pt_id, in which the parent HWPT must be
+ * allocated previously via the same ioctl from a given IOAS (@pt_id). In this
+ * case, the @data_type must be set to a pre-defined type corresponding to an
+ * I/O page table type supported by the underlying IOMMU hardware. The device
+ * via @dev_id and the vIOMMU via @pt_id must be associated to the same IOMMU
+ * instance.
*
* If the @data_type is set to IOMMU_HWPT_DATA_NONE, @data_len and
* @data_uptr should be zero. Otherwise, both @data_len and @data_uptr
@@ -485,14 +537,49 @@ struct iommu_hw_info_vtd {
};
/**
+ * struct iommu_hw_info_arm_smmuv3 - ARM SMMUv3 hardware information
+ * (IOMMU_HW_INFO_TYPE_ARM_SMMUV3)
+ *
+ * @flags: Must be set to 0
+ * @__reserved: Must be 0
+ * @idr: Implemented features for ARM SMMU Non-secure programming interface
+ * @iidr: Information about the implementation and implementer of ARM SMMU,
+ * and architecture version supported
+ * @aidr: ARM SMMU architecture version
+ *
+ * For the details of @idr, @iidr and @aidr, please refer to the chapters
+ * from 6.3.1 to 6.3.6 in the SMMUv3 Spec.
+ *
+ * User space should read the underlying ARM SMMUv3 hardware information for
+ * the list of supported features.
+ *
+ * Note that these values reflect the raw HW capability, without any insight if
+ * any required kernel driver support is present. Bits may be set indicating the
+ * HW has functionality that is lacking kernel software support, such as BTM. If
+ * a VMM is using this information to construct emulated copies of these
+ * registers it should only forward bits that it knows it can support.
+ *
+ * In future, presence of required kernel support will be indicated in flags.
+ */
+struct iommu_hw_info_arm_smmuv3 {
+ __u32 flags;
+ __u32 __reserved;
+ __u32 idr[6];
+ __u32 iidr;
+ __u32 aidr;
+};
+
+/**
* enum iommu_hw_info_type - IOMMU Hardware Info Types
* @IOMMU_HW_INFO_TYPE_NONE: Used by the drivers that do not report hardware
* info
* @IOMMU_HW_INFO_TYPE_INTEL_VTD: Intel VT-d iommu info type
+ * @IOMMU_HW_INFO_TYPE_ARM_SMMUV3: ARM SMMUv3 iommu info type
*/
enum iommu_hw_info_type {
IOMMU_HW_INFO_TYPE_NONE = 0,
IOMMU_HW_INFO_TYPE_INTEL_VTD = 1,
+ IOMMU_HW_INFO_TYPE_ARM_SMMUV3 = 2,
};
/**
@@ -627,9 +714,11 @@ struct iommu_hwpt_get_dirty_bitmap {
* enum iommu_hwpt_invalidate_data_type - IOMMU HWPT Cache Invalidation
* Data Type
* @IOMMU_HWPT_INVALIDATE_DATA_VTD_S1: Invalidation data for VTD_S1
+ * @IOMMU_VIOMMU_INVALIDATE_DATA_ARM_SMMUV3: Invalidation data for ARM SMMUv3
*/
enum iommu_hwpt_invalidate_data_type {
IOMMU_HWPT_INVALIDATE_DATA_VTD_S1 = 0,
+ IOMMU_VIOMMU_INVALIDATE_DATA_ARM_SMMUV3 = 1,
};
/**
@@ -669,9 +758,31 @@ struct iommu_hwpt_vtd_s1_invalidate {
};
/**
+ * struct iommu_viommu_arm_smmuv3_invalidate - ARM SMMUv3 cahce invalidation
+ * (IOMMU_VIOMMU_INVALIDATE_DATA_ARM_SMMUV3)
+ * @cmd: 128-bit cache invalidation command that runs in SMMU CMDQ.
+ * Must be little-endian.
+ *
+ * Supported command list only when passing in a vIOMMU via @hwpt_id:
+ * CMDQ_OP_TLBI_NSNH_ALL
+ * CMDQ_OP_TLBI_NH_VA
+ * CMDQ_OP_TLBI_NH_VAA
+ * CMDQ_OP_TLBI_NH_ALL
+ * CMDQ_OP_TLBI_NH_ASID
+ * CMDQ_OP_ATC_INV
+ * CMDQ_OP_CFGI_CD
+ * CMDQ_OP_CFGI_CD_ALL
+ *
+ * -EIO will be returned if the command is not supported.
+ */
+struct iommu_viommu_arm_smmuv3_invalidate {
+ __aligned_le64 cmd[2];
+};
+
+/**
* struct iommu_hwpt_invalidate - ioctl(IOMMU_HWPT_INVALIDATE)
* @size: sizeof(struct iommu_hwpt_invalidate)
- * @hwpt_id: ID of a nested HWPT for cache invalidation
+ * @hwpt_id: ID of a nested HWPT or a vIOMMU, for cache invalidation
* @data_uptr: User pointer to an array of driver-specific cache invalidation
* data.
* @data_type: One of enum iommu_hwpt_invalidate_data_type, defining the data
@@ -682,8 +793,11 @@ struct iommu_hwpt_vtd_s1_invalidate {
* Output the number of requests successfully handled by kernel.
* @__reserved: Must be 0.
*
- * Invalidate the iommu cache for user-managed page table. Modifications on a
- * user-managed page table should be followed by this operation to sync cache.
+ * Invalidate iommu cache for user-managed page table or vIOMMU. Modifications
+ * on a user-managed page table should be followed by this operation, if a HWPT
+ * is passed in via @hwpt_id. Other caches, such as device cache or descriptor
+ * cache can be flushed if a vIOMMU is passed in via the @hwpt_id field.
+ *
* Each ioctl can support one or more cache invalidation requests in the array
* that has a total size of @entry_len * @entry_num.
*
@@ -797,4 +911,88 @@ struct iommu_fault_alloc {
__u32 out_fault_fd;
};
#define IOMMU_FAULT_QUEUE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_FAULT_QUEUE_ALLOC)
+
+/**
+ * enum iommu_viommu_type - Virtual IOMMU Type
+ * @IOMMU_VIOMMU_TYPE_DEFAULT: Reserved for future use
+ * @IOMMU_VIOMMU_TYPE_ARM_SMMUV3: ARM SMMUv3 driver specific type
+ */
+enum iommu_viommu_type {
+ IOMMU_VIOMMU_TYPE_DEFAULT = 0,
+ IOMMU_VIOMMU_TYPE_ARM_SMMUV3 = 1,
+};
+
+/**
+ * struct iommu_viommu_alloc - ioctl(IOMMU_VIOMMU_ALLOC)
+ * @size: sizeof(struct iommu_viommu_alloc)
+ * @flags: Must be 0
+ * @type: Type of the virtual IOMMU. Must be defined in enum iommu_viommu_type
+ * @dev_id: The device's physical IOMMU will be used to back the virtual IOMMU
+ * @hwpt_id: ID of a nesting parent HWPT to associate to
+ * @out_viommu_id: Output virtual IOMMU ID for the allocated object
+ *
+ * Allocate a virtual IOMMU object, representing the underlying physical IOMMU's
+ * virtualization support that is a security-isolated slice of the real IOMMU HW
+ * that is unique to a specific VM. Operations global to the IOMMU are connected
+ * to the vIOMMU, such as:
+ * - Security namespace for guest owned ID, e.g. guest-controlled cache tags
+ * - Non-device-affiliated event reporting, e.g. invalidation queue errors
+ * - Access to a sharable nesting parent pagetable across physical IOMMUs
+ * - Virtualization of various platforms IDs, e.g. RIDs and others
+ * - Delivery of paravirtualized invalidation
+ * - Direct assigned invalidation queues
+ * - Direct assigned interrupts
+ */
+struct iommu_viommu_alloc {
+ __u32 size;
+ __u32 flags;
+ __u32 type;
+ __u32 dev_id;
+ __u32 hwpt_id;
+ __u32 out_viommu_id;
+};
+#define IOMMU_VIOMMU_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VIOMMU_ALLOC)
+
+/**
+ * struct iommu_vdevice_alloc - ioctl(IOMMU_VDEVICE_ALLOC)
+ * @size: sizeof(struct iommu_vdevice_alloc)
+ * @viommu_id: vIOMMU ID to associate with the virtual device
+ * @dev_id: The physical device to allocate a virtual instance on the vIOMMU
+ * @out_vdevice_id: Object handle for the vDevice. Pass to IOMMU_DESTORY
+ * @virt_id: Virtual device ID per vIOMMU, e.g. vSID of ARM SMMUv3, vDeviceID
+ * of AMD IOMMU, and vRID of a nested Intel VT-d to a Context Table
+ *
+ * Allocate a virtual device instance (for a physical device) against a vIOMMU.
+ * This instance holds the device's information (related to its vIOMMU) in a VM.
+ */
+struct iommu_vdevice_alloc {
+ __u32 size;
+ __u32 viommu_id;
+ __u32 dev_id;
+ __u32 out_vdevice_id;
+ __aligned_u64 virt_id;
+};
+#define IOMMU_VDEVICE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VDEVICE_ALLOC)
+
+/**
+ * struct iommu_ioas_change_process - ioctl(VFIO_IOAS_CHANGE_PROCESS)
+ * @size: sizeof(struct iommu_ioas_change_process)
+ * @__reserved: Must be 0
+ *
+ * This transfers pinned memory counts for every memory map in every IOAS
+ * in the context to the current process. This only supports maps created
+ * with IOMMU_IOAS_MAP_FILE, and returns EINVAL if other maps are present.
+ * If the ioctl returns a failure status, then nothing is changed.
+ *
+ * This API is useful for transferring operation of a device from one process
+ * to another, such as during userland live update.
+ */
+struct iommu_ioas_change_process {
+ __u32 size;
+ __u32 __reserved;
+};
+
+#define IOMMU_IOAS_CHANGE_PROCESS \
+ _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_CHANGE_PROCESS)
+
#endif
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 2b68e6cdf190..c8dbf8219c4f 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -35,7 +35,7 @@
#define VFIO_EEH 5
/* Two-stage IOMMU */
-#define VFIO_TYPE1_NESTING_IOMMU 6 /* Implies v2 */
+#define __VFIO_RESERVED_TYPE1_NESTING_IOMMU 6 /* Implies v2 */
#define VFIO_SPAPR_TCE_v2_IOMMU 7
diff --git a/mm/gup.c b/mm/gup.c
index ad0c8922dac3..2b84eead9baa 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -3760,3 +3760,27 @@ err:
return ret;
}
EXPORT_SYMBOL_GPL(memfd_pin_folios);
+
+/**
+ * folio_add_pins() - add pins to an already-pinned folio
+ * @folio: the folio to add more pins to
+ * @pins: number of pins to add
+ *
+ * Try to add more pins to an already-pinned folio. The semantics
+ * of the pin (e.g., FOLL_WRITE) follow any existing pin and cannot
+ * be changed.
+ *
+ * This function is helpful when having obtained a pin on a large folio
+ * using memfd_pin_folios(), but wanting to logically unpin parts
+ * (e.g., individual pages) of the folio later, for example, using
+ * unpin_user_page_range_dirty_lock().
+ *
+ * This is not the right interface to initially pin a folio.
+ */
+int folio_add_pins(struct folio *folio, unsigned int pins)
+{
+ VM_WARN_ON_ONCE(!folio_maybe_dma_pinned(folio));
+
+ return try_grab_folio(folio, pins, FOLL_PIN);
+}
+EXPORT_SYMBOL_GPL(folio_add_pins);
diff --git a/tools/testing/selftests/iommu/Makefile b/tools/testing/selftests/iommu/Makefile
index fd6477911f24..84abeb2f0949 100644
--- a/tools/testing/selftests/iommu/Makefile
+++ b/tools/testing/selftests/iommu/Makefile
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
CFLAGS += -Wall -O2 -Wno-unused-function
CFLAGS += $(KHDR_INCLUDES)
+LDLIBS += -lcap
TEST_GEN_PROGS :=
TEST_GEN_PROGS += iommufd
diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c
index 4927b9add5ad..a1b2b657999d 100644
--- a/tools/testing/selftests/iommu/iommufd.c
+++ b/tools/testing/selftests/iommu/iommufd.c
@@ -1,6 +1,8 @@
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES */
+#include <asm/unistd.h>
#include <stdlib.h>
+#include <sys/capability.h>
#include <sys/mman.h>
#include <sys/eventfd.h>
@@ -49,6 +51,9 @@ static __attribute__((constructor)) void setup_sizes(void)
vrc = mmap(buffer, BUFFER_SIZE, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
assert(vrc == buffer);
+
+ mfd_buffer = memfd_mmap(BUFFER_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
+ &mfd);
}
FIXTURE(iommufd)
@@ -128,6 +133,11 @@ TEST_F(iommufd, cmd_length)
TEST_LENGTH(iommu_ioas_unmap, IOMMU_IOAS_UNMAP, length);
TEST_LENGTH(iommu_option, IOMMU_OPTION, val64);
TEST_LENGTH(iommu_vfio_ioas, IOMMU_VFIO_IOAS, __reserved);
+ TEST_LENGTH(iommu_ioas_map_file, IOMMU_IOAS_MAP_FILE, iova);
+ TEST_LENGTH(iommu_viommu_alloc, IOMMU_VIOMMU_ALLOC, out_viommu_id);
+ TEST_LENGTH(iommu_vdevice_alloc, IOMMU_VDEVICE_ALLOC, virt_id);
+ TEST_LENGTH(iommu_ioas_change_process, IOMMU_IOAS_CHANGE_PROCESS,
+ __reserved);
#undef TEST_LENGTH
}
@@ -186,6 +196,144 @@ TEST_F(iommufd, global_options)
EXPECT_ERRNO(ENOENT, ioctl(self->fd, IOMMU_OPTION, &cmd));
}
+static void drop_cap_ipc_lock(struct __test_metadata *_metadata)
+{
+ cap_t caps;
+ cap_value_t cap_list[1] = { CAP_IPC_LOCK };
+
+ caps = cap_get_proc();
+ ASSERT_NE(caps, NULL);
+ ASSERT_NE(-1,
+ cap_set_flag(caps, CAP_EFFECTIVE, 1, cap_list, CAP_CLEAR));
+ ASSERT_NE(-1, cap_set_proc(caps));
+ cap_free(caps);
+}
+
+static long get_proc_status_value(pid_t pid, const char *var)
+{
+ FILE *fp;
+ char buf[80], tag[80];
+ long val = -1;
+
+ snprintf(buf, sizeof(buf), "/proc/%d/status", pid);
+ fp = fopen(buf, "r");
+ if (!fp)
+ return val;
+
+ while (fgets(buf, sizeof(buf), fp))
+ if (fscanf(fp, "%s %ld\n", tag, &val) == 2 && !strcmp(tag, var))
+ break;
+
+ fclose(fp);
+ return val;
+}
+
+static long get_vm_pinned(pid_t pid)
+{
+ return get_proc_status_value(pid, "VmPin:");
+}
+
+static long get_vm_locked(pid_t pid)
+{
+ return get_proc_status_value(pid, "VmLck:");
+}
+
+FIXTURE(change_process)
+{
+ int fd;
+ uint32_t ioas_id;
+};
+
+FIXTURE_VARIANT(change_process)
+{
+ int accounting;
+};
+
+FIXTURE_SETUP(change_process)
+{
+ self->fd = open("/dev/iommu", O_RDWR);
+ ASSERT_NE(-1, self->fd);
+
+ drop_cap_ipc_lock(_metadata);
+ if (variant->accounting != IOPT_PAGES_ACCOUNT_NONE) {
+ struct iommu_option set_limit_cmd = {
+ .size = sizeof(set_limit_cmd),
+ .option_id = IOMMU_OPTION_RLIMIT_MODE,
+ .op = IOMMU_OPTION_OP_SET,
+ .val64 = (variant->accounting == IOPT_PAGES_ACCOUNT_MM),
+ };
+ ASSERT_EQ(0, ioctl(self->fd, IOMMU_OPTION, &set_limit_cmd));
+ }
+
+ test_ioctl_ioas_alloc(&self->ioas_id);
+ test_cmd_mock_domain(self->ioas_id, NULL, NULL, NULL);
+}
+
+FIXTURE_TEARDOWN(change_process)
+{
+ teardown_iommufd(self->fd, _metadata);
+}
+
+FIXTURE_VARIANT_ADD(change_process, account_none)
+{
+ .accounting = IOPT_PAGES_ACCOUNT_NONE,
+};
+
+FIXTURE_VARIANT_ADD(change_process, account_user)
+{
+ .accounting = IOPT_PAGES_ACCOUNT_USER,
+};
+
+FIXTURE_VARIANT_ADD(change_process, account_mm)
+{
+ .accounting = IOPT_PAGES_ACCOUNT_MM,
+};
+
+TEST_F(change_process, basic)
+{
+ pid_t parent = getpid();
+ pid_t child;
+ __u64 iova;
+ struct iommu_ioas_change_process cmd = {
+ .size = sizeof(cmd),
+ };
+
+ /* Expect failure if non-file maps exist */
+ test_ioctl_ioas_map(buffer, PAGE_SIZE, &iova);
+ EXPECT_ERRNO(EINVAL, ioctl(self->fd, IOMMU_IOAS_CHANGE_PROCESS, &cmd));
+ test_ioctl_ioas_unmap(iova, PAGE_SIZE);
+
+ /* Change process works in current process. */
+ test_ioctl_ioas_map_file(mfd, 0, PAGE_SIZE, &iova);
+ ASSERT_EQ(0, ioctl(self->fd, IOMMU_IOAS_CHANGE_PROCESS, &cmd));
+
+ /* Change process works in another process */
+ child = fork();
+ if (!child) {
+ int nlock = PAGE_SIZE / 1024;
+
+ /* Parent accounts for locked memory before */
+ ASSERT_EQ(nlock, get_vm_pinned(parent));
+ if (variant->accounting == IOPT_PAGES_ACCOUNT_MM)
+ ASSERT_EQ(nlock, get_vm_locked(parent));
+ ASSERT_EQ(0, get_vm_pinned(getpid()));
+ ASSERT_EQ(0, get_vm_locked(getpid()));
+
+ ASSERT_EQ(0, ioctl(self->fd, IOMMU_IOAS_CHANGE_PROCESS, &cmd));
+
+ /* Child accounts for locked memory after */
+ ASSERT_EQ(0, get_vm_pinned(parent));
+ ASSERT_EQ(0, get_vm_locked(parent));
+ ASSERT_EQ(nlock, get_vm_pinned(getpid()));
+ if (variant->accounting == IOPT_PAGES_ACCOUNT_MM)
+ ASSERT_EQ(nlock, get_vm_locked(getpid()));
+
+ exit(0);
+ }
+ ASSERT_NE(-1, child);
+ ASSERT_EQ(child, waitpid(child, NULL, 0));
+}
+
FIXTURE(iommufd_ioas)
{
int fd;
@@ -220,6 +368,8 @@ FIXTURE_SETUP(iommufd_ioas)
for (i = 0; i != variant->mock_domains; i++) {
test_cmd_mock_domain(self->ioas_id, &self->stdev_id,
&self->hwpt_id, &self->device_id);
+ test_cmd_dev_check_cache_all(self->device_id,
+ IOMMU_TEST_DEV_CACHE_DEFAULT);
self->base_iova = MOCK_APERTURE_START;
}
}
@@ -360,9 +510,9 @@ TEST_F(iommufd_ioas, alloc_hwpt_nested)
EXPECT_ERRNO(EBUSY,
_test_ioctl_destroy(self->fd, parent_hwpt_id));
- /* hwpt_invalidate only supports a user-managed hwpt (nested) */
+ /* hwpt_invalidate does not support a parent hwpt */
num_inv = 1;
- test_err_hwpt_invalidate(ENOENT, parent_hwpt_id, inv_reqs,
+ test_err_hwpt_invalidate(EINVAL, parent_hwpt_id, inv_reqs,
IOMMU_HWPT_INVALIDATE_DATA_SELFTEST,
sizeof(*inv_reqs), &num_inv);
assert(!num_inv);
@@ -1372,6 +1522,7 @@ FIXTURE_VARIANT(iommufd_mock_domain)
{
unsigned int mock_domains;
bool hugepages;
+ bool file;
};
FIXTURE_SETUP(iommufd_mock_domain)
@@ -1384,9 +1535,12 @@ FIXTURE_SETUP(iommufd_mock_domain)
ASSERT_GE(ARRAY_SIZE(self->hwpt_ids), variant->mock_domains);
- for (i = 0; i != variant->mock_domains; i++)
+ for (i = 0; i != variant->mock_domains; i++) {
test_cmd_mock_domain(self->ioas_id, &self->stdev_ids[i],
&self->hwpt_ids[i], &self->idev_ids[i]);
+ test_cmd_dev_check_cache_all(self->idev_ids[0],
+ IOMMU_TEST_DEV_CACHE_DEFAULT);
+ }
self->hwpt_id = self->hwpt_ids[0];
self->mmap_flags = MAP_SHARED | MAP_ANONYMOUS;
@@ -1410,26 +1564,45 @@ FIXTURE_VARIANT_ADD(iommufd_mock_domain, one_domain)
{
.mock_domains = 1,
.hugepages = false,
+ .file = false,
};
FIXTURE_VARIANT_ADD(iommufd_mock_domain, two_domains)
{
.mock_domains = 2,
.hugepages = false,
+ .file = false,
};
FIXTURE_VARIANT_ADD(iommufd_mock_domain, one_domain_hugepage)
{
.mock_domains = 1,
.hugepages = true,
+ .file = false,
};
FIXTURE_VARIANT_ADD(iommufd_mock_domain, two_domains_hugepage)
{
.mock_domains = 2,
.hugepages = true,
+ .file = false,
};
+FIXTURE_VARIANT_ADD(iommufd_mock_domain, one_domain_file)
+{
+ .mock_domains = 1,
+ .hugepages = false,
+ .file = true,
+};
+
+FIXTURE_VARIANT_ADD(iommufd_mock_domain, one_domain_file_hugepage)
+{
+ .mock_domains = 1,
+ .hugepages = true,
+ .file = true,
+};
+
+
/* Have the kernel check that the user pages made it to the iommu_domain */
#define check_mock_iova(_ptr, _iova, _length) \
({ \
@@ -1455,7 +1628,10 @@ FIXTURE_VARIANT_ADD(iommufd_mock_domain, two_domains_hugepage)
} \
})
-TEST_F(iommufd_mock_domain, basic)
+static void
+test_basic_mmap(struct __test_metadata *_metadata,
+ struct _test_data_iommufd_mock_domain *self,
+ const struct _fixture_variant_iommufd_mock_domain *variant)
{
size_t buf_size = self->mmap_buf_size;
uint8_t *buf;
@@ -1478,6 +1654,40 @@ TEST_F(iommufd_mock_domain, basic)
test_err_ioctl_ioas_map(EFAULT, buf, buf_size, &iova);
}
+static void
+test_basic_file(struct __test_metadata *_metadata,
+ struct _test_data_iommufd_mock_domain *self,
+ const struct _fixture_variant_iommufd_mock_domain *variant)
+{
+ size_t buf_size = self->mmap_buf_size;
+ uint8_t *buf;
+ __u64 iova;
+ int mfd_tmp;
+ int prot = PROT_READ | PROT_WRITE;
+
+ /* Simple one page map */
+ test_ioctl_ioas_map_file(mfd, 0, PAGE_SIZE, &iova);
+ check_mock_iova(mfd_buffer, iova, PAGE_SIZE);
+
+ buf = memfd_mmap(buf_size, prot, MAP_SHARED, &mfd_tmp);
+ ASSERT_NE(MAP_FAILED, buf);
+
+ test_err_ioctl_ioas_map_file(EINVAL, mfd_tmp, 0, buf_size + 1, &iova);
+
+ ASSERT_EQ(0, ftruncate(mfd_tmp, 0));
+ test_err_ioctl_ioas_map_file(EINVAL, mfd_tmp, 0, buf_size, &iova);
+
+ close(mfd_tmp);
+}
+
+TEST_F(iommufd_mock_domain, basic)
+{
+ if (variant->file)
+ test_basic_file(_metadata, self, variant);
+ else
+ test_basic_mmap(_metadata, self, variant);
+}
+
TEST_F(iommufd_mock_domain, ro_unshare)
{
uint8_t *buf;
@@ -1513,9 +1723,13 @@ TEST_F(iommufd_mock_domain, all_aligns)
unsigned int start;
unsigned int end;
uint8_t *buf;
+ int prot = PROT_READ | PROT_WRITE;
+ int mfd;
- buf = mmap(0, buf_size, PROT_READ | PROT_WRITE, self->mmap_flags, -1,
- 0);
+ if (variant->file)
+ buf = memfd_mmap(buf_size, prot, MAP_SHARED, &mfd);
+ else
+ buf = mmap(0, buf_size, prot, self->mmap_flags, -1, 0);
ASSERT_NE(MAP_FAILED, buf);
check_refs(buf, buf_size, 0);
@@ -1532,7 +1746,12 @@ TEST_F(iommufd_mock_domain, all_aligns)
size_t length = end - start;
__u64 iova;
- test_ioctl_ioas_map(buf + start, length, &iova);
+ if (variant->file) {
+ test_ioctl_ioas_map_file(mfd, start, length,
+ &iova);
+ } else {
+ test_ioctl_ioas_map(buf + start, length, &iova);
+ }
check_mock_iova(buf + start, iova, length);
check_refs(buf + start / PAGE_SIZE * PAGE_SIZE,
end / PAGE_SIZE * PAGE_SIZE -
@@ -1544,6 +1763,8 @@ TEST_F(iommufd_mock_domain, all_aligns)
}
check_refs(buf, buf_size, 0);
ASSERT_EQ(0, munmap(buf, buf_size));
+ if (variant->file)
+ close(mfd);
}
TEST_F(iommufd_mock_domain, all_aligns_copy)
@@ -1554,9 +1775,13 @@ TEST_F(iommufd_mock_domain, all_aligns_copy)
unsigned int start;
unsigned int end;
uint8_t *buf;
+ int prot = PROT_READ | PROT_WRITE;
+ int mfd;
- buf = mmap(0, buf_size, PROT_READ | PROT_WRITE, self->mmap_flags, -1,
- 0);
+ if (variant->file)
+ buf = memfd_mmap(buf_size, prot, MAP_SHARED, &mfd);
+ else
+ buf = mmap(0, buf_size, prot, self->mmap_flags, -1, 0);
ASSERT_NE(MAP_FAILED, buf);
check_refs(buf, buf_size, 0);
@@ -1575,7 +1800,12 @@ TEST_F(iommufd_mock_domain, all_aligns_copy)
uint32_t mock_stdev_id;
__u64 iova;
- test_ioctl_ioas_map(buf + start, length, &iova);
+ if (variant->file) {
+ test_ioctl_ioas_map_file(mfd, start, length,
+ &iova);
+ } else {
+ test_ioctl_ioas_map(buf + start, length, &iova);
+ }
/* Add and destroy a domain while the area exists */
old_id = self->hwpt_ids[1];
@@ -1596,15 +1826,18 @@ TEST_F(iommufd_mock_domain, all_aligns_copy)
}
check_refs(buf, buf_size, 0);
ASSERT_EQ(0, munmap(buf, buf_size));
+ if (variant->file)
+ close(mfd);
}
TEST_F(iommufd_mock_domain, user_copy)
{
+ void *buf = variant->file ? mfd_buffer : buffer;
struct iommu_test_cmd access_cmd = {
.size = sizeof(access_cmd),
.op = IOMMU_TEST_OP_ACCESS_PAGES,
.access_pages = { .length = BUFFER_SIZE,
- .uptr = (uintptr_t)buffer },
+ .uptr = (uintptr_t)buf },
};
struct iommu_ioas_copy copy_cmd = {
.size = sizeof(copy_cmd),
@@ -1623,9 +1856,13 @@ TEST_F(iommufd_mock_domain, user_copy)
/* Pin the pages in an IOAS with no domains then copy to an IOAS with domains */
test_ioctl_ioas_alloc(&ioas_id);
- test_ioctl_ioas_map_id(ioas_id, buffer, BUFFER_SIZE,
- &copy_cmd.src_iova);
-
+ if (variant->file) {
+ test_ioctl_ioas_map_id_file(ioas_id, mfd, 0, BUFFER_SIZE,
+ &copy_cmd.src_iova);
+ } else {
+ test_ioctl_ioas_map_id(ioas_id, buf, BUFFER_SIZE,
+ &copy_cmd.src_iova);
+ }
test_cmd_create_access(ioas_id, &access_cmd.id,
MOCK_FLAGS_ACCESS_CREATE_NEEDS_PIN_PAGES);
@@ -1635,12 +1872,17 @@ TEST_F(iommufd_mock_domain, user_copy)
&access_cmd));
copy_cmd.src_ioas_id = ioas_id;
ASSERT_EQ(0, ioctl(self->fd, IOMMU_IOAS_COPY, &copy_cmd));
- check_mock_iova(buffer, MOCK_APERTURE_START, BUFFER_SIZE);
+ check_mock_iova(buf, MOCK_APERTURE_START, BUFFER_SIZE);
/* Now replace the ioas with a new one */
test_ioctl_ioas_alloc(&new_ioas_id);
- test_ioctl_ioas_map_id(new_ioas_id, buffer, BUFFER_SIZE,
- &copy_cmd.src_iova);
+ if (variant->file) {
+ test_ioctl_ioas_map_id_file(new_ioas_id, mfd, 0, BUFFER_SIZE,
+ &copy_cmd.src_iova);
+ } else {
+ test_ioctl_ioas_map_id(new_ioas_id, buf, BUFFER_SIZE,
+ &copy_cmd.src_iova);
+ }
test_cmd_access_replace_ioas(access_cmd.id, new_ioas_id);
/* Destroy the old ioas and cleanup copied mapping */
@@ -1654,7 +1896,7 @@ TEST_F(iommufd_mock_domain, user_copy)
&access_cmd));
copy_cmd.src_ioas_id = new_ioas_id;
ASSERT_EQ(0, ioctl(self->fd, IOMMU_IOAS_COPY, &copy_cmd));
- check_mock_iova(buffer, MOCK_APERTURE_START, BUFFER_SIZE);
+ check_mock_iova(buf, MOCK_APERTURE_START, BUFFER_SIZE);
test_cmd_destroy_access_pages(
access_cmd.id, access_cmd.access_pages.out_access_pages_id);
@@ -2386,4 +2628,332 @@ TEST_F(vfio_compat_mock_domain, huge_map)
}
}
+FIXTURE(iommufd_viommu)
+{
+ int fd;
+ uint32_t ioas_id;
+ uint32_t stdev_id;
+ uint32_t hwpt_id;
+ uint32_t nested_hwpt_id;
+ uint32_t device_id;
+ uint32_t viommu_id;
+};
+
+FIXTURE_VARIANT(iommufd_viommu)
+{
+ unsigned int viommu;
+};
+
+FIXTURE_SETUP(iommufd_viommu)
+{
+ self->fd = open("/dev/iommu", O_RDWR);
+ ASSERT_NE(-1, self->fd);
+ test_ioctl_ioas_alloc(&self->ioas_id);
+ test_ioctl_set_default_memory_limit();
+
+ if (variant->viommu) {
+ struct iommu_hwpt_selftest data = {
+ .iotlb = IOMMU_TEST_IOTLB_DEFAULT,
+ };
+
+ test_cmd_mock_domain(self->ioas_id, &self->stdev_id, NULL,
+ &self->device_id);
+
+ /* Allocate a nesting parent hwpt */
+ test_cmd_hwpt_alloc(self->device_id, self->ioas_id,
+ IOMMU_HWPT_ALLOC_NEST_PARENT,
+ &self->hwpt_id);
+
+ /* Allocate a vIOMMU taking refcount of the parent hwpt */
+ test_cmd_viommu_alloc(self->device_id, self->hwpt_id,
+ IOMMU_VIOMMU_TYPE_SELFTEST,
+ &self->viommu_id);
+
+ /* Allocate a regular nested hwpt */
+ test_cmd_hwpt_alloc_nested(self->device_id, self->viommu_id, 0,
+ &self->nested_hwpt_id,
+ IOMMU_HWPT_DATA_SELFTEST, &data,
+ sizeof(data));
+ }
+}
+
+FIXTURE_TEARDOWN(iommufd_viommu)
+{
+ teardown_iommufd(self->fd, _metadata);
+}
+
+FIXTURE_VARIANT_ADD(iommufd_viommu, no_viommu)
+{
+ .viommu = 0,
+};
+
+FIXTURE_VARIANT_ADD(iommufd_viommu, mock_viommu)
+{
+ .viommu = 1,
+};
+
+TEST_F(iommufd_viommu, viommu_auto_destroy)
+{
+}
+
+TEST_F(iommufd_viommu, viommu_negative_tests)
+{
+ uint32_t device_id = self->device_id;
+ uint32_t ioas_id = self->ioas_id;
+ uint32_t hwpt_id;
+
+ if (self->device_id) {
+ /* Negative test -- invalid hwpt (hwpt_id=0) */
+ test_err_viommu_alloc(ENOENT, device_id, 0,
+ IOMMU_VIOMMU_TYPE_SELFTEST, NULL);
+
+ /* Negative test -- not a nesting parent hwpt */
+ test_cmd_hwpt_alloc(device_id, ioas_id, 0, &hwpt_id);
+ test_err_viommu_alloc(EINVAL, device_id, hwpt_id,
+ IOMMU_VIOMMU_TYPE_SELFTEST, NULL);
+ test_ioctl_destroy(hwpt_id);
+
+ /* Negative test -- unsupported viommu type */
+ test_err_viommu_alloc(EOPNOTSUPP, device_id, self->hwpt_id,
+ 0xdead, NULL);
+ EXPECT_ERRNO(EBUSY,
+ _test_ioctl_destroy(self->fd, self->hwpt_id));
+ EXPECT_ERRNO(EBUSY,
+ _test_ioctl_destroy(self->fd, self->viommu_id));
+ } else {
+ test_err_viommu_alloc(ENOENT, self->device_id, self->hwpt_id,
+ IOMMU_VIOMMU_TYPE_SELFTEST, NULL);
+ }
+}
+
+TEST_F(iommufd_viommu, viommu_alloc_nested_iopf)
+{
+ struct iommu_hwpt_selftest data = {
+ .iotlb = IOMMU_TEST_IOTLB_DEFAULT,
+ };
+ uint32_t viommu_id = self->viommu_id;
+ uint32_t dev_id = self->device_id;
+ uint32_t iopf_hwpt_id;
+ uint32_t fault_id;
+ uint32_t fault_fd;
+
+ if (self->device_id) {
+ test_ioctl_fault_alloc(&fault_id, &fault_fd);
+ test_err_hwpt_alloc_iopf(
+ ENOENT, dev_id, viommu_id, UINT32_MAX,
+ IOMMU_HWPT_FAULT_ID_VALID, &iopf_hwpt_id,
+ IOMMU_HWPT_DATA_SELFTEST, &data, sizeof(data));
+ test_err_hwpt_alloc_iopf(
+ EOPNOTSUPP, dev_id, viommu_id, fault_id,
+ IOMMU_HWPT_FAULT_ID_VALID | (1 << 31), &iopf_hwpt_id,
+ IOMMU_HWPT_DATA_SELFTEST, &data, sizeof(data));
+ test_cmd_hwpt_alloc_iopf(
+ dev_id, viommu_id, fault_id, IOMMU_HWPT_FAULT_ID_VALID,
+ &iopf_hwpt_id, IOMMU_HWPT_DATA_SELFTEST, &data,
+ sizeof(data));
+
+ test_cmd_mock_domain_replace(self->stdev_id, iopf_hwpt_id);
+ EXPECT_ERRNO(EBUSY,
+ _test_ioctl_destroy(self->fd, iopf_hwpt_id));
+ test_cmd_trigger_iopf(dev_id, fault_fd);
+
+ test_cmd_mock_domain_replace(self->stdev_id, self->ioas_id);
+ test_ioctl_destroy(iopf_hwpt_id);
+ close(fault_fd);
+ test_ioctl_destroy(fault_id);
+ }
+}
+
+TEST_F(iommufd_viommu, vdevice_alloc)
+{
+ uint32_t viommu_id = self->viommu_id;
+ uint32_t dev_id = self->device_id;
+ uint32_t vdev_id = 0;
+
+ if (dev_id) {
+ /* Set vdev_id to 0x99, unset it, and set to 0x88 */
+ test_cmd_vdevice_alloc(viommu_id, dev_id, 0x99, &vdev_id);
+ test_err_vdevice_alloc(EEXIST, viommu_id, dev_id, 0x99,
+ &vdev_id);
+ test_ioctl_destroy(vdev_id);
+ test_cmd_vdevice_alloc(viommu_id, dev_id, 0x88, &vdev_id);
+ test_ioctl_destroy(vdev_id);
+ } else {
+ test_err_vdevice_alloc(ENOENT, viommu_id, dev_id, 0x99, NULL);
+ }
+}
+
+TEST_F(iommufd_viommu, vdevice_cache)
+{
+ struct iommu_viommu_invalidate_selftest inv_reqs[2] = {};
+ uint32_t viommu_id = self->viommu_id;
+ uint32_t dev_id = self->device_id;
+ uint32_t vdev_id = 0;
+ uint32_t num_inv;
+
+ if (dev_id) {
+ test_cmd_vdevice_alloc(viommu_id, dev_id, 0x99, &vdev_id);
+
+ test_cmd_dev_check_cache_all(dev_id,
+ IOMMU_TEST_DEV_CACHE_DEFAULT);
+
+ /* Check data_type by passing zero-length array */
+ num_inv = 0;
+ test_cmd_viommu_invalidate(viommu_id, inv_reqs,
+ sizeof(*inv_reqs), &num_inv);
+ assert(!num_inv);
+
+ /* Negative test: Invalid data_type */
+ num_inv = 1;
+ test_err_viommu_invalidate(EINVAL, viommu_id, inv_reqs,
+ IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST_INVALID,
+ sizeof(*inv_reqs), &num_inv);
+ assert(!num_inv);
+
+ /* Negative test: structure size sanity */
+ num_inv = 1;
+ test_err_viommu_invalidate(EINVAL, viommu_id, inv_reqs,
+ IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST,
+ sizeof(*inv_reqs) + 1, &num_inv);
+ assert(!num_inv);
+
+ num_inv = 1;
+ test_err_viommu_invalidate(EINVAL, viommu_id, inv_reqs,
+ IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST,
+ 1, &num_inv);
+ assert(!num_inv);
+
+ /* Negative test: invalid flag is passed */
+ num_inv = 1;
+ inv_reqs[0].flags = 0xffffffff;
+ inv_reqs[0].vdev_id = 0x99;
+ test_err_viommu_invalidate(EOPNOTSUPP, viommu_id, inv_reqs,
+ IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST,
+ sizeof(*inv_reqs), &num_inv);
+ assert(!num_inv);
+
+ /* Negative test: invalid data_uptr when array is not empty */
+ num_inv = 1;
+ inv_reqs[0].flags = 0;
+ inv_reqs[0].vdev_id = 0x99;
+ test_err_viommu_invalidate(EINVAL, viommu_id, NULL,
+ IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST,
+ sizeof(*inv_reqs), &num_inv);
+ assert(!num_inv);
+
+ /* Negative test: invalid entry_len when array is not empty */
+ num_inv = 1;
+ inv_reqs[0].flags = 0;
+ inv_reqs[0].vdev_id = 0x99;
+ test_err_viommu_invalidate(EINVAL, viommu_id, inv_reqs,
+ IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST,
+ 0, &num_inv);
+ assert(!num_inv);
+
+ /* Negative test: invalid cache_id */
+ num_inv = 1;
+ inv_reqs[0].flags = 0;
+ inv_reqs[0].vdev_id = 0x99;
+ inv_reqs[0].cache_id = MOCK_DEV_CACHE_ID_MAX + 1;
+ test_err_viommu_invalidate(EINVAL, viommu_id, inv_reqs,
+ IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST,
+ sizeof(*inv_reqs), &num_inv);
+ assert(!num_inv);
+
+ /* Negative test: invalid vdev_id */
+ num_inv = 1;
+ inv_reqs[0].flags = 0;
+ inv_reqs[0].vdev_id = 0x9;
+ inv_reqs[0].cache_id = 0;
+ test_err_viommu_invalidate(EINVAL, viommu_id, inv_reqs,
+ IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST,
+ sizeof(*inv_reqs), &num_inv);
+ assert(!num_inv);
+
+ /*
+ * Invalidate the 1st cache entry but fail the 2nd request
+ * due to invalid flags configuration in the 2nd request.
+ */
+ num_inv = 2;
+ inv_reqs[0].flags = 0;
+ inv_reqs[0].vdev_id = 0x99;
+ inv_reqs[0].cache_id = 0;
+ inv_reqs[1].flags = 0xffffffff;
+ inv_reqs[1].vdev_id = 0x99;
+ inv_reqs[1].cache_id = 1;
+ test_err_viommu_invalidate(EOPNOTSUPP, viommu_id, inv_reqs,
+ IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST,
+ sizeof(*inv_reqs), &num_inv);
+ assert(num_inv == 1);
+ test_cmd_dev_check_cache(dev_id, 0, 0);
+ test_cmd_dev_check_cache(dev_id, 1,
+ IOMMU_TEST_DEV_CACHE_DEFAULT);
+ test_cmd_dev_check_cache(dev_id, 2,
+ IOMMU_TEST_DEV_CACHE_DEFAULT);
+ test_cmd_dev_check_cache(dev_id, 3,
+ IOMMU_TEST_DEV_CACHE_DEFAULT);
+
+ /*
+ * Invalidate the 1st cache entry but fail the 2nd request
+ * due to invalid cache_id configuration in the 2nd request.
+ */
+ num_inv = 2;
+ inv_reqs[0].flags = 0;
+ inv_reqs[0].vdev_id = 0x99;
+ inv_reqs[0].cache_id = 0;
+ inv_reqs[1].flags = 0;
+ inv_reqs[1].vdev_id = 0x99;
+ inv_reqs[1].cache_id = MOCK_DEV_CACHE_ID_MAX + 1;
+ test_err_viommu_invalidate(EINVAL, viommu_id, inv_reqs,
+ IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST,
+ sizeof(*inv_reqs), &num_inv);
+ assert(num_inv == 1);
+ test_cmd_dev_check_cache(dev_id, 0, 0);
+ test_cmd_dev_check_cache(dev_id, 1,
+ IOMMU_TEST_DEV_CACHE_DEFAULT);
+ test_cmd_dev_check_cache(dev_id, 2,
+ IOMMU_TEST_DEV_CACHE_DEFAULT);
+ test_cmd_dev_check_cache(dev_id, 3,
+ IOMMU_TEST_DEV_CACHE_DEFAULT);
+
+ /* Invalidate the 2nd cache entry and verify */
+ num_inv = 1;
+ inv_reqs[0].flags = 0;
+ inv_reqs[0].vdev_id = 0x99;
+ inv_reqs[0].cache_id = 1;
+ test_cmd_viommu_invalidate(viommu_id, inv_reqs,
+ sizeof(*inv_reqs), &num_inv);
+ assert(num_inv == 1);
+ test_cmd_dev_check_cache(dev_id, 0, 0);
+ test_cmd_dev_check_cache(dev_id, 1, 0);
+ test_cmd_dev_check_cache(dev_id, 2,
+ IOMMU_TEST_DEV_CACHE_DEFAULT);
+ test_cmd_dev_check_cache(dev_id, 3,
+ IOMMU_TEST_DEV_CACHE_DEFAULT);
+
+ /* Invalidate the 3rd and 4th cache entries and verify */
+ num_inv = 2;
+ inv_reqs[0].flags = 0;
+ inv_reqs[0].vdev_id = 0x99;
+ inv_reqs[0].cache_id = 2;
+ inv_reqs[1].flags = 0;
+ inv_reqs[1].vdev_id = 0x99;
+ inv_reqs[1].cache_id = 3;
+ test_cmd_viommu_invalidate(viommu_id, inv_reqs,
+ sizeof(*inv_reqs), &num_inv);
+ assert(num_inv == 2);
+ test_cmd_dev_check_cache_all(dev_id, 0);
+
+ /* Invalidate all cache entries for nested_dev_id[1] and verify */
+ num_inv = 1;
+ inv_reqs[0].vdev_id = 0x99;
+ inv_reqs[0].flags = IOMMU_TEST_INVALIDATE_FLAG_ALL;
+ test_cmd_viommu_invalidate(viommu_id, inv_reqs,
+ sizeof(*inv_reqs), &num_inv);
+ assert(num_inv == 1);
+ test_cmd_dev_check_cache_all(dev_id, 0);
+ test_ioctl_destroy(vdev_id);
+ }
+}
+
TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/iommu/iommufd_fail_nth.c b/tools/testing/selftests/iommu/iommufd_fail_nth.c
index c5d5e69452b0..22f6fd5f0f74 100644
--- a/tools/testing/selftests/iommu/iommufd_fail_nth.c
+++ b/tools/testing/selftests/iommu/iommufd_fail_nth.c
@@ -47,6 +47,9 @@ static __attribute__((constructor)) void setup_buffer(void)
buffer = mmap(0, BUFFER_SIZE, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+
+ mfd_buffer = memfd_mmap(BUFFER_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
+ &mfd);
}
/*
@@ -331,6 +334,42 @@ TEST_FAIL_NTH(basic_fail_nth, map_domain)
return 0;
}
+/* iopt_area_fill_domains() and iopt_area_fill_domain() */
+TEST_FAIL_NTH(basic_fail_nth, map_file_domain)
+{
+ uint32_t ioas_id;
+ __u32 stdev_id;
+ __u32 hwpt_id;
+ __u64 iova;
+
+ self->fd = open("/dev/iommu", O_RDWR);
+ if (self->fd == -1)
+ return -1;
+
+ if (_test_ioctl_ioas_alloc(self->fd, &ioas_id))
+ return -1;
+
+ if (_test_ioctl_set_temp_memory_limit(self->fd, 32))
+ return -1;
+
+ fail_nth_enable();
+
+ if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id, NULL))
+ return -1;
+
+ if (_test_ioctl_ioas_map_file(self->fd, ioas_id, mfd, 0, 262144, &iova,
+ IOMMU_IOAS_MAP_WRITEABLE |
+ IOMMU_IOAS_MAP_READABLE))
+ return -1;
+
+ if (_test_ioctl_destroy(self->fd, stdev_id))
+ return -1;
+
+ if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id, NULL))
+ return -1;
+ return 0;
+}
+
TEST_FAIL_NTH(basic_fail_nth, map_two_domains)
{
uint32_t ioas_id;
@@ -582,6 +621,8 @@ TEST_FAIL_NTH(basic_fail_nth, device)
uint32_t stdev_id;
uint32_t idev_id;
uint32_t hwpt_id;
+ uint32_t viommu_id;
+ uint32_t vdev_id;
__u64 iova;
self->fd = open("/dev/iommu", O_RDWR);
@@ -624,6 +665,19 @@ TEST_FAIL_NTH(basic_fail_nth, device)
if (_test_cmd_mock_domain_replace(self->fd, stdev_id, hwpt_id, NULL))
return -1;
+
+ if (_test_cmd_hwpt_alloc(self->fd, idev_id, ioas_id, 0,
+ IOMMU_HWPT_ALLOC_NEST_PARENT, &hwpt_id,
+ IOMMU_HWPT_DATA_NONE, 0, 0))
+ return -1;
+
+ if (_test_cmd_viommu_alloc(self->fd, idev_id, hwpt_id,
+ IOMMU_VIOMMU_TYPE_SELFTEST, 0, &viommu_id))
+ return -1;
+
+ if (_test_cmd_vdevice_alloc(self->fd, viommu_id, idev_id, 0, &vdev_id))
+ return -1;
+
return 0;
}
diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h
index 40f6f14ce136..d979f5b0efe8 100644
--- a/tools/testing/selftests/iommu/iommufd_utils.h
+++ b/tools/testing/selftests/iommu/iommufd_utils.h
@@ -22,6 +22,12 @@
#define BIT_MASK(nr) (1UL << ((nr) % __BITS_PER_LONG))
#define BIT_WORD(nr) ((nr) / __BITS_PER_LONG)
+enum {
+ IOPT_PAGES_ACCOUNT_NONE = 0,
+ IOPT_PAGES_ACCOUNT_USER = 1,
+ IOPT_PAGES_ACCOUNT_MM = 2,
+};
+
#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
static inline void set_bit(unsigned int nr, unsigned long *addr)
@@ -40,12 +46,28 @@ static inline bool test_bit(unsigned int nr, unsigned long *addr)
static void *buffer;
static unsigned long BUFFER_SIZE;
+static void *mfd_buffer;
+static int mfd;
+
static unsigned long PAGE_SIZE;
#define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER))
#define offsetofend(TYPE, MEMBER) \
(offsetof(TYPE, MEMBER) + sizeof_field(TYPE, MEMBER))
+static inline void *memfd_mmap(size_t length, int prot, int flags, int *mfd_p)
+{
+ int mfd_flags = (flags & MAP_HUGETLB) ? MFD_HUGETLB : 0;
+ int mfd = memfd_create("buffer", mfd_flags);
+
+ if (mfd <= 0)
+ return MAP_FAILED;
+ if (ftruncate(mfd, length))
+ return MAP_FAILED;
+ *mfd_p = mfd;
+ return mmap(0, length, prot, flags, mfd, 0);
+}
+
/*
* Have the kernel check the refcount on pages. I don't know why a freshly
* mmap'd anon non-compound page starts out with a ref of 3
@@ -234,6 +256,30 @@ static int _test_cmd_hwpt_alloc(int fd, __u32 device_id, __u32 pt_id, __u32 ft_i
test_cmd_hwpt_check_iotlb(hwpt_id, i, expected); \
})
+#define test_cmd_dev_check_cache(device_id, cache_id, expected) \
+ ({ \
+ struct iommu_test_cmd test_cmd = { \
+ .size = sizeof(test_cmd), \
+ .op = IOMMU_TEST_OP_DEV_CHECK_CACHE, \
+ .id = device_id, \
+ .check_dev_cache = { \
+ .id = cache_id, \
+ .cache = expected, \
+ }, \
+ }; \
+ ASSERT_EQ(0, ioctl(self->fd, \
+ _IOMMU_TEST_CMD( \
+ IOMMU_TEST_OP_DEV_CHECK_CACHE), \
+ &test_cmd)); \
+ })
+
+#define test_cmd_dev_check_cache_all(device_id, expected) \
+ ({ \
+ int c; \
+ for (c = 0; c < MOCK_DEV_CACHE_NUM; c++) \
+ test_cmd_dev_check_cache(device_id, c, expected); \
+ })
+
static int _test_cmd_hwpt_invalidate(int fd, __u32 hwpt_id, void *reqs,
uint32_t data_type, uint32_t lreq,
uint32_t *nreqs)
@@ -265,6 +311,38 @@ static int _test_cmd_hwpt_invalidate(int fd, __u32 hwpt_id, void *reqs,
data_type, lreq, nreqs)); \
})
+static int _test_cmd_viommu_invalidate(int fd, __u32 viommu_id, void *reqs,
+ uint32_t data_type, uint32_t lreq,
+ uint32_t *nreqs)
+{
+ struct iommu_hwpt_invalidate cmd = {
+ .size = sizeof(cmd),
+ .hwpt_id = viommu_id,
+ .data_type = data_type,
+ .data_uptr = (uint64_t)reqs,
+ .entry_len = lreq,
+ .entry_num = *nreqs,
+ };
+ int rc = ioctl(fd, IOMMU_HWPT_INVALIDATE, &cmd);
+ *nreqs = cmd.entry_num;
+ return rc;
+}
+
+#define test_cmd_viommu_invalidate(viommu, reqs, lreq, nreqs) \
+ ({ \
+ ASSERT_EQ(0, \
+ _test_cmd_viommu_invalidate(self->fd, viommu, reqs, \
+ IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST, \
+ lreq, nreqs)); \
+ })
+#define test_err_viommu_invalidate(_errno, viommu_id, reqs, data_type, lreq, \
+ nreqs) \
+ ({ \
+ EXPECT_ERRNO(_errno, _test_cmd_viommu_invalidate( \
+ self->fd, viommu_id, reqs, \
+ data_type, lreq, nreqs)); \
+ })
+
static int _test_cmd_access_replace_ioas(int fd, __u32 access_id,
unsigned int ioas_id)
{
@@ -589,6 +667,47 @@ static int _test_ioctl_ioas_unmap(int fd, unsigned int ioas_id, uint64_t iova,
EXPECT_ERRNO(_errno, _test_ioctl_ioas_unmap(self->fd, self->ioas_id, \
iova, length, NULL))
+static int _test_ioctl_ioas_map_file(int fd, unsigned int ioas_id, int mfd,
+ size_t start, size_t length, __u64 *iova,
+ unsigned int flags)
+{
+ struct iommu_ioas_map_file cmd = {
+ .size = sizeof(cmd),
+ .flags = flags,
+ .ioas_id = ioas_id,
+ .fd = mfd,
+ .start = start,
+ .length = length,
+ };
+ int ret;
+
+ if (flags & IOMMU_IOAS_MAP_FIXED_IOVA)
+ cmd.iova = *iova;
+
+ ret = ioctl(fd, IOMMU_IOAS_MAP_FILE, &cmd);
+ *iova = cmd.iova;
+ return ret;
+}
+
+#define test_ioctl_ioas_map_file(mfd, start, length, iova_p) \
+ ASSERT_EQ(0, \
+ _test_ioctl_ioas_map_file( \
+ self->fd, self->ioas_id, mfd, start, length, iova_p, \
+ IOMMU_IOAS_MAP_WRITEABLE | IOMMU_IOAS_MAP_READABLE))
+
+#define test_err_ioctl_ioas_map_file(_errno, mfd, start, length, iova_p) \
+ EXPECT_ERRNO( \
+ _errno, \
+ _test_ioctl_ioas_map_file( \
+ self->fd, self->ioas_id, mfd, start, length, iova_p, \
+ IOMMU_IOAS_MAP_WRITEABLE | IOMMU_IOAS_MAP_READABLE))
+
+#define test_ioctl_ioas_map_id_file(ioas_id, mfd, start, length, iova_p) \
+ ASSERT_EQ(0, \
+ _test_ioctl_ioas_map_file( \
+ self->fd, ioas_id, mfd, start, length, iova_p, \
+ IOMMU_IOAS_MAP_WRITEABLE | IOMMU_IOAS_MAP_READABLE))
+
static int _test_ioctl_set_temp_memory_limit(int fd, unsigned int limit)
{
struct iommu_test_cmd memlimit_cmd = {
@@ -762,3 +881,58 @@ static int _test_cmd_trigger_iopf(int fd, __u32 device_id, __u32 fault_fd)
#define test_cmd_trigger_iopf(device_id, fault_fd) \
ASSERT_EQ(0, _test_cmd_trigger_iopf(self->fd, device_id, fault_fd))
+
+static int _test_cmd_viommu_alloc(int fd, __u32 device_id, __u32 hwpt_id,
+ __u32 type, __u32 flags, __u32 *viommu_id)
+{
+ struct iommu_viommu_alloc cmd = {
+ .size = sizeof(cmd),
+ .flags = flags,
+ .type = type,
+ .dev_id = device_id,
+ .hwpt_id = hwpt_id,
+ };
+ int ret;
+
+ ret = ioctl(fd, IOMMU_VIOMMU_ALLOC, &cmd);
+ if (ret)
+ return ret;
+ if (viommu_id)
+ *viommu_id = cmd.out_viommu_id;
+ return 0;
+}
+
+#define test_cmd_viommu_alloc(device_id, hwpt_id, type, viommu_id) \
+ ASSERT_EQ(0, _test_cmd_viommu_alloc(self->fd, device_id, hwpt_id, \
+ type, 0, viommu_id))
+#define test_err_viommu_alloc(_errno, device_id, hwpt_id, type, viommu_id) \
+ EXPECT_ERRNO(_errno, \
+ _test_cmd_viommu_alloc(self->fd, device_id, hwpt_id, \
+ type, 0, viommu_id))
+
+static int _test_cmd_vdevice_alloc(int fd, __u32 viommu_id, __u32 idev_id,
+ __u64 virt_id, __u32 *vdev_id)
+{
+ struct iommu_vdevice_alloc cmd = {
+ .size = sizeof(cmd),
+ .dev_id = idev_id,
+ .viommu_id = viommu_id,
+ .virt_id = virt_id,
+ };
+ int ret;
+
+ ret = ioctl(fd, IOMMU_VDEVICE_ALLOC, &cmd);
+ if (ret)
+ return ret;
+ if (vdev_id)
+ *vdev_id = cmd.out_vdevice_id;
+ return 0;
+}
+
+#define test_cmd_vdevice_alloc(viommu_id, idev_id, virt_id, vdev_id) \
+ ASSERT_EQ(0, _test_cmd_vdevice_alloc(self->fd, viommu_id, idev_id, \
+ virt_id, vdev_id))
+#define test_err_vdevice_alloc(_errno, viommu_id, idev_id, virt_id, vdev_id) \
+ EXPECT_ERRNO(_errno, \
+ _test_cmd_vdevice_alloc(self->fd, viommu_id, idev_id, \
+ virt_id, vdev_id))