summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-11-21 12:40:50 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2024-11-21 12:40:50 -0800
commit341d041daae52cd5f014f68c1c7d9039db818fca (patch)
tree87630919e6b6b456c8b7d32c2c5d52de6264b247
parent51ae62a12c242e49229db23b96d03ecc15efc0d1 (diff)
parent6d026e6d48cd2a95407c8fdd8d6187b871401c23 (diff)
Merge tag 'for-linus-iommufd' of git://git.kernel.org/pub/scm/linux/kernel/git/jgg/iommufd
Pull iommufd updates from Jason Gunthorpe: "Several new features and uAPI for iommufd: - IOMMU_IOAS_MAP_FILE allows passing in a file descriptor as the backing memory for an iommu mapping. To date VFIO/iommufd have used VMA's and pin_user_pages(), this now allows using memfds and memfd_pin_folios(). Notably this creates a pure folio path from the memfd to the iommu page table where memory is never broken down to PAGE_SIZE. - IOMMU_IOAS_CHANGE_PROCESS moves the pinned page accounting between two processes. Combined with the above this allows iommufd to support a VMM re-start using exec() where something like qemu would exec() a new version of itself and fd pass the memfds/iommufd/etc to the new process. The memfd allows DMA access to the memory to continue while the new process is getting setup, and the CHANGE_PROCESS updates all the accounting. - Support for fault reporting to userspace on non-PRI HW, such as ARM stall-mode embedded devices. - IOMMU_VIOMMU_ALLOC introduces the concept of a HW/driver backed virtual iommu. This will be used by VMMs to access hardware features that are contained with in a VM. The first use is to inform the kernel of the virtual SID to physical SID mapping when issuing SID based invalidation on ARM. Further uses will tie HW features that are directly accessed by the VM, such as invalidation queue assignment and others. - IOMMU_VDEVICE_ALLOC informs the kernel about the mapping of virtual device to physical device within a VIOMMU. Minimially this is used to translate VM issued cache invalidation commands from virtual to physical device IDs. - Enhancements to IOMMU_HWPT_INVALIDATE and IOMMU_HWPT_ALLOC to work with the VIOMMU - ARM SMMuv3 support for nested translation. Using the VIOMMU and VDEVICE the driver can model this HW's behavior for nested translation. This includes a shared branch from Will" * tag 'for-linus-iommufd' of git://git.kernel.org/pub/scm/linux/kernel/git/jgg/iommufd: (51 commits) iommu/arm-smmu-v3: Import IOMMUFD module namespace iommufd: IOMMU_IOAS_CHANGE_PROCESS selftest iommufd: Add IOMMU_IOAS_CHANGE_PROCESS iommufd: Lock all IOAS objects iommufd: Export do_update_pinned iommu/arm-smmu-v3: Support IOMMU_HWPT_INVALIDATE using a VIOMMU object iommu/arm-smmu-v3: Allow ATS for IOMMU_DOMAIN_NESTED iommu/arm-smmu-v3: Use S2FWB for NESTED domains iommu/arm-smmu-v3: Support IOMMU_DOMAIN_NESTED iommu/arm-smmu-v3: Support IOMMU_VIOMMU_ALLOC Documentation: userspace-api: iommufd: Update vDEVICE iommufd/selftest: Add vIOMMU coverage for IOMMU_HWPT_INVALIDATE ioctl iommufd/selftest: Add IOMMU_TEST_OP_DEV_CHECK_CACHE test command iommufd/selftest: Add mock_viommu_cache_invalidate iommufd/viommu: Add iommufd_viommu_find_dev helper iommu: Add iommu_copy_struct_from_full_user_array helper iommufd: Allow hwpt_id to carry viommu_id for IOMMU_HWPT_INVALIDATE iommu/viommu: Add cache_invalidate to iommufd_viommu_ops iommufd/selftest: Add IOMMU_VDEVICE_ALLOC test coverage iommufd/viommu: Add IOMMUFD_OBJ_VDEVICE and IOMMU_VDEVICE_ALLOC ioctl ...
-rw-r--r--Documentation/userspace-api/iommufd.rst226
-rw-r--r--drivers/acpi/arm64/iort.c13
-rw-r--r--drivers/iommu/Kconfig9
-rw-r--r--drivers/iommu/arm/arm-smmu-v3/Makefile1
-rw-r--r--drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c401
-rw-r--r--drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c139
-rw-r--r--drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h92
-rw-r--r--drivers/iommu/arm/arm-smmu/arm-smmu.c16
-rw-r--r--drivers/iommu/io-pgtable-arm.c27
-rw-r--r--drivers/iommu/iommu.c10
-rw-r--r--drivers/iommu/iommufd/Kconfig4
-rw-r--r--drivers/iommu/iommufd/Makefile6
-rw-r--r--drivers/iommu/iommufd/driver.c53
-rw-r--r--drivers/iommu/iommufd/fault.c9
-rw-r--r--drivers/iommu/iommufd/hw_pagetable.c113
-rw-r--r--drivers/iommu/iommufd/io_pagetable.c105
-rw-r--r--drivers/iommu/iommufd/io_pagetable.h26
-rw-r--r--drivers/iommu/iommufd/ioas.c259
-rw-r--r--drivers/iommu/iommufd/iommufd_private.h58
-rw-r--r--drivers/iommu/iommufd/iommufd_test.h32
-rw-r--r--drivers/iommu/iommufd/main.c65
-rw-r--r--drivers/iommu/iommufd/pages.c319
-rw-r--r--drivers/iommu/iommufd/selftest.c364
-rw-r--r--drivers/iommu/iommufd/vfio_compat.c7
-rw-r--r--drivers/iommu/iommufd/viommu.c157
-rw-r--r--drivers/vfio/vfio_iommu_type1.c12
-rw-r--r--include/acpi/actbl2.h3
-rw-r--r--include/linux/io-pgtable.h2
-rw-r--r--include/linux/iommu.h67
-rw-r--r--include/linux/iommufd.h108
-rw-r--r--include/linux/mm.h1
-rw-r--r--include/uapi/linux/iommufd.h216
-rw-r--r--include/uapi/linux/vfio.h2
-rw-r--r--mm/gup.c24
-rw-r--r--tools/testing/selftests/iommu/Makefile1
-rw-r--r--tools/testing/selftests/iommu/iommufd.c606
-rw-r--r--tools/testing/selftests/iommu/iommufd_fail_nth.c54
-rw-r--r--tools/testing/selftests/iommu/iommufd_utils.h174
38 files changed, 3354 insertions, 427 deletions
diff --git a/Documentation/userspace-api/iommufd.rst b/Documentation/userspace-api/iommufd.rst
index aa004faed5fd..70289d6815d2 100644
--- a/Documentation/userspace-api/iommufd.rst
+++ b/Documentation/userspace-api/iommufd.rst
@@ -41,46 +41,133 @@ Following IOMMUFD objects are exposed to userspace:
- IOMMUFD_OBJ_DEVICE, representing a device that is bound to iommufd by an
external driver.
-- IOMMUFD_OBJ_HW_PAGETABLE, representing an actual hardware I/O page table
- (i.e. a single struct iommu_domain) managed by the iommu driver.
-
- The IOAS has a list of HW_PAGETABLES that share the same IOVA mapping and
- it will synchronize its mapping with each member HW_PAGETABLE.
+- IOMMUFD_OBJ_HWPT_PAGING, representing an actual hardware I/O page table
+ (i.e. a single struct iommu_domain) managed by the iommu driver. "PAGING"
+ primarly indicates this type of HWPT should be linked to an IOAS. It also
+ indicates that it is backed by an iommu_domain with __IOMMU_DOMAIN_PAGING
+ feature flag. This can be either an UNMANAGED stage-1 domain for a device
+ running in the user space, or a nesting parent stage-2 domain for mappings
+ from guest-level physical addresses to host-level physical addresses.
+
+ The IOAS has a list of HWPT_PAGINGs that share the same IOVA mapping and
+ it will synchronize its mapping with each member HWPT_PAGING.
+
+- IOMMUFD_OBJ_HWPT_NESTED, representing an actual hardware I/O page table
+ (i.e. a single struct iommu_domain) managed by user space (e.g. guest OS).
+ "NESTED" indicates that this type of HWPT should be linked to an HWPT_PAGING.
+ It also indicates that it is backed by an iommu_domain that has a type of
+ IOMMU_DOMAIN_NESTED. This must be a stage-1 domain for a device running in
+ the user space (e.g. in a guest VM enabling the IOMMU nested translation
+ feature.) As such, it must be created with a given nesting parent stage-2
+ domain to associate to. This nested stage-1 page table managed by the user
+ space usually has mappings from guest-level I/O virtual addresses to guest-
+ level physical addresses.
+
+- IOMMUFD_OBJ_VIOMMU, representing a slice of the physical IOMMU instance,
+ passed to or shared with a VM. It may be some HW-accelerated virtualization
+ features and some SW resources used by the VM. For examples:
+
+ * Security namespace for guest owned ID, e.g. guest-controlled cache tags
+ * Non-device-affiliated event reporting, e.g. invalidation queue errors
+ * Access to a sharable nesting parent pagetable across physical IOMMUs
+ * Virtualization of various platforms IDs, e.g. RIDs and others
+ * Delivery of paravirtualized invalidation
+ * Direct assigned invalidation queues
+ * Direct assigned interrupts
+
+ Such a vIOMMU object generally has the access to a nesting parent pagetable
+ to support some HW-accelerated virtualization features. So, a vIOMMU object
+ must be created given a nesting parent HWPT_PAGING object, and then it would
+ encapsulate that HWPT_PAGING object. Therefore, a vIOMMU object can be used
+ to allocate an HWPT_NESTED object in place of the encapsulated HWPT_PAGING.
+
+ .. note::
+
+ The name "vIOMMU" isn't necessarily identical to a virtualized IOMMU in a
+ VM. A VM can have one giant virtualized IOMMU running on a machine having
+ multiple physical IOMMUs, in which case the VMM will dispatch the requests
+ or configurations from this single virtualized IOMMU instance to multiple
+ vIOMMU objects created for individual slices of different physical IOMMUs.
+ In other words, a vIOMMU object is always a representation of one physical
+ IOMMU, not necessarily of a virtualized IOMMU. For VMMs that want the full
+ virtualization features from physical IOMMUs, it is suggested to build the
+ same number of virtualized IOMMUs as the number of physical IOMMUs, so the
+ passed-through devices would be connected to their own virtualized IOMMUs
+ backed by corresponding vIOMMU objects, in which case a guest OS would do
+ the "dispatch" naturally instead of VMM trappings.
+
+- IOMMUFD_OBJ_VDEVICE, representing a virtual device for an IOMMUFD_OBJ_DEVICE
+ against an IOMMUFD_OBJ_VIOMMU. This virtual device holds the device's virtual
+ information or attributes (related to the vIOMMU) in a VM. An immediate vDATA
+ example can be the virtual ID of the device on a vIOMMU, which is a unique ID
+ that VMM assigns to the device for a translation channel/port of the vIOMMU,
+ e.g. vSID of ARM SMMUv3, vDeviceID of AMD IOMMU, and vRID of Intel VT-d to a
+ Context Table. Potential use cases of some advanced security information can
+ be forwarded via this object too, such as security level or realm information
+ in a Confidential Compute Architecture. A VMM should create a vDEVICE object
+ to forward all the device information in a VM, when it connects a device to a
+ vIOMMU, which is a separate ioctl call from attaching the same device to an
+ HWPT_PAGING that the vIOMMU holds.
All user-visible objects are destroyed via the IOMMU_DESTROY uAPI.
-The diagram below shows relationship between user-visible objects and kernel
+The diagrams below show relationships between user-visible objects and kernel
datastructures (external to iommufd), with numbers referred to operations
creating the objects and links::
- _________________________________________________________
- | iommufd |
- | [1] |
- | _________________ |
- | | | |
- | | | |
- | | | |
- | | | |
- | | | |
- | | | |
- | | | [3] [2] |
- | | | ____________ __________ |
- | | IOAS |<--| |<------| | |
- | | | |HW_PAGETABLE| | DEVICE | |
- | | | |____________| |__________| |
- | | | | | |
- | | | | | |
- | | | | | |
- | | | | | |
- | | | | | |
- | |_________________| | | |
- | | | | |
- |_________|___________________|___________________|_______|
- | | |
- | _____v______ _______v_____
- | PFN storage | | | |
- |------------>|iommu_domain| |struct device|
- |____________| |_____________|
+ _______________________________________________________________________
+ | iommufd (HWPT_PAGING only) |
+ | |
+ | [1] [3] [2] |
+ | ________________ _____________ ________ |
+ | | | | | | | |
+ | | IOAS |<---| HWPT_PAGING |<---------------------| DEVICE | |
+ | |________________| |_____________| |________| |
+ | | | | |
+ |_________|____________________|__________________________________|_____|
+ | | |
+ | ______v_____ ___v__
+ | PFN storage | (paging) | |struct|
+ |------------>|iommu_domain|<-----------------------|device|
+ |____________| |______|
+
+ _______________________________________________________________________
+ | iommufd (with HWPT_NESTED) |
+ | |
+ | [1] [3] [4] [2] |
+ | ________________ _____________ _____________ ________ |
+ | | | | | | | | | |
+ | | IOAS |<---| HWPT_PAGING |<---| HWPT_NESTED |<--| DEVICE | |
+ | |________________| |_____________| |_____________| |________| |
+ | | | | | |
+ |_________|____________________|__________________|_______________|_____|
+ | | | |
+ | ______v_____ ______v_____ ___v__
+ | PFN storage | (paging) | | (nested) | |struct|
+ |------------>|iommu_domain|<----|iommu_domain|<----|device|
+ |____________| |____________| |______|
+
+ _______________________________________________________________________
+ | iommufd (with vIOMMU/vDEVICE) |
+ | |
+ | [5] [6] |
+ | _____________ _____________ |
+ | | | | | |
+ | |----------------| vIOMMU |<---| vDEVICE |<----| |
+ | | | | |_____________| | |
+ | | | | | |
+ | | [1] | | [4] | [2] |
+ | | ______ | | _____________ _|______ |
+ | | | | | [3] | | | | | |
+ | | | IOAS |<---|(HWPT_PAGING)|<---| HWPT_NESTED |<--| DEVICE | |
+ | | |______| |_____________| |_____________| |________| |
+ | | | | | | |
+ |______|________|______________|__________________|_______________|_____|
+ | | | | |
+ ______v_____ | ______v_____ ______v_____ ___v__
+ | struct | | PFN | (paging) | | (nested) | |struct|
+ |iommu_device| |------>|iommu_domain|<----|iommu_domain|<----|device|
+ |____________| storage|____________| |____________| |______|
1. IOMMUFD_OBJ_IOAS is created via the IOMMU_IOAS_ALLOC uAPI. An iommufd can
hold multiple IOAS objects. IOAS is the most generic object and does not
@@ -94,21 +181,63 @@ creating the objects and links::
device. The driver must also set the driver_managed_dma flag and must not
touch the device until this operation succeeds.
-3. IOMMUFD_OBJ_HW_PAGETABLE is created when an external driver calls the IOMMUFD
- kAPI to attach a bound device to an IOAS. Similarly the external driver uAPI
- allows userspace to initiate the attaching operation. If a compatible
- pagetable already exists then it is reused for the attachment. Otherwise a
- new pagetable object and iommu_domain is created. Successful completion of
- this operation sets up the linkages among IOAS, device and iommu_domain. Once
- this completes the device could do DMA.
-
- Every iommu_domain inside the IOAS is also represented to userspace as a
- HW_PAGETABLE object.
+3. IOMMUFD_OBJ_HWPT_PAGING can be created in two ways:
+
+ * IOMMUFD_OBJ_HWPT_PAGING is automatically created when an external driver
+ calls the IOMMUFD kAPI to attach a bound device to an IOAS. Similarly the
+ external driver uAPI allows userspace to initiate the attaching operation.
+ If a compatible member HWPT_PAGING object exists in the IOAS's HWPT_PAGING
+ list, then it will be reused. Otherwise a new HWPT_PAGING that represents
+ an iommu_domain to userspace will be created, and then added to the list.
+ Successful completion of this operation sets up the linkages among IOAS,
+ device and iommu_domain. Once this completes the device could do DMA.
+
+ * IOMMUFD_OBJ_HWPT_PAGING can be manually created via the IOMMU_HWPT_ALLOC
+ uAPI, provided an ioas_id via @pt_id to associate the new HWPT_PAGING to
+ the corresponding IOAS object. The benefit of this manual allocation is to
+ allow allocation flags (defined in enum iommufd_hwpt_alloc_flags), e.g. it
+ allocates a nesting parent HWPT_PAGING if the IOMMU_HWPT_ALLOC_NEST_PARENT
+ flag is set.
+
+4. IOMMUFD_OBJ_HWPT_NESTED can be only manually created via the IOMMU_HWPT_ALLOC
+ uAPI, provided an hwpt_id or a viommu_id of a vIOMMU object encapsulating a
+ nesting parent HWPT_PAGING via @pt_id to associate the new HWPT_NESTED object
+ to the corresponding HWPT_PAGING object. The associating HWPT_PAGING object
+ must be a nesting parent manually allocated via the same uAPI previously with
+ an IOMMU_HWPT_ALLOC_NEST_PARENT flag, otherwise the allocation will fail. The
+ allocation will be further validated by the IOMMU driver to ensure that the
+ nesting parent domain and the nested domain being allocated are compatible.
+ Successful completion of this operation sets up linkages among IOAS, device,
+ and iommu_domains. Once this completes the device could do DMA via a 2-stage
+ translation, a.k.a nested translation. Note that multiple HWPT_NESTED objects
+ can be allocated by (and then associated to) the same nesting parent.
.. note::
- Future IOMMUFD updates will provide an API to create and manipulate the
- HW_PAGETABLE directly.
+ Either a manual IOMMUFD_OBJ_HWPT_PAGING or an IOMMUFD_OBJ_HWPT_NESTED is
+ created via the same IOMMU_HWPT_ALLOC uAPI. The difference is at the type
+ of the object passed in via the @pt_id field of struct iommufd_hwpt_alloc.
+
+5. IOMMUFD_OBJ_VIOMMU can be only manually created via the IOMMU_VIOMMU_ALLOC
+ uAPI, provided a dev_id (for the device's physical IOMMU to back the vIOMMU)
+ and an hwpt_id (to associate the vIOMMU to a nesting parent HWPT_PAGING). The
+ iommufd core will link the vIOMMU object to the struct iommu_device that the
+ struct device is behind. And an IOMMU driver can implement a viommu_alloc op
+ to allocate its own vIOMMU data structure embedding the core-level structure
+ iommufd_viommu and some driver-specific data. If necessary, the driver can
+ also configure its HW virtualization feature for that vIOMMU (and thus for
+ the VM). Successful completion of this operation sets up the linkages between
+ the vIOMMU object and the HWPT_PAGING, then this vIOMMU object can be used
+ as a nesting parent object to allocate an HWPT_NESTED object described above.
+
+6. IOMMUFD_OBJ_VDEVICE can be only manually created via the IOMMU_VDEVICE_ALLOC
+ uAPI, provided a viommu_id for an iommufd_viommu object and a dev_id for an
+ iommufd_device object. The vDEVICE object will be the binding between these
+ two parent objects. Another @virt_id will be also set via the uAPI providing
+ the iommufd core an index to store the vDEVICE object to a vDEVICE array per
+ vIOMMU. If necessary, the IOMMU driver may choose to implement a vdevce_alloc
+ op to init its HW for virtualization feature related to a vDEVICE. Successful
+ completion of this operation sets up the linkages between vIOMMU and device.
A device can only bind to an iommufd due to DMA ownership claim and attach to at
most one IOAS object (no support of PASID yet).
@@ -120,7 +249,10 @@ User visible objects are backed by following datastructures:
- iommufd_ioas for IOMMUFD_OBJ_IOAS.
- iommufd_device for IOMMUFD_OBJ_DEVICE.
-- iommufd_hw_pagetable for IOMMUFD_OBJ_HW_PAGETABLE.
+- iommufd_hwpt_paging for IOMMUFD_OBJ_HWPT_PAGING.
+- iommufd_hwpt_nested for IOMMUFD_OBJ_HWPT_NESTED.
+- iommufd_viommu for IOMMUFD_OBJ_VIOMMU.
+- iommufd_vdevice for IOMMUFD_OBJ_VDEVICE.
Several terminologies when looking at these datastructures:
diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index 4c745a26226b..1f7e4c691d9e 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -1218,6 +1218,17 @@ static bool iort_pci_rc_supports_ats(struct acpi_iort_node *node)
return pci_rc->ats_attribute & ACPI_IORT_ATS_SUPPORTED;
}
+static bool iort_pci_rc_supports_canwbs(struct acpi_iort_node *node)
+{
+ struct acpi_iort_memory_access *memory_access;
+ struct acpi_iort_root_complex *pci_rc;
+
+ pci_rc = (struct acpi_iort_root_complex *)node->node_data;
+ memory_access =
+ (struct acpi_iort_memory_access *)&pci_rc->memory_properties;
+ return memory_access->memory_flags & ACPI_IORT_MF_CANWBS;
+}
+
static int iort_iommu_xlate(struct device *dev, struct acpi_iort_node *node,
u32 streamid)
{
@@ -1335,6 +1346,8 @@ int iort_iommu_configure_id(struct device *dev, const u32 *id_in)
fwspec = dev_iommu_fwspec_get(dev);
if (fwspec && iort_pci_rc_supports_ats(node))
fwspec->flags |= IOMMU_FWSPEC_PCI_RC_ATS;
+ if (fwspec && iort_pci_rc_supports_canwbs(node))
+ fwspec->flags |= IOMMU_FWSPEC_PCI_RC_CANWBS;
} else {
node = iort_scan_node(ACPI_IORT_NODE_NAMED_COMPONENT,
iort_match_node_callback, dev);
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index b3aa1f5d5321..0c9bceb1653d 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -415,6 +415,15 @@ config ARM_SMMU_V3_SVA
Say Y here if your system supports SVA extensions such as PCIe PASID
and PRI.
+config ARM_SMMU_V3_IOMMUFD
+ bool "Enable IOMMUFD features for ARM SMMUv3 (EXPERIMENTAL)"
+ depends on IOMMUFD
+ help
+ Support for IOMMUFD features intended to support virtual machines
+ with accelerated virtual IOMMUs.
+
+ Say Y here if you are doing development and testing on this feature.
+
config ARM_SMMU_V3_KUNIT_TEST
tristate "KUnit tests for arm-smmu-v3 driver" if !KUNIT_ALL_TESTS
depends on KUNIT
diff --git a/drivers/iommu/arm/arm-smmu-v3/Makefile b/drivers/iommu/arm/arm-smmu-v3/Makefile
index dc98c88b48c8..493a659cc66b 100644
--- a/drivers/iommu/arm/arm-smmu-v3/Makefile
+++ b/drivers/iommu/arm/arm-smmu-v3/Makefile
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_ARM_SMMU_V3) += arm_smmu_v3.o
arm_smmu_v3-y := arm-smmu-v3.o
+arm_smmu_v3-$(CONFIG_ARM_SMMU_V3_IOMMUFD) += arm-smmu-v3-iommufd.o
arm_smmu_v3-$(CONFIG_ARM_SMMU_V3_SVA) += arm-smmu-v3-sva.o
arm_smmu_v3-$(CONFIG_TEGRA241_CMDQV) += tegra241-cmdqv.o
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
new file mode 100644
index 000000000000..6cc14d82399f
--- /dev/null
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
@@ -0,0 +1,401 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ */
+
+#include <uapi/linux/iommufd.h>
+
+#include "arm-smmu-v3.h"
+
+void *arm_smmu_hw_info(struct device *dev, u32 *length, u32 *type)
+{
+ struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+ struct iommu_hw_info_arm_smmuv3 *info;
+ u32 __iomem *base_idr;
+ unsigned int i;
+
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
+ if (!info)
+ return ERR_PTR(-ENOMEM);
+
+ base_idr = master->smmu->base + ARM_SMMU_IDR0;
+ for (i = 0; i <= 5; i++)
+ info->idr[i] = readl_relaxed(base_idr + i);
+ info->iidr = readl_relaxed(master->smmu->base + ARM_SMMU_IIDR);
+ info->aidr = readl_relaxed(master->smmu->base + ARM_SMMU_AIDR);
+
+ *length = sizeof(*info);
+ *type = IOMMU_HW_INFO_TYPE_ARM_SMMUV3;
+
+ return info;
+}
+
+static void arm_smmu_make_nested_cd_table_ste(
+ struct arm_smmu_ste *target, struct arm_smmu_master *master,
+ struct arm_smmu_nested_domain *nested_domain, bool ats_enabled)
+{
+ arm_smmu_make_s2_domain_ste(
+ target, master, nested_domain->vsmmu->s2_parent, ats_enabled);
+
+ target->data[0] = cpu_to_le64(STRTAB_STE_0_V |
+ FIELD_PREP(STRTAB_STE_0_CFG,
+ STRTAB_STE_0_CFG_NESTED));
+ target->data[0] |= nested_domain->ste[0] &
+ ~cpu_to_le64(STRTAB_STE_0_CFG);
+ target->data[1] |= nested_domain->ste[1];
+}
+
+/*
+ * Create a physical STE from the virtual STE that userspace provided when it
+ * created the nested domain. Using the vSTE userspace can request:
+ * - Non-valid STE
+ * - Abort STE
+ * - Bypass STE (install the S2, no CD table)
+ * - CD table STE (install the S2 and the userspace CD table)
+ */
+static void arm_smmu_make_nested_domain_ste(
+ struct arm_smmu_ste *target, struct arm_smmu_master *master,
+ struct arm_smmu_nested_domain *nested_domain, bool ats_enabled)
+{
+ unsigned int cfg =
+ FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(nested_domain->ste[0]));
+
+ /*
+ * Userspace can request a non-valid STE through the nesting interface.
+ * We relay that into an abort physical STE with the intention that
+ * C_BAD_STE for this SID can be generated to userspace.
+ */
+ if (!(nested_domain->ste[0] & cpu_to_le64(STRTAB_STE_0_V)))
+ cfg = STRTAB_STE_0_CFG_ABORT;
+
+ switch (cfg) {
+ case STRTAB_STE_0_CFG_S1_TRANS:
+ arm_smmu_make_nested_cd_table_ste(target, master, nested_domain,
+ ats_enabled);
+ break;
+ case STRTAB_STE_0_CFG_BYPASS:
+ arm_smmu_make_s2_domain_ste(target, master,
+ nested_domain->vsmmu->s2_parent,
+ ats_enabled);
+ break;
+ case STRTAB_STE_0_CFG_ABORT:
+ default:
+ arm_smmu_make_abort_ste(target);
+ break;
+ }
+}
+
+static int arm_smmu_attach_dev_nested(struct iommu_domain *domain,
+ struct device *dev)
+{
+ struct arm_smmu_nested_domain *nested_domain =
+ to_smmu_nested_domain(domain);
+ struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+ struct arm_smmu_attach_state state = {
+ .master = master,
+ .old_domain = iommu_get_domain_for_dev(dev),
+ .ssid = IOMMU_NO_PASID,
+ };
+ struct arm_smmu_ste ste;
+ int ret;
+
+ if (nested_domain->vsmmu->smmu != master->smmu)
+ return -EINVAL;
+ if (arm_smmu_ssids_in_use(&master->cd_table))
+ return -EBUSY;
+
+ mutex_lock(&arm_smmu_asid_lock);
+ /*
+ * The VM has to control the actual ATS state at the PCI device because
+ * we forward the invalidations directly from the VM. If the VM doesn't
+ * think ATS is on it will not generate ATC flushes and the ATC will
+ * become incoherent. Since we can't access the actual virtual PCI ATS
+ * config bit here base this off the EATS value in the STE. If the EATS
+ * is set then the VM must generate ATC flushes.
+ */
+ state.disable_ats = !nested_domain->enable_ats;
+ ret = arm_smmu_attach_prepare(&state, domain);
+ if (ret) {
+ mutex_unlock(&arm_smmu_asid_lock);
+ return ret;
+ }
+
+ arm_smmu_make_nested_domain_ste(&ste, master, nested_domain,
+ state.ats_enabled);
+ arm_smmu_install_ste_for_dev(master, &ste);
+ arm_smmu_attach_commit(&state);
+ mutex_unlock(&arm_smmu_asid_lock);
+ return 0;
+}
+
+static void arm_smmu_domain_nested_free(struct iommu_domain *domain)
+{
+ kfree(to_smmu_nested_domain(domain));
+}
+
+static const struct iommu_domain_ops arm_smmu_nested_ops = {
+ .attach_dev = arm_smmu_attach_dev_nested,
+ .free = arm_smmu_domain_nested_free,
+};
+
+static int arm_smmu_validate_vste(struct iommu_hwpt_arm_smmuv3 *arg,
+ bool *enable_ats)
+{
+ unsigned int eats;
+ unsigned int cfg;
+
+ if (!(arg->ste[0] & cpu_to_le64(STRTAB_STE_0_V))) {
+ memset(arg->ste, 0, sizeof(arg->ste));
+ return 0;
+ }
+
+ /* EIO is reserved for invalid STE data. */
+ if ((arg->ste[0] & ~STRTAB_STE_0_NESTING_ALLOWED) ||
+ (arg->ste[1] & ~STRTAB_STE_1_NESTING_ALLOWED))
+ return -EIO;
+
+ cfg = FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(arg->ste[0]));
+ if (cfg != STRTAB_STE_0_CFG_ABORT && cfg != STRTAB_STE_0_CFG_BYPASS &&
+ cfg != STRTAB_STE_0_CFG_S1_TRANS)
+ return -EIO;
+
+ /*
+ * Only Full ATS or ATS UR is supported
+ * The EATS field will be set by arm_smmu_make_nested_domain_ste()
+ */
+ eats = FIELD_GET(STRTAB_STE_1_EATS, le64_to_cpu(arg->ste[1]));
+ arg->ste[1] &= ~cpu_to_le64(STRTAB_STE_1_EATS);
+ if (eats != STRTAB_STE_1_EATS_ABT && eats != STRTAB_STE_1_EATS_TRANS)
+ return -EIO;
+
+ if (cfg == STRTAB_STE_0_CFG_S1_TRANS)
+ *enable_ats = (eats == STRTAB_STE_1_EATS_TRANS);
+ return 0;
+}
+
+static struct iommu_domain *
+arm_vsmmu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
+ const struct iommu_user_data *user_data)
+{
+ struct arm_vsmmu *vsmmu = container_of(viommu, struct arm_vsmmu, core);
+ const u32 SUPPORTED_FLAGS = IOMMU_HWPT_FAULT_ID_VALID;
+ struct arm_smmu_nested_domain *nested_domain;
+ struct iommu_hwpt_arm_smmuv3 arg;
+ bool enable_ats = false;
+ int ret;
+
+ /*
+ * Faults delivered to the nested domain are faults that originated by
+ * the S1 in the domain. The core code will match all PASIDs when
+ * delivering the fault due to user_pasid_table
+ */
+ if (flags & ~SUPPORTED_FLAGS)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ ret = iommu_copy_struct_from_user(&arg, user_data,
+ IOMMU_HWPT_DATA_ARM_SMMUV3, ste);
+ if (ret)
+ return ERR_PTR(ret);
+
+ ret = arm_smmu_validate_vste(&arg, &enable_ats);
+ if (ret)
+ return ERR_PTR(ret);
+
+ nested_domain = kzalloc(sizeof(*nested_domain), GFP_KERNEL_ACCOUNT);
+ if (!nested_domain)
+ return ERR_PTR(-ENOMEM);
+
+ nested_domain->domain.type = IOMMU_DOMAIN_NESTED;
+ nested_domain->domain.ops = &arm_smmu_nested_ops;
+ nested_domain->enable_ats = enable_ats;
+ nested_domain->vsmmu = vsmmu;
+ nested_domain->ste[0] = arg.ste[0];
+ nested_domain->ste[1] = arg.ste[1] & ~cpu_to_le64(STRTAB_STE_1_EATS);
+
+ return &nested_domain->domain;
+}
+
+static int arm_vsmmu_vsid_to_sid(struct arm_vsmmu *vsmmu, u32 vsid, u32 *sid)
+{
+ struct arm_smmu_master *master;
+ struct device *dev;
+ int ret = 0;
+
+ xa_lock(&vsmmu->core.vdevs);
+ dev = iommufd_viommu_find_dev(&vsmmu->core, (unsigned long)vsid);
+ if (!dev) {
+ ret = -EIO;
+ goto unlock;
+ }
+ master = dev_iommu_priv_get(dev);
+
+ /* At this moment, iommufd only supports PCI device that has one SID */
+ if (sid)
+ *sid = master->streams[0].id;
+unlock:
+ xa_unlock(&vsmmu->core.vdevs);
+ return ret;
+}
+
+/* This is basically iommu_viommu_arm_smmuv3_invalidate in u64 for conversion */
+struct arm_vsmmu_invalidation_cmd {
+ union {
+ u64 cmd[2];
+ struct iommu_viommu_arm_smmuv3_invalidate ucmd;
+ };
+};
+
+/*
+ * Convert, in place, the raw invalidation command into an internal format that
+ * can be passed to arm_smmu_cmdq_issue_cmdlist(). Internally commands are
+ * stored in CPU endian.
+ *
+ * Enforce the VMID or SID on the command.
+ */
+static int arm_vsmmu_convert_user_cmd(struct arm_vsmmu *vsmmu,
+ struct arm_vsmmu_invalidation_cmd *cmd)
+{
+ /* Commands are le64 stored in u64 */
+ cmd->cmd[0] = le64_to_cpu(cmd->ucmd.cmd[0]);
+ cmd->cmd[1] = le64_to_cpu(cmd->ucmd.cmd[1]);
+
+ switch (cmd->cmd[0] & CMDQ_0_OP) {
+ case CMDQ_OP_TLBI_NSNH_ALL:
+ /* Convert to NH_ALL */
+ cmd->cmd[0] = CMDQ_OP_TLBI_NH_ALL |
+ FIELD_PREP(CMDQ_TLBI_0_VMID, vsmmu->vmid);
+ cmd->cmd[1] = 0;
+ break;
+ case CMDQ_OP_TLBI_NH_VA:
+ case CMDQ_OP_TLBI_NH_VAA:
+ case CMDQ_OP_TLBI_NH_ALL:
+ case CMDQ_OP_TLBI_NH_ASID:
+ cmd->cmd[0] &= ~CMDQ_TLBI_0_VMID;
+ cmd->cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, vsmmu->vmid);
+ break;
+ case CMDQ_OP_ATC_INV:
+ case CMDQ_OP_CFGI_CD:
+ case CMDQ_OP_CFGI_CD_ALL: {
+ u32 sid, vsid = FIELD_GET(CMDQ_CFGI_0_SID, cmd->cmd[0]);
+
+ if (arm_vsmmu_vsid_to_sid(vsmmu, vsid, &sid))
+ return -EIO;
+ cmd->cmd[0] &= ~CMDQ_CFGI_0_SID;
+ cmd->cmd[0] |= FIELD_PREP(CMDQ_CFGI_0_SID, sid);
+ break;
+ }
+ default:
+ return -EIO;
+ }
+ return 0;
+}
+
+static int arm_vsmmu_cache_invalidate(struct iommufd_viommu *viommu,
+ struct iommu_user_data_array *array)
+{
+ struct arm_vsmmu *vsmmu = container_of(viommu, struct arm_vsmmu, core);
+ struct arm_smmu_device *smmu = vsmmu->smmu;
+ struct arm_vsmmu_invalidation_cmd *last;
+ struct arm_vsmmu_invalidation_cmd *cmds;
+ struct arm_vsmmu_invalidation_cmd *cur;
+ struct arm_vsmmu_invalidation_cmd *end;
+ int ret;
+
+ cmds = kcalloc(array->entry_num, sizeof(*cmds), GFP_KERNEL);
+ if (!cmds)
+ return -ENOMEM;
+ cur = cmds;
+ end = cmds + array->entry_num;
+
+ static_assert(sizeof(*cmds) == 2 * sizeof(u64));
+ ret = iommu_copy_struct_from_full_user_array(
+ cmds, sizeof(*cmds), array,
+ IOMMU_VIOMMU_INVALIDATE_DATA_ARM_SMMUV3);
+ if (ret)
+ goto out;
+
+ last = cmds;
+ while (cur != end) {
+ ret = arm_vsmmu_convert_user_cmd(vsmmu, cur);
+ if (ret)
+ goto out;
+
+ /* FIXME work in blocks of CMDQ_BATCH_ENTRIES and copy each block? */
+ cur++;
+ if (cur != end && (cur - last) != CMDQ_BATCH_ENTRIES - 1)
+ continue;
+
+ /* FIXME always uses the main cmdq rather than trying to group by type */
+ ret = arm_smmu_cmdq_issue_cmdlist(smmu, &smmu->cmdq, last->cmd,
+ cur - last, true);
+ if (ret) {
+ cur--;
+ goto out;
+ }
+ last = cur;
+ }
+out:
+ array->entry_num = cur - cmds;
+ kfree(cmds);
+ return ret;
+}
+
+static const struct iommufd_viommu_ops arm_vsmmu_ops = {
+ .alloc_domain_nested = arm_vsmmu_alloc_domain_nested,
+ .cache_invalidate = arm_vsmmu_cache_invalidate,
+};
+
+struct iommufd_viommu *arm_vsmmu_alloc(struct device *dev,
+ struct iommu_domain *parent,
+ struct iommufd_ctx *ictx,
+ unsigned int viommu_type)
+{
+ struct arm_smmu_device *smmu =
+ iommu_get_iommu_dev(dev, struct arm_smmu_device, iommu);
+ struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+ struct arm_smmu_domain *s2_parent = to_smmu_domain(parent);
+ struct arm_vsmmu *vsmmu;
+
+ if (viommu_type != IOMMU_VIOMMU_TYPE_ARM_SMMUV3)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ if (!(smmu->features & ARM_SMMU_FEAT_NESTING))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ if (s2_parent->smmu != master->smmu)
+ return ERR_PTR(-EINVAL);
+
+ /*
+ * FORCE_SYNC is not set with FEAT_NESTING. Some study of the exact HW
+ * defect is needed to determine if arm_vsmmu_cache_invalidate() needs
+ * any change to remove this.
+ */
+ if (WARN_ON(smmu->options & ARM_SMMU_OPT_CMDQ_FORCE_SYNC))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ /*
+ * Must support some way to prevent the VM from bypassing the cache
+ * because VFIO currently does not do any cache maintenance. canwbs
+ * indicates the device is fully coherent and no cache maintenance is
+ * ever required, even for PCI No-Snoop. S2FWB means the S1 can't make
+ * things non-coherent using the memattr, but No-Snoop behavior is not
+ * effected.
+ */
+ if (!arm_smmu_master_canwbs(master) &&
+ !(smmu->features & ARM_SMMU_FEAT_S2FWB))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ vsmmu = iommufd_viommu_alloc(ictx, struct arm_vsmmu, core,
+ &arm_vsmmu_ops);
+ if (IS_ERR(vsmmu))
+ return ERR_CAST(vsmmu);
+
+ vsmmu->smmu = smmu;
+ vsmmu->s2_parent = s2_parent;
+ /* FIXME Move VMID allocation from the S2 domain allocation to here */
+ vsmmu->vmid = s2_parent->s2_cfg.vmid;
+
+ return &vsmmu->core;
+}
+
+MODULE_IMPORT_NS(IOMMUFD);
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 353fea58cd31..10b4dbc8d027 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -295,6 +295,7 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
case CMDQ_OP_TLBI_NH_ASID:
cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_ASID, ent->tlbi.asid);
fallthrough;
+ case CMDQ_OP_TLBI_NH_ALL:
case CMDQ_OP_TLBI_S12_VMALL:
cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, ent->tlbi.vmid);
break;
@@ -765,9 +766,9 @@ static void arm_smmu_cmdq_write_entries(struct arm_smmu_cmdq *cmdq, u64 *cmds,
* insert their own list of commands then all of the commands from one
* CPU will appear before any of the commands from the other CPU.
*/
-static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
- struct arm_smmu_cmdq *cmdq,
- u64 *cmds, int n, bool sync)
+int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
+ struct arm_smmu_cmdq *cmdq, u64 *cmds, int n,
+ bool sync)
{
u64 cmd_sync[CMDQ_ENT_DWORDS];
u32 prod;
@@ -1045,7 +1046,8 @@ void arm_smmu_get_ste_used(const __le64 *ent, __le64 *used_bits)
/* S2 translates */
if (cfg & BIT(1)) {
used_bits[1] |=
- cpu_to_le64(STRTAB_STE_1_EATS | STRTAB_STE_1_SHCFG);
+ cpu_to_le64(STRTAB_STE_1_S2FWB | STRTAB_STE_1_EATS |
+ STRTAB_STE_1_SHCFG);
used_bits[2] |=
cpu_to_le64(STRTAB_STE_2_S2VMID | STRTAB_STE_2_VTCR |
STRTAB_STE_2_S2AA64 | STRTAB_STE_2_S2ENDI |
@@ -1549,7 +1551,6 @@ static void arm_smmu_write_ste(struct arm_smmu_master *master, u32 sid,
}
}
-VISIBLE_IF_KUNIT
void arm_smmu_make_abort_ste(struct arm_smmu_ste *target)
{
memset(target, 0, sizeof(*target));
@@ -1632,7 +1633,6 @@ void arm_smmu_make_cdtable_ste(struct arm_smmu_ste *target,
}
EXPORT_SYMBOL_IF_KUNIT(arm_smmu_make_cdtable_ste);
-VISIBLE_IF_KUNIT
void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target,
struct arm_smmu_master *master,
struct arm_smmu_domain *smmu_domain,
@@ -1655,6 +1655,8 @@ void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target,
FIELD_PREP(STRTAB_STE_1_EATS,
ats_enabled ? STRTAB_STE_1_EATS_TRANS : 0));
+ if (pgtbl_cfg->quirks & IO_PGTABLE_QUIRK_ARM_S2FWB)
+ target->data[1] |= cpu_to_le64(STRTAB_STE_1_S2FWB);
if (smmu->features & ARM_SMMU_FEAT_ATTR_TYPES_OVR)
target->data[1] |= cpu_to_le64(FIELD_PREP(STRTAB_STE_1_SHCFG,
STRTAB_STE_1_SHCFG_INCOMING));
@@ -2105,7 +2107,16 @@ int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain,
if (!master->ats_enabled)
continue;
- arm_smmu_atc_inv_to_cmd(master_domain->ssid, iova, size, &cmd);
+ if (master_domain->nested_ats_flush) {
+ /*
+ * If a S2 used as a nesting parent is changed we have
+ * no option but to completely flush the ATC.
+ */
+ arm_smmu_atc_inv_to_cmd(IOMMU_NO_PASID, 0, 0, &cmd);
+ } else {
+ arm_smmu_atc_inv_to_cmd(master_domain->ssid, iova, size,
+ &cmd);
+ }
for (i = 0; i < master->num_streams; i++) {
cmd.atc.sid = master->streams[i].id;
@@ -2232,6 +2243,15 @@ static void arm_smmu_tlb_inv_range_domain(unsigned long iova, size_t size,
}
__arm_smmu_tlb_inv_range(&cmd, iova, size, granule, smmu_domain);
+ if (smmu_domain->nest_parent) {
+ /*
+ * When the S2 domain changes all the nested S1 ASIDs have to be
+ * flushed too.
+ */
+ cmd.opcode = CMDQ_OP_TLBI_NH_ALL;
+ arm_smmu_cmdq_issue_cmd_with_sync(smmu_domain->smmu, &cmd);
+ }
+
/*
* Unfortunately, this can't be leaf-only since we may have
* zapped an entire table.
@@ -2293,6 +2313,8 @@ static bool arm_smmu_capable(struct device *dev, enum iommu_cap cap)
case IOMMU_CAP_CACHE_COHERENCY:
/* Assume that a coherent TCU implies coherent TBUs */
return master->smmu->features & ARM_SMMU_FEAT_COHERENCY;
+ case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
+ return arm_smmu_master_canwbs(master);
case IOMMU_CAP_NOEXEC:
case IOMMU_CAP_DEFERRED_FLUSH:
return true;
@@ -2303,6 +2325,26 @@ static bool arm_smmu_capable(struct device *dev, enum iommu_cap cap)
}
}
+static bool arm_smmu_enforce_cache_coherency(struct iommu_domain *domain)
+{
+ struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+ struct arm_smmu_master_domain *master_domain;
+ unsigned long flags;
+ bool ret = true;
+
+ spin_lock_irqsave(&smmu_domain->devices_lock, flags);
+ list_for_each_entry(master_domain, &smmu_domain->devices,
+ devices_elm) {
+ if (!arm_smmu_master_canwbs(master_domain->master)) {
+ ret = false;
+ break;
+ }
+ }
+ smmu_domain->enforce_cache_coherency = ret;
+ spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
+ return ret;
+}
+
struct arm_smmu_domain *arm_smmu_domain_alloc(void)
{
struct arm_smmu_domain *smmu_domain;
@@ -2442,6 +2484,9 @@ static int arm_smmu_domain_finalise(struct arm_smmu_domain *smmu_domain,
pgtbl_cfg.oas = smmu->oas;
fmt = ARM_64_LPAE_S2;
finalise_stage_fn = arm_smmu_domain_finalise_s2;
+ if ((smmu->features & ARM_SMMU_FEAT_S2FWB) &&
+ (flags & IOMMU_HWPT_ALLOC_NEST_PARENT))
+ pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_ARM_S2FWB;
break;
default:
return -EINVAL;
@@ -2483,8 +2528,8 @@ arm_smmu_get_step_for_sid(struct arm_smmu_device *smmu, u32 sid)
}
}
-static void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master,
- const struct arm_smmu_ste *target)
+void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master,
+ const struct arm_smmu_ste *target)
{
int i, j;
struct arm_smmu_device *smmu = master->smmu;
@@ -2595,7 +2640,7 @@ static void arm_smmu_disable_pasid(struct arm_smmu_master *master)
static struct arm_smmu_master_domain *
arm_smmu_find_master_domain(struct arm_smmu_domain *smmu_domain,
struct arm_smmu_master *master,
- ioasid_t ssid)
+ ioasid_t ssid, bool nested_ats_flush)
{
struct arm_smmu_master_domain *master_domain;
@@ -2604,7 +2649,8 @@ arm_smmu_find_master_domain(struct arm_smmu_domain *smmu_domain,
list_for_each_entry(master_domain, &smmu_domain->devices,
devices_elm) {
if (master_domain->master == master &&
- master_domain->ssid == ssid)
+ master_domain->ssid == ssid &&
+ master_domain->nested_ats_flush == nested_ats_flush)
return master_domain;
}
return NULL;
@@ -2624,6 +2670,8 @@ to_smmu_domain_devices(struct iommu_domain *domain)
if ((domain->type & __IOMMU_DOMAIN_PAGING) ||
domain->type == IOMMU_DOMAIN_SVA)
return to_smmu_domain(domain);
+ if (domain->type == IOMMU_DOMAIN_NESTED)
+ return to_smmu_nested_domain(domain)->vsmmu->s2_parent;
return NULL;
}
@@ -2633,13 +2681,18 @@ static void arm_smmu_remove_master_domain(struct arm_smmu_master *master,
{
struct arm_smmu_domain *smmu_domain = to_smmu_domain_devices(domain);
struct arm_smmu_master_domain *master_domain;
+ bool nested_ats_flush = false;
unsigned long flags;
if (!smmu_domain)
return;
+ if (domain->type == IOMMU_DOMAIN_NESTED)
+ nested_ats_flush = to_smmu_nested_domain(domain)->enable_ats;
+
spin_lock_irqsave(&smmu_domain->devices_lock, flags);
- master_domain = arm_smmu_find_master_domain(smmu_domain, master, ssid);
+ master_domain = arm_smmu_find_master_domain(smmu_domain, master, ssid,
+ nested_ats_flush);
if (master_domain) {
list_del(&master_domain->devices_elm);
kfree(master_domain);
@@ -2649,16 +2702,6 @@ static void arm_smmu_remove_master_domain(struct arm_smmu_master *master,
spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
}
-struct arm_smmu_attach_state {
- /* Inputs */
- struct iommu_domain *old_domain;
- struct arm_smmu_master *master;
- bool cd_needs_ats;
- ioasid_t ssid;
- /* Resulting state */
- bool ats_enabled;
-};
-
/*
* Start the sequence to attach a domain to a master. The sequence contains three
* steps:
@@ -2679,8 +2722,8 @@ struct arm_smmu_attach_state {
* new_domain can be a non-paging domain. In this case ATS will not be enabled,
* and invalidations won't be tracked.
*/
-static int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state,
- struct iommu_domain *new_domain)
+int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state,
+ struct iommu_domain *new_domain)
{
struct arm_smmu_master *master = state->master;
struct arm_smmu_master_domain *master_domain;
@@ -2706,7 +2749,8 @@ static int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state,
* enabled if we have arm_smmu_domain, those always have page
* tables.
*/
- state->ats_enabled = arm_smmu_ats_supported(master);
+ state->ats_enabled = !state->disable_ats &&
+ arm_smmu_ats_supported(master);
}
if (smmu_domain) {
@@ -2715,6 +2759,9 @@ static int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state,
return -ENOMEM;
master_domain->master = master;
master_domain->ssid = state->ssid;
+ if (new_domain->type == IOMMU_DOMAIN_NESTED)
+ master_domain->nested_ats_flush =
+ to_smmu_nested_domain(new_domain)->enable_ats;
/*
* During prepare we want the current smmu_domain and new
@@ -2731,6 +2778,14 @@ static int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state,
* one of them.
*/
spin_lock_irqsave(&smmu_domain->devices_lock, flags);
+ if (smmu_domain->enforce_cache_coherency &&
+ !arm_smmu_master_canwbs(master)) {
+ spin_unlock_irqrestore(&smmu_domain->devices_lock,
+ flags);
+ kfree(master_domain);
+ return -EINVAL;
+ }
+
if (state->ats_enabled)
atomic_inc(&smmu_domain->nr_ats_masters);
list_add(&master_domain->devices_elm, &smmu_domain->devices);
@@ -2754,7 +2809,7 @@ static int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state,
* completes synchronizing the PCI device's ATC and finishes manipulating the
* smmu_domain->devices list.
*/
-static void arm_smmu_attach_commit(struct arm_smmu_attach_state *state)
+void arm_smmu_attach_commit(struct arm_smmu_attach_state *state)
{
struct arm_smmu_master *master = state->master;
@@ -3084,7 +3139,8 @@ arm_smmu_domain_alloc_user(struct device *dev, u32 flags,
const struct iommu_user_data *user_data)
{
struct arm_smmu_master *master = dev_iommu_priv_get(dev);
- const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
+ const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING |
+ IOMMU_HWPT_ALLOC_NEST_PARENT;
struct arm_smmu_domain *smmu_domain;
int ret;
@@ -3097,6 +3153,15 @@ arm_smmu_domain_alloc_user(struct device *dev, u32 flags,
if (IS_ERR(smmu_domain))
return ERR_CAST(smmu_domain);
+ if (flags & IOMMU_HWPT_ALLOC_NEST_PARENT) {
+ if (!(master->smmu->features & ARM_SMMU_FEAT_NESTING)) {
+ ret = -EOPNOTSUPP;
+ goto err_free;
+ }
+ smmu_domain->stage = ARM_SMMU_DOMAIN_S2;
+ smmu_domain->nest_parent = true;
+ }
+
smmu_domain->domain.type = IOMMU_DOMAIN_UNMANAGED;
smmu_domain->domain.ops = arm_smmu_ops.default_domain_ops;
ret = arm_smmu_domain_finalise(smmu_domain, master->smmu, flags);
@@ -3378,21 +3443,6 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev)
return group;
}
-static int arm_smmu_enable_nesting(struct iommu_domain *domain)
-{
- struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
- int ret = 0;
-
- mutex_lock(&smmu_domain->init_mutex);
- if (smmu_domain->smmu)
- ret = -EPERM;
- else
- smmu_domain->stage = ARM_SMMU_DOMAIN_S2;
- mutex_unlock(&smmu_domain->init_mutex);
-
- return ret;
-}
-
static int arm_smmu_of_xlate(struct device *dev,
const struct of_phandle_args *args)
{
@@ -3491,6 +3541,7 @@ static struct iommu_ops arm_smmu_ops = {
.identity_domain = &arm_smmu_identity_domain,
.blocked_domain = &arm_smmu_blocked_domain,
.capable = arm_smmu_capable,
+ .hw_info = arm_smmu_hw_info,
.domain_alloc_paging = arm_smmu_domain_alloc_paging,
.domain_alloc_sva = arm_smmu_sva_domain_alloc,
.domain_alloc_user = arm_smmu_domain_alloc_user,
@@ -3504,17 +3555,19 @@ static struct iommu_ops arm_smmu_ops = {
.dev_disable_feat = arm_smmu_dev_disable_feature,
.page_response = arm_smmu_page_response,
.def_domain_type = arm_smmu_def_domain_type,
+ .viommu_alloc = arm_vsmmu_alloc,
+ .user_pasid_table = 1,
.pgsize_bitmap = -1UL, /* Restricted during device attach */
.owner = THIS_MODULE,
.default_domain_ops = &(const struct iommu_domain_ops) {
.attach_dev = arm_smmu_attach_dev,
+ .enforce_cache_coherency = arm_smmu_enforce_cache_coherency,
.set_dev_pasid = arm_smmu_s1_set_dev_pasid,
.map_pages = arm_smmu_map_pages,
.unmap_pages = arm_smmu_unmap_pages,
.flush_iotlb_all = arm_smmu_flush_iotlb_all,
.iotlb_sync = arm_smmu_iotlb_sync,
.iova_to_phys = arm_smmu_iova_to_phys,
- .enable_nesting = arm_smmu_enable_nesting,
.free = arm_smmu_domain_free_paging,
}
};
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 1e9952ca989f..af25f092303f 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -10,6 +10,7 @@
#include <linux/bitfield.h>
#include <linux/iommu.h>
+#include <linux/iommufd.h>
#include <linux/kernel.h>
#include <linux/mmzone.h>
#include <linux/sizes.h>
@@ -57,6 +58,7 @@ struct arm_smmu_device;
#define IDR1_SIDSIZE GENMASK(5, 0)
#define ARM_SMMU_IDR3 0xc
+#define IDR3_FWB (1 << 8)
#define IDR3_RIL (1 << 10)
#define ARM_SMMU_IDR5 0x14
@@ -81,6 +83,8 @@ struct arm_smmu_device;
#define IIDR_REVISION GENMASK(15, 12)
#define IIDR_IMPLEMENTER GENMASK(11, 0)
+#define ARM_SMMU_AIDR 0x1C
+
#define ARM_SMMU_CR0 0x20
#define CR0_ATSCHK (1 << 4)
#define CR0_CMDQEN (1 << 3)
@@ -241,6 +245,7 @@ static inline u32 arm_smmu_strtab_l2_idx(u32 sid)
#define STRTAB_STE_0_CFG_BYPASS 4
#define STRTAB_STE_0_CFG_S1_TRANS 5
#define STRTAB_STE_0_CFG_S2_TRANS 6
+#define STRTAB_STE_0_CFG_NESTED 7
#define STRTAB_STE_0_S1FMT GENMASK_ULL(5, 4)
#define STRTAB_STE_0_S1FMT_LINEAR 0
@@ -261,6 +266,7 @@ static inline u32 arm_smmu_strtab_l2_idx(u32 sid)
#define STRTAB_STE_1_S1COR GENMASK_ULL(5, 4)
#define STRTAB_STE_1_S1CSH GENMASK_ULL(7, 6)
+#define STRTAB_STE_1_S2FWB (1UL << 25)
#define STRTAB_STE_1_S1STALLD (1UL << 27)
#define STRTAB_STE_1_EATS GENMASK_ULL(29, 28)
@@ -292,6 +298,15 @@ static inline u32 arm_smmu_strtab_l2_idx(u32 sid)
#define STRTAB_STE_3_S2TTB_MASK GENMASK_ULL(51, 4)
+/* These bits can be controlled by userspace for STRTAB_STE_0_CFG_NESTED */
+#define STRTAB_STE_0_NESTING_ALLOWED \
+ cpu_to_le64(STRTAB_STE_0_V | STRTAB_STE_0_CFG | STRTAB_STE_0_S1FMT | \
+ STRTAB_STE_0_S1CTXPTR_MASK | STRTAB_STE_0_S1CDMAX)
+#define STRTAB_STE_1_NESTING_ALLOWED \
+ cpu_to_le64(STRTAB_STE_1_S1DSS | STRTAB_STE_1_S1CIR | \
+ STRTAB_STE_1_S1COR | STRTAB_STE_1_S1CSH | \
+ STRTAB_STE_1_S1STALLD | STRTAB_STE_1_EATS)
+
/*
* Context descriptors.
*
@@ -511,8 +526,10 @@ struct arm_smmu_cmdq_ent {
};
} cfgi;
+ #define CMDQ_OP_TLBI_NH_ALL 0x10
#define CMDQ_OP_TLBI_NH_ASID 0x11
#define CMDQ_OP_TLBI_NH_VA 0x12
+ #define CMDQ_OP_TLBI_NH_VAA 0x13
#define CMDQ_OP_TLBI_EL2_ALL 0x20
#define CMDQ_OP_TLBI_EL2_ASID 0x21
#define CMDQ_OP_TLBI_EL2_VA 0x22
@@ -726,6 +743,7 @@ struct arm_smmu_device {
#define ARM_SMMU_FEAT_ATTR_TYPES_OVR (1 << 20)
#define ARM_SMMU_FEAT_HA (1 << 21)
#define ARM_SMMU_FEAT_HD (1 << 22)
+#define ARM_SMMU_FEAT_S2FWB (1 << 23)
u32 features;
#define ARM_SMMU_OPT_SKIP_PREFETCH (1 << 0)
@@ -811,10 +829,20 @@ struct arm_smmu_domain {
/* List of struct arm_smmu_master_domain */
struct list_head devices;
spinlock_t devices_lock;
+ bool enforce_cache_coherency : 1;
+ bool nest_parent : 1;
struct mmu_notifier mmu_notifier;
};
+struct arm_smmu_nested_domain {
+ struct iommu_domain domain;
+ struct arm_vsmmu *vsmmu;
+ bool enable_ats : 1;
+
+ __le64 ste[2];
+};
+
/* The following are exposed for testing purposes. */
struct arm_smmu_entry_writer_ops;
struct arm_smmu_entry_writer {
@@ -827,21 +855,22 @@ struct arm_smmu_entry_writer_ops {
void (*sync)(struct arm_smmu_entry_writer *writer);
};
+void arm_smmu_make_abort_ste(struct arm_smmu_ste *target);
+void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target,
+ struct arm_smmu_master *master,
+ struct arm_smmu_domain *smmu_domain,
+ bool ats_enabled);
+
#if IS_ENABLED(CONFIG_KUNIT)
void arm_smmu_get_ste_used(const __le64 *ent, __le64 *used_bits);
void arm_smmu_write_entry(struct arm_smmu_entry_writer *writer, __le64 *cur,
const __le64 *target);
void arm_smmu_get_cd_used(const __le64 *ent, __le64 *used_bits);
-void arm_smmu_make_abort_ste(struct arm_smmu_ste *target);
void arm_smmu_make_bypass_ste(struct arm_smmu_device *smmu,
struct arm_smmu_ste *target);
void arm_smmu_make_cdtable_ste(struct arm_smmu_ste *target,
struct arm_smmu_master *master, bool ats_enabled,
unsigned int s1dss);
-void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target,
- struct arm_smmu_master *master,
- struct arm_smmu_domain *smmu_domain,
- bool ats_enabled);
void arm_smmu_make_sva_cd(struct arm_smmu_cd *target,
struct arm_smmu_master *master, struct mm_struct *mm,
u16 asid);
@@ -851,6 +880,7 @@ struct arm_smmu_master_domain {
struct list_head devices_elm;
struct arm_smmu_master *master;
ioasid_t ssid;
+ bool nested_ats_flush : 1;
};
static inline struct arm_smmu_domain *to_smmu_domain(struct iommu_domain *dom)
@@ -858,6 +888,12 @@ static inline struct arm_smmu_domain *to_smmu_domain(struct iommu_domain *dom)
return container_of(dom, struct arm_smmu_domain, domain);
}
+static inline struct arm_smmu_nested_domain *
+to_smmu_nested_domain(struct iommu_domain *dom)
+{
+ return container_of(dom, struct arm_smmu_nested_domain, domain);
+}
+
extern struct xarray arm_smmu_asid_xa;
extern struct mutex arm_smmu_asid_lock;
@@ -893,6 +929,33 @@ int arm_smmu_init_one_queue(struct arm_smmu_device *smmu,
int arm_smmu_cmdq_init(struct arm_smmu_device *smmu,
struct arm_smmu_cmdq *cmdq);
+static inline bool arm_smmu_master_canwbs(struct arm_smmu_master *master)
+{
+ return dev_iommu_fwspec_get(master->dev)->flags &
+ IOMMU_FWSPEC_PCI_RC_CANWBS;
+}
+
+struct arm_smmu_attach_state {
+ /* Inputs */
+ struct iommu_domain *old_domain;
+ struct arm_smmu_master *master;
+ bool cd_needs_ats;
+ bool disable_ats;
+ ioasid_t ssid;
+ /* Resulting state */
+ bool ats_enabled;
+};
+
+int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state,
+ struct iommu_domain *new_domain);
+void arm_smmu_attach_commit(struct arm_smmu_attach_state *state);
+void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master,
+ const struct arm_smmu_ste *target);
+
+int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
+ struct arm_smmu_cmdq *cmdq, u64 *cmds, int n,
+ bool sync);
+
#ifdef CONFIG_ARM_SMMU_V3_SVA
bool arm_smmu_sva_supported(struct arm_smmu_device *smmu);
bool arm_smmu_master_sva_supported(struct arm_smmu_master *master);
@@ -949,4 +1012,23 @@ tegra241_cmdqv_probe(struct arm_smmu_device *smmu)
return ERR_PTR(-ENODEV);
}
#endif /* CONFIG_TEGRA241_CMDQV */
+
+struct arm_vsmmu {
+ struct iommufd_viommu core;
+ struct arm_smmu_device *smmu;
+ struct arm_smmu_domain *s2_parent;
+ u16 vmid;
+};
+
+#if IS_ENABLED(CONFIG_ARM_SMMU_V3_IOMMUFD)
+void *arm_smmu_hw_info(struct device *dev, u32 *length, u32 *type);
+struct iommufd_viommu *arm_vsmmu_alloc(struct device *dev,
+ struct iommu_domain *parent,
+ struct iommufd_ctx *ictx,
+ unsigned int viommu_type);
+#else
+#define arm_smmu_hw_info NULL
+#define arm_vsmmu_alloc NULL
+#endif /* CONFIG_ARM_SMMU_V3_IOMMUFD */
+
#endif /* _ARM_SMMU_V3_H */
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index 8321962b3714..12b173eec454 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -1558,21 +1558,6 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev)
return group;
}
-static int arm_smmu_enable_nesting(struct iommu_domain *domain)
-{
- struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
- int ret = 0;
-
- mutex_lock(&smmu_domain->init_mutex);
- if (smmu_domain->smmu)
- ret = -EPERM;
- else
- smmu_domain->stage = ARM_SMMU_DOMAIN_NESTED;
- mutex_unlock(&smmu_domain->init_mutex);
-
- return ret;
-}
-
static int arm_smmu_set_pgtable_quirks(struct iommu_domain *domain,
unsigned long quirks)
{
@@ -1656,7 +1641,6 @@ static struct iommu_ops arm_smmu_ops = {
.flush_iotlb_all = arm_smmu_flush_iotlb_all,
.iotlb_sync = arm_smmu_iotlb_sync,
.iova_to_phys = arm_smmu_iova_to_phys,
- .enable_nesting = arm_smmu_enable_nesting,
.set_pgtable_quirks = arm_smmu_set_pgtable_quirks,
.free = arm_smmu_domain_free,
}
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 0e67f1721a3d..74f58c6ac30c 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -106,6 +106,18 @@
#define ARM_LPAE_PTE_HAP_FAULT (((arm_lpae_iopte)0) << 6)
#define ARM_LPAE_PTE_HAP_READ (((arm_lpae_iopte)1) << 6)
#define ARM_LPAE_PTE_HAP_WRITE (((arm_lpae_iopte)2) << 6)
+/*
+ * For !FWB these code to:
+ * 1111 = Normal outer write back cachable / Inner Write Back Cachable
+ * Permit S1 to override
+ * 0101 = Normal Non-cachable / Inner Non-cachable
+ * 0001 = Device / Device-nGnRE
+ * For S2FWB these code:
+ * 0110 Force Normal Write Back
+ * 0101 Normal* is forced Normal-NC, Device unchanged
+ * 0001 Force Device-nGnRE
+ */
+#define ARM_LPAE_PTE_MEMATTR_FWB_WB (((arm_lpae_iopte)0x6) << 2)
#define ARM_LPAE_PTE_MEMATTR_OIWB (((arm_lpae_iopte)0xf) << 2)
#define ARM_LPAE_PTE_MEMATTR_NC (((arm_lpae_iopte)0x5) << 2)
#define ARM_LPAE_PTE_MEMATTR_DEV (((arm_lpae_iopte)0x1) << 2)
@@ -458,12 +470,16 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data,
*/
if (data->iop.fmt == ARM_64_LPAE_S2 ||
data->iop.fmt == ARM_32_LPAE_S2) {
- if (prot & IOMMU_MMIO)
+ if (prot & IOMMU_MMIO) {
pte |= ARM_LPAE_PTE_MEMATTR_DEV;
- else if (prot & IOMMU_CACHE)
- pte |= ARM_LPAE_PTE_MEMATTR_OIWB;
- else
+ } else if (prot & IOMMU_CACHE) {
+ if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_S2FWB)
+ pte |= ARM_LPAE_PTE_MEMATTR_FWB_WB;
+ else
+ pte |= ARM_LPAE_PTE_MEMATTR_OIWB;
+ } else {
pte |= ARM_LPAE_PTE_MEMATTR_NC;
+ }
} else {
if (prot & IOMMU_MMIO)
pte |= (ARM_LPAE_MAIR_ATTR_IDX_DEV
@@ -1035,8 +1051,7 @@ arm_64_lpae_alloc_pgtable_s2(struct io_pgtable_cfg *cfg, void *cookie)
struct arm_lpae_io_pgtable *data;
typeof(&cfg->arm_lpae_s2_cfg.vtcr) vtcr = &cfg->arm_lpae_s2_cfg.vtcr;
- /* The NS quirk doesn't apply at stage 2 */
- if (cfg->quirks)
+ if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_S2FWB))
return NULL;
data = arm_lpae_alloc_pgtable(cfg);
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 83c8e617a2c5..dbd70d5a4702 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2723,16 +2723,6 @@ static int __init iommu_init(void)
}
core_initcall(iommu_init);
-int iommu_enable_nesting(struct iommu_domain *domain)
-{
- if (domain->type != IOMMU_DOMAIN_UNMANAGED)
- return -EINVAL;
- if (!domain->ops->enable_nesting)
- return -EINVAL;
- return domain->ops->enable_nesting(domain);
-}
-EXPORT_SYMBOL_GPL(iommu_enable_nesting);
-
int iommu_set_pgtable_quirks(struct iommu_domain *domain,
unsigned long quirk)
{
diff --git a/drivers/iommu/iommufd/Kconfig b/drivers/iommu/iommufd/Kconfig
index 76656fe0470d..0a07f9449fd9 100644
--- a/drivers/iommu/iommufd/Kconfig
+++ b/drivers/iommu/iommufd/Kconfig
@@ -1,4 +1,8 @@
# SPDX-License-Identifier: GPL-2.0-only
+config IOMMUFD_DRIVER_CORE
+ tristate
+ default (IOMMUFD_DRIVER || IOMMUFD) if IOMMUFD!=n
+
config IOMMUFD
tristate "IOMMU Userspace API"
select INTERVAL_TREE
diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile
index cf4605962bea..cb784da6cddc 100644
--- a/drivers/iommu/iommufd/Makefile
+++ b/drivers/iommu/iommufd/Makefile
@@ -7,9 +7,13 @@ iommufd-y := \
ioas.o \
main.o \
pages.o \
- vfio_compat.o
+ vfio_compat.o \
+ viommu.o
iommufd-$(CONFIG_IOMMUFD_TEST) += selftest.o
obj-$(CONFIG_IOMMUFD) += iommufd.o
obj-$(CONFIG_IOMMUFD_DRIVER) += iova_bitmap.o
+
+iommufd_driver-y := driver.o
+obj-$(CONFIG_IOMMUFD_DRIVER_CORE) += iommufd_driver.o
diff --git a/drivers/iommu/iommufd/driver.c b/drivers/iommu/iommufd/driver.c
new file mode 100644
index 000000000000..7b67fdf44134
--- /dev/null
+++ b/drivers/iommu/iommufd/driver.c
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ */
+#include "iommufd_private.h"
+
+struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx,
+ size_t size,
+ enum iommufd_object_type type)
+{
+ struct iommufd_object *obj;
+ int rc;
+
+ obj = kzalloc(size, GFP_KERNEL_ACCOUNT);
+ if (!obj)
+ return ERR_PTR(-ENOMEM);
+ obj->type = type;
+ /* Starts out bias'd by 1 until it is removed from the xarray */
+ refcount_set(&obj->shortterm_users, 1);
+ refcount_set(&obj->users, 1);
+
+ /*
+ * Reserve an ID in the xarray but do not publish the pointer yet since
+ * the caller hasn't initialized it yet. Once the pointer is published
+ * in the xarray and visible to other threads we can't reliably destroy
+ * it anymore, so the caller must complete all errorable operations
+ * before calling iommufd_object_finalize().
+ */
+ rc = xa_alloc(&ictx->objects, &obj->id, XA_ZERO_ENTRY, xa_limit_31b,
+ GFP_KERNEL_ACCOUNT);
+ if (rc)
+ goto out_free;
+ return obj;
+out_free:
+ kfree(obj);
+ return ERR_PTR(rc);
+}
+EXPORT_SYMBOL_NS_GPL(_iommufd_object_alloc, IOMMUFD);
+
+/* Caller should xa_lock(&viommu->vdevs) to protect the return value */
+struct device *iommufd_viommu_find_dev(struct iommufd_viommu *viommu,
+ unsigned long vdev_id)
+{
+ struct iommufd_vdevice *vdev;
+
+ lockdep_assert_held(&viommu->vdevs.xa_lock);
+
+ vdev = xa_load(&viommu->vdevs, vdev_id);
+ return vdev ? vdev->dev : NULL;
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_viommu_find_dev, IOMMUFD);
+
+MODULE_DESCRIPTION("iommufd code shared with builtin modules");
+MODULE_LICENSE("GPL");
diff --git a/drivers/iommu/iommufd/fault.c b/drivers/iommu/iommufd/fault.c
index e590973ce5cf..053b0e30f55a 100644
--- a/drivers/iommu/iommufd/fault.c
+++ b/drivers/iommu/iommufd/fault.c
@@ -10,6 +10,7 @@
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/pci.h>
+#include <linux/pci-ats.h>
#include <linux/poll.h>
#include <uapi/linux/iommufd.h>
@@ -27,8 +28,12 @@ static int iommufd_fault_iopf_enable(struct iommufd_device *idev)
* resource between PF and VFs. There is no coordination for this
* shared capability. This waits for a vPRI reset to recover.
*/
- if (dev_is_pci(dev) && to_pci_dev(dev)->is_virtfn)
- return -EINVAL;
+ if (dev_is_pci(dev)) {
+ struct pci_dev *pdev = to_pci_dev(dev);
+
+ if (pdev->is_virtfn && pci_pri_supported(pdev))
+ return -EINVAL;
+ }
mutex_lock(&idev->iopf_lock);
/* Device iopf has already been on. */
diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
index d06bf6e6c19f..702057655a81 100644
--- a/drivers/iommu/iommufd/hw_pagetable.c
+++ b/drivers/iommu/iommufd/hw_pagetable.c
@@ -57,7 +57,10 @@ void iommufd_hwpt_nested_destroy(struct iommufd_object *obj)
container_of(obj, struct iommufd_hwpt_nested, common.obj);
__iommufd_hwpt_destroy(&hwpt_nested->common);
- refcount_dec(&hwpt_nested->parent->common.obj.users);
+ if (hwpt_nested->viommu)
+ refcount_dec(&hwpt_nested->viommu->obj.users);
+ else
+ refcount_dec(&hwpt_nested->parent->common.obj.users);
}
void iommufd_hwpt_nested_abort(struct iommufd_object *obj)
@@ -248,8 +251,7 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx,
}
hwpt->domain->owner = ops;
- if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED ||
- !hwpt->domain->ops->cache_invalidate_user)) {
+ if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) {
rc = -EINVAL;
goto out_abort;
}
@@ -260,6 +262,58 @@ out_abort:
return ERR_PTR(rc);
}
+/**
+ * iommufd_viommu_alloc_hwpt_nested() - Get a hwpt_nested for a vIOMMU
+ * @viommu: vIOMMU ojbect to associate the hwpt_nested/domain with
+ * @flags: Flags from userspace
+ * @user_data: user_data pointer. Must be valid
+ *
+ * Allocate a new IOMMU_DOMAIN_NESTED for a vIOMMU and return it as a NESTED
+ * hw_pagetable.
+ */
+static struct iommufd_hwpt_nested *
+iommufd_viommu_alloc_hwpt_nested(struct iommufd_viommu *viommu, u32 flags,
+ const struct iommu_user_data *user_data)
+{
+ struct iommufd_hwpt_nested *hwpt_nested;
+ struct iommufd_hw_pagetable *hwpt;
+ int rc;
+
+ if (!user_data->len)
+ return ERR_PTR(-EOPNOTSUPP);
+ if (!viommu->ops || !viommu->ops->alloc_domain_nested)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ hwpt_nested = __iommufd_object_alloc(
+ viommu->ictx, hwpt_nested, IOMMUFD_OBJ_HWPT_NESTED, common.obj);
+ if (IS_ERR(hwpt_nested))
+ return ERR_CAST(hwpt_nested);
+ hwpt = &hwpt_nested->common;
+
+ hwpt_nested->viommu = viommu;
+ refcount_inc(&viommu->obj.users);
+ hwpt_nested->parent = viommu->hwpt;
+
+ hwpt->domain =
+ viommu->ops->alloc_domain_nested(viommu, flags, user_data);
+ if (IS_ERR(hwpt->domain)) {
+ rc = PTR_ERR(hwpt->domain);
+ hwpt->domain = NULL;
+ goto out_abort;
+ }
+ hwpt->domain->owner = viommu->iommu_dev->ops;
+
+ if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) {
+ rc = -EINVAL;
+ goto out_abort;
+ }
+ return hwpt_nested;
+
+out_abort:
+ iommufd_object_abort_and_destroy(viommu->ictx, &hwpt->obj);
+ return ERR_PTR(rc);
+}
+
int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd)
{
struct iommu_hwpt_alloc *cmd = ucmd->cmd;
@@ -316,6 +370,22 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd)
goto out_unlock;
}
hwpt = &hwpt_nested->common;
+ } else if (pt_obj->type == IOMMUFD_OBJ_VIOMMU) {
+ struct iommufd_hwpt_nested *hwpt_nested;
+ struct iommufd_viommu *viommu;
+
+ viommu = container_of(pt_obj, struct iommufd_viommu, obj);
+ if (viommu->iommu_dev != __iommu_get_iommu_dev(idev->dev)) {
+ rc = -EINVAL;
+ goto out_unlock;
+ }
+ hwpt_nested = iommufd_viommu_alloc_hwpt_nested(
+ viommu, cmd->flags, &user_data);
+ if (IS_ERR(hwpt_nested)) {
+ rc = PTR_ERR(hwpt_nested);
+ goto out_unlock;
+ }
+ hwpt = &hwpt_nested->common;
} else {
rc = -EINVAL;
goto out_put_pt;
@@ -412,7 +482,7 @@ int iommufd_hwpt_invalidate(struct iommufd_ucmd *ucmd)
.entry_len = cmd->entry_len,
.entry_num = cmd->entry_num,
};
- struct iommufd_hw_pagetable *hwpt;
+ struct iommufd_object *pt_obj;
u32 done_num = 0;
int rc;
@@ -426,17 +496,40 @@ int iommufd_hwpt_invalidate(struct iommufd_ucmd *ucmd)
goto out;
}
- hwpt = iommufd_get_hwpt_nested(ucmd, cmd->hwpt_id);
- if (IS_ERR(hwpt)) {
- rc = PTR_ERR(hwpt);
+ pt_obj = iommufd_get_object(ucmd->ictx, cmd->hwpt_id, IOMMUFD_OBJ_ANY);
+ if (IS_ERR(pt_obj)) {
+ rc = PTR_ERR(pt_obj);
goto out;
}
+ if (pt_obj->type == IOMMUFD_OBJ_HWPT_NESTED) {
+ struct iommufd_hw_pagetable *hwpt =
+ container_of(pt_obj, struct iommufd_hw_pagetable, obj);
+
+ if (!hwpt->domain->ops ||
+ !hwpt->domain->ops->cache_invalidate_user) {
+ rc = -EOPNOTSUPP;
+ goto out_put_pt;
+ }
+ rc = hwpt->domain->ops->cache_invalidate_user(hwpt->domain,
+ &data_array);
+ } else if (pt_obj->type == IOMMUFD_OBJ_VIOMMU) {
+ struct iommufd_viommu *viommu =
+ container_of(pt_obj, struct iommufd_viommu, obj);
+
+ if (!viommu->ops || !viommu->ops->cache_invalidate) {
+ rc = -EOPNOTSUPP;
+ goto out_put_pt;
+ }
+ rc = viommu->ops->cache_invalidate(viommu, &data_array);
+ } else {
+ rc = -EINVAL;
+ goto out_put_pt;
+ }
- rc = hwpt->domain->ops->cache_invalidate_user(hwpt->domain,
- &data_array);
done_num = data_array.entry_num;
- iommufd_put_object(ucmd->ictx, &hwpt->obj);
+out_put_pt:
+ iommufd_put_object(ucmd->ictx, pt_obj);
out:
cmd->entry_num = done_num;
if (iommufd_ucmd_respond(ucmd, sizeof(*cmd)))
diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c
index 4bf7ccd39d46..8a790e597e12 100644
--- a/drivers/iommu/iommufd/io_pagetable.c
+++ b/drivers/iommu/iommufd/io_pagetable.c
@@ -107,9 +107,9 @@ static bool __alloc_iova_check_used(struct interval_tree_span_iter *span,
* Does not return a 0 IOVA even if it is valid.
*/
static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova,
- unsigned long uptr, unsigned long length)
+ unsigned long addr, unsigned long length)
{
- unsigned long page_offset = uptr % PAGE_SIZE;
+ unsigned long page_offset = addr % PAGE_SIZE;
struct interval_tree_double_span_iter used_span;
struct interval_tree_span_iter allowed_span;
unsigned long max_alignment = PAGE_SIZE;
@@ -122,15 +122,15 @@ static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova,
return -EOVERFLOW;
/*
- * Keep alignment present in the uptr when building the IOVA, this
+ * Keep alignment present in addr when building the IOVA, which
* increases the chance we can map a THP.
*/
- if (!uptr)
+ if (!addr)
iova_alignment = roundup_pow_of_two(length);
else
iova_alignment = min_t(unsigned long,
roundup_pow_of_two(length),
- 1UL << __ffs64(uptr));
+ 1UL << __ffs64(addr));
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
max_alignment = HPAGE_SIZE;
@@ -248,6 +248,7 @@ static int iopt_alloc_area_pages(struct io_pagetable *iopt,
int iommu_prot, unsigned int flags)
{
struct iopt_pages_list *elm;
+ unsigned long start;
unsigned long iova;
int rc = 0;
@@ -267,9 +268,15 @@ static int iopt_alloc_area_pages(struct io_pagetable *iopt,
/* Use the first entry to guess the ideal IOVA alignment */
elm = list_first_entry(pages_list, struct iopt_pages_list,
next);
- rc = iopt_alloc_iova(
- iopt, dst_iova,
- (uintptr_t)elm->pages->uptr + elm->start_byte, length);
+ switch (elm->pages->type) {
+ case IOPT_ADDRESS_USER:
+ start = elm->start_byte + (uintptr_t)elm->pages->uptr;
+ break;
+ case IOPT_ADDRESS_FILE:
+ start = elm->start_byte + elm->pages->start;
+ break;
+ }
+ rc = iopt_alloc_iova(iopt, dst_iova, start, length);
if (rc)
goto out_unlock;
if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
@@ -384,6 +391,34 @@ out_unlock_domains:
return rc;
}
+static int iopt_map_common(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
+ struct iopt_pages *pages, unsigned long *iova,
+ unsigned long length, unsigned long start_byte,
+ int iommu_prot, unsigned int flags)
+{
+ struct iopt_pages_list elm = {};
+ LIST_HEAD(pages_list);
+ int rc;
+
+ elm.pages = pages;
+ elm.start_byte = start_byte;
+ if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM &&
+ elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER)
+ elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM;
+ elm.length = length;
+ list_add(&elm.next, &pages_list);
+
+ rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags);
+ if (rc) {
+ if (elm.area)
+ iopt_abort_area(elm.area);
+ if (elm.pages)
+ iopt_put_pages(elm.pages);
+ return rc;
+ }
+ return 0;
+}
+
/**
* iopt_map_user_pages() - Map a user VA to an iova in the io page table
* @ictx: iommufd_ctx the iopt is part of
@@ -408,29 +443,41 @@ int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
unsigned long length, int iommu_prot,
unsigned int flags)
{
- struct iopt_pages_list elm = {};
- LIST_HEAD(pages_list);
- int rc;
+ struct iopt_pages *pages;
- elm.pages = iopt_alloc_pages(uptr, length, iommu_prot & IOMMU_WRITE);
- if (IS_ERR(elm.pages))
- return PTR_ERR(elm.pages);
- if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM &&
- elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER)
- elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM;
- elm.start_byte = uptr - elm.pages->uptr;
- elm.length = length;
- list_add(&elm.next, &pages_list);
+ pages = iopt_alloc_user_pages(uptr, length, iommu_prot & IOMMU_WRITE);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
- rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags);
- if (rc) {
- if (elm.area)
- iopt_abort_area(elm.area);
- if (elm.pages)
- iopt_put_pages(elm.pages);
- return rc;
- }
- return 0;
+ return iopt_map_common(ictx, iopt, pages, iova, length,
+ uptr - pages->uptr, iommu_prot, flags);
+}
+
+/**
+ * iopt_map_file_pages() - Like iopt_map_user_pages, but map a file.
+ * @ictx: iommufd_ctx the iopt is part of
+ * @iopt: io_pagetable to act on
+ * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
+ * the chosen iova on output. Otherwise is the iova to map to on input
+ * @file: file to map
+ * @start: map file starting at this byte offset
+ * @length: Number of bytes to map
+ * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
+ * @flags: IOPT_ALLOC_IOVA or zero
+ */
+int iopt_map_file_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
+ unsigned long *iova, struct file *file,
+ unsigned long start, unsigned long length,
+ int iommu_prot, unsigned int flags)
+{
+ struct iopt_pages *pages;
+
+ pages = iopt_alloc_file_pages(file, start, length,
+ iommu_prot & IOMMU_WRITE);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+ return iopt_map_common(ictx, iopt, pages, iova, length,
+ start - pages->start, iommu_prot, flags);
}
struct iova_bitmap_fn_arg {
diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h
index c61d74471684..10c928a9a463 100644
--- a/drivers/iommu/iommufd/io_pagetable.h
+++ b/drivers/iommu/iommufd/io_pagetable.h
@@ -173,6 +173,12 @@ enum {
IOPT_PAGES_ACCOUNT_NONE = 0,
IOPT_PAGES_ACCOUNT_USER = 1,
IOPT_PAGES_ACCOUNT_MM = 2,
+ IOPT_PAGES_ACCOUNT_MODE_NUM = 3,
+};
+
+enum iopt_address_type {
+ IOPT_ADDRESS_USER = 0,
+ IOPT_ADDRESS_FILE = 1,
};
/*
@@ -195,7 +201,14 @@ struct iopt_pages {
struct task_struct *source_task;
struct mm_struct *source_mm;
struct user_struct *source_user;
- void __user *uptr;
+ enum iopt_address_type type;
+ union {
+ void __user *uptr; /* IOPT_ADDRESS_USER */
+ struct { /* IOPT_ADDRESS_FILE */
+ struct file *file;
+ unsigned long start;
+ };
+ };
bool writable:1;
u8 account_mode;
@@ -206,8 +219,10 @@ struct iopt_pages {
struct rb_root_cached domains_itree;
};
-struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length,
- bool writable);
+struct iopt_pages *iopt_alloc_user_pages(void __user *uptr,
+ unsigned long length, bool writable);
+struct iopt_pages *iopt_alloc_file_pages(struct file *file, unsigned long start,
+ unsigned long length, bool writable);
void iopt_release_pages(struct kref *kref);
static inline void iopt_put_pages(struct iopt_pages *pages)
{
@@ -238,4 +253,9 @@ struct iopt_pages_access {
unsigned int users;
};
+struct pfn_reader_user;
+
+int iopt_pages_update_pinned(struct iopt_pages *pages, unsigned long npages,
+ bool inc, struct pfn_reader_user *user);
+
#endif
diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c
index 2c4b2bb11e78..1542c5fd10a8 100644
--- a/drivers/iommu/iommufd/ioas.c
+++ b/drivers/iommu/iommufd/ioas.c
@@ -2,6 +2,7 @@
/*
* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
*/
+#include <linux/file.h>
#include <linux/interval_tree.h>
#include <linux/iommu.h>
#include <linux/iommufd.h>
@@ -51,7 +52,10 @@ int iommufd_ioas_alloc_ioctl(struct iommufd_ucmd *ucmd)
rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
if (rc)
goto out_table;
+
+ down_read(&ucmd->ictx->ioas_creation_lock);
iommufd_object_finalize(ucmd->ictx, &ioas->obj);
+ up_read(&ucmd->ictx->ioas_creation_lock);
return 0;
out_table:
@@ -197,6 +201,52 @@ static int conv_iommu_prot(u32 map_flags)
return iommu_prot;
}
+int iommufd_ioas_map_file(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_ioas_map_file *cmd = ucmd->cmd;
+ unsigned long iova = cmd->iova;
+ struct iommufd_ioas *ioas;
+ unsigned int flags = 0;
+ struct file *file;
+ int rc;
+
+ if (cmd->flags &
+ ~(IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE |
+ IOMMU_IOAS_MAP_READABLE))
+ return -EOPNOTSUPP;
+
+ if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX)
+ return -EOVERFLOW;
+
+ if (!(cmd->flags &
+ (IOMMU_IOAS_MAP_WRITEABLE | IOMMU_IOAS_MAP_READABLE)))
+ return -EINVAL;
+
+ ioas = iommufd_get_ioas(ucmd->ictx, cmd->ioas_id);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+
+ if (!(cmd->flags & IOMMU_IOAS_MAP_FIXED_IOVA))
+ flags = IOPT_ALLOC_IOVA;
+
+ file = fget(cmd->fd);
+ if (!file)
+ return -EBADF;
+
+ rc = iopt_map_file_pages(ucmd->ictx, &ioas->iopt, &iova, file,
+ cmd->start, cmd->length,
+ conv_iommu_prot(cmd->flags), flags);
+ if (rc)
+ goto out_put;
+
+ cmd->iova = iova;
+ rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+out_put:
+ iommufd_put_object(ucmd->ictx, &ioas->obj);
+ fput(file);
+ return rc;
+}
+
int iommufd_ioas_map(struct iommufd_ucmd *ucmd)
{
struct iommu_ioas_map *cmd = ucmd->cmd;
@@ -327,6 +377,215 @@ out_put:
return rc;
}
+static void iommufd_release_all_iova_rwsem(struct iommufd_ctx *ictx,
+ struct xarray *ioas_list)
+{
+ struct iommufd_ioas *ioas;
+ unsigned long index;
+
+ xa_for_each(ioas_list, index, ioas) {
+ up_write(&ioas->iopt.iova_rwsem);
+ refcount_dec(&ioas->obj.users);
+ }
+ up_write(&ictx->ioas_creation_lock);
+ xa_destroy(ioas_list);
+}
+
+static int iommufd_take_all_iova_rwsem(struct iommufd_ctx *ictx,
+ struct xarray *ioas_list)
+{
+ struct iommufd_object *obj;
+ unsigned long index;
+ int rc;
+
+ /*
+ * This is very ugly, it is done instead of adding a lock around
+ * pages->source_mm, which is a performance path for mdev, we just
+ * obtain the write side of all the iova_rwsems which also protects the
+ * pages->source_*. Due to copies we can't know which IOAS could read
+ * from the pages, so we just lock everything. This is the only place
+ * locks are nested and they are uniformly taken in ID order.
+ *
+ * ioas_creation_lock prevents new IOAS from being installed in the
+ * xarray while we do this, and also prevents more than one thread from
+ * holding nested locks.
+ */
+ down_write(&ictx->ioas_creation_lock);
+ xa_lock(&ictx->objects);
+ xa_for_each(&ictx->objects, index, obj) {
+ struct iommufd_ioas *ioas;
+
+ if (!obj || obj->type != IOMMUFD_OBJ_IOAS)
+ continue;
+
+ if (!refcount_inc_not_zero(&obj->users))
+ continue;
+
+ xa_unlock(&ictx->objects);
+
+ ioas = container_of(obj, struct iommufd_ioas, obj);
+ down_write_nest_lock(&ioas->iopt.iova_rwsem,
+ &ictx->ioas_creation_lock);
+
+ rc = xa_err(xa_store(ioas_list, index, ioas, GFP_KERNEL));
+ if (rc) {
+ iommufd_release_all_iova_rwsem(ictx, ioas_list);
+ return rc;
+ }
+
+ xa_lock(&ictx->objects);
+ }
+ xa_unlock(&ictx->objects);
+ return 0;
+}
+
+static bool need_charge_update(struct iopt_pages *pages)
+{
+ switch (pages->account_mode) {
+ case IOPT_PAGES_ACCOUNT_NONE:
+ return false;
+ case IOPT_PAGES_ACCOUNT_MM:
+ return pages->source_mm != current->mm;
+ case IOPT_PAGES_ACCOUNT_USER:
+ /*
+ * Update when mm changes because it also accounts
+ * in mm->pinned_vm.
+ */
+ return (pages->source_user != current_user()) ||
+ (pages->source_mm != current->mm);
+ }
+ return true;
+}
+
+static int charge_current(unsigned long *npinned)
+{
+ struct iopt_pages tmp = {
+ .source_mm = current->mm,
+ .source_task = current->group_leader,
+ .source_user = current_user(),
+ };
+ unsigned int account_mode;
+ int rc;
+
+ for (account_mode = 0; account_mode != IOPT_PAGES_ACCOUNT_MODE_NUM;
+ account_mode++) {
+ if (!npinned[account_mode])
+ continue;
+
+ tmp.account_mode = account_mode;
+ rc = iopt_pages_update_pinned(&tmp, npinned[account_mode], true,
+ NULL);
+ if (rc)
+ goto err_undo;
+ }
+ return 0;
+
+err_undo:
+ while (account_mode != 0) {
+ account_mode--;
+ if (!npinned[account_mode])
+ continue;
+ tmp.account_mode = account_mode;
+ iopt_pages_update_pinned(&tmp, npinned[account_mode], false,
+ NULL);
+ }
+ return rc;
+}
+
+static void change_mm(struct iopt_pages *pages)
+{
+ struct task_struct *old_task = pages->source_task;
+ struct user_struct *old_user = pages->source_user;
+ struct mm_struct *old_mm = pages->source_mm;
+
+ pages->source_mm = current->mm;
+ mmgrab(pages->source_mm);
+ mmdrop(old_mm);
+
+ pages->source_task = current->group_leader;
+ get_task_struct(pages->source_task);
+ put_task_struct(old_task);
+
+ pages->source_user = get_uid(current_user());
+ free_uid(old_user);
+}
+
+#define for_each_ioas_area(_xa, _index, _ioas, _area) \
+ xa_for_each((_xa), (_index), (_ioas)) \
+ for (_area = iopt_area_iter_first(&_ioas->iopt, 0, ULONG_MAX); \
+ _area; \
+ _area = iopt_area_iter_next(_area, 0, ULONG_MAX))
+
+int iommufd_ioas_change_process(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_ioas_change_process *cmd = ucmd->cmd;
+ struct iommufd_ctx *ictx = ucmd->ictx;
+ unsigned long all_npinned[IOPT_PAGES_ACCOUNT_MODE_NUM] = {};
+ struct iommufd_ioas *ioas;
+ struct iopt_area *area;
+ struct iopt_pages *pages;
+ struct xarray ioas_list;
+ unsigned long index;
+ int rc;
+
+ if (cmd->__reserved)
+ return -EOPNOTSUPP;
+
+ xa_init(&ioas_list);
+ rc = iommufd_take_all_iova_rwsem(ictx, &ioas_list);
+ if (rc)
+ return rc;
+
+ for_each_ioas_area(&ioas_list, index, ioas, area) {
+ if (area->pages->type != IOPT_ADDRESS_FILE) {
+ rc = -EINVAL;
+ goto out;
+ }
+ }
+
+ /*
+ * Count last_pinned pages, then clear it to avoid double counting
+ * if the same iopt_pages is visited multiple times in this loop.
+ * Since we are under all the locks, npinned == last_npinned, so we
+ * can easily restore last_npinned before we return.
+ */
+ for_each_ioas_area(&ioas_list, index, ioas, area) {
+ pages = area->pages;
+
+ if (need_charge_update(pages)) {
+ all_npinned[pages->account_mode] += pages->last_npinned;
+ pages->last_npinned = 0;
+ }
+ }
+
+ rc = charge_current(all_npinned);
+
+ if (rc) {
+ /* Charge failed. Fix last_npinned and bail. */
+ for_each_ioas_area(&ioas_list, index, ioas, area)
+ area->pages->last_npinned = area->pages->npinned;
+ goto out;
+ }
+
+ for_each_ioas_area(&ioas_list, index, ioas, area) {
+ pages = area->pages;
+
+ /* Uncharge the old one (which also restores last_npinned) */
+ if (need_charge_update(pages)) {
+ int r = iopt_pages_update_pinned(pages, pages->npinned,
+ false, NULL);
+
+ if (WARN_ON(r))
+ rc = r;
+ }
+ change_mm(pages);
+ }
+
+out:
+ iommufd_release_all_iova_rwsem(ictx, &ioas_list);
+ return rc;
+}
+
int iommufd_option_rlimit_mode(struct iommu_option *cmd,
struct iommufd_ctx *ictx)
{
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index f1d865e6fab6..b6d706cf2c66 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -5,8 +5,8 @@
#define __IOMMUFD_PRIVATE_H
#include <linux/iommu.h>
+#include <linux/iommufd.h>
#include <linux/iova_bitmap.h>
-#include <linux/refcount.h>
#include <linux/rwsem.h>
#include <linux/uaccess.h>
#include <linux/xarray.h>
@@ -24,6 +24,7 @@ struct iommufd_ctx {
struct xarray objects;
struct xarray groups;
wait_queue_head_t destroy_wait;
+ struct rw_semaphore ioas_creation_lock;
u8 account_mode;
/* Compatibility with VFIO no iommu */
@@ -69,6 +70,10 @@ int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
unsigned long *iova, void __user *uptr,
unsigned long length, int iommu_prot,
unsigned int flags);
+int iopt_map_file_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
+ unsigned long *iova, struct file *file,
+ unsigned long start, unsigned long length,
+ int iommu_prot, unsigned int flags);
int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list,
unsigned long length, unsigned long *dst_iova,
int iommu_prot, unsigned int flags);
@@ -122,29 +127,6 @@ static inline int iommufd_ucmd_respond(struct iommufd_ucmd *ucmd,
return 0;
}
-enum iommufd_object_type {
- IOMMUFD_OBJ_NONE,
- IOMMUFD_OBJ_ANY = IOMMUFD_OBJ_NONE,
- IOMMUFD_OBJ_DEVICE,
- IOMMUFD_OBJ_HWPT_PAGING,
- IOMMUFD_OBJ_HWPT_NESTED,
- IOMMUFD_OBJ_IOAS,
- IOMMUFD_OBJ_ACCESS,
- IOMMUFD_OBJ_FAULT,
-#ifdef CONFIG_IOMMUFD_TEST
- IOMMUFD_OBJ_SELFTEST,
-#endif
- IOMMUFD_OBJ_MAX,
-};
-
-/* Base struct for all objects with a userspace ID handle. */
-struct iommufd_object {
- refcount_t shortterm_users;
- refcount_t users;
- enum iommufd_object_type type;
- unsigned int id;
-};
-
static inline bool iommufd_lock_obj(struct iommufd_object *obj)
{
if (!refcount_inc_not_zero(&obj->users))
@@ -225,10 +207,6 @@ iommufd_object_put_and_try_destroy(struct iommufd_ctx *ictx,
iommufd_object_remove(ictx, obj, obj->id, 0);
}
-struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx,
- size_t size,
- enum iommufd_object_type type);
-
#define __iommufd_object_alloc(ictx, ptr, type, obj) \
container_of(_iommufd_object_alloc( \
ictx, \
@@ -276,6 +254,8 @@ void iommufd_ioas_destroy(struct iommufd_object *obj);
int iommufd_ioas_iova_ranges(struct iommufd_ucmd *ucmd);
int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd);
int iommufd_ioas_map(struct iommufd_ucmd *ucmd);
+int iommufd_ioas_map_file(struct iommufd_ucmd *ucmd);
+int iommufd_ioas_change_process(struct iommufd_ucmd *ucmd);
int iommufd_ioas_copy(struct iommufd_ucmd *ucmd);
int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd);
int iommufd_ioas_option(struct iommufd_ucmd *ucmd);
@@ -312,6 +292,7 @@ struct iommufd_hwpt_paging {
struct iommufd_hwpt_nested {
struct iommufd_hw_pagetable common;
struct iommufd_hwpt_paging *parent;
+ struct iommufd_viommu *viommu;
};
static inline bool hwpt_is_paging(struct iommufd_hw_pagetable *hwpt)
@@ -528,6 +509,27 @@ static inline int iommufd_hwpt_replace_device(struct iommufd_device *idev,
return iommu_group_replace_domain(idev->igroup->group, hwpt->domain);
}
+static inline struct iommufd_viommu *
+iommufd_get_viommu(struct iommufd_ucmd *ucmd, u32 id)
+{
+ return container_of(iommufd_get_object(ucmd->ictx, id,
+ IOMMUFD_OBJ_VIOMMU),
+ struct iommufd_viommu, obj);
+}
+
+int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd);
+void iommufd_viommu_destroy(struct iommufd_object *obj);
+int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd);
+void iommufd_vdevice_destroy(struct iommufd_object *obj);
+
+struct iommufd_vdevice {
+ struct iommufd_object obj;
+ struct iommufd_ctx *ictx;
+ struct iommufd_viommu *viommu;
+ struct device *dev;
+ u64 id; /* per-vIOMMU virtual ID */
+};
+
#ifdef CONFIG_IOMMUFD_TEST
int iommufd_test(struct iommufd_ucmd *ucmd);
void iommufd_selftest_destroy(struct iommufd_object *obj);
diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h
index f4bc23a92f9a..a6b7a163f636 100644
--- a/drivers/iommu/iommufd/iommufd_test.h
+++ b/drivers/iommu/iommufd/iommufd_test.h
@@ -23,6 +23,7 @@ enum {
IOMMU_TEST_OP_DIRTY,
IOMMU_TEST_OP_MD_CHECK_IOTLB,
IOMMU_TEST_OP_TRIGGER_IOPF,
+ IOMMU_TEST_OP_DEV_CHECK_CACHE,
};
enum {
@@ -54,6 +55,11 @@ enum {
MOCK_NESTED_DOMAIN_IOTLB_NUM = 4,
};
+enum {
+ MOCK_DEV_CACHE_ID_MAX = 3,
+ MOCK_DEV_CACHE_NUM = 4,
+};
+
struct iommu_test_cmd {
__u32 size;
__u32 op;
@@ -135,6 +141,10 @@ struct iommu_test_cmd {
__u32 perm;
__u64 addr;
} trigger_iopf;
+ struct {
+ __u32 id;
+ __u32 cache;
+ } check_dev_cache;
};
__u32 last;
};
@@ -152,6 +162,7 @@ struct iommu_test_hw_info {
/* Should not be equal to any defined value in enum iommu_hwpt_data_type */
#define IOMMU_HWPT_DATA_SELFTEST 0xdead
#define IOMMU_TEST_IOTLB_DEFAULT 0xbadbeef
+#define IOMMU_TEST_DEV_CACHE_DEFAULT 0xbaddad
/**
* struct iommu_hwpt_selftest
@@ -180,4 +191,25 @@ struct iommu_hwpt_invalidate_selftest {
__u32 iotlb_id;
};
+#define IOMMU_VIOMMU_TYPE_SELFTEST 0xdeadbeef
+
+/* Should not be equal to any defined value in enum iommu_viommu_invalidate_data_type */
+#define IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST 0xdeadbeef
+#define IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST_INVALID 0xdadbeef
+
+/**
+ * struct iommu_viommu_invalidate_selftest - Invalidation data for Mock VIOMMU
+ * (IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST)
+ * @flags: Invalidate flags
+ * @cache_id: Invalidate cache entry index
+ *
+ * If IOMMU_TEST_INVALIDATE_ALL is set in @flags, @cache_id will be ignored
+ */
+struct iommu_viommu_invalidate_selftest {
+#define IOMMU_TEST_INVALIDATE_FLAG_ALL (1 << 0)
+ __u32 flags;
+ __u32 vdev_id;
+ __u32 cache_id;
+};
+
#endif
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index b5f5d27ee963..0a96cc8f27da 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -29,38 +29,6 @@ struct iommufd_object_ops {
static const struct iommufd_object_ops iommufd_object_ops[];
static struct miscdevice vfio_misc_dev;
-struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx,
- size_t size,
- enum iommufd_object_type type)
-{
- struct iommufd_object *obj;
- int rc;
-
- obj = kzalloc(size, GFP_KERNEL_ACCOUNT);
- if (!obj)
- return ERR_PTR(-ENOMEM);
- obj->type = type;
- /* Starts out bias'd by 1 until it is removed from the xarray */
- refcount_set(&obj->shortterm_users, 1);
- refcount_set(&obj->users, 1);
-
- /*
- * Reserve an ID in the xarray but do not publish the pointer yet since
- * the caller hasn't initialized it yet. Once the pointer is published
- * in the xarray and visible to other threads we can't reliably destroy
- * it anymore, so the caller must complete all errorable operations
- * before calling iommufd_object_finalize().
- */
- rc = xa_alloc(&ictx->objects, &obj->id, XA_ZERO_ENTRY,
- xa_limit_31b, GFP_KERNEL_ACCOUNT);
- if (rc)
- goto out_free;
- return obj;
-out_free:
- kfree(obj);
- return ERR_PTR(rc);
-}
-
/*
* Allow concurrent access to the object.
*
@@ -73,20 +41,26 @@ out_free:
void iommufd_object_finalize(struct iommufd_ctx *ictx,
struct iommufd_object *obj)
{
+ XA_STATE(xas, &ictx->objects, obj->id);
void *old;
- old = xa_store(&ictx->objects, obj->id, obj, GFP_KERNEL);
- /* obj->id was returned from xa_alloc() so the xa_store() cannot fail */
- WARN_ON(old);
+ xa_lock(&ictx->objects);
+ old = xas_store(&xas, obj);
+ xa_unlock(&ictx->objects);
+ /* obj->id was returned from xa_alloc() so the xas_store() cannot fail */
+ WARN_ON(old != XA_ZERO_ENTRY);
}
/* Undo _iommufd_object_alloc() if iommufd_object_finalize() was not called */
void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj)
{
+ XA_STATE(xas, &ictx->objects, obj->id);
void *old;
- old = xa_erase(&ictx->objects, obj->id);
- WARN_ON(old);
+ xa_lock(&ictx->objects);
+ old = xas_store(&xas, NULL);
+ xa_unlock(&ictx->objects);
+ WARN_ON(old != XA_ZERO_ENTRY);
kfree(obj);
}
@@ -248,6 +222,7 @@ static int iommufd_fops_open(struct inode *inode, struct file *filp)
pr_info_once("IOMMUFD is providing /dev/vfio/vfio, not VFIO.\n");
}
+ init_rwsem(&ictx->ioas_creation_lock);
xa_init_flags(&ictx->objects, XA_FLAGS_ALLOC1 | XA_FLAGS_ACCOUNT);
xa_init(&ictx->groups);
ictx->file = filp;
@@ -333,6 +308,8 @@ union ucmd_buffer {
struct iommu_ioas_unmap unmap;
struct iommu_option option;
struct iommu_vfio_ioas vfio_ioas;
+ struct iommu_viommu_alloc viommu;
+ struct iommu_vdevice_alloc vdev;
#ifdef CONFIG_IOMMUFD_TEST
struct iommu_test_cmd test;
#endif
@@ -372,18 +349,26 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
struct iommu_ioas_alloc, out_ioas_id),
IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas,
struct iommu_ioas_allow_iovas, allowed_iovas),
+ IOCTL_OP(IOMMU_IOAS_CHANGE_PROCESS, iommufd_ioas_change_process,
+ struct iommu_ioas_change_process, __reserved),
IOCTL_OP(IOMMU_IOAS_COPY, iommufd_ioas_copy, struct iommu_ioas_copy,
src_iova),
IOCTL_OP(IOMMU_IOAS_IOVA_RANGES, iommufd_ioas_iova_ranges,
struct iommu_ioas_iova_ranges, out_iova_alignment),
IOCTL_OP(IOMMU_IOAS_MAP, iommufd_ioas_map, struct iommu_ioas_map,
iova),
+ IOCTL_OP(IOMMU_IOAS_MAP_FILE, iommufd_ioas_map_file,
+ struct iommu_ioas_map_file, iova),
IOCTL_OP(IOMMU_IOAS_UNMAP, iommufd_ioas_unmap, struct iommu_ioas_unmap,
length),
IOCTL_OP(IOMMU_OPTION, iommufd_option, struct iommu_option,
val64),
IOCTL_OP(IOMMU_VFIO_IOAS, iommufd_vfio_ioas, struct iommu_vfio_ioas,
__reserved),
+ IOCTL_OP(IOMMU_VIOMMU_ALLOC, iommufd_viommu_alloc_ioctl,
+ struct iommu_viommu_alloc, out_viommu_id),
+ IOCTL_OP(IOMMU_VDEVICE_ALLOC, iommufd_vdevice_alloc_ioctl,
+ struct iommu_vdevice_alloc, virt_id),
#ifdef CONFIG_IOMMUFD_TEST
IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last),
#endif
@@ -519,6 +504,12 @@ static const struct iommufd_object_ops iommufd_object_ops[] = {
[IOMMUFD_OBJ_FAULT] = {
.destroy = iommufd_fault_destroy,
},
+ [IOMMUFD_OBJ_VIOMMU] = {
+ .destroy = iommufd_viommu_destroy,
+ },
+ [IOMMUFD_OBJ_VDEVICE] = {
+ .destroy = iommufd_vdevice_destroy,
+ },
#ifdef CONFIG_IOMMUFD_TEST
[IOMMUFD_OBJ_SELFTEST] = {
.destroy = iommufd_selftest_destroy,
diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c
index 93d806c9c073..3427749bc5ce 100644
--- a/drivers/iommu/iommufd/pages.c
+++ b/drivers/iommu/iommufd/pages.c
@@ -45,6 +45,7 @@
* last_iova + 1 can overflow. An iopt_pages index will always be much less than
* ULONG_MAX so last_index + 1 cannot overflow.
*/
+#include <linux/file.h>
#include <linux/highmem.h>
#include <linux/iommu.h>
#include <linux/iommufd.h>
@@ -346,27 +347,41 @@ static void batch_destroy(struct pfn_batch *batch, void *backup)
kfree(batch->pfns);
}
-/* true if the pfn was added, false otherwise */
-static bool batch_add_pfn(struct pfn_batch *batch, unsigned long pfn)
+static bool batch_add_pfn_num(struct pfn_batch *batch, unsigned long pfn,
+ u32 nr)
{
const unsigned int MAX_NPFNS = type_max(typeof(*batch->npfns));
-
- if (batch->end &&
- pfn == batch->pfns[batch->end - 1] + batch->npfns[batch->end - 1] &&
- batch->npfns[batch->end - 1] != MAX_NPFNS) {
- batch->npfns[batch->end - 1]++;
- batch->total_pfns++;
- return true;
- }
- if (batch->end == batch->array_size)
+ unsigned int end = batch->end;
+
+ if (end && pfn == batch->pfns[end - 1] + batch->npfns[end - 1] &&
+ nr <= MAX_NPFNS - batch->npfns[end - 1]) {
+ batch->npfns[end - 1] += nr;
+ } else if (end < batch->array_size) {
+ batch->pfns[end] = pfn;
+ batch->npfns[end] = nr;
+ batch->end++;
+ } else {
return false;
- batch->total_pfns++;
- batch->pfns[batch->end] = pfn;
- batch->npfns[batch->end] = 1;
- batch->end++;
+ }
+
+ batch->total_pfns += nr;
return true;
}
+static void batch_remove_pfn_num(struct pfn_batch *batch, unsigned long nr)
+{
+ batch->npfns[batch->end - 1] -= nr;
+ if (batch->npfns[batch->end - 1] == 0)
+ batch->end--;
+ batch->total_pfns -= nr;
+}
+
+/* true if the pfn was added, false otherwise */
+static bool batch_add_pfn(struct pfn_batch *batch, unsigned long pfn)
+{
+ return batch_add_pfn_num(batch, pfn, 1);
+}
+
/*
* Fill the batch with pfns from the domain. When the batch is full, or it
* reaches last_index, the function will return. The caller should use
@@ -622,6 +637,41 @@ static void batch_from_pages(struct pfn_batch *batch, struct page **pages,
break;
}
+static int batch_from_folios(struct pfn_batch *batch, struct folio ***folios_p,
+ unsigned long *offset_p, unsigned long npages)
+{
+ int rc = 0;
+ struct folio **folios = *folios_p;
+ unsigned long offset = *offset_p;
+
+ while (npages) {
+ struct folio *folio = *folios;
+ unsigned long nr = folio_nr_pages(folio) - offset;
+ unsigned long pfn = page_to_pfn(folio_page(folio, offset));
+
+ nr = min(nr, npages);
+ npages -= nr;
+
+ if (!batch_add_pfn_num(batch, pfn, nr))
+ break;
+ if (nr > 1) {
+ rc = folio_add_pins(folio, nr - 1);
+ if (rc) {
+ batch_remove_pfn_num(batch, nr);
+ goto out;
+ }
+ }
+
+ folios++;
+ offset = 0;
+ }
+
+out:
+ *folios_p = folios;
+ *offset_p = offset;
+ return rc;
+}
+
static void batch_unpin(struct pfn_batch *batch, struct iopt_pages *pages,
unsigned int first_page_off, size_t npages)
{
@@ -703,19 +753,32 @@ struct pfn_reader_user {
* neither
*/
int locked;
+
+ /* The following are only valid if file != NULL. */
+ struct file *file;
+ struct folio **ufolios;
+ size_t ufolios_len;
+ unsigned long ufolios_offset;
+ struct folio **ufolios_next;
};
static void pfn_reader_user_init(struct pfn_reader_user *user,
struct iopt_pages *pages)
{
user->upages = NULL;
+ user->upages_len = 0;
user->upages_start = 0;
user->upages_end = 0;
user->locked = -1;
-
user->gup_flags = FOLL_LONGTERM;
if (pages->writable)
user->gup_flags |= FOLL_WRITE;
+
+ user->file = (pages->type == IOPT_ADDRESS_FILE) ? pages->file : NULL;
+ user->ufolios = NULL;
+ user->ufolios_len = 0;
+ user->ufolios_next = NULL;
+ user->ufolios_offset = 0;
}
static void pfn_reader_user_destroy(struct pfn_reader_user *user,
@@ -724,13 +787,67 @@ static void pfn_reader_user_destroy(struct pfn_reader_user *user,
if (user->locked != -1) {
if (user->locked)
mmap_read_unlock(pages->source_mm);
- if (pages->source_mm != current->mm)
+ if (!user->file && pages->source_mm != current->mm)
mmput(pages->source_mm);
user->locked = -1;
}
kfree(user->upages);
user->upages = NULL;
+ kfree(user->ufolios);
+ user->ufolios = NULL;
+}
+
+static long pin_memfd_pages(struct pfn_reader_user *user, unsigned long start,
+ unsigned long npages)
+{
+ unsigned long i;
+ unsigned long offset;
+ unsigned long npages_out = 0;
+ struct page **upages = user->upages;
+ unsigned long end = start + (npages << PAGE_SHIFT) - 1;
+ long nfolios = user->ufolios_len / sizeof(*user->ufolios);
+
+ /*
+ * todo: memfd_pin_folios should return the last pinned offset so
+ * we can compute npages pinned, and avoid looping over folios here
+ * if upages == NULL.
+ */
+ nfolios = memfd_pin_folios(user->file, start, end, user->ufolios,
+ nfolios, &offset);
+ if (nfolios <= 0)
+ return nfolios;
+
+ offset >>= PAGE_SHIFT;
+ user->ufolios_next = user->ufolios;
+ user->ufolios_offset = offset;
+
+ for (i = 0; i < nfolios; i++) {
+ struct folio *folio = user->ufolios[i];
+ unsigned long nr = folio_nr_pages(folio);
+ unsigned long npin = min(nr - offset, npages);
+
+ npages -= npin;
+ npages_out += npin;
+
+ if (upages) {
+ if (npin == 1) {
+ *upages++ = folio_page(folio, offset);
+ } else {
+ int rc = folio_add_pins(folio, npin - 1);
+
+ if (rc)
+ return rc;
+
+ while (npin--)
+ *upages++ = folio_page(folio, offset++);
+ }
+ }
+
+ offset = 0;
+ }
+
+ return npages_out;
}
static int pfn_reader_user_pin(struct pfn_reader_user *user,
@@ -739,7 +856,9 @@ static int pfn_reader_user_pin(struct pfn_reader_user *user,
unsigned long last_index)
{
bool remote_mm = pages->source_mm != current->mm;
- unsigned long npages;
+ unsigned long npages = last_index - start_index + 1;
+ unsigned long start;
+ unsigned long unum;
uintptr_t uptr;
long rc;
@@ -747,40 +866,50 @@ static int pfn_reader_user_pin(struct pfn_reader_user *user,
WARN_ON(last_index < start_index))
return -EINVAL;
- if (!user->upages) {
+ if (!user->file && !user->upages) {
/* All undone in pfn_reader_destroy() */
- user->upages_len =
- (last_index - start_index + 1) * sizeof(*user->upages);
+ user->upages_len = npages * sizeof(*user->upages);
user->upages = temp_kmalloc(&user->upages_len, NULL, 0);
if (!user->upages)
return -ENOMEM;
}
+ if (user->file && !user->ufolios) {
+ user->ufolios_len = npages * sizeof(*user->ufolios);
+ user->ufolios = temp_kmalloc(&user->ufolios_len, NULL, 0);
+ if (!user->ufolios)
+ return -ENOMEM;
+ }
+
if (user->locked == -1) {
/*
* The majority of usages will run the map task within the mm
* providing the pages, so we can optimize into
* get_user_pages_fast()
*/
- if (remote_mm) {
+ if (!user->file && remote_mm) {
if (!mmget_not_zero(pages->source_mm))
return -EFAULT;
}
user->locked = 0;
}
- npages = min_t(unsigned long, last_index - start_index + 1,
- user->upages_len / sizeof(*user->upages));
-
+ unum = user->file ? user->ufolios_len / sizeof(*user->ufolios) :
+ user->upages_len / sizeof(*user->upages);
+ npages = min_t(unsigned long, npages, unum);
if (iommufd_should_fail())
return -EFAULT;
- uptr = (uintptr_t)(pages->uptr + start_index * PAGE_SIZE);
- if (!remote_mm)
+ if (user->file) {
+ start = pages->start + (start_index * PAGE_SIZE);
+ rc = pin_memfd_pages(user, start, npages);
+ } else if (!remote_mm) {
+ uptr = (uintptr_t)(pages->uptr + start_index * PAGE_SIZE);
rc = pin_user_pages_fast(uptr, npages, user->gup_flags,
user->upages);
- else {
+ } else {
+ uptr = (uintptr_t)(pages->uptr + start_index * PAGE_SIZE);
if (!user->locked) {
mmap_read_lock(pages->source_mm);
user->locked = 1;
@@ -838,7 +967,8 @@ static int update_mm_locked_vm(struct iopt_pages *pages, unsigned long npages,
mmap_read_unlock(pages->source_mm);
user->locked = 0;
/* If we had the lock then we also have a get */
- } else if ((!user || !user->upages) &&
+
+ } else if ((!user || (!user->upages && !user->ufolios)) &&
pages->source_mm != current->mm) {
if (!mmget_not_zero(pages->source_mm))
return -EINVAL;
@@ -855,8 +985,8 @@ static int update_mm_locked_vm(struct iopt_pages *pages, unsigned long npages,
return rc;
}
-static int do_update_pinned(struct iopt_pages *pages, unsigned long npages,
- bool inc, struct pfn_reader_user *user)
+int iopt_pages_update_pinned(struct iopt_pages *pages, unsigned long npages,
+ bool inc, struct pfn_reader_user *user)
{
int rc = 0;
@@ -890,8 +1020,8 @@ static void update_unpinned(struct iopt_pages *pages)
return;
if (pages->npinned == pages->last_npinned)
return;
- do_update_pinned(pages, pages->last_npinned - pages->npinned, false,
- NULL);
+ iopt_pages_update_pinned(pages, pages->last_npinned - pages->npinned,
+ false, NULL);
}
/*
@@ -921,7 +1051,7 @@ static int pfn_reader_user_update_pinned(struct pfn_reader_user *user,
npages = pages->npinned - pages->last_npinned;
inc = true;
}
- return do_update_pinned(pages, npages, inc, user);
+ return iopt_pages_update_pinned(pages, npages, inc, user);
}
/*
@@ -978,6 +1108,8 @@ static int pfn_reader_fill_span(struct pfn_reader *pfns)
{
struct interval_tree_double_span_iter *span = &pfns->span;
unsigned long start_index = pfns->batch_end_index;
+ struct pfn_reader_user *user = &pfns->user;
+ unsigned long npages;
struct iopt_area *area;
int rc;
@@ -1015,11 +1147,17 @@ static int pfn_reader_fill_span(struct pfn_reader *pfns)
return rc;
}
- batch_from_pages(&pfns->batch,
- pfns->user.upages +
- (start_index - pfns->user.upages_start),
- pfns->user.upages_end - start_index);
- return 0;
+ npages = user->upages_end - start_index;
+ start_index -= user->upages_start;
+ rc = 0;
+
+ if (!user->file)
+ batch_from_pages(&pfns->batch, user->upages + start_index,
+ npages);
+ else
+ rc = batch_from_folios(&pfns->batch, &user->ufolios_next,
+ &user->ufolios_offset, npages);
+ return rc;
}
static bool pfn_reader_done(struct pfn_reader *pfns)
@@ -1092,16 +1230,25 @@ static int pfn_reader_init(struct pfn_reader *pfns, struct iopt_pages *pages,
static void pfn_reader_release_pins(struct pfn_reader *pfns)
{
struct iopt_pages *pages = pfns->pages;
+ struct pfn_reader_user *user = &pfns->user;
- if (pfns->user.upages_end > pfns->batch_end_index) {
- size_t npages = pfns->user.upages_end - pfns->batch_end_index;
-
+ if (user->upages_end > pfns->batch_end_index) {
/* Any pages not transferred to the batch are just unpinned */
- unpin_user_pages(pfns->user.upages + (pfns->batch_end_index -
- pfns->user.upages_start),
- npages);
+
+ unsigned long npages = user->upages_end - pfns->batch_end_index;
+ unsigned long start_index = pfns->batch_end_index -
+ user->upages_start;
+
+ if (!user->file) {
+ unpin_user_pages(user->upages + start_index, npages);
+ } else {
+ long n = user->ufolios_len / sizeof(*user->ufolios);
+
+ unpin_folios(user->ufolios_next,
+ user->ufolios + n - user->ufolios_next);
+ }
iopt_pages_sub_npinned(pages, npages);
- pfns->user.upages_end = pfns->batch_end_index;
+ user->upages_end = pfns->batch_end_index;
}
if (pfns->batch_start_index != pfns->batch_end_index) {
pfn_reader_unpin(pfns);
@@ -1139,11 +1286,11 @@ static int pfn_reader_first(struct pfn_reader *pfns, struct iopt_pages *pages,
return 0;
}
-struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length,
- bool writable)
+static struct iopt_pages *iopt_alloc_pages(unsigned long start_byte,
+ unsigned long length,
+ bool writable)
{
struct iopt_pages *pages;
- unsigned long end;
/*
* The iommu API uses size_t as the length, and protect the DIV_ROUND_UP
@@ -1152,9 +1299,6 @@ struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length,
if (length > SIZE_MAX - PAGE_SIZE || length == 0)
return ERR_PTR(-EINVAL);
- if (check_add_overflow((unsigned long)uptr, length, &end))
- return ERR_PTR(-EOVERFLOW);
-
pages = kzalloc(sizeof(*pages), GFP_KERNEL_ACCOUNT);
if (!pages)
return ERR_PTR(-ENOMEM);
@@ -1164,8 +1308,7 @@ struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length,
mutex_init(&pages->mutex);
pages->source_mm = current->mm;
mmgrab(pages->source_mm);
- pages->uptr = (void __user *)ALIGN_DOWN((uintptr_t)uptr, PAGE_SIZE);
- pages->npages = DIV_ROUND_UP(length + (uptr - pages->uptr), PAGE_SIZE);
+ pages->npages = DIV_ROUND_UP(length + start_byte, PAGE_SIZE);
pages->access_itree = RB_ROOT_CACHED;
pages->domains_itree = RB_ROOT_CACHED;
pages->writable = writable;
@@ -1179,6 +1322,45 @@ struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length,
return pages;
}
+struct iopt_pages *iopt_alloc_user_pages(void __user *uptr,
+ unsigned long length, bool writable)
+{
+ struct iopt_pages *pages;
+ unsigned long end;
+ void __user *uptr_down =
+ (void __user *) ALIGN_DOWN((uintptr_t)uptr, PAGE_SIZE);
+
+ if (check_add_overflow((unsigned long)uptr, length, &end))
+ return ERR_PTR(-EOVERFLOW);
+
+ pages = iopt_alloc_pages(uptr - uptr_down, length, writable);
+ if (IS_ERR(pages))
+ return pages;
+ pages->uptr = uptr_down;
+ pages->type = IOPT_ADDRESS_USER;
+ return pages;
+}
+
+struct iopt_pages *iopt_alloc_file_pages(struct file *file, unsigned long start,
+ unsigned long length, bool writable)
+
+{
+ struct iopt_pages *pages;
+ unsigned long start_down = ALIGN_DOWN(start, PAGE_SIZE);
+ unsigned long end;
+
+ if (length && check_add_overflow(start, length - 1, &end))
+ return ERR_PTR(-EOVERFLOW);
+
+ pages = iopt_alloc_pages(start - start_down, length, writable);
+ if (IS_ERR(pages))
+ return pages;
+ pages->file = get_file(file);
+ pages->start = start_down;
+ pages->type = IOPT_ADDRESS_FILE;
+ return pages;
+}
+
void iopt_release_pages(struct kref *kref)
{
struct iopt_pages *pages = container_of(kref, struct iopt_pages, kref);
@@ -1191,6 +1373,8 @@ void iopt_release_pages(struct kref *kref)
mutex_destroy(&pages->mutex);
put_task_struct(pages->source_task);
free_uid(pages->source_user);
+ if (pages->type == IOPT_ADDRESS_FILE)
+ fput(pages->file);
kfree(pages);
}
@@ -1630,11 +1814,11 @@ static int iopt_pages_fill_from_domain(struct iopt_pages *pages,
return 0;
}
-static int iopt_pages_fill_from_mm(struct iopt_pages *pages,
- struct pfn_reader_user *user,
- unsigned long start_index,
- unsigned long last_index,
- struct page **out_pages)
+static int iopt_pages_fill(struct iopt_pages *pages,
+ struct pfn_reader_user *user,
+ unsigned long start_index,
+ unsigned long last_index,
+ struct page **out_pages)
{
unsigned long cur_index = start_index;
int rc;
@@ -1708,8 +1892,8 @@ int iopt_pages_fill_xarray(struct iopt_pages *pages, unsigned long start_index,
/* hole */
cur_pages = out_pages + (span.start_hole - start_index);
- rc = iopt_pages_fill_from_mm(pages, &user, span.start_hole,
- span.last_hole, cur_pages);
+ rc = iopt_pages_fill(pages, &user, span.start_hole,
+ span.last_hole, cur_pages);
if (rc)
goto out_clean_xa;
rc = pages_to_xarray(&pages->pinned_pfns, span.start_hole,
@@ -1789,6 +1973,10 @@ static int iopt_pages_rw_page(struct iopt_pages *pages, unsigned long index,
struct page *page = NULL;
int rc;
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
+ WARN_ON(pages->type != IOPT_ADDRESS_USER))
+ return -EINVAL;
+
if (!mmget_not_zero(pages->source_mm))
return iopt_pages_rw_slow(pages, index, index, offset, data,
length, flags);
@@ -1844,6 +2032,15 @@ int iopt_pages_rw_access(struct iopt_pages *pages, unsigned long start_byte,
if ((flags & IOMMUFD_ACCESS_RW_WRITE) && !pages->writable)
return -EPERM;
+ if (pages->type == IOPT_ADDRESS_FILE)
+ return iopt_pages_rw_slow(pages, start_index, last_index,
+ start_byte % PAGE_SIZE, data, length,
+ flags);
+
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
+ WARN_ON(pages->type != IOPT_ADDRESS_USER))
+ return -EINVAL;
+
if (!(flags & IOMMUFD_ACCESS_RW_KTHREAD) && change_mm) {
if (start_index == last_index)
return iopt_pages_rw_page(pages, start_index,
diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c
index 540437be168a..2f9de177dffc 100644
--- a/drivers/iommu/iommufd/selftest.c
+++ b/drivers/iommu/iommufd/selftest.c
@@ -126,12 +126,35 @@ struct mock_iommu_domain {
struct xarray pfns;
};
+static inline struct mock_iommu_domain *
+to_mock_domain(struct iommu_domain *domain)
+{
+ return container_of(domain, struct mock_iommu_domain, domain);
+}
+
struct mock_iommu_domain_nested {
struct iommu_domain domain;
+ struct mock_viommu *mock_viommu;
struct mock_iommu_domain *parent;
u32 iotlb[MOCK_NESTED_DOMAIN_IOTLB_NUM];
};
+static inline struct mock_iommu_domain_nested *
+to_mock_nested(struct iommu_domain *domain)
+{
+ return container_of(domain, struct mock_iommu_domain_nested, domain);
+}
+
+struct mock_viommu {
+ struct iommufd_viommu core;
+ struct mock_iommu_domain *s2_parent;
+};
+
+static inline struct mock_viommu *to_mock_viommu(struct iommufd_viommu *viommu)
+{
+ return container_of(viommu, struct mock_viommu, core);
+}
+
enum selftest_obj_type {
TYPE_IDEV,
};
@@ -140,8 +163,14 @@ struct mock_dev {
struct device dev;
unsigned long flags;
int id;
+ u32 cache[MOCK_DEV_CACHE_NUM];
};
+static inline struct mock_dev *to_mock_dev(struct device *dev)
+{
+ return container_of(dev, struct mock_dev, dev);
+}
+
struct selftest_obj {
struct iommufd_object obj;
enum selftest_obj_type type;
@@ -155,10 +184,15 @@ struct selftest_obj {
};
};
+static inline struct selftest_obj *to_selftest_obj(struct iommufd_object *obj)
+{
+ return container_of(obj, struct selftest_obj, obj);
+}
+
static int mock_domain_nop_attach(struct iommu_domain *domain,
struct device *dev)
{
- struct mock_dev *mdev = container_of(dev, struct mock_dev, dev);
+ struct mock_dev *mdev = to_mock_dev(dev);
if (domain->dirty_ops && (mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY))
return -EINVAL;
@@ -193,8 +227,7 @@ static void *mock_domain_hw_info(struct device *dev, u32 *length, u32 *type)
static int mock_domain_set_dirty_tracking(struct iommu_domain *domain,
bool enable)
{
- struct mock_iommu_domain *mock =
- container_of(domain, struct mock_iommu_domain, domain);
+ struct mock_iommu_domain *mock = to_mock_domain(domain);
unsigned long flags = mock->flags;
if (enable && !domain->dirty_ops)
@@ -243,8 +276,7 @@ static int mock_domain_read_and_clear_dirty(struct iommu_domain *domain,
unsigned long flags,
struct iommu_dirty_bitmap *dirty)
{
- struct mock_iommu_domain *mock =
- container_of(domain, struct mock_iommu_domain, domain);
+ struct mock_iommu_domain *mock = to_mock_domain(domain);
unsigned long end = iova + size;
void *ent;
@@ -281,7 +313,7 @@ static const struct iommu_dirty_ops dirty_ops = {
static struct iommu_domain *mock_domain_alloc_paging(struct device *dev)
{
- struct mock_dev *mdev = container_of(dev, struct mock_dev, dev);
+ struct mock_dev *mdev = to_mock_dev(dev);
struct mock_iommu_domain *mock;
mock = kzalloc(sizeof(*mock), GFP_KERNEL);
@@ -298,76 +330,85 @@ static struct iommu_domain *mock_domain_alloc_paging(struct device *dev)
return &mock->domain;
}
-static struct iommu_domain *
-__mock_domain_alloc_nested(struct mock_iommu_domain *mock_parent,
- const struct iommu_hwpt_selftest *user_cfg)
+static struct mock_iommu_domain_nested *
+__mock_domain_alloc_nested(const struct iommu_user_data *user_data)
{
struct mock_iommu_domain_nested *mock_nested;
- int i;
+ struct iommu_hwpt_selftest user_cfg;
+ int rc, i;
+
+ if (user_data->type != IOMMU_HWPT_DATA_SELFTEST)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ rc = iommu_copy_struct_from_user(&user_cfg, user_data,
+ IOMMU_HWPT_DATA_SELFTEST, iotlb);
+ if (rc)
+ return ERR_PTR(rc);
mock_nested = kzalloc(sizeof(*mock_nested), GFP_KERNEL);
if (!mock_nested)
return ERR_PTR(-ENOMEM);
- mock_nested->parent = mock_parent;
mock_nested->domain.ops = &domain_nested_ops;
mock_nested->domain.type = IOMMU_DOMAIN_NESTED;
for (i = 0; i < MOCK_NESTED_DOMAIN_IOTLB_NUM; i++)
- mock_nested->iotlb[i] = user_cfg->iotlb;
- return &mock_nested->domain;
+ mock_nested->iotlb[i] = user_cfg.iotlb;
+ return mock_nested;
}
static struct iommu_domain *
-mock_domain_alloc_user(struct device *dev, u32 flags,
- struct iommu_domain *parent,
- const struct iommu_user_data *user_data)
+mock_domain_alloc_nested(struct iommu_domain *parent, u32 flags,
+ const struct iommu_user_data *user_data)
{
+ struct mock_iommu_domain_nested *mock_nested;
struct mock_iommu_domain *mock_parent;
- struct iommu_hwpt_selftest user_cfg;
- int rc;
-
- /* must be mock_domain */
- if (!parent) {
- struct mock_dev *mdev = container_of(dev, struct mock_dev, dev);
- bool has_dirty_flag = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
- bool no_dirty_ops = mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY;
- struct iommu_domain *domain;
-
- if (flags & (~(IOMMU_HWPT_ALLOC_NEST_PARENT |
- IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
- return ERR_PTR(-EOPNOTSUPP);
- if (user_data || (has_dirty_flag && no_dirty_ops))
- return ERR_PTR(-EOPNOTSUPP);
- domain = mock_domain_alloc_paging(dev);
- if (!domain)
- return ERR_PTR(-ENOMEM);
- if (has_dirty_flag)
- container_of(domain, struct mock_iommu_domain, domain)
- ->domain.dirty_ops = &dirty_ops;
- return domain;
- }
- /* must be mock_domain_nested */
- if (user_data->type != IOMMU_HWPT_DATA_SELFTEST || flags)
+ if (flags)
return ERR_PTR(-EOPNOTSUPP);
if (!parent || parent->ops != mock_ops.default_domain_ops)
return ERR_PTR(-EINVAL);
- mock_parent = container_of(parent, struct mock_iommu_domain, domain);
+ mock_parent = to_mock_domain(parent);
if (!mock_parent)
return ERR_PTR(-EINVAL);
- rc = iommu_copy_struct_from_user(&user_cfg, user_data,
- IOMMU_HWPT_DATA_SELFTEST, iotlb);
- if (rc)
- return ERR_PTR(rc);
+ mock_nested = __mock_domain_alloc_nested(user_data);
+ if (IS_ERR(mock_nested))
+ return ERR_CAST(mock_nested);
+ mock_nested->parent = mock_parent;
+ return &mock_nested->domain;
+}
+
+static struct iommu_domain *
+mock_domain_alloc_user(struct device *dev, u32 flags,
+ struct iommu_domain *parent,
+ const struct iommu_user_data *user_data)
+{
+ bool has_dirty_flag = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
+ const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING |
+ IOMMU_HWPT_ALLOC_NEST_PARENT;
+ bool no_dirty_ops = to_mock_dev(dev)->flags &
+ MOCK_FLAGS_DEVICE_NO_DIRTY;
+ struct iommu_domain *domain;
- return __mock_domain_alloc_nested(mock_parent, &user_cfg);
+ if (parent)
+ return mock_domain_alloc_nested(parent, flags, user_data);
+
+ if (user_data)
+ return ERR_PTR(-EOPNOTSUPP);
+ if ((flags & ~PAGING_FLAGS) || (has_dirty_flag && no_dirty_ops))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ domain = mock_domain_alloc_paging(dev);
+ if (!domain)
+ return ERR_PTR(-ENOMEM);
+ if (has_dirty_flag)
+ domain->dirty_ops = &dirty_ops;
+ return domain;
}
static void mock_domain_free(struct iommu_domain *domain)
{
- struct mock_iommu_domain *mock =
- container_of(domain, struct mock_iommu_domain, domain);
+ struct mock_iommu_domain *mock = to_mock_domain(domain);
WARN_ON(!xa_empty(&mock->pfns));
kfree(mock);
@@ -378,8 +419,7 @@ static int mock_domain_map_pages(struct iommu_domain *domain,
size_t pgsize, size_t pgcount, int prot,
gfp_t gfp, size_t *mapped)
{
- struct mock_iommu_domain *mock =
- container_of(domain, struct mock_iommu_domain, domain);
+ struct mock_iommu_domain *mock = to_mock_domain(domain);
unsigned long flags = MOCK_PFN_START_IOVA;
unsigned long start_iova = iova;
@@ -430,8 +470,7 @@ static size_t mock_domain_unmap_pages(struct iommu_domain *domain,
size_t pgcount,
struct iommu_iotlb_gather *iotlb_gather)
{
- struct mock_iommu_domain *mock =
- container_of(domain, struct mock_iommu_domain, domain);
+ struct mock_iommu_domain *mock = to_mock_domain(domain);
bool first = true;
size_t ret = 0;
void *ent;
@@ -479,8 +518,7 @@ static size_t mock_domain_unmap_pages(struct iommu_domain *domain,
static phys_addr_t mock_domain_iova_to_phys(struct iommu_domain *domain,
dma_addr_t iova)
{
- struct mock_iommu_domain *mock =
- container_of(domain, struct mock_iommu_domain, domain);
+ struct mock_iommu_domain *mock = to_mock_domain(domain);
void *ent;
WARN_ON(iova % MOCK_IO_PAGE_SIZE);
@@ -491,7 +529,7 @@ static phys_addr_t mock_domain_iova_to_phys(struct iommu_domain *domain,
static bool mock_domain_capable(struct device *dev, enum iommu_cap cap)
{
- struct mock_dev *mdev = container_of(dev, struct mock_dev, dev);
+ struct mock_dev *mdev = to_mock_dev(dev);
switch (cap) {
case IOMMU_CAP_CACHE_COHERENCY:
@@ -507,14 +545,17 @@ static bool mock_domain_capable(struct device *dev, enum iommu_cap cap)
static struct iopf_queue *mock_iommu_iopf_queue;
-static struct iommu_device mock_iommu_device = {
-};
+static struct mock_iommu_device {
+ struct iommu_device iommu_dev;
+ struct completion complete;
+ refcount_t users;
+} mock_iommu;
static struct iommu_device *mock_probe_device(struct device *dev)
{
if (dev->bus != &iommufd_mock_bus_type.bus)
return ERR_PTR(-ENODEV);
- return &mock_iommu_device;
+ return &mock_iommu.iommu_dev;
}
static void mock_domain_page_response(struct device *dev, struct iopf_fault *evt,
@@ -540,6 +581,132 @@ static int mock_dev_disable_feat(struct device *dev, enum iommu_dev_features fea
return 0;
}
+static void mock_viommu_destroy(struct iommufd_viommu *viommu)
+{
+ struct mock_iommu_device *mock_iommu = container_of(
+ viommu->iommu_dev, struct mock_iommu_device, iommu_dev);
+
+ if (refcount_dec_and_test(&mock_iommu->users))
+ complete(&mock_iommu->complete);
+
+ /* iommufd core frees mock_viommu and viommu */
+}
+
+static struct iommu_domain *
+mock_viommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
+ const struct iommu_user_data *user_data)
+{
+ struct mock_viommu *mock_viommu = to_mock_viommu(viommu);
+ struct mock_iommu_domain_nested *mock_nested;
+
+ if (flags & ~IOMMU_HWPT_FAULT_ID_VALID)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ mock_nested = __mock_domain_alloc_nested(user_data);
+ if (IS_ERR(mock_nested))
+ return ERR_CAST(mock_nested);
+ mock_nested->mock_viommu = mock_viommu;
+ mock_nested->parent = mock_viommu->s2_parent;
+ return &mock_nested->domain;
+}
+
+static int mock_viommu_cache_invalidate(struct iommufd_viommu *viommu,
+ struct iommu_user_data_array *array)
+{
+ struct iommu_viommu_invalidate_selftest *cmds;
+ struct iommu_viommu_invalidate_selftest *cur;
+ struct iommu_viommu_invalidate_selftest *end;
+ int rc;
+
+ /* A zero-length array is allowed to validate the array type */
+ if (array->entry_num == 0 &&
+ array->type == IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST) {
+ array->entry_num = 0;
+ return 0;
+ }
+
+ cmds = kcalloc(array->entry_num, sizeof(*cmds), GFP_KERNEL);
+ if (!cmds)
+ return -ENOMEM;
+ cur = cmds;
+ end = cmds + array->entry_num;
+
+ static_assert(sizeof(*cmds) == 3 * sizeof(u32));
+ rc = iommu_copy_struct_from_full_user_array(
+ cmds, sizeof(*cmds), array,
+ IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST);
+ if (rc)
+ goto out;
+
+ while (cur != end) {
+ struct mock_dev *mdev;
+ struct device *dev;
+ int i;
+
+ if (cur->flags & ~IOMMU_TEST_INVALIDATE_FLAG_ALL) {
+ rc = -EOPNOTSUPP;
+ goto out;
+ }
+
+ if (cur->cache_id > MOCK_DEV_CACHE_ID_MAX) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ xa_lock(&viommu->vdevs);
+ dev = iommufd_viommu_find_dev(viommu,
+ (unsigned long)cur->vdev_id);
+ if (!dev) {
+ xa_unlock(&viommu->vdevs);
+ rc = -EINVAL;
+ goto out;
+ }
+ mdev = container_of(dev, struct mock_dev, dev);
+
+ if (cur->flags & IOMMU_TEST_INVALIDATE_FLAG_ALL) {
+ /* Invalidate all cache entries and ignore cache_id */
+ for (i = 0; i < MOCK_DEV_CACHE_NUM; i++)
+ mdev->cache[i] = 0;
+ } else {
+ mdev->cache[cur->cache_id] = 0;
+ }
+ xa_unlock(&viommu->vdevs);
+
+ cur++;
+ }
+out:
+ array->entry_num = cur - cmds;
+ kfree(cmds);
+ return rc;
+}
+
+static struct iommufd_viommu_ops mock_viommu_ops = {
+ .destroy = mock_viommu_destroy,
+ .alloc_domain_nested = mock_viommu_alloc_domain_nested,
+ .cache_invalidate = mock_viommu_cache_invalidate,
+};
+
+static struct iommufd_viommu *mock_viommu_alloc(struct device *dev,
+ struct iommu_domain *domain,
+ struct iommufd_ctx *ictx,
+ unsigned int viommu_type)
+{
+ struct mock_iommu_device *mock_iommu =
+ iommu_get_iommu_dev(dev, struct mock_iommu_device, iommu_dev);
+ struct mock_viommu *mock_viommu;
+
+ if (viommu_type != IOMMU_VIOMMU_TYPE_SELFTEST)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ mock_viommu = iommufd_viommu_alloc(ictx, struct mock_viommu, core,
+ &mock_viommu_ops);
+ if (IS_ERR(mock_viommu))
+ return ERR_CAST(mock_viommu);
+
+ refcount_inc(&mock_iommu->users);
+ return &mock_viommu->core;
+}
+
static const struct iommu_ops mock_ops = {
/*
* IOMMU_DOMAIN_BLOCKED cannot be returned from def_domain_type()
@@ -559,6 +726,7 @@ static const struct iommu_ops mock_ops = {
.dev_enable_feat = mock_dev_enable_feat,
.dev_disable_feat = mock_dev_disable_feat,
.user_pasid_table = true,
+ .viommu_alloc = mock_viommu_alloc,
.default_domain_ops =
&(struct iommu_domain_ops){
.free = mock_domain_free,
@@ -571,18 +739,14 @@ static const struct iommu_ops mock_ops = {
static void mock_domain_free_nested(struct iommu_domain *domain)
{
- struct mock_iommu_domain_nested *mock_nested =
- container_of(domain, struct mock_iommu_domain_nested, domain);
-
- kfree(mock_nested);
+ kfree(to_mock_nested(domain));
}
static int
mock_domain_cache_invalidate_user(struct iommu_domain *domain,
struct iommu_user_data_array *array)
{
- struct mock_iommu_domain_nested *mock_nested =
- container_of(domain, struct mock_iommu_domain_nested, domain);
+ struct mock_iommu_domain_nested *mock_nested = to_mock_nested(domain);
struct iommu_hwpt_invalidate_selftest inv;
u32 processed = 0;
int i = 0, j;
@@ -657,7 +821,7 @@ get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id,
iommufd_put_object(ucmd->ictx, &hwpt->obj);
return ERR_PTR(-EINVAL);
}
- *mock = container_of(hwpt->domain, struct mock_iommu_domain, domain);
+ *mock = to_mock_domain(hwpt->domain);
return hwpt;
}
@@ -675,14 +839,13 @@ get_md_pagetable_nested(struct iommufd_ucmd *ucmd, u32 mockpt_id,
iommufd_put_object(ucmd->ictx, &hwpt->obj);
return ERR_PTR(-EINVAL);
}
- *mock_nested = container_of(hwpt->domain,
- struct mock_iommu_domain_nested, domain);
+ *mock_nested = to_mock_nested(hwpt->domain);
return hwpt;
}
static void mock_dev_release(struct device *dev)
{
- struct mock_dev *mdev = container_of(dev, struct mock_dev, dev);
+ struct mock_dev *mdev = to_mock_dev(dev);
ida_free(&mock_dev_ida, mdev->id);
kfree(mdev);
@@ -691,7 +854,7 @@ static void mock_dev_release(struct device *dev)
static struct mock_dev *mock_dev_create(unsigned long dev_flags)
{
struct mock_dev *mdev;
- int rc;
+ int rc, i;
if (dev_flags &
~(MOCK_FLAGS_DEVICE_NO_DIRTY | MOCK_FLAGS_DEVICE_HUGE_IOVA))
@@ -705,6 +868,8 @@ static struct mock_dev *mock_dev_create(unsigned long dev_flags)
mdev->flags = dev_flags;
mdev->dev.release = mock_dev_release;
mdev->dev.bus = &iommufd_mock_bus_type.bus;
+ for (i = 0; i < MOCK_DEV_CACHE_NUM; i++)
+ mdev->cache[i] = IOMMU_TEST_DEV_CACHE_DEFAULT;
rc = ida_alloc(&mock_dev_ida, GFP_KERNEL);
if (rc < 0)
@@ -813,7 +978,7 @@ static int iommufd_test_mock_domain_replace(struct iommufd_ucmd *ucmd,
if (IS_ERR(dev_obj))
return PTR_ERR(dev_obj);
- sobj = container_of(dev_obj, struct selftest_obj, obj);
+ sobj = to_selftest_obj(dev_obj);
if (sobj->type != TYPE_IDEV) {
rc = -EINVAL;
goto out_dev_obj;
@@ -951,8 +1116,7 @@ static int iommufd_test_md_check_iotlb(struct iommufd_ucmd *ucmd,
if (IS_ERR(hwpt))
return PTR_ERR(hwpt);
- mock_nested = container_of(hwpt->domain,
- struct mock_iommu_domain_nested, domain);
+ mock_nested = to_mock_nested(hwpt->domain);
if (iotlb_id > MOCK_NESTED_DOMAIN_IOTLB_ID_MAX ||
mock_nested->iotlb[iotlb_id] != iotlb)
@@ -961,6 +1125,24 @@ static int iommufd_test_md_check_iotlb(struct iommufd_ucmd *ucmd,
return rc;
}
+static int iommufd_test_dev_check_cache(struct iommufd_ucmd *ucmd, u32 idev_id,
+ unsigned int cache_id, u32 cache)
+{
+ struct iommufd_device *idev;
+ struct mock_dev *mdev;
+ int rc = 0;
+
+ idev = iommufd_get_device(ucmd, idev_id);
+ if (IS_ERR(idev))
+ return PTR_ERR(idev);
+ mdev = container_of(idev->dev, struct mock_dev, dev);
+
+ if (cache_id > MOCK_DEV_CACHE_ID_MAX || mdev->cache[cache_id] != cache)
+ rc = -EINVAL;
+ iommufd_put_object(ucmd->ictx, &idev->obj);
+ return rc;
+}
+
struct selftest_access {
struct iommufd_access *access;
struct file *file;
@@ -1431,7 +1613,7 @@ static int iommufd_test_trigger_iopf(struct iommufd_ucmd *ucmd,
void iommufd_selftest_destroy(struct iommufd_object *obj)
{
- struct selftest_obj *sobj = container_of(obj, struct selftest_obj, obj);
+ struct selftest_obj *sobj = to_selftest_obj(obj);
switch (sobj->type) {
case TYPE_IDEV:
@@ -1470,6 +1652,10 @@ int iommufd_test(struct iommufd_ucmd *ucmd)
return iommufd_test_md_check_iotlb(ucmd, cmd->id,
cmd->check_iotlb.id,
cmd->check_iotlb.iotlb);
+ case IOMMU_TEST_OP_DEV_CHECK_CACHE:
+ return iommufd_test_dev_check_cache(ucmd, cmd->id,
+ cmd->check_dev_cache.id,
+ cmd->check_dev_cache.cache);
case IOMMU_TEST_OP_CREATE_ACCESS:
return iommufd_test_create_access(ucmd, cmd->id,
cmd->create_access.flags);
@@ -1536,24 +1722,27 @@ int __init iommufd_test_init(void)
if (rc)
goto err_platform;
- rc = iommu_device_sysfs_add(&mock_iommu_device,
+ rc = iommu_device_sysfs_add(&mock_iommu.iommu_dev,
&selftest_iommu_dev->dev, NULL, "%s",
dev_name(&selftest_iommu_dev->dev));
if (rc)
goto err_bus;
- rc = iommu_device_register_bus(&mock_iommu_device, &mock_ops,
+ rc = iommu_device_register_bus(&mock_iommu.iommu_dev, &mock_ops,
&iommufd_mock_bus_type.bus,
&iommufd_mock_bus_type.nb);
if (rc)
goto err_sysfs;
+ refcount_set(&mock_iommu.users, 1);
+ init_completion(&mock_iommu.complete);
+
mock_iommu_iopf_queue = iopf_queue_alloc("mock-iopfq");
return 0;
err_sysfs:
- iommu_device_sysfs_remove(&mock_iommu_device);
+ iommu_device_sysfs_remove(&mock_iommu.iommu_dev);
err_bus:
bus_unregister(&iommufd_mock_bus_type.bus);
err_platform:
@@ -1563,6 +1752,22 @@ err_dbgfs:
return rc;
}
+static void iommufd_test_wait_for_users(void)
+{
+ if (refcount_dec_and_test(&mock_iommu.users))
+ return;
+ /*
+ * Time out waiting for iommu device user count to become 0.
+ *
+ * Note that this is just making an example here, since the selftest is
+ * built into the iommufd module, i.e. it only unplugs the iommu device
+ * when unloading the module. So, it is expected that this WARN_ON will
+ * not trigger, as long as any iommufd FDs are open.
+ */
+ WARN_ON(!wait_for_completion_timeout(&mock_iommu.complete,
+ msecs_to_jiffies(10000)));
+}
+
void iommufd_test_exit(void)
{
if (mock_iommu_iopf_queue) {
@@ -1570,8 +1775,9 @@ void iommufd_test_exit(void)
mock_iommu_iopf_queue = NULL;
}
- iommu_device_sysfs_remove(&mock_iommu_device);
- iommu_device_unregister_bus(&mock_iommu_device,
+ iommufd_test_wait_for_users();
+ iommu_device_sysfs_remove(&mock_iommu.iommu_dev);
+ iommu_device_unregister_bus(&mock_iommu.iommu_dev,
&iommufd_mock_bus_type.bus,
&iommufd_mock_bus_type.nb);
bus_unregister(&iommufd_mock_bus_type.bus);
diff --git a/drivers/iommu/iommufd/vfio_compat.c b/drivers/iommu/iommufd/vfio_compat.c
index a3ad5f0b6c59..514aacd64009 100644
--- a/drivers/iommu/iommufd/vfio_compat.c
+++ b/drivers/iommu/iommufd/vfio_compat.c
@@ -291,12 +291,7 @@ static int iommufd_vfio_check_extension(struct iommufd_ctx *ictx,
case VFIO_DMA_CC_IOMMU:
return iommufd_vfio_cc_iommu(ictx);
- /*
- * This is obsolete, and to be removed from VFIO. It was an incomplete
- * idea that got merged.
- * https://lore.kernel.org/kvm/0-v1-0093c9b0e345+19-vfio_no_nesting_jgg@nvidia.com/
- */
- case VFIO_TYPE1_NESTING_IOMMU:
+ case __VFIO_RESERVED_TYPE1_NESTING_IOMMU:
return 0;
/*
diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c
new file mode 100644
index 000000000000..69b88e8c7c26
--- /dev/null
+++ b/drivers/iommu/iommufd/viommu.c
@@ -0,0 +1,157 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ */
+#include "iommufd_private.h"
+
+void iommufd_viommu_destroy(struct iommufd_object *obj)
+{
+ struct iommufd_viommu *viommu =
+ container_of(obj, struct iommufd_viommu, obj);
+
+ if (viommu->ops && viommu->ops->destroy)
+ viommu->ops->destroy(viommu);
+ refcount_dec(&viommu->hwpt->common.obj.users);
+ xa_destroy(&viommu->vdevs);
+}
+
+int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_viommu_alloc *cmd = ucmd->cmd;
+ struct iommufd_hwpt_paging *hwpt_paging;
+ struct iommufd_viommu *viommu;
+ struct iommufd_device *idev;
+ const struct iommu_ops *ops;
+ int rc;
+
+ if (cmd->flags || cmd->type == IOMMU_VIOMMU_TYPE_DEFAULT)
+ return -EOPNOTSUPP;
+
+ idev = iommufd_get_device(ucmd, cmd->dev_id);
+ if (IS_ERR(idev))
+ return PTR_ERR(idev);
+
+ ops = dev_iommu_ops(idev->dev);
+ if (!ops->viommu_alloc) {
+ rc = -EOPNOTSUPP;
+ goto out_put_idev;
+ }
+
+ hwpt_paging = iommufd_get_hwpt_paging(ucmd, cmd->hwpt_id);
+ if (IS_ERR(hwpt_paging)) {
+ rc = PTR_ERR(hwpt_paging);
+ goto out_put_idev;
+ }
+
+ if (!hwpt_paging->nest_parent) {
+ rc = -EINVAL;
+ goto out_put_hwpt;
+ }
+
+ viommu = ops->viommu_alloc(idev->dev, hwpt_paging->common.domain,
+ ucmd->ictx, cmd->type);
+ if (IS_ERR(viommu)) {
+ rc = PTR_ERR(viommu);
+ goto out_put_hwpt;
+ }
+
+ xa_init(&viommu->vdevs);
+ viommu->type = cmd->type;
+ viommu->ictx = ucmd->ictx;
+ viommu->hwpt = hwpt_paging;
+ refcount_inc(&viommu->hwpt->common.obj.users);
+ /*
+ * It is the most likely case that a physical IOMMU is unpluggable. A
+ * pluggable IOMMU instance (if exists) is responsible for refcounting
+ * on its own.
+ */
+ viommu->iommu_dev = __iommu_get_iommu_dev(idev->dev);
+
+ cmd->out_viommu_id = viommu->obj.id;
+ rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+ if (rc)
+ goto out_abort;
+ iommufd_object_finalize(ucmd->ictx, &viommu->obj);
+ goto out_put_hwpt;
+
+out_abort:
+ iommufd_object_abort_and_destroy(ucmd->ictx, &viommu->obj);
+out_put_hwpt:
+ iommufd_put_object(ucmd->ictx, &hwpt_paging->common.obj);
+out_put_idev:
+ iommufd_put_object(ucmd->ictx, &idev->obj);
+ return rc;
+}
+
+void iommufd_vdevice_destroy(struct iommufd_object *obj)
+{
+ struct iommufd_vdevice *vdev =
+ container_of(obj, struct iommufd_vdevice, obj);
+ struct iommufd_viommu *viommu = vdev->viommu;
+
+ /* xa_cmpxchg is okay to fail if alloc failed xa_cmpxchg previously */
+ xa_cmpxchg(&viommu->vdevs, vdev->id, vdev, NULL, GFP_KERNEL);
+ refcount_dec(&viommu->obj.users);
+ put_device(vdev->dev);
+}
+
+int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_vdevice_alloc *cmd = ucmd->cmd;
+ struct iommufd_vdevice *vdev, *curr;
+ struct iommufd_viommu *viommu;
+ struct iommufd_device *idev;
+ u64 virt_id = cmd->virt_id;
+ int rc = 0;
+
+ /* virt_id indexes an xarray */
+ if (virt_id > ULONG_MAX)
+ return -EINVAL;
+
+ viommu = iommufd_get_viommu(ucmd, cmd->viommu_id);
+ if (IS_ERR(viommu))
+ return PTR_ERR(viommu);
+
+ idev = iommufd_get_device(ucmd, cmd->dev_id);
+ if (IS_ERR(idev)) {
+ rc = PTR_ERR(idev);
+ goto out_put_viommu;
+ }
+
+ if (viommu->iommu_dev != __iommu_get_iommu_dev(idev->dev)) {
+ rc = -EINVAL;
+ goto out_put_idev;
+ }
+
+ vdev = iommufd_object_alloc(ucmd->ictx, vdev, IOMMUFD_OBJ_VDEVICE);
+ if (IS_ERR(vdev)) {
+ rc = PTR_ERR(vdev);
+ goto out_put_idev;
+ }
+
+ vdev->id = virt_id;
+ vdev->dev = idev->dev;
+ get_device(idev->dev);
+ vdev->viommu = viommu;
+ refcount_inc(&viommu->obj.users);
+
+ curr = xa_cmpxchg(&viommu->vdevs, virt_id, NULL, vdev, GFP_KERNEL);
+ if (curr) {
+ rc = xa_err(curr) ?: -EEXIST;
+ goto out_abort;
+ }
+
+ cmd->out_vdevice_id = vdev->obj.id;
+ rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+ if (rc)
+ goto out_abort;
+ iommufd_object_finalize(ucmd->ictx, &vdev->obj);
+ goto out_put_idev;
+
+out_abort:
+ iommufd_object_abort_and_destroy(ucmd->ictx, &vdev->obj);
+out_put_idev:
+ iommufd_put_object(ucmd->ictx, &idev->obj);
+out_put_viommu:
+ iommufd_put_object(ucmd->ictx, &viommu->obj);
+ return rc;
+}
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index bf391b40e576..50ebc9593c9d 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -72,7 +72,6 @@ struct vfio_iommu {
uint64_t pgsize_bitmap;
uint64_t num_non_pinned_groups;
bool v2;
- bool nesting;
bool dirty_page_tracking;
struct list_head emulated_iommu_groups;
};
@@ -2195,12 +2194,6 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
goto out_free_domain;
}
- if (iommu->nesting) {
- ret = iommu_enable_nesting(domain->domain);
- if (ret)
- goto out_domain;
- }
-
ret = iommu_attach_group(domain->domain, group->iommu_group);
if (ret)
goto out_domain;
@@ -2541,9 +2534,7 @@ static void *vfio_iommu_type1_open(unsigned long arg)
switch (arg) {
case VFIO_TYPE1_IOMMU:
break;
- case VFIO_TYPE1_NESTING_IOMMU:
- iommu->nesting = true;
- fallthrough;
+ case __VFIO_RESERVED_TYPE1_NESTING_IOMMU:
case VFIO_TYPE1v2_IOMMU:
iommu->v2 = true;
break;
@@ -2638,7 +2629,6 @@ static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu,
switch (arg) {
case VFIO_TYPE1_IOMMU:
case VFIO_TYPE1v2_IOMMU:
- case VFIO_TYPE1_NESTING_IOMMU:
case VFIO_UNMAP_ALL:
return 1;
case VFIO_UPDATE_VADDR:
diff --git a/include/acpi/actbl2.h b/include/acpi/actbl2.h
index d3858eebc255..2e917a8f8bca 100644
--- a/include/acpi/actbl2.h
+++ b/include/acpi/actbl2.h
@@ -453,7 +453,7 @@ struct acpi_table_ccel {
* IORT - IO Remapping Table
*
* Conforms to "IO Remapping Table System Software on ARM Platforms",
- * Document number: ARM DEN 0049E.e, Sep 2022
+ * Document number: ARM DEN 0049E.f, Apr 2024
*
******************************************************************************/
@@ -524,6 +524,7 @@ struct acpi_iort_memory_access {
#define ACPI_IORT_MF_COHERENCY (1)
#define ACPI_IORT_MF_ATTRIBUTES (1<<1)
+#define ACPI_IORT_MF_CANWBS (1<<2)
/*
* IORT node specific subtables
diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index b1ecfc3cd5bc..ce86b09ae80f 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -87,6 +87,7 @@ struct io_pgtable_cfg {
* attributes set in the TCR for a non-coherent page-table walker.
*
* IO_PGTABLE_QUIRK_ARM_HD: Enables dirty tracking in stage 1 pagetable.
+ * IO_PGTABLE_QUIRK_ARM_S2FWB: Use the FWB format for the MemAttrs bits
*/
#define IO_PGTABLE_QUIRK_ARM_NS BIT(0)
#define IO_PGTABLE_QUIRK_NO_PERMS BIT(1)
@@ -95,6 +96,7 @@ struct io_pgtable_cfg {
#define IO_PGTABLE_QUIRK_ARM_TTBR1 BIT(5)
#define IO_PGTABLE_QUIRK_ARM_OUTER_WBWA BIT(6)
#define IO_PGTABLE_QUIRK_ARM_HD BIT(7)
+ #define IO_PGTABLE_QUIRK_ARM_S2FWB BIT(8)
unsigned long quirks;
unsigned long pgsize_bitmap;
unsigned int ias;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index bd722f473635..8b02adbd14f7 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -42,6 +42,8 @@ struct notifier_block;
struct iommu_sva;
struct iommu_dma_cookie;
struct iommu_fault_param;
+struct iommufd_ctx;
+struct iommufd_viommu;
#define IOMMU_FAULT_PERM_READ (1 << 0) /* read */
#define IOMMU_FAULT_PERM_WRITE (1 << 1) /* write */
@@ -491,7 +493,9 @@ static inline int __iommu_copy_struct_from_user_array(
* @index: Index to the location in the array to copy user data from
* @min_last: The last member of the data structure @kdst points in the
* initial version.
- * Return 0 for success, otherwise -error.
+ *
+ * Copy a single entry from a user array. Return 0 for success, otherwise
+ * -error.
*/
#define iommu_copy_struct_from_user_array(kdst, user_array, data_type, index, \
min_last) \
@@ -500,6 +504,50 @@ static inline int __iommu_copy_struct_from_user_array(
offsetofend(typeof(*(kdst)), min_last))
/**
+ * iommu_copy_struct_from_full_user_array - Copy iommu driver specific user
+ * space data from an iommu_user_data_array
+ * @kdst: Pointer to an iommu driver specific user data that is defined in
+ * include/uapi/linux/iommufd.h
+ * @kdst_entry_size: sizeof(*kdst)
+ * @user_array: Pointer to a struct iommu_user_data_array for a user space
+ * array
+ * @data_type: The data type of the @kdst. Must match with @user_array->type
+ *
+ * Copy the entire user array. kdst must have room for kdst_entry_size *
+ * user_array->entry_num bytes. Return 0 for success, otherwise -error.
+ */
+static inline int
+iommu_copy_struct_from_full_user_array(void *kdst, size_t kdst_entry_size,
+ struct iommu_user_data_array *user_array,
+ unsigned int data_type)
+{
+ unsigned int i;
+ int ret;
+
+ if (user_array->type != data_type)
+ return -EINVAL;
+ if (!user_array->entry_num)
+ return -EINVAL;
+ if (likely(user_array->entry_len == kdst_entry_size)) {
+ if (copy_from_user(kdst, user_array->uptr,
+ user_array->entry_num *
+ user_array->entry_len))
+ return -EFAULT;
+ }
+
+ /* Copy item by item */
+ for (i = 0; i != user_array->entry_num; i++) {
+ ret = copy_struct_from_user(
+ kdst + kdst_entry_size * i, kdst_entry_size,
+ user_array->uptr + user_array->entry_len * i,
+ user_array->entry_len);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+/**
* struct iommu_ops - iommu ops and capabilities
* @capable: check capability
* @hw_info: report iommu hardware information. The data buffer returned by this
@@ -542,6 +590,14 @@ static inline int __iommu_copy_struct_from_user_array(
* @remove_dev_pasid: Remove any translation configurations of a specific
* pasid, so that any DMA transactions with this pasid
* will be blocked by the hardware.
+ * @viommu_alloc: Allocate an iommufd_viommu on a physical IOMMU instance behind
+ * the @dev, as the set of virtualization resources shared/passed
+ * to user space IOMMU instance. And associate it with a nesting
+ * @parent_domain. The @viommu_type must be defined in the header
+ * include/uapi/linux/iommufd.h
+ * It is required to call iommufd_viommu_alloc() helper for
+ * a bundled allocation of the core and the driver structures,
+ * using the given @ictx pointer.
* @pgsize_bitmap: bitmap of all possible supported page sizes
* @owner: Driver module providing these ops
* @identity_domain: An always available, always attachable identity
@@ -591,6 +647,10 @@ struct iommu_ops {
void (*remove_dev_pasid)(struct device *dev, ioasid_t pasid,
struct iommu_domain *domain);
+ struct iommufd_viommu *(*viommu_alloc)(
+ struct device *dev, struct iommu_domain *parent_domain,
+ struct iommufd_ctx *ictx, unsigned int viommu_type);
+
const struct iommu_domain_ops *default_domain_ops;
unsigned long pgsize_bitmap;
struct module *owner;
@@ -635,7 +695,6 @@ struct iommu_ops {
* @enforce_cache_coherency: Prevent any kind of DMA from bypassing IOMMU_CACHE,
* including no-snoop TLPs on PCIe or other platform
* specific mechanisms.
- * @enable_nesting: Enable nesting
* @set_pgtable_quirks: Set io page table quirks (IO_PGTABLE_QUIRK_*)
* @free: Release the domain after use.
*/
@@ -663,7 +722,6 @@ struct iommu_domain_ops {
dma_addr_t iova);
bool (*enforce_cache_coherency)(struct iommu_domain *domain);
- int (*enable_nesting)(struct iommu_domain *domain);
int (*set_pgtable_quirks)(struct iommu_domain *domain,
unsigned long quirks);
@@ -844,7 +902,6 @@ extern void iommu_group_put(struct iommu_group *group);
extern int iommu_group_id(struct iommu_group *group);
extern struct iommu_domain *iommu_group_default_domain(struct iommu_group *);
-int iommu_enable_nesting(struct iommu_domain *domain);
int iommu_set_pgtable_quirks(struct iommu_domain *domain,
unsigned long quirks);
@@ -994,6 +1051,8 @@ struct iommu_fwspec {
/* ATS is supported */
#define IOMMU_FWSPEC_PCI_RC_ATS (1 << 0)
+/* CANWBS is supported */
+#define IOMMU_FWSPEC_PCI_RC_CANWBS (1 << 1)
/*
* An iommu attach handle represents a relationship between an iommu domain
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index 30f832a60ccb..11110c749200 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -8,16 +8,46 @@
#include <linux/err.h>
#include <linux/errno.h>
+#include <linux/refcount.h>
#include <linux/types.h>
+#include <linux/xarray.h>
struct device;
struct file;
struct iommu_group;
+struct iommu_user_data;
+struct iommu_user_data_array;
struct iommufd_access;
struct iommufd_ctx;
struct iommufd_device;
+struct iommufd_viommu_ops;
struct page;
+enum iommufd_object_type {
+ IOMMUFD_OBJ_NONE,
+ IOMMUFD_OBJ_ANY = IOMMUFD_OBJ_NONE,
+ IOMMUFD_OBJ_DEVICE,
+ IOMMUFD_OBJ_HWPT_PAGING,
+ IOMMUFD_OBJ_HWPT_NESTED,
+ IOMMUFD_OBJ_IOAS,
+ IOMMUFD_OBJ_ACCESS,
+ IOMMUFD_OBJ_FAULT,
+ IOMMUFD_OBJ_VIOMMU,
+ IOMMUFD_OBJ_VDEVICE,
+#ifdef CONFIG_IOMMUFD_TEST
+ IOMMUFD_OBJ_SELFTEST,
+#endif
+ IOMMUFD_OBJ_MAX,
+};
+
+/* Base struct for all objects with a userspace ID handle. */
+struct iommufd_object {
+ refcount_t shortterm_users;
+ refcount_t users;
+ enum iommufd_object_type type;
+ unsigned int id;
+};
+
struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx,
struct device *dev, u32 *id);
void iommufd_device_unbind(struct iommufd_device *idev);
@@ -54,6 +84,45 @@ void iommufd_access_detach(struct iommufd_access *access);
void iommufd_ctx_get(struct iommufd_ctx *ictx);
+struct iommufd_viommu {
+ struct iommufd_object obj;
+ struct iommufd_ctx *ictx;
+ struct iommu_device *iommu_dev;
+ struct iommufd_hwpt_paging *hwpt;
+
+ const struct iommufd_viommu_ops *ops;
+
+ struct xarray vdevs;
+
+ unsigned int type;
+};
+
+/**
+ * struct iommufd_viommu_ops - vIOMMU specific operations
+ * @destroy: Clean up all driver-specific parts of an iommufd_viommu. The memory
+ * of the vIOMMU will be free-ed by iommufd core after calling this op
+ * @alloc_domain_nested: Allocate a IOMMU_DOMAIN_NESTED on a vIOMMU that holds a
+ * nesting parent domain (IOMMU_DOMAIN_PAGING). @user_data
+ * must be defined in include/uapi/linux/iommufd.h.
+ * It must fully initialize the new iommu_domain before
+ * returning. Upon failure, ERR_PTR must be returned.
+ * @cache_invalidate: Flush hardware cache used by a vIOMMU. It can be used for
+ * any IOMMU hardware specific cache: TLB and device cache.
+ * The @array passes in the cache invalidation requests, in
+ * form of a driver data structure. A driver must update the
+ * array->entry_num to report the number of handled requests.
+ * The data structure of the array entry must be defined in
+ * include/uapi/linux/iommufd.h
+ */
+struct iommufd_viommu_ops {
+ void (*destroy)(struct iommufd_viommu *viommu);
+ struct iommu_domain *(*alloc_domain_nested)(
+ struct iommufd_viommu *viommu, u32 flags,
+ const struct iommu_user_data *user_data);
+ int (*cache_invalidate)(struct iommufd_viommu *viommu,
+ struct iommu_user_data_array *array);
+};
+
#if IS_ENABLED(CONFIG_IOMMUFD)
struct iommufd_ctx *iommufd_ctx_from_file(struct file *file);
struct iommufd_ctx *iommufd_ctx_from_fd(int fd);
@@ -111,4 +180,43 @@ static inline int iommufd_vfio_compat_set_no_iommu(struct iommufd_ctx *ictx)
return -EOPNOTSUPP;
}
#endif /* CONFIG_IOMMUFD */
+
+#if IS_ENABLED(CONFIG_IOMMUFD_DRIVER_CORE)
+struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx,
+ size_t size,
+ enum iommufd_object_type type);
+struct device *iommufd_viommu_find_dev(struct iommufd_viommu *viommu,
+ unsigned long vdev_id);
+#else /* !CONFIG_IOMMUFD_DRIVER_CORE */
+static inline struct iommufd_object *
+_iommufd_object_alloc(struct iommufd_ctx *ictx, size_t size,
+ enum iommufd_object_type type)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline struct device *
+iommufd_viommu_find_dev(struct iommufd_viommu *viommu, unsigned long vdev_id)
+{
+ return NULL;
+}
+#endif /* CONFIG_IOMMUFD_DRIVER_CORE */
+
+/*
+ * Helpers for IOMMU driver to allocate driver structures that will be freed by
+ * the iommufd core. The free op will be called prior to freeing the memory.
+ */
+#define iommufd_viommu_alloc(ictx, drv_struct, member, viommu_ops) \
+ ({ \
+ drv_struct *ret; \
+ \
+ static_assert(__same_type(struct iommufd_viommu, \
+ ((drv_struct *)NULL)->member)); \
+ static_assert(offsetof(drv_struct, member.obj) == 0); \
+ ret = (drv_struct *)_iommufd_object_alloc( \
+ ictx, sizeof(drv_struct), IOMMUFD_OBJ_VIOMMU); \
+ if (!IS_ERR(ret)) \
+ ret->member.ops = viommu_ops; \
+ ret; \
+ })
#endif
diff --git a/include/linux/mm.h b/include/linux/mm.h
index feb5c8021bef..673771f34674 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2536,6 +2536,7 @@ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end,
struct folio **folios, unsigned int max_folios,
pgoff_t *offset);
+int folio_add_pins(struct folio *folio, unsigned int pins);
int get_user_pages_fast(unsigned long start, int nr_pages,
unsigned int gup_flags, struct page **pages);
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 72010f71c5e4..4ae8b1ee0444 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -51,6 +51,10 @@ enum {
IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP = 0x8c,
IOMMUFD_CMD_HWPT_INVALIDATE = 0x8d,
IOMMUFD_CMD_FAULT_QUEUE_ALLOC = 0x8e,
+ IOMMUFD_CMD_IOAS_MAP_FILE = 0x8f,
+ IOMMUFD_CMD_VIOMMU_ALLOC = 0x90,
+ IOMMUFD_CMD_VDEVICE_ALLOC = 0x91,
+ IOMMUFD_CMD_IOAS_CHANGE_PROCESS = 0x92,
};
/**
@@ -214,6 +218,30 @@ struct iommu_ioas_map {
#define IOMMU_IOAS_MAP _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_MAP)
/**
+ * struct iommu_ioas_map_file - ioctl(IOMMU_IOAS_MAP_FILE)
+ * @size: sizeof(struct iommu_ioas_map_file)
+ * @flags: same as for iommu_ioas_map
+ * @ioas_id: same as for iommu_ioas_map
+ * @fd: the memfd to map
+ * @start: byte offset from start of file to map from
+ * @length: same as for iommu_ioas_map
+ * @iova: same as for iommu_ioas_map
+ *
+ * Set an IOVA mapping from a memfd file. All other arguments and semantics
+ * match those of IOMMU_IOAS_MAP.
+ */
+struct iommu_ioas_map_file {
+ __u32 size;
+ __u32 flags;
+ __u32 ioas_id;
+ __s32 fd;
+ __aligned_u64 start;
+ __aligned_u64 length;
+ __aligned_u64 iova;
+};
+#define IOMMU_IOAS_MAP_FILE _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_MAP_FILE)
+
+/**
* struct iommu_ioas_copy - ioctl(IOMMU_IOAS_COPY)
* @size: sizeof(struct iommu_ioas_copy)
* @flags: Combination of enum iommufd_ioas_map_flags
@@ -395,13 +423,35 @@ struct iommu_hwpt_vtd_s1 {
};
/**
+ * struct iommu_hwpt_arm_smmuv3 - ARM SMMUv3 nested STE
+ * (IOMMU_HWPT_DATA_ARM_SMMUV3)
+ *
+ * @ste: The first two double words of the user space Stream Table Entry for
+ * the translation. Must be little-endian.
+ * Allowed fields: (Refer to "5.2 Stream Table Entry" in SMMUv3 HW Spec)
+ * - word-0: V, Cfg, S1Fmt, S1ContextPtr, S1CDMax
+ * - word-1: EATS, S1DSS, S1CIR, S1COR, S1CSH, S1STALLD
+ *
+ * -EIO will be returned if @ste is not legal or contains any non-allowed field.
+ * Cfg can be used to select a S1, Bypass or Abort configuration. A Bypass
+ * nested domain will translate the same as the nesting parent. The S1 will
+ * install a Context Descriptor Table pointing at userspace memory translated
+ * by the nesting parent.
+ */
+struct iommu_hwpt_arm_smmuv3 {
+ __aligned_le64 ste[2];
+};
+
+/**
* enum iommu_hwpt_data_type - IOMMU HWPT Data Type
* @IOMMU_HWPT_DATA_NONE: no data
* @IOMMU_HWPT_DATA_VTD_S1: Intel VT-d stage-1 page table
+ * @IOMMU_HWPT_DATA_ARM_SMMUV3: ARM SMMUv3 Context Descriptor Table
*/
enum iommu_hwpt_data_type {
IOMMU_HWPT_DATA_NONE = 0,
IOMMU_HWPT_DATA_VTD_S1 = 1,
+ IOMMU_HWPT_DATA_ARM_SMMUV3 = 2,
};
/**
@@ -409,7 +459,7 @@ enum iommu_hwpt_data_type {
* @size: sizeof(struct iommu_hwpt_alloc)
* @flags: Combination of enum iommufd_hwpt_alloc_flags
* @dev_id: The device to allocate this HWPT for
- * @pt_id: The IOAS or HWPT to connect this HWPT to
+ * @pt_id: The IOAS or HWPT or vIOMMU to connect this HWPT to
* @out_hwpt_id: The ID of the new HWPT
* @__reserved: Must be 0
* @data_type: One of enum iommu_hwpt_data_type
@@ -428,11 +478,13 @@ enum iommu_hwpt_data_type {
* IOMMU_HWPT_DATA_NONE. The HWPT can be allocated as a parent HWPT for a
* nesting configuration by passing IOMMU_HWPT_ALLOC_NEST_PARENT via @flags.
*
- * A user-managed nested HWPT will be created from a given parent HWPT via
- * @pt_id, in which the parent HWPT must be allocated previously via the
- * same ioctl from a given IOAS (@pt_id). In this case, the @data_type
- * must be set to a pre-defined type corresponding to an I/O page table
- * type supported by the underlying IOMMU hardware.
+ * A user-managed nested HWPT will be created from a given vIOMMU (wrapping a
+ * parent HWPT) or a parent HWPT via @pt_id, in which the parent HWPT must be
+ * allocated previously via the same ioctl from a given IOAS (@pt_id). In this
+ * case, the @data_type must be set to a pre-defined type corresponding to an
+ * I/O page table type supported by the underlying IOMMU hardware. The device
+ * via @dev_id and the vIOMMU via @pt_id must be associated to the same IOMMU
+ * instance.
*
* If the @data_type is set to IOMMU_HWPT_DATA_NONE, @data_len and
* @data_uptr should be zero. Otherwise, both @data_len and @data_uptr
@@ -485,14 +537,49 @@ struct iommu_hw_info_vtd {
};
/**
+ * struct iommu_hw_info_arm_smmuv3 - ARM SMMUv3 hardware information
+ * (IOMMU_HW_INFO_TYPE_ARM_SMMUV3)
+ *
+ * @flags: Must be set to 0
+ * @__reserved: Must be 0
+ * @idr: Implemented features for ARM SMMU Non-secure programming interface
+ * @iidr: Information about the implementation and implementer of ARM SMMU,
+ * and architecture version supported
+ * @aidr: ARM SMMU architecture version
+ *
+ * For the details of @idr, @iidr and @aidr, please refer to the chapters
+ * from 6.3.1 to 6.3.6 in the SMMUv3 Spec.
+ *
+ * User space should read the underlying ARM SMMUv3 hardware information for
+ * the list of supported features.
+ *
+ * Note that these values reflect the raw HW capability, without any insight if
+ * any required kernel driver support is present. Bits may be set indicating the
+ * HW has functionality that is lacking kernel software support, such as BTM. If
+ * a VMM is using this information to construct emulated copies of these
+ * registers it should only forward bits that it knows it can support.
+ *
+ * In future, presence of required kernel support will be indicated in flags.
+ */
+struct iommu_hw_info_arm_smmuv3 {
+ __u32 flags;
+ __u32 __reserved;
+ __u32 idr[6];
+ __u32 iidr;
+ __u32 aidr;
+};
+
+/**
* enum iommu_hw_info_type - IOMMU Hardware Info Types
* @IOMMU_HW_INFO_TYPE_NONE: Used by the drivers that do not report hardware
* info
* @IOMMU_HW_INFO_TYPE_INTEL_VTD: Intel VT-d iommu info type
+ * @IOMMU_HW_INFO_TYPE_ARM_SMMUV3: ARM SMMUv3 iommu info type
*/
enum iommu_hw_info_type {
IOMMU_HW_INFO_TYPE_NONE = 0,
IOMMU_HW_INFO_TYPE_INTEL_VTD = 1,
+ IOMMU_HW_INFO_TYPE_ARM_SMMUV3 = 2,
};
/**
@@ -627,9 +714,11 @@ struct iommu_hwpt_get_dirty_bitmap {
* enum iommu_hwpt_invalidate_data_type - IOMMU HWPT Cache Invalidation
* Data Type
* @IOMMU_HWPT_INVALIDATE_DATA_VTD_S1: Invalidation data for VTD_S1
+ * @IOMMU_VIOMMU_INVALIDATE_DATA_ARM_SMMUV3: Invalidation data for ARM SMMUv3
*/
enum iommu_hwpt_invalidate_data_type {
IOMMU_HWPT_INVALIDATE_DATA_VTD_S1 = 0,
+ IOMMU_VIOMMU_INVALIDATE_DATA_ARM_SMMUV3 = 1,
};
/**
@@ -669,9 +758,31 @@ struct iommu_hwpt_vtd_s1_invalidate {
};
/**
+ * struct iommu_viommu_arm_smmuv3_invalidate - ARM SMMUv3 cahce invalidation
+ * (IOMMU_VIOMMU_INVALIDATE_DATA_ARM_SMMUV3)
+ * @cmd: 128-bit cache invalidation command that runs in SMMU CMDQ.
+ * Must be little-endian.
+ *
+ * Supported command list only when passing in a vIOMMU via @hwpt_id:
+ * CMDQ_OP_TLBI_NSNH_ALL
+ * CMDQ_OP_TLBI_NH_VA
+ * CMDQ_OP_TLBI_NH_VAA
+ * CMDQ_OP_TLBI_NH_ALL
+ * CMDQ_OP_TLBI_NH_ASID
+ * CMDQ_OP_ATC_INV
+ * CMDQ_OP_CFGI_CD
+ * CMDQ_OP_CFGI_CD_ALL
+ *
+ * -EIO will be returned if the command is not supported.
+ */
+struct iommu_viommu_arm_smmuv3_invalidate {
+ __aligned_le64 cmd[2];
+};
+
+/**
* struct iommu_hwpt_invalidate - ioctl(IOMMU_HWPT_INVALIDATE)
* @size: sizeof(struct iommu_hwpt_invalidate)
- * @hwpt_id: ID of a nested HWPT for cache invalidation
+ * @hwpt_id: ID of a nested HWPT or a vIOMMU, for cache invalidation
* @data_uptr: User pointer to an array of driver-specific cache invalidation
* data.
* @data_type: One of enum iommu_hwpt_invalidate_data_type, defining the data
@@ -682,8 +793,11 @@ struct iommu_hwpt_vtd_s1_invalidate {
* Output the number of requests successfully handled by kernel.
* @__reserved: Must be 0.
*
- * Invalidate the iommu cache for user-managed page table. Modifications on a
- * user-managed page table should be followed by this operation to sync cache.
+ * Invalidate iommu cache for user-managed page table or vIOMMU. Modifications
+ * on a user-managed page table should be followed by this operation, if a HWPT
+ * is passed in via @hwpt_id. Other caches, such as device cache or descriptor
+ * cache can be flushed if a vIOMMU is passed in via the @hwpt_id field.
+ *
* Each ioctl can support one or more cache invalidation requests in the array
* that has a total size of @entry_len * @entry_num.
*
@@ -797,4 +911,88 @@ struct iommu_fault_alloc {
__u32 out_fault_fd;
};
#define IOMMU_FAULT_QUEUE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_FAULT_QUEUE_ALLOC)
+
+/**
+ * enum iommu_viommu_type - Virtual IOMMU Type
+ * @IOMMU_VIOMMU_TYPE_DEFAULT: Reserved for future use
+ * @IOMMU_VIOMMU_TYPE_ARM_SMMUV3: ARM SMMUv3 driver specific type
+ */
+enum iommu_viommu_type {
+ IOMMU_VIOMMU_TYPE_DEFAULT = 0,
+ IOMMU_VIOMMU_TYPE_ARM_SMMUV3 = 1,
+};
+
+/**
+ * struct iommu_viommu_alloc - ioctl(IOMMU_VIOMMU_ALLOC)
+ * @size: sizeof(struct iommu_viommu_alloc)
+ * @flags: Must be 0
+ * @type: Type of the virtual IOMMU. Must be defined in enum iommu_viommu_type
+ * @dev_id: The device's physical IOMMU will be used to back the virtual IOMMU
+ * @hwpt_id: ID of a nesting parent HWPT to associate to
+ * @out_viommu_id: Output virtual IOMMU ID for the allocated object
+ *
+ * Allocate a virtual IOMMU object, representing the underlying physical IOMMU's
+ * virtualization support that is a security-isolated slice of the real IOMMU HW
+ * that is unique to a specific VM. Operations global to the IOMMU are connected
+ * to the vIOMMU, such as:
+ * - Security namespace for guest owned ID, e.g. guest-controlled cache tags
+ * - Non-device-affiliated event reporting, e.g. invalidation queue errors
+ * - Access to a sharable nesting parent pagetable across physical IOMMUs
+ * - Virtualization of various platforms IDs, e.g. RIDs and others
+ * - Delivery of paravirtualized invalidation
+ * - Direct assigned invalidation queues
+ * - Direct assigned interrupts
+ */
+struct iommu_viommu_alloc {
+ __u32 size;
+ __u32 flags;
+ __u32 type;
+ __u32 dev_id;
+ __u32 hwpt_id;
+ __u32 out_viommu_id;
+};
+#define IOMMU_VIOMMU_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VIOMMU_ALLOC)
+
+/**
+ * struct iommu_vdevice_alloc - ioctl(IOMMU_VDEVICE_ALLOC)
+ * @size: sizeof(struct iommu_vdevice_alloc)
+ * @viommu_id: vIOMMU ID to associate with the virtual device
+ * @dev_id: The physical device to allocate a virtual instance on the vIOMMU
+ * @out_vdevice_id: Object handle for the vDevice. Pass to IOMMU_DESTORY
+ * @virt_id: Virtual device ID per vIOMMU, e.g. vSID of ARM SMMUv3, vDeviceID
+ * of AMD IOMMU, and vRID of a nested Intel VT-d to a Context Table
+ *
+ * Allocate a virtual device instance (for a physical device) against a vIOMMU.
+ * This instance holds the device's information (related to its vIOMMU) in a VM.
+ */
+struct iommu_vdevice_alloc {
+ __u32 size;
+ __u32 viommu_id;
+ __u32 dev_id;
+ __u32 out_vdevice_id;
+ __aligned_u64 virt_id;
+};
+#define IOMMU_VDEVICE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VDEVICE_ALLOC)
+
+/**
+ * struct iommu_ioas_change_process - ioctl(VFIO_IOAS_CHANGE_PROCESS)
+ * @size: sizeof(struct iommu_ioas_change_process)
+ * @__reserved: Must be 0
+ *
+ * This transfers pinned memory counts for every memory map in every IOAS
+ * in the context to the current process. This only supports maps created
+ * with IOMMU_IOAS_MAP_FILE, and returns EINVAL if other maps are present.
+ * If the ioctl returns a failure status, then nothing is changed.
+ *
+ * This API is useful for transferring operation of a device from one process
+ * to another, such as during userland live update.
+ */
+struct iommu_ioas_change_process {
+ __u32 size;
+ __u32 __reserved;
+};
+
+#define IOMMU_IOAS_CHANGE_PROCESS \
+ _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_CHANGE_PROCESS)
+
#endif
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 2b68e6cdf190..c8dbf8219c4f 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -35,7 +35,7 @@
#define VFIO_EEH 5
/* Two-stage IOMMU */
-#define VFIO_TYPE1_NESTING_IOMMU 6 /* Implies v2 */
+#define __VFIO_RESERVED_TYPE1_NESTING_IOMMU 6 /* Implies v2 */
#define VFIO_SPAPR_TCE_v2_IOMMU 7
diff --git a/mm/gup.c b/mm/gup.c
index ad0c8922dac3..2b84eead9baa 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -3760,3 +3760,27 @@ err:
return ret;
}
EXPORT_SYMBOL_GPL(memfd_pin_folios);
+
+/**
+ * folio_add_pins() - add pins to an already-pinned folio
+ * @folio: the folio to add more pins to
+ * @pins: number of pins to add
+ *
+ * Try to add more pins to an already-pinned folio. The semantics
+ * of the pin (e.g., FOLL_WRITE) follow any existing pin and cannot
+ * be changed.
+ *
+ * This function is helpful when having obtained a pin on a large folio
+ * using memfd_pin_folios(), but wanting to logically unpin parts
+ * (e.g., individual pages) of the folio later, for example, using
+ * unpin_user_page_range_dirty_lock().
+ *
+ * This is not the right interface to initially pin a folio.
+ */
+int folio_add_pins(struct folio *folio, unsigned int pins)
+{
+ VM_WARN_ON_ONCE(!folio_maybe_dma_pinned(folio));
+
+ return try_grab_folio(folio, pins, FOLL_PIN);
+}
+EXPORT_SYMBOL_GPL(folio_add_pins);
diff --git a/tools/testing/selftests/iommu/Makefile b/tools/testing/selftests/iommu/Makefile
index fd6477911f24..84abeb2f0949 100644
--- a/tools/testing/selftests/iommu/Makefile
+++ b/tools/testing/selftests/iommu/Makefile
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
CFLAGS += -Wall -O2 -Wno-unused-function
CFLAGS += $(KHDR_INCLUDES)
+LDLIBS += -lcap
TEST_GEN_PROGS :=
TEST_GEN_PROGS += iommufd
diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c
index 4927b9add5ad..a1b2b657999d 100644
--- a/tools/testing/selftests/iommu/iommufd.c
+++ b/tools/testing/selftests/iommu/iommufd.c
@@ -1,6 +1,8 @@
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES */
+#include <asm/unistd.h>
#include <stdlib.h>
+#include <sys/capability.h>
#include <sys/mman.h>
#include <sys/eventfd.h>
@@ -49,6 +51,9 @@ static __attribute__((constructor)) void setup_sizes(void)
vrc = mmap(buffer, BUFFER_SIZE, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
assert(vrc == buffer);
+
+ mfd_buffer = memfd_mmap(BUFFER_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
+ &mfd);
}
FIXTURE(iommufd)
@@ -128,6 +133,11 @@ TEST_F(iommufd, cmd_length)
TEST_LENGTH(iommu_ioas_unmap, IOMMU_IOAS_UNMAP, length);
TEST_LENGTH(iommu_option, IOMMU_OPTION, val64);
TEST_LENGTH(iommu_vfio_ioas, IOMMU_VFIO_IOAS, __reserved);
+ TEST_LENGTH(iommu_ioas_map_file, IOMMU_IOAS_MAP_FILE, iova);
+ TEST_LENGTH(iommu_viommu_alloc, IOMMU_VIOMMU_ALLOC, out_viommu_id);
+ TEST_LENGTH(iommu_vdevice_alloc, IOMMU_VDEVICE_ALLOC, virt_id);
+ TEST_LENGTH(iommu_ioas_change_process, IOMMU_IOAS_CHANGE_PROCESS,
+ __reserved);
#undef TEST_LENGTH
}
@@ -186,6 +196,144 @@ TEST_F(iommufd, global_options)
EXPECT_ERRNO(ENOENT, ioctl(self->fd, IOMMU_OPTION, &cmd));
}
+static void drop_cap_ipc_lock(struct __test_metadata *_metadata)
+{
+ cap_t caps;
+ cap_value_t cap_list[1] = { CAP_IPC_LOCK };
+
+ caps = cap_get_proc();
+ ASSERT_NE(caps, NULL);
+ ASSERT_NE(-1,
+ cap_set_flag(caps, CAP_EFFECTIVE, 1, cap_list, CAP_CLEAR));
+ ASSERT_NE(-1, cap_set_proc(caps));
+ cap_free(caps);
+}
+
+static long get_proc_status_value(pid_t pid, const char *var)
+{
+ FILE *fp;
+ char buf[80], tag[80];
+ long val = -1;
+
+ snprintf(buf, sizeof(buf), "/proc/%d/status", pid);
+ fp = fopen(buf, "r");
+ if (!fp)
+ return val;
+
+ while (fgets(buf, sizeof(buf), fp))
+ if (fscanf(fp, "%s %ld\n", tag, &val) == 2 && !strcmp(tag, var))
+ break;
+
+ fclose(fp);
+ return val;
+}
+
+static long get_vm_pinned(pid_t pid)
+{
+ return get_proc_status_value(pid, "VmPin:");
+}
+
+static long get_vm_locked(pid_t pid)
+{
+ return get_proc_status_value(pid, "VmLck:");
+}
+
+FIXTURE(change_process)
+{
+ int fd;
+ uint32_t ioas_id;
+};
+
+FIXTURE_VARIANT(change_process)
+{
+ int accounting;
+};
+
+FIXTURE_SETUP(change_process)
+{
+ self->fd = open("/dev/iommu", O_RDWR);
+ ASSERT_NE(-1, self->fd);
+
+ drop_cap_ipc_lock(_metadata);
+ if (variant->accounting != IOPT_PAGES_ACCOUNT_NONE) {
+ struct iommu_option set_limit_cmd = {
+ .size = sizeof(set_limit_cmd),
+ .option_id = IOMMU_OPTION_RLIMIT_MODE,
+ .op = IOMMU_OPTION_OP_SET,
+ .val64 = (variant->accounting == IOPT_PAGES_ACCOUNT_MM),
+ };
+ ASSERT_EQ(0, ioctl(self->fd, IOMMU_OPTION, &set_limit_cmd));
+ }
+
+ test_ioctl_ioas_alloc(&self->ioas_id);
+ test_cmd_mock_domain(self->ioas_id, NULL, NULL, NULL);
+}
+
+FIXTURE_TEARDOWN(change_process)
+{
+ teardown_iommufd(self->fd, _metadata);
+}
+
+FIXTURE_VARIANT_ADD(change_process, account_none)
+{
+ .accounting = IOPT_PAGES_ACCOUNT_NONE,
+};
+
+FIXTURE_VARIANT_ADD(change_process, account_user)
+{
+ .accounting = IOPT_PAGES_ACCOUNT_USER,
+};
+
+FIXTURE_VARIANT_ADD(change_process, account_mm)
+{
+ .accounting = IOPT_PAGES_ACCOUNT_MM,
+};
+
+TEST_F(change_process, basic)
+{
+ pid_t parent = getpid();
+ pid_t child;
+ __u64 iova;
+ struct iommu_ioas_change_process cmd = {
+ .size = sizeof(cmd),
+ };
+
+ /* Expect failure if non-file maps exist */
+ test_ioctl_ioas_map(buffer, PAGE_SIZE, &iova);
+ EXPECT_ERRNO(EINVAL, ioctl(self->fd, IOMMU_IOAS_CHANGE_PROCESS, &cmd));
+ test_ioctl_ioas_unmap(iova, PAGE_SIZE);
+
+ /* Change process works in current process. */
+ test_ioctl_ioas_map_file(mfd, 0, PAGE_SIZE, &iova);
+ ASSERT_EQ(0, ioctl(self->fd, IOMMU_IOAS_CHANGE_PROCESS, &cmd));
+
+ /* Change process works in another process */
+ child = fork();
+ if (!child) {
+ int nlock = PAGE_SIZE / 1024;
+
+ /* Parent accounts for locked memory before */
+ ASSERT_EQ(nlock, get_vm_pinned(parent));
+ if (variant->accounting == IOPT_PAGES_ACCOUNT_MM)
+ ASSERT_EQ(nlock, get_vm_locked(parent));
+ ASSERT_EQ(0, get_vm_pinned(getpid()));
+ ASSERT_EQ(0, get_vm_locked(getpid()));
+
+ ASSERT_EQ(0, ioctl(self->fd, IOMMU_IOAS_CHANGE_PROCESS, &cmd));
+
+ /* Child accounts for locked memory after */
+ ASSERT_EQ(0, get_vm_pinned(parent));
+ ASSERT_EQ(0, get_vm_locked(parent));
+ ASSERT_EQ(nlock, get_vm_pinned(getpid()));
+ if (variant->accounting == IOPT_PAGES_ACCOUNT_MM)
+ ASSERT_EQ(nlock, get_vm_locked(getpid()));
+
+ exit(0);
+ }
+ ASSERT_NE(-1, child);
+ ASSERT_EQ(child, waitpid(child, NULL, 0));
+}
+
FIXTURE(iommufd_ioas)
{
int fd;
@@ -220,6 +368,8 @@ FIXTURE_SETUP(iommufd_ioas)
for (i = 0; i != variant->mock_domains; i++) {
test_cmd_mock_domain(self->ioas_id, &self->stdev_id,
&self->hwpt_id, &self->device_id);
+ test_cmd_dev_check_cache_all(self->device_id,
+ IOMMU_TEST_DEV_CACHE_DEFAULT);
self->base_iova = MOCK_APERTURE_START;
}
}
@@ -360,9 +510,9 @@ TEST_F(iommufd_ioas, alloc_hwpt_nested)
EXPECT_ERRNO(EBUSY,
_test_ioctl_destroy(self->fd, parent_hwpt_id));
- /* hwpt_invalidate only supports a user-managed hwpt (nested) */
+ /* hwpt_invalidate does not support a parent hwpt */
num_inv = 1;
- test_err_hwpt_invalidate(ENOENT, parent_hwpt_id, inv_reqs,
+ test_err_hwpt_invalidate(EINVAL, parent_hwpt_id, inv_reqs,
IOMMU_HWPT_INVALIDATE_DATA_SELFTEST,
sizeof(*inv_reqs), &num_inv);
assert(!num_inv);
@@ -1372,6 +1522,7 @@ FIXTURE_VARIANT(iommufd_mock_domain)
{
unsigned int mock_domains;
bool hugepages;
+ bool file;
};
FIXTURE_SETUP(iommufd_mock_domain)
@@ -1384,9 +1535,12 @@ FIXTURE_SETUP(iommufd_mock_domain)
ASSERT_GE(ARRAY_SIZE(self->hwpt_ids), variant->mock_domains);
- for (i = 0; i != variant->mock_domains; i++)
+ for (i = 0; i != variant->mock_domains; i++) {
test_cmd_mock_domain(self->ioas_id, &self->stdev_ids[i],
&self->hwpt_ids[i], &self->idev_ids[i]);
+ test_cmd_dev_check_cache_all(self->idev_ids[0],
+ IOMMU_TEST_DEV_CACHE_DEFAULT);
+ }
self->hwpt_id = self->hwpt_ids[0];
self->mmap_flags = MAP_SHARED | MAP_ANONYMOUS;
@@ -1410,26 +1564,45 @@ FIXTURE_VARIANT_ADD(iommufd_mock_domain, one_domain)
{
.mock_domains = 1,
.hugepages = false,
+ .file = false,
};
FIXTURE_VARIANT_ADD(iommufd_mock_domain, two_domains)
{
.mock_domains = 2,
.hugepages = false,
+ .file = false,
};
FIXTURE_VARIANT_ADD(iommufd_mock_domain, one_domain_hugepage)
{
.mock_domains = 1,
.hugepages = true,
+ .file = false,
};
FIXTURE_VARIANT_ADD(iommufd_mock_domain, two_domains_hugepage)
{
.mock_domains = 2,
.hugepages = true,
+ .file = false,
};
+FIXTURE_VARIANT_ADD(iommufd_mock_domain, one_domain_file)
+{
+ .mock_domains = 1,
+ .hugepages = false,
+ .file = true,
+};
+
+FIXTURE_VARIANT_ADD(iommufd_mock_domain, one_domain_file_hugepage)
+{
+ .mock_domains = 1,
+ .hugepages = true,
+ .file = true,
+};
+
+
/* Have the kernel check that the user pages made it to the iommu_domain */
#define check_mock_iova(_ptr, _iova, _length) \
({ \
@@ -1455,7 +1628,10 @@ FIXTURE_VARIANT_ADD(iommufd_mock_domain, two_domains_hugepage)
} \
})
-TEST_F(iommufd_mock_domain, basic)
+static void
+test_basic_mmap(struct __test_metadata *_metadata,
+ struct _test_data_iommufd_mock_domain *self,
+ const struct _fixture_variant_iommufd_mock_domain *variant)
{
size_t buf_size = self->mmap_buf_size;
uint8_t *buf;
@@ -1478,6 +1654,40 @@ TEST_F(iommufd_mock_domain, basic)
test_err_ioctl_ioas_map(EFAULT, buf, buf_size, &iova);
}
+static void
+test_basic_file(struct __test_metadata *_metadata,
+ struct _test_data_iommufd_mock_domain *self,
+ const struct _fixture_variant_iommufd_mock_domain *variant)
+{
+ size_t buf_size = self->mmap_buf_size;
+ uint8_t *buf;
+ __u64 iova;
+ int mfd_tmp;
+ int prot = PROT_READ | PROT_WRITE;
+
+ /* Simple one page map */
+ test_ioctl_ioas_map_file(mfd, 0, PAGE_SIZE, &iova);
+ check_mock_iova(mfd_buffer, iova, PAGE_SIZE);
+
+ buf = memfd_mmap(buf_size, prot, MAP_SHARED, &mfd_tmp);
+ ASSERT_NE(MAP_FAILED, buf);
+
+ test_err_ioctl_ioas_map_file(EINVAL, mfd_tmp, 0, buf_size + 1, &iova);
+
+ ASSERT_EQ(0, ftruncate(mfd_tmp, 0));
+ test_err_ioctl_ioas_map_file(EINVAL, mfd_tmp, 0, buf_size, &iova);
+
+ close(mfd_tmp);
+}
+
+TEST_F(iommufd_mock_domain, basic)
+{
+ if (variant->file)
+ test_basic_file(_metadata, self, variant);
+ else
+ test_basic_mmap(_metadata, self, variant);
+}
+
TEST_F(iommufd_mock_domain, ro_unshare)
{
uint8_t *buf;
@@ -1513,9 +1723,13 @@ TEST_F(iommufd_mock_domain, all_aligns)
unsigned int start;
unsigned int end;
uint8_t *buf;
+ int prot = PROT_READ | PROT_WRITE;
+ int mfd;
- buf = mmap(0, buf_size, PROT_READ | PROT_WRITE, self->mmap_flags, -1,
- 0);
+ if (variant->file)
+ buf = memfd_mmap(buf_size, prot, MAP_SHARED, &mfd);
+ else
+ buf = mmap(0, buf_size, prot, self->mmap_flags, -1, 0);
ASSERT_NE(MAP_FAILED, buf);
check_refs(buf, buf_size, 0);
@@ -1532,7 +1746,12 @@ TEST_F(iommufd_mock_domain, all_aligns)
size_t length = end - start;
__u64 iova;
- test_ioctl_ioas_map(buf + start, length, &iova);
+ if (variant->file) {
+ test_ioctl_ioas_map_file(mfd, start, length,
+ &iova);
+ } else {
+ test_ioctl_ioas_map(buf + start, length, &iova);
+ }
check_mock_iova(buf + start, iova, length);
check_refs(buf + start / PAGE_SIZE * PAGE_SIZE,
end / PAGE_SIZE * PAGE_SIZE -
@@ -1544,6 +1763,8 @@ TEST_F(iommufd_mock_domain, all_aligns)
}
check_refs(buf, buf_size, 0);
ASSERT_EQ(0, munmap(buf, buf_size));
+ if (variant->file)
+ close(mfd);
}
TEST_F(iommufd_mock_domain, all_aligns_copy)
@@ -1554,9 +1775,13 @@ TEST_F(iommufd_mock_domain, all_aligns_copy)
unsigned int start;
unsigned int end;
uint8_t *buf;
+ int prot = PROT_READ | PROT_WRITE;
+ int mfd;
- buf = mmap(0, buf_size, PROT_READ | PROT_WRITE, self->mmap_flags, -1,
- 0);
+ if (variant->file)
+ buf = memfd_mmap(buf_size, prot, MAP_SHARED, &mfd);
+ else
+ buf = mmap(0, buf_size, prot, self->mmap_flags, -1, 0);
ASSERT_NE(MAP_FAILED, buf);
check_refs(buf, buf_size, 0);
@@ -1575,7 +1800,12 @@ TEST_F(iommufd_mock_domain, all_aligns_copy)
uint32_t mock_stdev_id;
__u64 iova;
- test_ioctl_ioas_map(buf + start, length, &iova);
+ if (variant->file) {
+ test_ioctl_ioas_map_file(mfd, start, length,
+ &iova);
+ } else {
+ test_ioctl_ioas_map(buf + start, length, &iova);
+ }
/* Add and destroy a domain while the area exists */
old_id = self->hwpt_ids[1];
@@ -1596,15 +1826,18 @@ TEST_F(iommufd_mock_domain, all_aligns_copy)
}
check_refs(buf, buf_size, 0);
ASSERT_EQ(0, munmap(buf, buf_size));
+ if (variant->file)
+ close(mfd);
}
TEST_F(iommufd_mock_domain, user_copy)
{
+ void *buf = variant->file ? mfd_buffer : buffer;
struct iommu_test_cmd access_cmd = {
.size = sizeof(access_cmd),
.op = IOMMU_TEST_OP_ACCESS_PAGES,
.access_pages = { .length = BUFFER_SIZE,
- .uptr = (uintptr_t)buffer },
+ .uptr = (uintptr_t)buf },
};
struct iommu_ioas_copy copy_cmd = {
.size = sizeof(copy_cmd),
@@ -1623,9 +1856,13 @@ TEST_F(iommufd_mock_domain, user_copy)
/* Pin the pages in an IOAS with no domains then copy to an IOAS with domains */
test_ioctl_ioas_alloc(&ioas_id);
- test_ioctl_ioas_map_id(ioas_id, buffer, BUFFER_SIZE,
- &copy_cmd.src_iova);
-
+ if (variant->file) {
+ test_ioctl_ioas_map_id_file(ioas_id, mfd, 0, BUFFER_SIZE,
+ &copy_cmd.src_iova);
+ } else {
+ test_ioctl_ioas_map_id(ioas_id, buf, BUFFER_SIZE,
+ &copy_cmd.src_iova);
+ }
test_cmd_create_access(ioas_id, &access_cmd.id,
MOCK_FLAGS_ACCESS_CREATE_NEEDS_PIN_PAGES);
@@ -1635,12 +1872,17 @@ TEST_F(iommufd_mock_domain, user_copy)
&access_cmd));
copy_cmd.src_ioas_id = ioas_id;
ASSERT_EQ(0, ioctl(self->fd, IOMMU_IOAS_COPY, &copy_cmd));
- check_mock_iova(buffer, MOCK_APERTURE_START, BUFFER_SIZE);
+ check_mock_iova(buf, MOCK_APERTURE_START, BUFFER_SIZE);
/* Now replace the ioas with a new one */
test_ioctl_ioas_alloc(&new_ioas_id);
- test_ioctl_ioas_map_id(new_ioas_id, buffer, BUFFER_SIZE,
- &copy_cmd.src_iova);
+ if (variant->file) {
+ test_ioctl_ioas_map_id_file(new_ioas_id, mfd, 0, BUFFER_SIZE,
+ &copy_cmd.src_iova);
+ } else {
+ test_ioctl_ioas_map_id(new_ioas_id, buf, BUFFER_SIZE,
+ &copy_cmd.src_iova);
+ }
test_cmd_access_replace_ioas(access_cmd.id, new_ioas_id);
/* Destroy the old ioas and cleanup copied mapping */
@@ -1654,7 +1896,7 @@ TEST_F(iommufd_mock_domain, user_copy)
&access_cmd));
copy_cmd.src_ioas_id = new_ioas_id;
ASSERT_EQ(0, ioctl(self->fd, IOMMU_IOAS_COPY, &copy_cmd));
- check_mock_iova(buffer, MOCK_APERTURE_START, BUFFER_SIZE);
+ check_mock_iova(buf, MOCK_APERTURE_START, BUFFER_SIZE);
test_cmd_destroy_access_pages(
access_cmd.id, access_cmd.access_pages.out_access_pages_id);
@@ -2386,4 +2628,332 @@ TEST_F(vfio_compat_mock_domain, huge_map)
}
}
+FIXTURE(iommufd_viommu)
+{
+ int fd;
+ uint32_t ioas_id;
+ uint32_t stdev_id;
+ uint32_t hwpt_id;
+ uint32_t nested_hwpt_id;
+ uint32_t device_id;
+ uint32_t viommu_id;
+};
+
+FIXTURE_VARIANT(iommufd_viommu)
+{
+ unsigned int viommu;
+};
+
+FIXTURE_SETUP(iommufd_viommu)
+{
+ self->fd = open("/dev/iommu", O_RDWR);
+ ASSERT_NE(-1, self->fd);
+ test_ioctl_ioas_alloc(&self->ioas_id);
+ test_ioctl_set_default_memory_limit();
+
+ if (variant->viommu) {
+ struct iommu_hwpt_selftest data = {
+ .iotlb = IOMMU_TEST_IOTLB_DEFAULT,
+ };
+
+ test_cmd_mock_domain(self->ioas_id, &self->stdev_id, NULL,
+ &self->device_id);
+
+ /* Allocate a nesting parent hwpt */
+ test_cmd_hwpt_alloc(self->device_id, self->ioas_id,
+ IOMMU_HWPT_ALLOC_NEST_PARENT,
+ &self->hwpt_id);
+
+ /* Allocate a vIOMMU taking refcount of the parent hwpt */
+ test_cmd_viommu_alloc(self->device_id, self->hwpt_id,
+ IOMMU_VIOMMU_TYPE_SELFTEST,
+ &self->viommu_id);
+
+ /* Allocate a regular nested hwpt */
+ test_cmd_hwpt_alloc_nested(self->device_id, self->viommu_id, 0,
+ &self->nested_hwpt_id,
+ IOMMU_HWPT_DATA_SELFTEST, &data,
+ sizeof(data));
+ }
+}
+
+FIXTURE_TEARDOWN(iommufd_viommu)
+{
+ teardown_iommufd(self->fd, _metadata);
+}
+
+FIXTURE_VARIANT_ADD(iommufd_viommu, no_viommu)
+{
+ .viommu = 0,
+};
+
+FIXTURE_VARIANT_ADD(iommufd_viommu, mock_viommu)
+{
+ .viommu = 1,
+};
+
+TEST_F(iommufd_viommu, viommu_auto_destroy)
+{
+}
+
+TEST_F(iommufd_viommu, viommu_negative_tests)
+{
+ uint32_t device_id = self->device_id;
+ uint32_t ioas_id = self->ioas_id;
+ uint32_t hwpt_id;
+
+ if (self->device_id) {
+ /* Negative test -- invalid hwpt (hwpt_id=0) */
+ test_err_viommu_alloc(ENOENT, device_id, 0,
+ IOMMU_VIOMMU_TYPE_SELFTEST, NULL);
+
+ /* Negative test -- not a nesting parent hwpt */
+ test_cmd_hwpt_alloc(device_id, ioas_id, 0, &hwpt_id);
+ test_err_viommu_alloc(EINVAL, device_id, hwpt_id,
+ IOMMU_VIOMMU_TYPE_SELFTEST, NULL);
+ test_ioctl_destroy(hwpt_id);
+
+ /* Negative test -- unsupported viommu type */
+ test_err_viommu_alloc(EOPNOTSUPP, device_id, self->hwpt_id,
+ 0xdead, NULL);
+ EXPECT_ERRNO(EBUSY,
+ _test_ioctl_destroy(self->fd, self->hwpt_id));
+ EXPECT_ERRNO(EBUSY,
+ _test_ioctl_destroy(self->fd, self->viommu_id));
+ } else {
+ test_err_viommu_alloc(ENOENT, self->device_id, self->hwpt_id,
+ IOMMU_VIOMMU_TYPE_SELFTEST, NULL);
+ }
+}
+
+TEST_F(iommufd_viommu, viommu_alloc_nested_iopf)
+{
+ struct iommu_hwpt_selftest data = {
+ .iotlb = IOMMU_TEST_IOTLB_DEFAULT,
+ };
+ uint32_t viommu_id = self->viommu_id;
+ uint32_t dev_id = self->device_id;
+ uint32_t iopf_hwpt_id;
+ uint32_t fault_id;
+ uint32_t fault_fd;
+
+ if (self->device_id) {
+ test_ioctl_fault_alloc(&fault_id, &fault_fd);
+ test_err_hwpt_alloc_iopf(
+ ENOENT, dev_id, viommu_id, UINT32_MAX,
+ IOMMU_HWPT_FAULT_ID_VALID, &iopf_hwpt_id,
+ IOMMU_HWPT_DATA_SELFTEST, &data, sizeof(data));
+ test_err_hwpt_alloc_iopf(
+ EOPNOTSUPP, dev_id, viommu_id, fault_id,
+ IOMMU_HWPT_FAULT_ID_VALID | (1 << 31), &iopf_hwpt_id,
+ IOMMU_HWPT_DATA_SELFTEST, &data, sizeof(data));
+ test_cmd_hwpt_alloc_iopf(
+ dev_id, viommu_id, fault_id, IOMMU_HWPT_FAULT_ID_VALID,
+ &iopf_hwpt_id, IOMMU_HWPT_DATA_SELFTEST, &data,
+ sizeof(data));
+
+ test_cmd_mock_domain_replace(self->stdev_id, iopf_hwpt_id);
+ EXPECT_ERRNO(EBUSY,
+ _test_ioctl_destroy(self->fd, iopf_hwpt_id));
+ test_cmd_trigger_iopf(dev_id, fault_fd);
+
+ test_cmd_mock_domain_replace(self->stdev_id, self->ioas_id);
+ test_ioctl_destroy(iopf_hwpt_id);
+ close(fault_fd);
+ test_ioctl_destroy(fault_id);
+ }
+}
+
+TEST_F(iommufd_viommu, vdevice_alloc)
+{
+ uint32_t viommu_id = self->viommu_id;
+ uint32_t dev_id = self->device_id;
+ uint32_t vdev_id = 0;
+
+ if (dev_id) {
+ /* Set vdev_id to 0x99, unset it, and set to 0x88 */
+ test_cmd_vdevice_alloc(viommu_id, dev_id, 0x99, &vdev_id);
+ test_err_vdevice_alloc(EEXIST, viommu_id, dev_id, 0x99,
+ &vdev_id);
+ test_ioctl_destroy(vdev_id);
+ test_cmd_vdevice_alloc(viommu_id, dev_id, 0x88, &vdev_id);
+ test_ioctl_destroy(vdev_id);
+ } else {
+ test_err_vdevice_alloc(ENOENT, viommu_id, dev_id, 0x99, NULL);
+ }
+}
+
+TEST_F(iommufd_viommu, vdevice_cache)
+{
+ struct iommu_viommu_invalidate_selftest inv_reqs[2] = {};
+ uint32_t viommu_id = self->viommu_id;
+ uint32_t dev_id = self->device_id;
+ uint32_t vdev_id = 0;
+ uint32_t num_inv;
+
+ if (dev_id) {
+ test_cmd_vdevice_alloc(viommu_id, dev_id, 0x99, &vdev_id);
+
+ test_cmd_dev_check_cache_all(dev_id,
+ IOMMU_TEST_DEV_CACHE_DEFAULT);
+
+ /* Check data_type by passing zero-length array */
+ num_inv = 0;
+ test_cmd_viommu_invalidate(viommu_id, inv_reqs,
+ sizeof(*inv_reqs), &num_inv);
+ assert(!num_inv);
+
+ /* Negative test: Invalid data_type */
+ num_inv = 1;
+ test_err_viommu_invalidate(EINVAL, viommu_id, inv_reqs,
+ IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST_INVALID,
+ sizeof(*inv_reqs), &num_inv);
+ assert(!num_inv);
+
+ /* Negative test: structure size sanity */
+ num_inv = 1;
+ test_err_viommu_invalidate(EINVAL, viommu_id, inv_reqs,
+ IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST,
+ sizeof(*inv_reqs) + 1, &num_inv);
+ assert(!num_inv);
+
+ num_inv = 1;
+ test_err_viommu_invalidate(EINVAL, viommu_id, inv_reqs,
+ IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST,
+ 1, &num_inv);
+ assert(!num_inv);
+
+ /* Negative test: invalid flag is passed */
+ num_inv = 1;
+ inv_reqs[0].flags = 0xffffffff;
+ inv_reqs[0].vdev_id = 0x99;
+ test_err_viommu_invalidate(EOPNOTSUPP, viommu_id, inv_reqs,
+ IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST,
+ sizeof(*inv_reqs), &num_inv);
+ assert(!num_inv);
+
+ /* Negative test: invalid data_uptr when array is not empty */
+ num_inv = 1;
+ inv_reqs[0].flags = 0;
+ inv_reqs[0].vdev_id = 0x99;
+ test_err_viommu_invalidate(EINVAL, viommu_id, NULL,
+ IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST,
+ sizeof(*inv_reqs), &num_inv);
+ assert(!num_inv);
+
+ /* Negative test: invalid entry_len when array is not empty */
+ num_inv = 1;
+ inv_reqs[0].flags = 0;
+ inv_reqs[0].vdev_id = 0x99;
+ test_err_viommu_invalidate(EINVAL, viommu_id, inv_reqs,
+ IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST,
+ 0, &num_inv);
+ assert(!num_inv);
+
+ /* Negative test: invalid cache_id */
+ num_inv = 1;
+ inv_reqs[0].flags = 0;
+ inv_reqs[0].vdev_id = 0x99;
+ inv_reqs[0].cache_id = MOCK_DEV_CACHE_ID_MAX + 1;
+ test_err_viommu_invalidate(EINVAL, viommu_id, inv_reqs,
+ IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST,
+ sizeof(*inv_reqs), &num_inv);
+ assert(!num_inv);
+
+ /* Negative test: invalid vdev_id */
+ num_inv = 1;
+ inv_reqs[0].flags = 0;
+ inv_reqs[0].vdev_id = 0x9;
+ inv_reqs[0].cache_id = 0;
+ test_err_viommu_invalidate(EINVAL, viommu_id, inv_reqs,
+ IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST,
+ sizeof(*inv_reqs), &num_inv);
+ assert(!num_inv);
+
+ /*
+ * Invalidate the 1st cache entry but fail the 2nd request
+ * due to invalid flags configuration in the 2nd request.
+ */
+ num_inv = 2;
+ inv_reqs[0].flags = 0;
+ inv_reqs[0].vdev_id = 0x99;
+ inv_reqs[0].cache_id = 0;
+ inv_reqs[1].flags = 0xffffffff;
+ inv_reqs[1].vdev_id = 0x99;
+ inv_reqs[1].cache_id = 1;
+ test_err_viommu_invalidate(EOPNOTSUPP, viommu_id, inv_reqs,
+ IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST,
+ sizeof(*inv_reqs), &num_inv);
+ assert(num_inv == 1);
+ test_cmd_dev_check_cache(dev_id, 0, 0);
+ test_cmd_dev_check_cache(dev_id, 1,
+ IOMMU_TEST_DEV_CACHE_DEFAULT);
+ test_cmd_dev_check_cache(dev_id, 2,
+ IOMMU_TEST_DEV_CACHE_DEFAULT);
+ test_cmd_dev_check_cache(dev_id, 3,
+ IOMMU_TEST_DEV_CACHE_DEFAULT);
+
+ /*
+ * Invalidate the 1st cache entry but fail the 2nd request
+ * due to invalid cache_id configuration in the 2nd request.
+ */
+ num_inv = 2;
+ inv_reqs[0].flags = 0;
+ inv_reqs[0].vdev_id = 0x99;
+ inv_reqs[0].cache_id = 0;
+ inv_reqs[1].flags = 0;
+ inv_reqs[1].vdev_id = 0x99;
+ inv_reqs[1].cache_id = MOCK_DEV_CACHE_ID_MAX + 1;
+ test_err_viommu_invalidate(EINVAL, viommu_id, inv_reqs,
+ IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST,
+ sizeof(*inv_reqs), &num_inv);
+ assert(num_inv == 1);
+ test_cmd_dev_check_cache(dev_id, 0, 0);
+ test_cmd_dev_check_cache(dev_id, 1,
+ IOMMU_TEST_DEV_CACHE_DEFAULT);
+ test_cmd_dev_check_cache(dev_id, 2,
+ IOMMU_TEST_DEV_CACHE_DEFAULT);
+ test_cmd_dev_check_cache(dev_id, 3,
+ IOMMU_TEST_DEV_CACHE_DEFAULT);
+
+ /* Invalidate the 2nd cache entry and verify */
+ num_inv = 1;
+ inv_reqs[0].flags = 0;
+ inv_reqs[0].vdev_id = 0x99;
+ inv_reqs[0].cache_id = 1;
+ test_cmd_viommu_invalidate(viommu_id, inv_reqs,
+ sizeof(*inv_reqs), &num_inv);
+ assert(num_inv == 1);
+ test_cmd_dev_check_cache(dev_id, 0, 0);
+ test_cmd_dev_check_cache(dev_id, 1, 0);
+ test_cmd_dev_check_cache(dev_id, 2,
+ IOMMU_TEST_DEV_CACHE_DEFAULT);
+ test_cmd_dev_check_cache(dev_id, 3,
+ IOMMU_TEST_DEV_CACHE_DEFAULT);
+
+ /* Invalidate the 3rd and 4th cache entries and verify */
+ num_inv = 2;
+ inv_reqs[0].flags = 0;
+ inv_reqs[0].vdev_id = 0x99;
+ inv_reqs[0].cache_id = 2;
+ inv_reqs[1].flags = 0;
+ inv_reqs[1].vdev_id = 0x99;
+ inv_reqs[1].cache_id = 3;
+ test_cmd_viommu_invalidate(viommu_id, inv_reqs,
+ sizeof(*inv_reqs), &num_inv);
+ assert(num_inv == 2);
+ test_cmd_dev_check_cache_all(dev_id, 0);
+
+ /* Invalidate all cache entries for nested_dev_id[1] and verify */
+ num_inv = 1;
+ inv_reqs[0].vdev_id = 0x99;
+ inv_reqs[0].flags = IOMMU_TEST_INVALIDATE_FLAG_ALL;
+ test_cmd_viommu_invalidate(viommu_id, inv_reqs,
+ sizeof(*inv_reqs), &num_inv);
+ assert(num_inv == 1);
+ test_cmd_dev_check_cache_all(dev_id, 0);
+ test_ioctl_destroy(vdev_id);
+ }
+}
+
TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/iommu/iommufd_fail_nth.c b/tools/testing/selftests/iommu/iommufd_fail_nth.c
index c5d5e69452b0..22f6fd5f0f74 100644
--- a/tools/testing/selftests/iommu/iommufd_fail_nth.c
+++ b/tools/testing/selftests/iommu/iommufd_fail_nth.c
@@ -47,6 +47,9 @@ static __attribute__((constructor)) void setup_buffer(void)
buffer = mmap(0, BUFFER_SIZE, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+
+ mfd_buffer = memfd_mmap(BUFFER_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
+ &mfd);
}
/*
@@ -331,6 +334,42 @@ TEST_FAIL_NTH(basic_fail_nth, map_domain)
return 0;
}
+/* iopt_area_fill_domains() and iopt_area_fill_domain() */
+TEST_FAIL_NTH(basic_fail_nth, map_file_domain)
+{
+ uint32_t ioas_id;
+ __u32 stdev_id;
+ __u32 hwpt_id;
+ __u64 iova;
+
+ self->fd = open("/dev/iommu", O_RDWR);
+ if (self->fd == -1)
+ return -1;
+
+ if (_test_ioctl_ioas_alloc(self->fd, &ioas_id))
+ return -1;
+
+ if (_test_ioctl_set_temp_memory_limit(self->fd, 32))
+ return -1;
+
+ fail_nth_enable();
+
+ if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id, NULL))
+ return -1;
+
+ if (_test_ioctl_ioas_map_file(self->fd, ioas_id, mfd, 0, 262144, &iova,
+ IOMMU_IOAS_MAP_WRITEABLE |
+ IOMMU_IOAS_MAP_READABLE))
+ return -1;
+
+ if (_test_ioctl_destroy(self->fd, stdev_id))
+ return -1;
+
+ if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id, NULL))
+ return -1;
+ return 0;
+}
+
TEST_FAIL_NTH(basic_fail_nth, map_two_domains)
{
uint32_t ioas_id;
@@ -582,6 +621,8 @@ TEST_FAIL_NTH(basic_fail_nth, device)
uint32_t stdev_id;
uint32_t idev_id;
uint32_t hwpt_id;
+ uint32_t viommu_id;
+ uint32_t vdev_id;
__u64 iova;
self->fd = open("/dev/iommu", O_RDWR);
@@ -624,6 +665,19 @@ TEST_FAIL_NTH(basic_fail_nth, device)
if (_test_cmd_mock_domain_replace(self->fd, stdev_id, hwpt_id, NULL))
return -1;
+
+ if (_test_cmd_hwpt_alloc(self->fd, idev_id, ioas_id, 0,
+ IOMMU_HWPT_ALLOC_NEST_PARENT, &hwpt_id,
+ IOMMU_HWPT_DATA_NONE, 0, 0))
+ return -1;
+
+ if (_test_cmd_viommu_alloc(self->fd, idev_id, hwpt_id,
+ IOMMU_VIOMMU_TYPE_SELFTEST, 0, &viommu_id))
+ return -1;
+
+ if (_test_cmd_vdevice_alloc(self->fd, viommu_id, idev_id, 0, &vdev_id))
+ return -1;
+
return 0;
}
diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h
index 40f6f14ce136..d979f5b0efe8 100644
--- a/tools/testing/selftests/iommu/iommufd_utils.h
+++ b/tools/testing/selftests/iommu/iommufd_utils.h
@@ -22,6 +22,12 @@
#define BIT_MASK(nr) (1UL << ((nr) % __BITS_PER_LONG))
#define BIT_WORD(nr) ((nr) / __BITS_PER_LONG)
+enum {
+ IOPT_PAGES_ACCOUNT_NONE = 0,
+ IOPT_PAGES_ACCOUNT_USER = 1,
+ IOPT_PAGES_ACCOUNT_MM = 2,
+};
+
#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
static inline void set_bit(unsigned int nr, unsigned long *addr)
@@ -40,12 +46,28 @@ static inline bool test_bit(unsigned int nr, unsigned long *addr)
static void *buffer;
static unsigned long BUFFER_SIZE;
+static void *mfd_buffer;
+static int mfd;
+
static unsigned long PAGE_SIZE;
#define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER))
#define offsetofend(TYPE, MEMBER) \
(offsetof(TYPE, MEMBER) + sizeof_field(TYPE, MEMBER))
+static inline void *memfd_mmap(size_t length, int prot, int flags, int *mfd_p)
+{
+ int mfd_flags = (flags & MAP_HUGETLB) ? MFD_HUGETLB : 0;
+ int mfd = memfd_create("buffer", mfd_flags);
+
+ if (mfd <= 0)
+ return MAP_FAILED;
+ if (ftruncate(mfd, length))
+ return MAP_FAILED;
+ *mfd_p = mfd;
+ return mmap(0, length, prot, flags, mfd, 0);
+}
+
/*
* Have the kernel check the refcount on pages. I don't know why a freshly
* mmap'd anon non-compound page starts out with a ref of 3
@@ -234,6 +256,30 @@ static int _test_cmd_hwpt_alloc(int fd, __u32 device_id, __u32 pt_id, __u32 ft_i
test_cmd_hwpt_check_iotlb(hwpt_id, i, expected); \
})
+#define test_cmd_dev_check_cache(device_id, cache_id, expected) \
+ ({ \
+ struct iommu_test_cmd test_cmd = { \
+ .size = sizeof(test_cmd), \
+ .op = IOMMU_TEST_OP_DEV_CHECK_CACHE, \
+ .id = device_id, \
+ .check_dev_cache = { \
+ .id = cache_id, \
+ .cache = expected, \
+ }, \
+ }; \
+ ASSERT_EQ(0, ioctl(self->fd, \
+ _IOMMU_TEST_CMD( \
+ IOMMU_TEST_OP_DEV_CHECK_CACHE), \
+ &test_cmd)); \
+ })
+
+#define test_cmd_dev_check_cache_all(device_id, expected) \
+ ({ \
+ int c; \
+ for (c = 0; c < MOCK_DEV_CACHE_NUM; c++) \
+ test_cmd_dev_check_cache(device_id, c, expected); \
+ })
+
static int _test_cmd_hwpt_invalidate(int fd, __u32 hwpt_id, void *reqs,
uint32_t data_type, uint32_t lreq,
uint32_t *nreqs)
@@ -265,6 +311,38 @@ static int _test_cmd_hwpt_invalidate(int fd, __u32 hwpt_id, void *reqs,
data_type, lreq, nreqs)); \
})
+static int _test_cmd_viommu_invalidate(int fd, __u32 viommu_id, void *reqs,
+ uint32_t data_type, uint32_t lreq,
+ uint32_t *nreqs)
+{
+ struct iommu_hwpt_invalidate cmd = {
+ .size = sizeof(cmd),
+ .hwpt_id = viommu_id,
+ .data_type = data_type,
+ .data_uptr = (uint64_t)reqs,
+ .entry_len = lreq,
+ .entry_num = *nreqs,
+ };
+ int rc = ioctl(fd, IOMMU_HWPT_INVALIDATE, &cmd);
+ *nreqs = cmd.entry_num;
+ return rc;
+}
+
+#define test_cmd_viommu_invalidate(viommu, reqs, lreq, nreqs) \
+ ({ \
+ ASSERT_EQ(0, \
+ _test_cmd_viommu_invalidate(self->fd, viommu, reqs, \
+ IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST, \
+ lreq, nreqs)); \
+ })
+#define test_err_viommu_invalidate(_errno, viommu_id, reqs, data_type, lreq, \
+ nreqs) \
+ ({ \
+ EXPECT_ERRNO(_errno, _test_cmd_viommu_invalidate( \
+ self->fd, viommu_id, reqs, \
+ data_type, lreq, nreqs)); \
+ })
+
static int _test_cmd_access_replace_ioas(int fd, __u32 access_id,
unsigned int ioas_id)
{
@@ -589,6 +667,47 @@ static int _test_ioctl_ioas_unmap(int fd, unsigned int ioas_id, uint64_t iova,
EXPECT_ERRNO(_errno, _test_ioctl_ioas_unmap(self->fd, self->ioas_id, \
iova, length, NULL))
+static int _test_ioctl_ioas_map_file(int fd, unsigned int ioas_id, int mfd,
+ size_t start, size_t length, __u64 *iova,
+ unsigned int flags)
+{
+ struct iommu_ioas_map_file cmd = {
+ .size = sizeof(cmd),
+ .flags = flags,
+ .ioas_id = ioas_id,
+ .fd = mfd,
+ .start = start,
+ .length = length,
+ };
+ int ret;
+
+ if (flags & IOMMU_IOAS_MAP_FIXED_IOVA)
+ cmd.iova = *iova;
+
+ ret = ioctl(fd, IOMMU_IOAS_MAP_FILE, &cmd);
+ *iova = cmd.iova;
+ return ret;
+}
+
+#define test_ioctl_ioas_map_file(mfd, start, length, iova_p) \
+ ASSERT_EQ(0, \
+ _test_ioctl_ioas_map_file( \
+ self->fd, self->ioas_id, mfd, start, length, iova_p, \
+ IOMMU_IOAS_MAP_WRITEABLE | IOMMU_IOAS_MAP_READABLE))
+
+#define test_err_ioctl_ioas_map_file(_errno, mfd, start, length, iova_p) \
+ EXPECT_ERRNO( \
+ _errno, \
+ _test_ioctl_ioas_map_file( \
+ self->fd, self->ioas_id, mfd, start, length, iova_p, \
+ IOMMU_IOAS_MAP_WRITEABLE | IOMMU_IOAS_MAP_READABLE))
+
+#define test_ioctl_ioas_map_id_file(ioas_id, mfd, start, length, iova_p) \
+ ASSERT_EQ(0, \
+ _test_ioctl_ioas_map_file( \
+ self->fd, ioas_id, mfd, start, length, iova_p, \
+ IOMMU_IOAS_MAP_WRITEABLE | IOMMU_IOAS_MAP_READABLE))
+
static int _test_ioctl_set_temp_memory_limit(int fd, unsigned int limit)
{
struct iommu_test_cmd memlimit_cmd = {
@@ -762,3 +881,58 @@ static int _test_cmd_trigger_iopf(int fd, __u32 device_id, __u32 fault_fd)
#define test_cmd_trigger_iopf(device_id, fault_fd) \
ASSERT_EQ(0, _test_cmd_trigger_iopf(self->fd, device_id, fault_fd))
+
+static int _test_cmd_viommu_alloc(int fd, __u32 device_id, __u32 hwpt_id,
+ __u32 type, __u32 flags, __u32 *viommu_id)
+{
+ struct iommu_viommu_alloc cmd = {
+ .size = sizeof(cmd),
+ .flags = flags,
+ .type = type,
+ .dev_id = device_id,
+ .hwpt_id = hwpt_id,
+ };
+ int ret;
+
+ ret = ioctl(fd, IOMMU_VIOMMU_ALLOC, &cmd);
+ if (ret)
+ return ret;
+ if (viommu_id)
+ *viommu_id = cmd.out_viommu_id;
+ return 0;
+}
+
+#define test_cmd_viommu_alloc(device_id, hwpt_id, type, viommu_id) \
+ ASSERT_EQ(0, _test_cmd_viommu_alloc(self->fd, device_id, hwpt_id, \
+ type, 0, viommu_id))
+#define test_err_viommu_alloc(_errno, device_id, hwpt_id, type, viommu_id) \
+ EXPECT_ERRNO(_errno, \
+ _test_cmd_viommu_alloc(self->fd, device_id, hwpt_id, \
+ type, 0, viommu_id))
+
+static int _test_cmd_vdevice_alloc(int fd, __u32 viommu_id, __u32 idev_id,
+ __u64 virt_id, __u32 *vdev_id)
+{
+ struct iommu_vdevice_alloc cmd = {
+ .size = sizeof(cmd),
+ .dev_id = idev_id,
+ .viommu_id = viommu_id,
+ .virt_id = virt_id,
+ };
+ int ret;
+
+ ret = ioctl(fd, IOMMU_VDEVICE_ALLOC, &cmd);
+ if (ret)
+ return ret;
+ if (vdev_id)
+ *vdev_id = cmd.out_vdevice_id;
+ return 0;
+}
+
+#define test_cmd_vdevice_alloc(viommu_id, idev_id, virt_id, vdev_id) \
+ ASSERT_EQ(0, _test_cmd_vdevice_alloc(self->fd, viommu_id, idev_id, \
+ virt_id, vdev_id))
+#define test_err_vdevice_alloc(_errno, viommu_id, idev_id, virt_id, vdev_id) \
+ EXPECT_ERRNO(_errno, \
+ _test_cmd_vdevice_alloc(self->fd, viommu_id, idev_id, \
+ virt_id, vdev_id))