Merge tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost

Pull virtio, vhost updates from Michael Tsirkin: "Fixes, features, performance: - new iommu device - vhost guest memory access using vmap (just meta-data for now) - minor fixes" * tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost: virtio-mmio: add error check for platform_get_irq scsi: virtio_scsi: Use struct_size() helper iommu/virtio: Add event queue iommu/virtio: Add probe request iommu: Add virtio-iommu driver PCI: OF: Initialize dev->fwnode appropriately of: Allow the iommu-map property to omit untranslated devices dt-bindings: virtio: Add virtio-pci-iommu node dt-bindings: virtio-mmio: Add IOMMU description vhost: fix clang build warning vhost: access vq metadata through kernel virtual address vhost: factor out setting vring addr and num vhost: introduce helpers to get the size of metadata area vhost: rename vq_iotlb_prefetch() to vq_meta_prefetch() vhost: fine grain userspace memory accessors vhost: generalize adding used elem
author: Linus Torvalds <torvalds@linux-foundation.org> 2019-07-17 11:26:09 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2019-07-17 11:26:09 -0700
commit: 3a1d5384b7decbff6519daa9c65a35665e227323 (patch)
tree: 7442f1b74d452d82d6702f8cd25173cc81c0c634 /drivers
parent: 37d4607ebbbf5d8b74cbcb9434a5ce6897a51864 (diff)
parent: 5e663f0410fa2f355042209154029842ba1abd43 (diff)
10 files changed, 1963 insertions, 131 deletions
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 83664db5221d..e15cdcd8cb3c 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -473,4 +473,15 @@ config HYPERV_IOMMU
 	  Stub IOMMU driver to handle IRQs as to allow Hyper-V Linux
 	  guests to run with x2APIC mode enabled.
 
+config VIRTIO_IOMMU
+	bool "Virtio IOMMU driver"
+	depends on VIRTIO=y
+	depends on ARM64
+	select IOMMU_API
+	select INTERVAL_TREE
+	help
+	  Para-virtualised IOMMU driver with virtio.
+
+	  Say Y here if you intend to run this kernel as a guest.
+
 endif # IOMMU_SUPPORT
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 8c71a15e986b..f13f36ae1af6 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -33,3 +33,4 @@ obj-$(CONFIG_FSL_PAMU) += fsl_pamu.o fsl_pamu_domain.o
 obj-$(CONFIG_S390_IOMMU) += s390-iommu.o
 obj-$(CONFIG_QCOM_IOMMU) += qcom_iommu.o
 obj-$(CONFIG_HYPERV_IOMMU) += hyperv-iommu.o
+obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o
diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
new file mode 100644
index 000000000000..433f4d2ee956
--- /dev/null
+++ b/drivers/iommu/virtio-iommu.c
@@ -0,0 +1,1158 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Virtio driver for the paravirtualized IOMMU
+ *
+ * Copyright (C) 2018 Arm Limited
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/amba/bus.h>
+#include <linux/delay.h>
+#include <linux/dma-iommu.h>
+#include <linux/freezer.h>
+#include <linux/interval_tree.h>
+#include <linux/iommu.h>
+#include <linux/module.h>
+#include <linux/of_iommu.h>
+#include <linux/of_platform.h>
+#include <linux/pci.h>
+#include <linux/platform_device.h>
+#include <linux/virtio.h>
+#include <linux/virtio_config.h>
+#include <linux/virtio_ids.h>
+#include <linux/wait.h>
+
+#include <uapi/linux/virtio_iommu.h>
+
+#define MSI_IOVA_BASE			0x8000000
+#define MSI_IOVA_LENGTH			0x100000
+
+#define VIOMMU_REQUEST_VQ		0
+#define VIOMMU_EVENT_VQ			1
+#define VIOMMU_NR_VQS			2
+
+struct viommu_dev {
+	struct iommu_device		iommu;
+	struct device			*dev;
+	struct virtio_device		*vdev;
+
+	struct ida			domain_ids;
+
+	struct virtqueue		*vqs[VIOMMU_NR_VQS];
+	spinlock_t			request_lock;
+	struct list_head		requests;
+	void				*evts;
+
+	/* Device configuration */
+	struct iommu_domain_geometry	geometry;
+	u64				pgsize_bitmap;
+	u8				domain_bits;
+	u32				probe_size;
+};
+
+struct viommu_mapping {
+	phys_addr_t			paddr;
+	struct interval_tree_node	iova;
+	u32				flags;
+};
+
+struct viommu_domain {
+	struct iommu_domain		domain;
+	struct viommu_dev		*viommu;
+	struct mutex			mutex; /* protects viommu pointer */
+	unsigned int			id;
+
+	spinlock_t			mappings_lock;
+	struct rb_root_cached		mappings;
+
+	unsigned long			nr_endpoints;
+};
+
+struct viommu_endpoint {
+	struct device			*dev;
+	struct viommu_dev		*viommu;
+	struct viommu_domain		*vdomain;
+	struct list_head		resv_regions;
+};
+
+struct viommu_request {
+	struct list_head		list;
+	void				*writeback;
+	unsigned int			write_offset;
+	unsigned int			len;
+	char				buf[];
+};
+
+#define VIOMMU_FAULT_RESV_MASK		0xffffff00
+
+struct viommu_event {
+	union {
+		u32			head;
+		struct virtio_iommu_fault fault;
+	};
+};
+
+#define to_viommu_domain(domain)	\
+	container_of(domain, struct viommu_domain, domain)
+
+static int viommu_get_req_errno(void *buf, size_t len)
+{
+	struct virtio_iommu_req_tail *tail = buf + len - sizeof(*tail);
+
+	switch (tail->status) {
+	case VIRTIO_IOMMU_S_OK:
+		return 0;
+	case VIRTIO_IOMMU_S_UNSUPP:
+		return -ENOSYS;
+	case VIRTIO_IOMMU_S_INVAL:
+		return -EINVAL;
+	case VIRTIO_IOMMU_S_RANGE:
+		return -ERANGE;
+	case VIRTIO_IOMMU_S_NOENT:
+		return -ENOENT;
+	case VIRTIO_IOMMU_S_FAULT:
+		return -EFAULT;
+	case VIRTIO_IOMMU_S_IOERR:
+	case VIRTIO_IOMMU_S_DEVERR:
+	default:
+		return -EIO;
+	}
+}
+
+static void viommu_set_req_status(void *buf, size_t len, int status)
+{
+	struct virtio_iommu_req_tail *tail = buf + len - sizeof(*tail);
+
+	tail->status = status;
+}
+
+static off_t viommu_get_write_desc_offset(struct viommu_dev *viommu,
+					  struct virtio_iommu_req_head *req,
+					  size_t len)
+{
+	size_t tail_size = sizeof(struct virtio_iommu_req_tail);
+
+	if (req->type == VIRTIO_IOMMU_T_PROBE)
+		return len - viommu->probe_size - tail_size;
+
+	return len - tail_size;
+}
+
+/*
+ * __viommu_sync_req - Complete all in-flight requests
+ *
+ * Wait for all added requests to complete. When this function returns, all
+ * requests that were in-flight at the time of the call have completed.
+ */
+static int __viommu_sync_req(struct viommu_dev *viommu)
+{
+	int ret = 0;
+	unsigned int len;
+	size_t write_len;
+	struct viommu_request *req;
+	struct virtqueue *vq = viommu->vqs[VIOMMU_REQUEST_VQ];
+
+	assert_spin_locked(&viommu->request_lock);
+
+	virtqueue_kick(vq);
+
+	while (!list_empty(&viommu->requests)) {
+		len = 0;
+		req = virtqueue_get_buf(vq, &len);
+		if (!req)
+			continue;
+
+		if (!len)
+			viommu_set_req_status(req->buf, req->len,
+					      VIRTIO_IOMMU_S_IOERR);
+
+		write_len = req->len - req->write_offset;
+		if (req->writeback && len == write_len)
+			memcpy(req->writeback, req->buf + req->write_offset,
+			       write_len);
+
+		list_del(&req->list);
+		kfree(req);
+	}
+
+	return ret;
+}
+
+static int viommu_sync_req(struct viommu_dev *viommu)
+{
+	int ret;
+	unsigned long flags;
+
+	spin_lock_irqsave(&viommu->request_lock, flags);
+	ret = __viommu_sync_req(viommu);
+	if (ret)
+		dev_dbg(viommu->dev, "could not sync requests (%d)\n", ret);
+	spin_unlock_irqrestore(&viommu->request_lock, flags);
+
+	return ret;
+}
+
+/*
+ * __viommu_add_request - Add one request to the queue
+ * @buf: pointer to the request buffer
+ * @len: length of the request buffer
+ * @writeback: copy data back to the buffer when the request completes.
+ *
+ * Add a request to the queue. Only synchronize the queue if it's already full.
+ * Otherwise don't kick the queue nor wait for requests to complete.
+ *
+ * When @writeback is true, data written by the device, including the request
+ * status, is copied into @buf after the request completes. This is unsafe if
+ * the caller allocates @buf on stack and drops the lock between add_req() and
+ * sync_req().
+ *
+ * Return 0 if the request was successfully added to the queue.
+ */
+static int __viommu_add_req(struct viommu_dev *viommu, void *buf, size_t len,
+			    bool writeback)
+{
+	int ret;
+	off_t write_offset;
+	struct viommu_request *req;
+	struct scatterlist top_sg, bottom_sg;
+	struct scatterlist *sg[2] = { &top_sg, &bottom_sg };
+	struct virtqueue *vq = viommu->vqs[VIOMMU_REQUEST_VQ];
+
+	assert_spin_locked(&viommu->request_lock);
+
+	write_offset = viommu_get_write_desc_offset(viommu, buf, len);
+	if (write_offset <= 0)
+		return -EINVAL;
+
+	req = kzalloc(sizeof(*req) + len, GFP_ATOMIC);
+	if (!req)
+		return -ENOMEM;
+
+	req->len = len;
+	if (writeback) {
+		req->writeback = buf + write_offset;
+		req->write_offset = write_offset;
+	}
+	memcpy(&req->buf, buf, write_offset);
+
+	sg_init_one(&top_sg, req->buf, write_offset);
+	sg_init_one(&bottom_sg, req->buf + write_offset, len - write_offset);
+
+	ret = virtqueue_add_sgs(vq, sg, 1, 1, req, GFP_ATOMIC);
+	if (ret == -ENOSPC) {
+		/* If the queue is full, sync and retry */
+		if (!__viommu_sync_req(viommu))
+			ret = virtqueue_add_sgs(vq, sg, 1, 1, req, GFP_ATOMIC);
+	}
+	if (ret)
+		goto err_free;
+
+	list_add_tail(&req->list, &viommu->requests);
+	return 0;
+
+err_free:
+	kfree(req);
+	return ret;
+}
+
+static int viommu_add_req(struct viommu_dev *viommu, void *buf, size_t len)
+{
+	int ret;
+	unsigned long flags;
+
+	spin_lock_irqsave(&viommu->request_lock, flags);
+	ret = __viommu_add_req(viommu, buf, len, false);
+	if (ret)
+		dev_dbg(viommu->dev, "could not add request: %d\n", ret);
+	spin_unlock_irqrestore(&viommu->request_lock, flags);
+
+	return ret;
+}
+
+/*
+ * Send a request and wait for it to complete. Return the request status (as an
+ * errno)
+ */
+static int viommu_send_req_sync(struct viommu_dev *viommu, void *buf,
+				size_t len)
+{
+	int ret;
+	unsigned long flags;
+
+	spin_lock_irqsave(&viommu->request_lock, flags);
+
+	ret = __viommu_add_req(viommu, buf, len, true);
+	if (ret) {
+		dev_dbg(viommu->dev, "could not add request (%d)\n", ret);
+		goto out_unlock;
+	}
+
+	ret = __viommu_sync_req(viommu);
+	if (ret) {
+		dev_dbg(viommu->dev, "could not sync requests (%d)\n", ret);
+		/* Fall-through (get the actual request status) */
+	}
+
+	ret = viommu_get_req_errno(buf, len);
+out_unlock:
+	spin_unlock_irqrestore(&viommu->request_lock, flags);
+	return ret;
+}
+
+/*
+ * viommu_add_mapping - add a mapping to the internal tree
+ *
+ * On success, return the new mapping. Otherwise return NULL.
+ */
+static int viommu_add_mapping(struct viommu_domain *vdomain, unsigned long iova,
+			      phys_addr_t paddr, size_t size, u32 flags)
+{
+	unsigned long irqflags;
+	struct viommu_mapping *mapping;
+
+	mapping = kzalloc(sizeof(*mapping), GFP_ATOMIC);
+	if (!mapping)
+		return -ENOMEM;
+
+	mapping->paddr		= paddr;
+	mapping->iova.start	= iova;
+	mapping->iova.last	= iova + size - 1;
+	mapping->flags		= flags;
+
+	spin_lock_irqsave(&vdomain->mappings_lock, irqflags);
+	interval_tree_insert(&mapping->iova, &vdomain->mappings);
+	spin_unlock_irqrestore(&vdomain->mappings_lock, irqflags);
+
+	return 0;
+}
+
+/*
+ * viommu_del_mappings - remove mappings from the internal tree
+ *
+ * @vdomain: the domain
+ * @iova: start of the range
+ * @size: size of the range. A size of 0 corresponds to the entire address
+ *	space.
+ *
+ * On success, returns the number of unmapped bytes (>= size)
+ */
+static size_t viommu_del_mappings(struct viommu_domain *vdomain,
+				  unsigned long iova, size_t size)
+{
+	size_t unmapped = 0;
+	unsigned long flags;
+	unsigned long last = iova + size - 1;
+	struct viommu_mapping *mapping = NULL;
+	struct interval_tree_node *node, *next;
+
+	spin_lock_irqsave(&vdomain->mappings_lock, flags);
+	next = interval_tree_iter_first(&vdomain->mappings, iova, last);
+	while (next) {
+		node = next;
+		mapping = container_of(node, struct viommu_mapping, iova);
+		next = interval_tree_iter_next(node, iova, last);
+
+		/* Trying to split a mapping? */
+		if (mapping->iova.start < iova)
+			break;
+
+		/*
+		 * Virtio-iommu doesn't allow UNMAP to split a mapping created
+		 * with a single MAP request, so remove the full mapping.
+		 */
+		unmapped += mapping->iova.last - mapping->iova.start + 1;
+
+		interval_tree_remove(node, &vdomain->mappings);
+		kfree(mapping);
+	}
+	spin_unlock_irqrestore(&vdomain->mappings_lock, flags);
+
+	return unmapped;
+}
+
+/*
+ * viommu_replay_mappings - re-send MAP requests
+ *
+ * When reattaching a domain that was previously detached from all endpoints,
+ * mappings were deleted from the device. Re-create the mappings available in
+ * the internal tree.
+ */
+static int viommu_replay_mappings(struct viommu_domain *vdomain)
+{
+	int ret = 0;
+	unsigned long flags;
+	struct viommu_mapping *mapping;
+	struct interval_tree_node *node;
+	struct virtio_iommu_req_map map;
+
+	spin_lock_irqsave(&vdomain->mappings_lock, flags);
+	node = interval_tree_iter_first(&vdomain->mappings, 0, -1UL);
+	while (node) {
+		mapping = container_of(node, struct viommu_mapping, iova);
+		map = (struct virtio_iommu_req_map) {
+			.head.type	= VIRTIO_IOMMU_T_MAP,
+			.domain		= cpu_to_le32(vdomain->id),
+			.virt_start	= cpu_to_le64(mapping->iova.start),
+			.virt_end	= cpu_to_le64(mapping->iova.last),
+			.phys_start	= cpu_to_le64(mapping->paddr),
+			.flags		= cpu_to_le32(mapping->flags),
+		};
+
+		ret = viommu_send_req_sync(vdomain->viommu, &map, sizeof(map));
+		if (ret)
+			break;
+
+		node = interval_tree_iter_next(node, 0, -1UL);
+	}
+	spin_unlock_irqrestore(&vdomain->mappings_lock, flags);
+
+	return ret;
+}
+
+static int viommu_add_resv_mem(struct viommu_endpoint *vdev,
+			       struct virtio_iommu_probe_resv_mem *mem,
+			       size_t len)
+{
+	size_t size;
+	u64 start64, end64;
+	phys_addr_t start, end;
+	struct iommu_resv_region *region = NULL;
+	unsigned long prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO;
+
+	start = start64 = le64_to_cpu(mem->start);
+	end = end64 = le64_to_cpu(mem->end);
+	size = end64 - start64 + 1;
+
+	/* Catch any overflow, including the unlikely end64 - start64 + 1 = 0 */
+	if (start != start64 || end != end64 || size < end64 - start64)
+		return -EOVERFLOW;
+
+	if (len < sizeof(*mem))
+		return -EINVAL;
+
+	switch (mem->subtype) {
+	default:
+		dev_warn(vdev->dev, "unknown resv mem subtype 0x%x\n",
+			 mem->subtype);
+		/* Fall-through */
+	case VIRTIO_IOMMU_RESV_MEM_T_RESERVED:
+		region = iommu_alloc_resv_region(start, size, 0,
+						 IOMMU_RESV_RESERVED);
+		break;
+	case VIRTIO_IOMMU_RESV_MEM_T_MSI:
+		region = iommu_alloc_resv_region(start, size, prot,
+						 IOMMU_RESV_MSI);
+		break;
+	}
+	if (!region)
+		return -ENOMEM;
+
+	list_add(&vdev->resv_regions, &region->list);
+	return 0;
+}
+
+static int viommu_probe_endpoint(struct viommu_dev *viommu, struct device *dev)
+{
+	int ret;
+	u16 type, len;
+	size_t cur = 0;
+	size_t probe_len;
+	struct virtio_iommu_req_probe *probe;
+	struct virtio_iommu_probe_property *prop;
+	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+	struct viommu_endpoint *vdev = fwspec->iommu_priv;
+
+	if (!fwspec->num_ids)
+		return -EINVAL;
+
+	probe_len = sizeof(*probe) + viommu->probe_size +
+		    sizeof(struct virtio_iommu_req_tail);
+	probe = kzalloc(probe_len, GFP_KERNEL);
+	if (!probe)
+		return -ENOMEM;
+
+	probe->head.type = VIRTIO_IOMMU_T_PROBE;
+	/*
+	 * For now, assume that properties of an endpoint that outputs multiple
+	 * IDs are consistent. Only probe the first one.
+	 */
+	probe->endpoint = cpu_to_le32(fwspec->ids[0]);
+
+	ret = viommu_send_req_sync(viommu, probe, probe_len);
+	if (ret)
+		goto out_free;
+
+	prop = (void *)probe->properties;
+	type = le16_to_cpu(prop->type) & VIRTIO_IOMMU_PROBE_T_MASK;
+
+	while (type != VIRTIO_IOMMU_PROBE_T_NONE &&
+	       cur < viommu->probe_size) {
+		len = le16_to_cpu(prop->length) + sizeof(*prop);
+
+		switch (type) {
+		case VIRTIO_IOMMU_PROBE_T_RESV_MEM:
+			ret = viommu_add_resv_mem(vdev, (void *)prop, len);
+			break;
+		default:
+			dev_err(dev, "unknown viommu prop 0x%x\n", type);
+		}
+
+		if (ret)
+			dev_err(dev, "failed to parse viommu prop 0x%x\n", type);
+
+		cur += len;
+		if (cur >= viommu->probe_size)
+			break;
+
+		prop = (void *)probe->properties + cur;
+		type = le16_to_cpu(prop->type) & VIRTIO_IOMMU_PROBE_T_MASK;
+	}
+
+out_free:
+	kfree(probe);
+	return ret;
+}
+
+static int viommu_fault_handler(struct viommu_dev *viommu,
+				struct virtio_iommu_fault *fault)
+{
+	char *reason_str;
+
+	u8 reason	= fault->reason;
+	u32 flags	= le32_to_cpu(fault->flags);
+	u32 endpoint	= le32_to_cpu(fault->endpoint);
+	u64 address	= le64_to_cpu(fault->address);
+
+	switch (reason) {
+	case VIRTIO_IOMMU_FAULT_R_DOMAIN:
+		reason_str = "domain";
+		break;
+	case VIRTIO_IOMMU_FAULT_R_MAPPING:
+		reason_str = "page";
+		break;
+	case VIRTIO_IOMMU_FAULT_R_UNKNOWN:
+	default:
+		reason_str = "unknown";
+		break;
+	}
+
+	/* TODO: find EP by ID and report_iommu_fault */
+	if (flags & VIRTIO_IOMMU_FAULT_F_ADDRESS)
+		dev_err_ratelimited(viommu->dev, "%s fault from EP %u at %#llx [%s%s%s]\n",
+				    reason_str, endpoint, address,
+				    flags & VIRTIO_IOMMU_FAULT_F_READ ? "R" : "",
+				    flags & VIRTIO_IOMMU_FAULT_F_WRITE ? "W" : "",
+				    flags & VIRTIO_IOMMU_FAULT_F_EXEC ? "X" : "");
+	else
+		dev_err_ratelimited(viommu->dev, "%s fault from EP %u\n",
+				    reason_str, endpoint);
+	return 0;
+}
+
+static void viommu_event_handler(struct virtqueue *vq)
+{
+	int ret;
+	unsigned int len;
+	struct scatterlist sg[1];
+	struct viommu_event *evt;
+	struct viommu_dev *viommu = vq->vdev->priv;
+
+	while ((evt = virtqueue_get_buf(vq, &len)) != NULL) {
+		if (len > sizeof(*evt)) {
+			dev_err(viommu->dev,
+				"invalid event buffer (len %u != %zu)\n",
+				len, sizeof(*evt));
+		} else if (!(evt->head & VIOMMU_FAULT_RESV_MASK)) {
+			viommu_fault_handler(viommu, &evt->fault);
+		}
+
+		sg_init_one(sg, evt, sizeof(*evt));
+		ret = virtqueue_add_inbuf(vq, sg, 1, evt, GFP_ATOMIC);
+		if (ret)
+			dev_err(viommu->dev, "could not add event buffer\n");
+	}
+
+	virtqueue_kick(vq);
+}
+
+/* IOMMU API */
+
+static struct iommu_domain *viommu_domain_alloc(unsigned type)
+{
+	struct viommu_domain *vdomain;
+
+	if (type != IOMMU_DOMAIN_UNMANAGED && type != IOMMU_DOMAIN_DMA)
+		return NULL;
+
+	vdomain = kzalloc(sizeof(*vdomain), GFP_KERNEL);
+	if (!vdomain)
+		return NULL;
+
+	mutex_init(&vdomain->mutex);
+	spin_lock_init(&vdomain->mappings_lock);
+	vdomain->mappings = RB_ROOT_CACHED;
+
+	if (type == IOMMU_DOMAIN_DMA &&
+	    iommu_get_dma_cookie(&vdomain->domain)) {
+		kfree(vdomain);
+		return NULL;
+	}
+
+	return &vdomain->domain;
+}
+
+static int viommu_domain_finalise(struct viommu_dev *viommu,
+				  struct iommu_domain *domain)
+{
+	int ret;
+	struct viommu_domain *vdomain = to_viommu_domain(domain);
+	unsigned int max_domain = viommu->domain_bits > 31 ? ~0 :
+				  (1U << viommu->domain_bits) - 1;
+
+	vdomain->viommu		= viommu;
+
+	domain->pgsize_bitmap	= viommu->pgsize_bitmap;
+	domain->geometry	= viommu->geometry;
+
+	ret = ida_alloc_max(&viommu->domain_ids, max_domain, GFP_KERNEL);
+	if (ret >= 0)
+		vdomain->id = (unsigned int)ret;
+
+	return ret > 0 ? 0 : ret;
+}
+
+static void viommu_domain_free(struct iommu_domain *domain)
+{
+	struct viommu_domain *vdomain = to_viommu_domain(domain);
+
+	iommu_put_dma_cookie(domain);
+
+	/* Free all remaining mappings (size 2^64) */
+	viommu_del_mappings(vdomain, 0, 0);
+
+	if (vdomain->viommu)
+		ida_free(&vdomain->viommu->domain_ids, vdomain->id);
+
+	kfree(vdomain);
+}
+
+static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev)
+{
+	int i;
+	int ret = 0;
+	struct virtio_iommu_req_attach req;
+	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+	struct viommu_endpoint *vdev = fwspec->iommu_priv;
+	struct viommu_domain *vdomain = to_viommu_domain(domain);
+
+	mutex_lock(&vdomain->mutex);
+	if (!vdomain->viommu) {
+		/*
+		 * Properly initialize the domain now that we know which viommu
+		 * owns it.
+		 */
+		ret = viommu_domain_finalise(vdev->viommu, domain);
+	} else if (vdomain->viommu != vdev->viommu) {
+		dev_err(dev, "cannot attach to foreign vIOMMU\n");
+		ret = -EXDEV;
+	}
+	mutex_unlock(&vdomain->mutex);
+
+	if (ret)
+		return ret;
+
+	/*
+	 * In the virtio-iommu device, when attaching the endpoint to a new
+	 * domain, it is detached from the old one and, if as as a result the
+	 * old domain isn't attached to any endpoint, all mappings are removed
+	 * from the old domain and it is freed.
+	 *
+	 * In the driver the old domain still exists, and its mappings will be
+	 * recreated if it gets reattached to an endpoint. Otherwise it will be
+	 * freed explicitly.
+	 *
+	 * vdev->vdomain is protected by group->mutex
+	 */
+	if (vdev->vdomain)
+		vdev->vdomain->nr_endpoints--;
+
+	req = (struct virtio_iommu_req_attach) {
+		.head.type	= VIRTIO_IOMMU_T_ATTACH,
+		.domain		= cpu_to_le32(vdomain->id),
+	};
+
+	for (i = 0; i < fwspec->num_ids; i++) {
+		req.endpoint = cpu_to_le32(fwspec->ids[i]);
+
+		ret = viommu_send_req_sync(vdomain->viommu, &req, sizeof(req));
+		if (ret)
+			return ret;
+	}
+
+	if (!vdomain->nr_endpoints) {
+		/*
+		 * This endpoint is the first to be attached to the domain.
+		 * Replay existing mappings (e.g. SW MSI).
+		 */
+		ret = viommu_replay_mappings(vdomain);
+		if (ret)
+			return ret;
+	}
+
+	vdomain->nr_endpoints++;
+	vdev->vdomain = vdomain;
+
+	return 0;
+}
+
+static int viommu_map(struct iommu_domain *domain, unsigned long iova,
+		      phys_addr_t paddr, size_t size, int prot)
+{
+	int ret;
+	int flags;
+	struct virtio_iommu_req_map map;
+	struct viommu_domain *vdomain = to_viommu_domain(domain);
+
+	flags = (prot & IOMMU_READ ? VIRTIO_IOMMU_MAP_F_READ : 0) |
+		(prot & IOMMU_WRITE ? VIRTIO_IOMMU_MAP_F_WRITE : 0) |
+		(prot & IOMMU_MMIO ? VIRTIO_IOMMU_MAP_F_MMIO : 0);
+
+	ret = viommu_add_mapping(vdomain, iova, paddr, size, flags);
+	if (ret)
+		return ret;
+
+	map = (struct virtio_iommu_req_map) {
+		.head.type	= VIRTIO_IOMMU_T_MAP,
+		.domain		= cpu_to_le32(vdomain->id),
+		.virt_start	= cpu_to_le64(iova),
+		.phys_start	= cpu_to_le64(paddr),
+		.virt_end	= cpu_to_le64(iova + size - 1),
+		.flags		= cpu_to_le32(flags),
+	};
+
+	if (!vdomain->nr_endpoints)
+		return 0;
+
+	ret = viommu_send_req_sync(vdomain->viommu, &map, sizeof(map));
+	if (ret)
+		viommu_del_mappings(vdomain, iova, size);
+
+	return ret;
+}
+
+static size_t viommu_unmap(struct iommu_domain *domain, unsigned long iova,
+			   size_t size)
+{
+	int ret = 0;
+	size_t unmapped;
+	struct virtio_iommu_req_unmap unmap;
+	struct viommu_domain *vdomain = to_viommu_domain(domain);
+
+	unmapped = viommu_del_mappings(vdomain, iova, size);
+	if (unmapped < size)
+		return 0;
+
+	/* Device already removed all mappings after detach. */
+	if (!vdomain->nr_endpoints)
+		return unmapped;
+
+	unmap = (struct virtio_iommu_req_unmap) {
+		.head.type	= VIRTIO_IOMMU_T_UNMAP,
+		.domain		= cpu_to_le32(vdomain->id),
+		.virt_start	= cpu_to_le64(iova),
+		.virt_end	= cpu_to_le64(iova + unmapped - 1),
+	};
+
+	ret = viommu_add_req(vdomain->viommu, &unmap, sizeof(unmap));
+	return ret ? 0 : unmapped;
+}
+
+static phys_addr_t viommu_iova_to_phys(struct iommu_domain *domain,
+				       dma_addr_t iova)
+{
+	u64 paddr = 0;
+	unsigned long flags;
+	struct viommu_mapping *mapping;
+	struct interval_tree_node *node;
+	struct viommu_domain *vdomain = to_viommu_domain(domain);
+
+	spin_lock_irqsave(&vdomain->mappings_lock, flags);
+	node = interval_tree_iter_first(&vdomain->mappings, iova, iova);
+	if (node) {
+		mapping = container_of(node, struct viommu_mapping, iova);
+		paddr = mapping->paddr + (iova - mapping->iova.start);
+	}
+	spin_unlock_irqrestore(&vdomain->mappings_lock, flags);
+
+	return paddr;
+}
+
+static void viommu_iotlb_sync(struct iommu_domain *domain)
+{
+	struct viommu_domain *vdomain = to_viommu_domain(domain);
+
+	viommu_sync_req(vdomain->viommu);
+}
+
+static void viommu_get_resv_regions(struct device *dev, struct list_head *head)
+{
+	struct iommu_resv_region *entry, *new_entry, *msi = NULL;
+	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+	struct viommu_endpoint *vdev = fwspec->iommu_priv;
+	int prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO;
+
+	list_for_each_entry(entry, &vdev->resv_regions, list) {
+		if (entry->type == IOMMU_RESV_MSI)
+			msi = entry;
+
+		new_entry = kmemdup(entry, sizeof(*entry), GFP_KERNEL);
+		if (!new_entry)
+			return;
+		list_add_tail(&new_entry->list, head);
+	}
+
+	/*
+	 * If the device didn't register any bypass MSI window, add a
+	 * software-mapped region.
+	 */
+	if (!msi) {
+		msi = iommu_alloc_resv_region(MSI_IOVA_BASE, MSI_IOVA_LENGTH,
+					      prot, IOMMU_RESV_SW_MSI);
+		if (!msi)
+			return;
+
+		list_add_tail(&msi->list, head);
+	}
+
+	iommu_dma_get_resv_regions(dev, head);
+}
+
+static void viommu_put_resv_regions(struct device *dev, struct list_head *head)
+{
+	struct iommu_resv_region *entry, *next;
+
+	list_for_each_entry_safe(entry, next, head, list)
+		kfree(entry);
+}
+
+static struct iommu_ops viommu_ops;
+static struct virtio_driver virtio_iommu_drv;
+
+static int viommu_match_node(struct device *dev, const void *data)
+{
+	return dev->parent->fwnode == data;
+}
+
+static struct viommu_dev *viommu_get_by_fwnode(struct fwnode_handle *fwnode)
+{
+	struct device *dev = driver_find_device(&virtio_iommu_drv.driver, NULL,
+						fwnode, viommu_match_node);
+	put_device(dev);
+
+	return dev ? dev_to_virtio(dev)->priv : NULL;
+}
+
+static int viommu_add_device(struct device *dev)
+{
+	int ret;
+	struct iommu_group *group;
+	struct viommu_endpoint *vdev;
+	struct viommu_dev *viommu = NULL;
+	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+
+	if (!fwspec || fwspec->ops != &viommu_ops)
+		return -ENODEV;
+
+	viommu = viommu_get_by_fwnode(fwspec->iommu_fwnode);
+	if (!viommu)
+		return -ENODEV;
+
+	vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
+	if (!vdev)
+		return -ENOMEM;
+
+	vdev->dev = dev;
+	vdev->viommu = viommu;
+	INIT_LIST_HEAD(&vdev->resv_regions);
+	fwspec->iommu_priv = vdev;
+
+	if (viommu->probe_size) {
+		/* Get additional information for this endpoint */
+		ret = viommu_probe_endpoint(viommu, dev);
+		if (ret)
+			goto err_free_dev;
+	}
+
+	ret = iommu_device_link(&viommu->iommu, dev);
+	if (ret)
+		goto err_free_dev;
+
+	/*
+	 * Last step creates a default domain and attaches to it. Everything
+	 * must be ready.
+	 */
+	group = iommu_group_get_for_dev(dev);
+	if (IS_ERR(group)) {
+		ret = PTR_ERR(group);
+		goto err_unlink_dev;
+	}
+
+	iommu_group_put(group);
+
+	return PTR_ERR_OR_ZERO(group);
+
+err_unlink_dev:
+	iommu_device_unlink(&viommu->iommu, dev);
+err_free_dev:
+	viommu_put_resv_regions(dev, &vdev->resv_regions);
+	kfree(vdev);
+
+	return ret;
+}
+
+static void viommu_remove_device(struct device *dev)
+{
+	struct viommu_endpoint *vdev;
+	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+
+	if (!fwspec || fwspec->ops != &viommu_ops)
+		return;
+
+	vdev = fwspec->iommu_priv;
+
+	iommu_group_remove_device(dev);
+	iommu_device_unlink(&vdev->viommu->iommu, dev);
+	viommu_put_resv_regions(dev, &vdev->resv_regions);
+	kfree(vdev);
+}
+
+static struct iommu_group *viommu_device_group(struct device *dev)
+{
+	if (dev_is_pci(dev))
+		return pci_device_group(dev);
+	else
+		return generic_device_group(dev);
+}
+
+static int viommu_of_xlate(struct device *dev, struct of_phandle_args *args)
+{
+	return iommu_fwspec_add_ids(dev, args->args, 1);
+}
+
+static struct iommu_ops viommu_ops = {
+	.domain_alloc		= viommu_domain_alloc,
+	.domain_free		= viommu_domain_free,
+	.attach_dev		= viommu_attach_dev,
+	.map			= viommu_map,
+	.unmap			= viommu_unmap,
+	.iova_to_phys		= viommu_iova_to_phys,
+	.iotlb_sync		= viommu_iotlb_sync,
+	.add_device		= viommu_add_device,
+	.remove_device		= viommu_remove_device,
+	.device_group		= viommu_device_group,
+	.get_resv_regions	= viommu_get_resv_regions,
+	.put_resv_regions	= viommu_put_resv_regions,
+	.of_xlate		= viommu_of_xlate,
+};
+
+static int viommu_init_vqs(struct viommu_dev *viommu)
+{
+	struct virtio_device *vdev = dev_to_virtio(viommu->dev);
+	const char *names[] = { "request", "event" };
+	vq_callback_t *callbacks[] = {
+		NULL, /* No async requests */
+		viommu_event_handler,
+	};
+
+	return virtio_find_vqs(vdev, VIOMMU_NR_VQS, viommu->vqs, callbacks,
+			       names, NULL);
+}
+
+static int viommu_fill_evtq(struct viommu_dev *viommu)
+{
+	int i, ret;
+	struct scatterlist sg[1];
+	struct viommu_event *evts;
+	struct virtqueue *vq = viommu->vqs[VIOMMU_EVENT_VQ];
+	size_t nr_evts = vq->num_free;
+
+	viommu->evts = evts = devm_kmalloc_array(viommu->dev, nr_evts,
+						 sizeof(*evts), GFP_KERNEL);
+	if (!evts)
+		return -ENOMEM;
+
+	for (i = 0; i < nr_evts; i++) {
+		sg_init_one(sg, &evts[i], sizeof(*evts));
+		ret = virtqueue_add_inbuf(vq, sg, 1, &evts[i], GFP_KERNEL);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int viommu_probe(struct virtio_device *vdev)
+{
+	struct device *parent_dev = vdev->dev.parent;
+	struct viommu_dev *viommu = NULL;
+	struct device *dev = &vdev->dev;
+	u64 input_start = 0;
+	u64 input_end = -1UL;
+	int ret;
+
+	if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1) ||
+	    !virtio_has_feature(vdev, VIRTIO_IOMMU_F_MAP_UNMAP))
+		return -ENODEV;
+
+	viommu = devm_kzalloc(dev, sizeof(*viommu), GFP_KERNEL);
+	if (!viommu)
+		return -ENOMEM;
+
+	spin_lock_init(&viommu->request_lock);
+	ida_init(&viommu->domain_ids);
+	viommu->dev = dev;
+	viommu->vdev = vdev;
+	INIT_LIST_HEAD(&viommu->requests);
+
+	ret = viommu_init_vqs(viommu);
+	if (ret)
+		return ret;
+
+	virtio_cread(vdev, struct virtio_iommu_config, page_size_mask,
+		     &viommu->pgsize_bitmap);
+
+	if (!viommu->pgsize_bitmap) {
+		ret = -EINVAL;
+		goto err_free_vqs;
+	}
+
+	viommu->domain_bits = 32;
+
+	/* Optional features */
+	virtio_cread_feature(vdev, VIRTIO_IOMMU_F_INPUT_RANGE,
+			     struct virtio_iommu_config, input_range.start,
+			     &input_start);
+
+	virtio_cread_feature(vdev, VIRTIO_IOMMU_F_INPUT_RANGE,
+			     struct virtio_iommu_config, input_range.end,
+			     &input_end);
+
+	virtio_cread_feature(vdev, VIRTIO_IOMMU_F_DOMAIN_BITS,
+			     struct virtio_iommu_config, domain_bits,
+			     &viommu->domain_bits);
+
+	virtio_cread_feature(vdev, VIRTIO_IOMMU_F_PROBE,
+			     struct virtio_iommu_config, probe_size,
+			     &viommu->probe_size);
+
+	viommu->geometry = (struct iommu_domain_geometry) {
+		.aperture_start	= input_start,
+		.aperture_end	= input_end,
+		.force_aperture	= true,
+	};
+
+	viommu_ops.pgsize_bitmap = viommu->pgsize_bitmap;
+
+	virtio_device_ready(vdev);
+
+	/* Populate the event queue with buffers */
+	ret = viommu_fill_evtq(viommu);
+	if (ret)
+		goto err_free_vqs;
+
+	ret = iommu_device_sysfs_add(&viommu->iommu, dev, NULL, "%s",
+				     virtio_bus_name(vdev));
+	if (ret)
+		goto err_free_vqs;
+
+	iommu_device_set_ops(&viommu->iommu, &viommu_ops);
+	iommu_device_set_fwnode(&viommu->iommu, parent_dev->fwnode);
+
+	iommu_device_register(&viommu->iommu);
+
+#ifdef CONFIG_PCI
+	if (pci_bus_type.iommu_ops != &viommu_ops) {
+		pci_request_acs();
+		ret = bus_set_iommu(&pci_bus_type, &viommu_ops);
+		if (ret)
+			goto err_unregister;
+	}
+#endif
+#ifdef CONFIG_ARM_AMBA
+	if (amba_bustype.iommu_ops != &viommu_ops) {
+		ret = bus_set_iommu(&amba_bustype, &viommu_ops);
+		if (ret)
+			goto err_unregister;
+	}
+#endif
+	if (platform_bus_type.iommu_ops != &viommu_ops) {
+		ret = bus_set_iommu(&platform_bus_type, &viommu_ops);
+		if (ret)
+			goto err_unregister;
+	}
+
+	vdev->priv = viommu;
+
+	dev_info(dev, "input address: %u bits\n",
+		 order_base_2(viommu->geometry.aperture_end));
+	dev_info(dev, "page mask: %#llx\n", viommu->pgsize_bitmap);
+
+	return 0;
+
+err_unregister:
+	iommu_device_sysfs_remove(&viommu->iommu);
+	iommu_device_unregister(&viommu->iommu);
+err_free_vqs:
+	vdev->config->del_vqs(vdev);
+
+	return ret;
+}
+
+static void viommu_remove(struct virtio_device *vdev)
+{
+	struct viommu_dev *viommu = vdev->priv;
+
+	iommu_device_sysfs_remove(&viommu->iommu);
+	iommu_device_unregister(&viommu->iommu);
+
+	/* Stop all virtqueues */
+	vdev->config->reset(vdev);
+	vdev->config->del_vqs(vdev);
+
+	dev_info(&vdev->dev, "device removed\n");
+}
+
+static void viommu_config_changed(struct virtio_device *vdev)
+{
+	dev_warn(&vdev->dev, "config changed\n");
+}
+
+static unsigned int features[] = {
+	VIRTIO_IOMMU_F_MAP_UNMAP,
+	VIRTIO_IOMMU_F_DOMAIN_BITS,
+	VIRTIO_IOMMU_F_INPUT_RANGE,
+	VIRTIO_IOMMU_F_PROBE,
+};
+
+static struct virtio_device_id id_table[] = {
+	{ VIRTIO_ID_IOMMU, VIRTIO_DEV_ANY_ID },
+	{ 0 },
+};
+
+static struct virtio_driver virtio_iommu_drv = {
+	.driver.name		= KBUILD_MODNAME,
+	.driver.owner		= THIS_MODULE,
+	.id_table		= id_table,
+	.feature_table		= features,
+	.feature_table_size	= ARRAY_SIZE(features),
+	.probe			= viommu_probe,
+	.remove			= viommu_remove,
+	.config_changed		= viommu_config_changed,
+};
+
+module_virtio_driver(virtio_iommu_drv);
+
+MODULE_DESCRIPTION("Virtio IOMMU driver");
+MODULE_AUTHOR("Jean-Philippe Brucker <jean-philippe.brucker@arm.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/of/base.c b/drivers/of/base.c
index 20e0e7ee4edf..55e7f5bb0549 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -2294,8 +2294,12 @@ int of_map_rid(struct device_node *np, u32 rid,
 		return 0;
 	}
 
-	pr_err("%pOF: Invalid %s translation - no match for rid 0x%x on %pOF\n",
-		np, map_name, rid, target && *target ? *target : NULL);
-	return -EFAULT;
+	pr_info("%pOF: no %s translation for rid 0x%x on %pOF\n", np, map_name,
+		rid, target && *target ? *target : NULL);
+
+	/* Bypasses translation */
+	if (id_out)
+		*id_out = rid;
+	return 0;
 }
 EXPORT_SYMBOL_GPL(of_map_rid);
diff --git a/drivers/pci/of.c b/drivers/pci/of.c
index 73d5adec0a28..bc7b27a28795 100644
--- a/drivers/pci/of.c
+++ b/drivers/pci/of.c
@@ -22,12 +22,15 @@ void pci_set_of_node(struct pci_dev *dev)
 		return;
 	dev->dev.of_node = of_pci_find_child_device(dev->bus->dev.of_node,
 						    dev->devfn);
+	if (dev->dev.of_node)
+		dev->dev.fwnode = &dev->dev.of_node->fwnode;
 }
 
 void pci_release_of_node(struct pci_dev *dev)
 {
 	of_node_put(dev->dev.of_node);
 	dev->dev.of_node = NULL;
+	dev->dev.fwnode = NULL;
 }
 
 void pci_set_bus_of_node(struct pci_bus *bus)
@@ -41,13 +44,18 @@ void pci_set_bus_of_node(struct pci_bus *bus)
 		if (node && of_property_read_bool(node, "external-facing"))
 			bus->self->untrusted = true;
 	}
+
 	bus->dev.of_node = node;
+
+	if (bus->dev.of_node)
+		bus->dev.fwnode = &bus->dev.of_node->fwnode;
 }
 
 void pci_release_bus_of_node(struct pci_bus *bus)
 {
 	of_node_put(bus->dev.of_node);
 	bus->dev.of_node = NULL;
+	bus->dev.fwnode = NULL;
 }
 
 struct device_node * __weak pcibios_get_phb_of_node(struct pci_bus *bus)
diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index 1705398b026a..297e1076e571 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -792,7 +792,7 @@ static int virtscsi_probe(struct virtio_device *vdev)
 	num_targets = virtscsi_config_get(vdev, max_target) + 1;
 
 	shost = scsi_host_alloc(&virtscsi_host_template,
-		sizeof(*vscsi) + sizeof(vscsi->req_vqs[0]) * num_queues);
+				struct_size(vscsi, req_vqs, num_queues));
 	if (!shost)
 		return -ENOMEM;
 
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 247e5585af5d..1a2dd53caade 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -956,7 +956,7 @@ static void handle_tx(struct vhost_net *net)
 	if (!sock)
 		goto out;
 
-	if (!vq_iotlb_prefetch(vq))
+	if (!vq_meta_prefetch(vq))
 		goto out;
 
 	vhost_disable_notify(&net->dev, vq);
@@ -1125,7 +1125,7 @@ static void handle_rx(struct vhost_net *net)
 	if (!sock)
 		goto out;
 
-	if (!vq_iotlb_prefetch(vq))
+	if (!vq_meta_prefetch(vq))
 		goto out;
 
 	vhost_disable_notify(&net->dev, vq);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index ff8892c38666..0536f8526359 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -298,6 +298,160 @@ static void vhost_vq_meta_reset(struct vhost_dev *d)
 		__vhost_vq_meta_reset(d->vqs[i]);
 }
 
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+static void vhost_map_unprefetch(struct vhost_map *map)
+{
+	kfree(map->pages);
+	map->pages = NULL;
+	map->npages = 0;
+	map->addr = NULL;
+}
+
+static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq)
+{
+	struct vhost_map *map[VHOST_NUM_ADDRS];
+	int i;
+
+	spin_lock(&vq->mmu_lock);
+	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
+		map[i] = rcu_dereference_protected(vq->maps[i],
+				  lockdep_is_held(&vq->mmu_lock));
+		if (map[i])
+			rcu_assign_pointer(vq->maps[i], NULL);
+	}
+	spin_unlock(&vq->mmu_lock);
+
+	synchronize_rcu();
+
+	for (i = 0; i < VHOST_NUM_ADDRS; i++)
+		if (map[i])
+			vhost_map_unprefetch(map[i]);
+
+}
+
+static void vhost_reset_vq_maps(struct vhost_virtqueue *vq)
+{
+	int i;
+
+	vhost_uninit_vq_maps(vq);
+	for (i = 0; i < VHOST_NUM_ADDRS; i++)
+		vq->uaddrs[i].size = 0;
+}
+
+static bool vhost_map_range_overlap(struct vhost_uaddr *uaddr,
+				     unsigned long start,
+				     unsigned long end)
+{
+	if (unlikely(!uaddr->size))
+		return false;
+
+	return !(end < uaddr->uaddr || start > uaddr->uaddr - 1 + uaddr->size);
+}
+
+static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
+				      int index,
+				      unsigned long start,
+				      unsigned long end)
+{
+	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
+	struct vhost_map *map;
+	int i;
+
+	if (!vhost_map_range_overlap(uaddr, start, end))
+		return;
+
+	spin_lock(&vq->mmu_lock);
+	++vq->invalidate_count;
+
+	map = rcu_dereference_protected(vq->maps[index],
+					lockdep_is_held(&vq->mmu_lock));
+	if (map) {
+		if (uaddr->write) {
+			for (i = 0; i < map->npages; i++)
+				set_page_dirty(map->pages[i]);
+		}
+		rcu_assign_pointer(vq->maps[index], NULL);
+	}
+	spin_unlock(&vq->mmu_lock);
+
+	if (map) {
+		synchronize_rcu();
+		vhost_map_unprefetch(map);
+	}
+}
+
+static void vhost_invalidate_vq_end(struct vhost_virtqueue *vq,
+				    int index,
+				    unsigned long start,
+				    unsigned long end)
+{
+	if (!vhost_map_range_overlap(&vq->uaddrs[index], start, end))
+		return;
+
+	spin_lock(&vq->mmu_lock);
+	--vq->invalidate_count;
+	spin_unlock(&vq->mmu_lock);
+}
+
+static int vhost_invalidate_range_start(struct mmu_notifier *mn,
+					const struct mmu_notifier_range *range)
+{
+	struct vhost_dev *dev = container_of(mn, struct vhost_dev,
+					     mmu_notifier);
+	int i, j;
+
+	if (!mmu_notifier_range_blockable(range))
+		return -EAGAIN;
+
+	for (i = 0; i < dev->nvqs; i++) {
+		struct vhost_virtqueue *vq = dev->vqs[i];
+
+		for (j = 0; j < VHOST_NUM_ADDRS; j++)
+			vhost_invalidate_vq_start(vq, j,
+						  range->start,
+						  range->end);
+	}
+
+	return 0;
+}
+
+static void vhost_invalidate_range_end(struct mmu_notifier *mn,
+				       const struct mmu_notifier_range *range)
+{
+	struct vhost_dev *dev = container_of(mn, struct vhost_dev,
+					     mmu_notifier);
+	int i, j;
+
+	for (i = 0; i < dev->nvqs; i++) {
+		struct vhost_virtqueue *vq = dev->vqs[i];
+
+		for (j = 0; j < VHOST_NUM_ADDRS; j++)
+			vhost_invalidate_vq_end(vq, j,
+						range->start,
+						range->end);
+	}
+}
+
+static const struct mmu_notifier_ops vhost_mmu_notifier_ops = {
+	.invalidate_range_start = vhost_invalidate_range_start,
+	.invalidate_range_end = vhost_invalidate_range_end,
+};
+
+static void vhost_init_maps(struct vhost_dev *dev)
+{
+	struct vhost_virtqueue *vq;
+	int i, j;
+
+	dev->mmu_notifier.ops = &vhost_mmu_notifier_ops;
+
+	for (i = 0; i < dev->nvqs; ++i) {
+		vq = dev->vqs[i];
+		for (j = 0; j < VHOST_NUM_ADDRS; j++)
+			RCU_INIT_POINTER(vq->maps[j], NULL);
+	}
+}
+#endif
+
 static void vhost_vq_reset(struct vhost_dev *dev,
 			   struct vhost_virtqueue *vq)
 {
@@ -326,7 +480,11 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 	vq->busyloop_timeout = 0;
 	vq->umem = NULL;
 	vq->iotlb = NULL;
+	vq->invalidate_count = 0;
 	__vhost_vq_meta_reset(vq);
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	vhost_reset_vq_maps(vq);
+#endif
 }
 
 static int vhost_worker(void *data)
@@ -427,6 +585,32 @@ bool vhost_exceeds_weight(struct vhost_virtqueue *vq,
 }
 EXPORT_SYMBOL_GPL(vhost_exceeds_weight);
 
+static size_t vhost_get_avail_size(struct vhost_virtqueue *vq,
+				   unsigned int num)
+{
+	size_t event __maybe_unused =
+	       vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
+
+	return sizeof(*vq->avail) +
+	       sizeof(*vq->avail->ring) * num + event;
+}
+
+static size_t vhost_get_used_size(struct vhost_virtqueue *vq,
+				  unsigned int num)
+{
+	size_t event __maybe_unused =
+	       vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
+
+	return sizeof(*vq->used) +
+	       sizeof(*vq->used->ring) * num + event;
+}
+
+static size_t vhost_get_desc_size(struct vhost_virtqueue *vq,
+				  unsigned int num)
+{
+	return sizeof(*vq->desc) * num;
+}
+
 void vhost_dev_init(struct vhost_dev *dev,
 		    struct vhost_virtqueue **vqs, int nvqs,
 		    int iov_limit, int weight, int byte_weight)
@@ -450,7 +634,9 @@ void vhost_dev_init(struct vhost_dev *dev,
 	INIT_LIST_HEAD(&dev->read_list);
 	INIT_LIST_HEAD(&dev->pending_list);
 	spin_lock_init(&dev->iotlb_lock);
-
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	vhost_init_maps(dev);
+#endif
 
 	for (i = 0; i < dev->nvqs; ++i) {
 		vq = dev->vqs[i];
@@ -459,6 +645,7 @@ void vhost_dev_init(struct vhost_dev *dev,
 		vq->heads = NULL;
 		vq->dev = dev;
 		mutex_init(&vq->mutex);
+		spin_lock_init(&vq->mmu_lock);
 		vhost_vq_reset(dev, vq);
 		if (vq->handle_kick)
 			vhost_poll_init(&vq->poll, vq->handle_kick,
@@ -538,7 +725,18 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
 	if (err)
 		goto err_cgroup;
 
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	err = mmu_notifier_register(&dev->mmu_notifier, dev->mm);
+	if (err)
+		goto err_mmu_notifier;
+#endif
+
 	return 0;
+
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+err_mmu_notifier:
+	vhost_dev_free_iovecs(dev);
+#endif
 err_cgroup:
 	kthread_stop(worker);
 	dev->worker = NULL;
@@ -629,6 +827,107 @@ static void vhost_clear_msg(struct vhost_dev *dev)
 	spin_unlock(&dev->iotlb_lock);
 }
 
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+static void vhost_setup_uaddr(struct vhost_virtqueue *vq,
+			      int index, unsigned long uaddr,
+			      size_t size, bool write)
+{
+	struct vhost_uaddr *addr = &vq->uaddrs[index];
+
+	addr->uaddr = uaddr;
+	addr->size = size;
+	addr->write = write;
+}
+
+static void vhost_setup_vq_uaddr(struct vhost_virtqueue *vq)
+{
+	vhost_setup_uaddr(vq, VHOST_ADDR_DESC,
+			  (unsigned long)vq->desc,
+			  vhost_get_desc_size(vq, vq->num),
+			  false);
+	vhost_setup_uaddr(vq, VHOST_ADDR_AVAIL,
+			  (unsigned long)vq->avail,
+			  vhost_get_avail_size(vq, vq->num),
+			  false);
+	vhost_setup_uaddr(vq, VHOST_ADDR_USED,
+			  (unsigned long)vq->used,
+			  vhost_get_used_size(vq, vq->num),
+			  true);
+}
+
+static int vhost_map_prefetch(struct vhost_virtqueue *vq,
+			       int index)
+{
+	struct vhost_map *map;
+	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
+	struct page **pages;
+	int npages = DIV_ROUND_UP(uaddr->size, PAGE_SIZE);
+	int npinned;
+	void *vaddr, *v;
+	int err;
+	int i;
+
+	spin_lock(&vq->mmu_lock);
+
+	err = -EFAULT;
+	if (vq->invalidate_count)
+		goto err;
+
+	err = -ENOMEM;
+	map = kmalloc(sizeof(*map), GFP_ATOMIC);
+	if (!map)
+		goto err;
+
+	pages = kmalloc_array(npages, sizeof(struct page *), GFP_ATOMIC);
+	if (!pages)
+		goto err_pages;
+
+	err = EFAULT;
+	npinned = __get_user_pages_fast(uaddr->uaddr, npages,
+					uaddr->write, pages);
+	if (npinned > 0)
+		release_pages(pages, npinned);
+	if (npinned != npages)
+		goto err_gup;
+
+	for (i = 0; i < npinned; i++)
+		if (PageHighMem(pages[i]))
+			goto err_gup;
+
+	vaddr = v = page_address(pages[0]);
+
+	/* For simplicity, fallback to userspace address if VA is not
+	 * contigious.
+	 */
+	for (i = 1; i < npinned; i++) {
+		v += PAGE_SIZE;
+		if (v != page_address(pages[i]))
+			goto err_gup;
+	}
+
+	map->addr = vaddr + (uaddr->uaddr & (PAGE_SIZE - 1));
+	map->npages = npages;
+	map->pages = pages;
+
+	rcu_assign_pointer(vq->maps[index], map);
+	/* No need for a synchronize_rcu(). This function should be
+	 * called by dev->worker so we are serialized with all
+	 * readers.
+	 */
+	spin_unlock(&vq->mmu_lock);
+
+	return 0;
+
+err_gup:
+	kfree(pages);
+err_pages:
+	kfree(map);
+err:
+	spin_unlock(&vq->mmu_lock);
+	return err;
+}
+#endif
+
 void vhost_dev_cleanup(struct vhost_dev *dev)
 {
 	int i;
@@ -658,8 +957,16 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
 		kthread_stop(dev->worker);
 		dev->worker = NULL;
 	}
-	if (dev->mm)
+	if (dev->mm) {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+		mmu_notifier_unregister(&dev->mmu_notifier, dev->mm);
+#endif
 		mmput(dev->mm);
+	}
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	for (i = 0; i < dev->nvqs; i++)
+		vhost_uninit_vq_maps(dev->vqs[i]);
+#endif
 	dev->mm = NULL;
 }
 EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
@@ -886,6 +1193,113 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
 	ret; \
 })
 
+static inline int vhost_put_avail_event(struct vhost_virtqueue *vq)
+{
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_used *used;
+
+	if (!vq->iotlb) {
+		rcu_read_lock();
+
+		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
+		if (likely(map)) {
+			used = map->addr;
+			*((__virtio16 *)&used->ring[vq->num]) =
+				cpu_to_vhost16(vq, vq->avail_idx);
+			rcu_read_unlock();
+			return 0;
+		}
+
+		rcu_read_unlock();
+	}
+#endif
+
+	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
+			      vhost_avail_event(vq));
+}
+
+static inline int vhost_put_used(struct vhost_virtqueue *vq,
+				 struct vring_used_elem *head, int idx,
+				 int count)
+{
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_used *used;
+	size_t size;
+
+	if (!vq->iotlb) {
+		rcu_read_lock();
+
+		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
+		if (likely(map)) {
+			used = map->addr;
+			size = count * sizeof(*head);
+			memcpy(used->ring + idx, head, size);
+			rcu_read_unlock();
+			return 0;
+		}
+
+		rcu_read_unlock();
+	}
+#endif
+
+	return vhost_copy_to_user(vq, vq->used->ring + idx, head,
+				  count * sizeof(*head));
+}
+
+static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
+
+{
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_used *used;
+
+	if (!vq->iotlb) {
+		rcu_read_lock();
+
+		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
+		if (likely(map)) {
+			used = map->addr;
+			used->flags = cpu_to_vhost16(vq, vq->used_flags);
+			rcu_read_unlock();
+			return 0;
+		}
+
+		rcu_read_unlock();
+	}
+#endif
+
+	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
+			      &vq->used->flags);
+}
+
+static inline int vhost_put_used_idx(struct vhost_virtqueue *vq)
+
+{
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_used *used;
+
+	if (!vq->iotlb) {
+		rcu_read_lock();
+
+		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
+		if (likely(map)) {
+			used = map->addr;
+			used->idx = cpu_to_vhost16(vq, vq->last_used_idx);
+			rcu_read_unlock();
+			return 0;
+		}
+
+		rcu_read_unlock();
+	}
+#endif
+
+	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
+			      &vq->used->idx);
+}
+
 #define vhost_get_user(vq, x, ptr, type)		\
 ({ \
 	int ret; \
@@ -924,6 +1338,155 @@ static void vhost_dev_unlock_vqs(struct vhost_dev *d)
 		mutex_unlock(&d->vqs[i]->mutex);
 }
 
+static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,
+				      __virtio16 *idx)
+{
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_avail *avail;
+
+	if (!vq->iotlb) {
+		rcu_read_lock();
+
+		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
+		if (likely(map)) {
+			avail = map->addr;
+			*idx = avail->idx;
+			rcu_read_unlock();
+			return 0;
+		}
+
+		rcu_read_unlock();
+	}
+#endif
+
+	return vhost_get_avail(vq, *idx, &vq->avail->idx);
+}
+
+static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
+				       __virtio16 *head, int idx)
+{
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_avail *avail;
+
+	if (!vq->iotlb) {
+		rcu_read_lock();
+
+		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
+		if (likely(map)) {
+			avail = map->addr;
+			*head = avail->ring[idx & (vq->num - 1)];
+			rcu_read_unlock();
+			return 0;
+		}
+
+		rcu_read_unlock();
+	}
+#endif
+
+	return vhost_get_avail(vq, *head,
+			       &vq->avail->ring[idx & (vq->num - 1)]);
+}
+
+static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq,
+					__virtio16 *flags)
+{
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_avail *avail;
+
+	if (!vq->iotlb) {
+		rcu_read_lock();
+
+		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
+		if (likely(map)) {
+			avail = map->addr;
+			*flags = avail->flags;
+			rcu_read_unlock();
+			return 0;
+		}
+
+		rcu_read_unlock();
+	}
+#endif
+
+	return vhost_get_avail(vq, *flags, &vq->avail->flags);
+}
+
+static inline int vhost_get_used_event(struct vhost_virtqueue *vq,
+				       __virtio16 *event)
+{
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_avail *avail;
+
+	if (!vq->iotlb) {
+		rcu_read_lock();
+		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
+		if (likely(map)) {
+			avail = map->addr;
+			*event = (__virtio16)avail->ring[vq->num];
+			rcu_read_unlock();
+			return 0;
+		}
+		rcu_read_unlock();
+	}
+#endif
+
+	return vhost_get_avail(vq, *event, vhost_used_event(vq));
+}
+
+static inline int vhost_get_used_idx(struct vhost_virtqueue *vq,
+				     __virtio16 *idx)
+{
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_used *used;
+
+	if (!vq->iotlb) {
+		rcu_read_lock();
+
+		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
+		if (likely(map)) {
+			used = map->addr;
+			*idx = used->idx;
+			rcu_read_unlock();
+			return 0;
+		}
+
+		rcu_read_unlock();
+	}
+#endif
+
+	return vhost_get_used(vq, *idx, &vq->used->idx);
+}
+
+static inline int vhost_get_desc(struct vhost_virtqueue *vq,
+				 struct vring_desc *desc, int idx)
+{
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_desc *d;
+
+	if (!vq->iotlb) {
+		rcu_read_lock();
+
+		map = rcu_dereference(vq->maps[VHOST_ADDR_DESC]);
+		if (likely(map)) {
+			d = map->addr;
+			*desc = *(d + idx);
+			rcu_read_unlock();
+			return 0;
+		}
+
+		rcu_read_unlock();
+	}
+#endif
+
+	return vhost_copy_from_user(vq, desc, vq->desc + idx, sizeof(*desc));
+}
+
 static int vhost_new_umem_range(struct vhost_umem *umem,
 				u64 start, u64 size, u64 end,
 				u64 userspace_addr, int perm)
@@ -1209,13 +1772,9 @@ static bool vq_access_ok(struct vhost_virtqueue *vq, unsigned int num,
 			 struct vring_used __user *used)
 
 {
-	size_t s __maybe_unused = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
-
-	return access_ok(desc, num * sizeof *desc) &&
-	       access_ok(avail,
-			 sizeof *avail + num * sizeof *avail->ring + s) &&
-	       access_ok(used,
-			sizeof *used + num * sizeof *used->ring + s);
+	return access_ok(desc, vhost_get_desc_size(vq, num)) &&
+	       access_ok(avail, vhost_get_avail_size(vq, num)) &&
+	       access_ok(used, vhost_get_used_size(vq, num));
 }
 
 static void vhost_vq_meta_update(struct vhost_virtqueue *vq,
@@ -1265,26 +1824,42 @@ static bool iotlb_access_ok(struct vhost_virtqueue *vq,
 	return true;
 }
 
-int vq_iotlb_prefetch(struct vhost_virtqueue *vq)
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+static void vhost_vq_map_prefetch(struct vhost_virtqueue *vq)
+{
+	struct vhost_map __rcu *map;
+	int i;
+
+	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
+		rcu_read_lock();
+		map = rcu_dereference(vq->maps[i]);
+		rcu_read_unlock();
+		if (unlikely(!map))
+			vhost_map_prefetch(vq, i);
+	}
+}
+#endif
+
+int vq_meta_prefetch(struct vhost_virtqueue *vq)
 {
-	size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
 	unsigned int num = vq->num;
 
-	if (!vq->iotlb)
+	if (!vq->iotlb) {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+		vhost_vq_map_prefetch(vq);
+#endif
 		return 1;
+	}
 
 	return iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->desc,
-			       num * sizeof(*vq->desc), VHOST_ADDR_DESC) &&
+			       vhost_get_desc_size(vq, num), VHOST_ADDR_DESC) &&
 	       iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->avail,
-			       sizeof *vq->avail +
-			       num * sizeof(*vq->avail->ring) + s,
+			       vhost_get_avail_size(vq, num),
 			       VHOST_ADDR_AVAIL) &&
 	       iotlb_access_ok(vq, VHOST_ACCESS_WO, (u64)(uintptr_t)vq->used,
-			       sizeof *vq->used +
-			       num * sizeof(*vq->used->ring) + s,
-			       VHOST_ADDR_USED);
+			       vhost_get_used_size(vq, num), VHOST_ADDR_USED);
 }
-EXPORT_SYMBOL_GPL(vq_iotlb_prefetch);
+EXPORT_SYMBOL_GPL(vq_meta_prefetch);
 
 /* Can we log writes? */
 /* Caller should have device mutex but not vq mutex */
@@ -1299,13 +1874,10 @@ EXPORT_SYMBOL_GPL(vhost_log_access_ok);
 static bool vq_log_access_ok(struct vhost_virtqueue *vq,
 			     void __user *log_base)
 {
-	size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
-
 	return vq_memory_access_ok(log_base, vq->umem,
 				   vhost_has_feature(vq, VHOST_F_LOG_ALL)) &&
 		(!vq->log_used || log_access_ok(log_base, vq->log_addr,
-					sizeof *vq->used +
-					vq->num * sizeof *vq->used->ring + s));
+				  vhost_get_used_size(vq, vq->num)));
 }
 
 /* Can we start vq? */
@@ -1405,6 +1977,121 @@ err:
 	return -EFAULT;
 }
 
+static long vhost_vring_set_num(struct vhost_dev *d,
+				struct vhost_virtqueue *vq,
+				void __user *argp)
+{
+	struct vhost_vring_state s;
+
+	/* Resizing ring with an active backend?
+	 * You don't want to do that. */
+	if (vq->private_data)
+		return -EBUSY;
+
+	if (copy_from_user(&s, argp, sizeof s))
+		return -EFAULT;
+
+	if (!s.num || s.num > 0xffff || (s.num & (s.num - 1)))
+		return -EINVAL;
+	vq->num = s.num;
+
+	return 0;
+}
+
+static long vhost_vring_set_addr(struct vhost_dev *d,
+				 struct vhost_virtqueue *vq,
+				 void __user *argp)
+{
+	struct vhost_vring_addr a;
+
+	if (copy_from_user(&a, argp, sizeof a))
+		return -EFAULT;
+	if (a.flags & ~(0x1 << VHOST_VRING_F_LOG))
+		return -EOPNOTSUPP;
+
+	/* For 32bit, verify that the top 32bits of the user
+	   data are set to zero. */
+	if ((u64)(unsigned long)a.desc_user_addr != a.desc_user_addr ||
+	    (u64)(unsigned long)a.used_user_addr != a.used_user_addr ||
+	    (u64)(unsigned long)a.avail_user_addr != a.avail_user_addr)
+		return -EFAULT;
+
+	/* Make sure it's safe to cast pointers to vring types. */
+	BUILD_BUG_ON(__alignof__ *vq->avail > VRING_AVAIL_ALIGN_SIZE);
+	BUILD_BUG_ON(__alignof__ *vq->used > VRING_USED_ALIGN_SIZE);
+	if ((a.avail_user_addr & (VRING_AVAIL_ALIGN_SIZE - 1)) ||
+	    (a.used_user_addr & (VRING_USED_ALIGN_SIZE - 1)) ||
+	    (a.log_guest_addr & (VRING_USED_ALIGN_SIZE - 1)))
+		return -EINVAL;
+
+	/* We only verify access here if backend is configured.
+	 * If it is not, we don't as size might not have been setup.
+	 * We will verify when backend is configured. */
+	if (vq->private_data) {
+		if (!vq_access_ok(vq, vq->num,
+			(void __user *)(unsigned long)a.desc_user_addr,
+			(void __user *)(unsigned long)a.avail_user_addr,
+			(void __user *)(unsigned long)a.used_user_addr))
+			return -EINVAL;
+
+		/* Also validate log access for used ring if enabled. */
+		if ((a.flags & (0x1 << VHOST_VRING_F_LOG)) &&
+			!log_access_ok(vq->log_base, a.log_guest_addr,
+				sizeof *vq->used +
+				vq->num * sizeof *vq->used->ring))
+			return -EINVAL;
+	}
+
+	vq->log_used = !!(a.flags & (0x1 << VHOST_VRING_F_LOG));
+	vq->desc = (void __user *)(unsigned long)a.desc_user_addr;
+	vq->avail = (void __user *)(unsigned long)a.avail_user_addr;
+	vq->log_addr = a.log_guest_addr;
+	vq->used = (void __user *)(unsigned long)a.used_user_addr;
+
+	return 0;
+}
+
+static long vhost_vring_set_num_addr(struct vhost_dev *d,
+				     struct vhost_virtqueue *vq,
+				     unsigned int ioctl,
+				     void __user *argp)
+{
+	long r;
+
+	mutex_lock(&vq->mutex);
+
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	/* Unregister MMU notifer to allow invalidation callback
+	 * can access vq->uaddrs[] without holding a lock.
+	 */
+	if (d->mm)
+		mmu_notifier_unregister(&d->mmu_notifier, d->mm);
+
+	vhost_uninit_vq_maps(vq);
+#endif
+
+	switch (ioctl) {
+	case VHOST_SET_VRING_NUM:
+		r = vhost_vring_set_num(d, vq, argp);
+		break;
+	case VHOST_SET_VRING_ADDR:
+		r = vhost_vring_set_addr(d, vq, argp);
+		break;
+	default:
+		BUG();
+	}
+
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	vhost_setup_vq_uaddr(vq);
+
+	if (d->mm)
+		mmu_notifier_register(&d->mmu_notifier, d->mm);
+#endif
+
+	mutex_unlock(&vq->mutex);
+
+	return r;
+}
 long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
 {
 	struct file *eventfp, *filep = NULL;
@@ -1414,7 +2101,6 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
 	struct vhost_virtqueue *vq;
 	struct vhost_vring_state s;
 	struct vhost_vring_file f;
-	struct vhost_vring_addr a;
 	u32 idx;
 	long r;
 
@@ -1427,26 +2113,14 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
 	idx = array_index_nospec(idx, d->nvqs);
 	vq = d->vqs[idx];
 
+	if (ioctl == VHOST_SET_VRING_NUM ||
+	    ioctl == VHOST_SET_VRING_ADDR) {
+		return vhost_vring_set_num_addr(d, vq, ioctl, argp);
+	}
+
 	mutex_lock(&vq->mutex);
 
 	switch (ioctl) {
-	case VHOST_SET_VRING_NUM:
-		/* Resizing ring with an active backend?
-		 * You don't want to do that. */
-		if (vq->private_data) {
-			r = -EBUSY;
-			break;
-		}
-		if (copy_from_user(&s, argp, sizeof s)) {
-			r = -EFAULT;
-			break;
-		}
-		if (!s.num || s.num > 0xffff || (s.num & (s.num - 1))) {
-			r = -EINVAL;
-			break;
-		}
-		vq->num = s.num;
-		break;
 	case VHOST_SET_VRING_BASE:
 		/* Moving base with an active backend?
 		 * You don't want to do that. */
@@ -1472,62 +2146,6 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
 		if (copy_to_user(argp, &s, sizeof s))
 			r = -EFAULT;
 		break;
-	case VHOST_SET_VRING_ADDR:
-		if (copy_from_user(&a, argp, sizeof a)) {
-			r = -EFAULT;
-			break;
-		}
-		if (a.flags & ~(0x1 << VHOST_VRING_F_LOG)) {
-			r = -EOPNOTSUPP;
-			break;
-		}
-		/* For 32bit, verify that the top 32bits of the user
-		   data are set to zero. */
-		if ((u64)(unsigned long)a.desc_user_addr != a.desc_user_addr ||
-		    (u64)(unsigned long)a.used_user_addr != a.used_user_addr ||
-		    (u64)(unsigned long)a.avail_user_addr != a.avail_user_addr) {
-			r = -EFAULT;
-			break;
-		}
-
-		/* Make sure it's safe to cast pointers to vring types. */
-		BUILD_BUG_ON(__alignof__ *vq->avail > VRING_AVAIL_ALIGN_SIZE);
-		BUILD_BUG_ON(__alignof__ *vq->used > VRING_USED_ALIGN_SIZE);
-		if ((a.avail_user_addr & (VRING_AVAIL_ALIGN_SIZE - 1)) ||
-		    (a.used_user_addr & (VRING_USED_ALIGN_SIZE - 1)) ||
-		    (a.log_guest_addr & (VRING_USED_ALIGN_SIZE - 1))) {
-			r = -EINVAL;
-			break;
-		}
-
-		/* We only verify access here if backend is configured.
-		 * If it is not, we don't as size might not have been setup.
-		 * We will verify when backend is configured. */
-		if (vq->private_data) {
-			if (!vq_access_ok(vq, vq->num,
-				(void __user *)(unsigned long)a.desc_user_addr,
-				(void __user *)(unsigned long)a.avail_user_addr,
-				(void __user *)(unsigned long)a.used_user_addr)) {
-				r = -EINVAL;
-				break;
-			}
-
-			/* Also validate log access for used ring if enabled. */
-			if ((a.flags & (0x1 << VHOST_VRING_F_LOG)) &&
-			    !log_access_ok(vq->log_base, a.log_guest_addr,
-					   sizeof *vq->used +
-					   vq->num * sizeof *vq->used->ring)) {
-				r = -EINVAL;
-				break;
-			}
-		}
-
-		vq->log_used = !!(a.flags & (0x1 << VHOST_VRING_F_LOG));
-		vq->desc = (void __user *)(unsigned long)a.desc_user_addr;
-		vq->avail = (void __user *)(unsigned long)a.avail_user_addr;
-		vq->log_addr = a.log_guest_addr;
-		vq->used = (void __user *)(unsigned long)a.used_user_addr;
-		break;
 	case VHOST_SET_VRING_KICK:
 		if (copy_from_user(&f, argp, sizeof f)) {
 			r = -EFAULT;
@@ -1861,8 +2479,7 @@ EXPORT_SYMBOL_GPL(vhost_log_write);
 static int vhost_update_used_flags(struct vhost_virtqueue *vq)
 {
 	void __user *used;
-	if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
-			   &vq->used->flags) < 0)
+	if (vhost_put_used_flags(vq))
 		return -EFAULT;
 	if (unlikely(vq->log_used)) {
 		/* Make sure the flag is seen before log. */
@@ -1879,8 +2496,7 @@ static int vhost_update_used_flags(struct vhost_virtqueue *vq)
 
 static int vhost_update_avail_event(struct vhost_virtqueue *vq, u16 avail_event)
 {
-	if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
-			   vhost_avail_event(vq)))
+	if (vhost_put_avail_event(vq))
 		return -EFAULT;
 	if (unlikely(vq->log_used)) {
 		void __user *used;
@@ -1916,7 +2532,7 @@ int vhost_vq_init_access(struct vhost_virtqueue *vq)
 		r = -EFAULT;
 		goto err;
 	}
-	r = vhost_get_used(vq, last_used_idx, &vq->used->idx);
+	r = vhost_get_used_idx(vq, &last_used_idx);
 	if (r) {
 		vq_err(vq, "Can't access used idx at %p\n",
 		       &vq->used->idx);
@@ -2115,7 +2731,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 	last_avail_idx = vq->last_avail_idx;
 
 	if (vq->avail_idx == vq->last_avail_idx) {
-		if (unlikely(vhost_get_avail(vq, avail_idx, &vq->avail->idx))) {
+		if (unlikely(vhost_get_avail_idx(vq, &avail_idx))) {
 			vq_err(vq, "Failed to access avail idx at %p\n",
 				&vq->avail->idx);
 			return -EFAULT;
@@ -2142,8 +2758,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 
 	/* Grab the next descriptor number they're advertising, and increment
 	 * the index we've seen. */
-	if (unlikely(vhost_get_avail(vq, ring_head,
-		     &vq->avail->ring[last_avail_idx & (vq->num - 1)]))) {
+	if (unlikely(vhost_get_avail_head(vq, &ring_head, last_avail_idx))) {
 		vq_err(vq, "Failed to read head: idx %d address %p\n",
 		       last_avail_idx,
 		       &vq->avail->ring[last_avail_idx % vq->num]);
@@ -2178,8 +2793,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 			       i, vq->num, head);
 			return -EINVAL;
 		}
-		ret = vhost_copy_from_user(vq, &desc, vq->desc + i,
-					   sizeof desc);
+		ret = vhost_get_desc(vq, &desc, i);
 		if (unlikely(ret)) {
 			vq_err(vq, "Failed to get descriptor: idx %d addr %p\n",
 			       i, vq->desc + i);
@@ -2272,16 +2886,7 @@ static int __vhost_add_used_n(struct vhost_virtqueue *vq,
 
 	start = vq->last_used_idx & (vq->num - 1);
 	used = vq->used->ring + start;
-	if (count == 1) {
-		if (vhost_put_user(vq, heads[0].id, &used->id)) {
-			vq_err(vq, "Failed to write used id");
-			return -EFAULT;
-		}
-		if (vhost_put_user(vq, heads[0].len, &used->len)) {
-			vq_err(vq, "Failed to write used len");
-			return -EFAULT;
-		}
-	} else if (vhost_copy_to_user(vq, used, heads, count * sizeof *used)) {
+	if (vhost_put_used(vq, heads, start, count)) {
 		vq_err(vq, "Failed to write used");
 		return -EFAULT;
 	}
@@ -2323,8 +2928,7 @@ int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
 
 	/* Make sure buffer is written before we update index. */
 	smp_wmb();
-	if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
-			   &vq->used->idx)) {
+	if (vhost_put_used_idx(vq)) {
 		vq_err(vq, "Failed to increment used idx");
 		return -EFAULT;
 	}
@@ -2357,7 +2961,7 @@ static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 
 	if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) {
 		__virtio16 flags;
-		if (vhost_get_avail(vq, flags, &vq->avail->flags)) {
+		if (vhost_get_avail_flags(vq, &flags)) {
 			vq_err(vq, "Failed to get flags");
 			return true;
 		}
@@ -2371,7 +2975,7 @@ static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 	if (unlikely(!v))
 		return true;
 
-	if (vhost_get_avail(vq, event, vhost_used_event(vq))) {
+	if (vhost_get_used_event(vq, &event)) {
 		vq_err(vq, "Failed to get used event idx");
 		return true;
 	}
@@ -2416,7 +3020,7 @@ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 	if (vq->avail_idx != vq->last_avail_idx)
 		return false;
 
-	r = vhost_get_avail(vq, avail_idx, &vq->avail->idx);
+	r = vhost_get_avail_idx(vq, &avail_idx);
 	if (unlikely(r))
 		return false;
 	vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
@@ -2452,7 +3056,7 @@ bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 	/* They could have slipped one in as we were doing that: make
 	 * sure it's written, then check again. */
 	smp_mb();
-	r = vhost_get_avail(vq, avail_idx, &vq->avail->idx);
+	r = vhost_get_avail_idx(vq, &avail_idx);
 	if (r) {
 		vq_err(vq, "Failed to check avail idx at %p: %d\n",
 		       &vq->avail->idx, r);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 27a78a9b8cc7..819296332913 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -12,6 +12,9 @@
 #include <linux/virtio_config.h>
 #include <linux/virtio_ring.h>
 #include <linux/atomic.h>
+#include <linux/pagemap.h>
+#include <linux/mmu_notifier.h>
+#include <asm/cacheflush.h>
 
 struct vhost_work;
 typedef void (*vhost_work_fn_t)(struct vhost_work *work);
@@ -80,6 +83,24 @@ enum vhost_uaddr_type {
 	VHOST_NUM_ADDRS = 3,
 };
 
+struct vhost_map {
+	int npages;
+	void *addr;
+	struct page **pages;
+};
+
+struct vhost_uaddr {
+	unsigned long uaddr;
+	size_t size;
+	bool write;
+};
+
+#if defined(CONFIG_MMU_NOTIFIER) && ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 0
+#define VHOST_ARCH_CAN_ACCEL_UACCESS 1
+#else
+#define VHOST_ARCH_CAN_ACCEL_UACCESS 0
+#endif
+
 /* The virtqueue structure describes a queue attached to a device. */
 struct vhost_virtqueue {
 	struct vhost_dev *dev;
@@ -90,7 +111,22 @@ struct vhost_virtqueue {
 	struct vring_desc __user *desc;
 	struct vring_avail __user *avail;
 	struct vring_used __user *used;
+
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	/* Read by memory accessors, modified by meta data
+	 * prefetching, MMU notifier and vring ioctl().
+	 * Synchonrized through mmu_lock (writers) and RCU (writers
+	 * and readers).
+	 */
+	struct vhost_map __rcu *maps[VHOST_NUM_ADDRS];
+	/* Read by MMU notifier, modified by vring ioctl(),
+	 * synchronized through MMU notifier
+	 * registering/unregistering.
+	 */
+	struct vhost_uaddr uaddrs[VHOST_NUM_ADDRS];
+#endif
 	const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS];
+
 	struct file *kick;
 	struct eventfd_ctx *call_ctx;
 	struct eventfd_ctx *error_ctx;
@@ -145,6 +181,8 @@ struct vhost_virtqueue {
 	bool user_be;
 #endif
 	u32 busyloop_timeout;
+	spinlock_t mmu_lock;
+	int invalidate_count;
 };
 
 struct vhost_msg_node {
@@ -158,6 +196,9 @@ struct vhost_msg_node {
 
 struct vhost_dev {
 	struct mm_struct *mm;
+#ifdef CONFIG_MMU_NOTIFIER
+	struct mmu_notifier mmu_notifier;
+#endif
 	struct mutex mutex;
 	struct vhost_virtqueue **vqs;
 	int nvqs;
@@ -212,7 +253,7 @@ bool vhost_enable_notify(struct vhost_dev *, struct vhost_virtqueue *);
 int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
 		    unsigned int log_num, u64 len,
 		    struct iovec *iov, int count);
-int vq_iotlb_prefetch(struct vhost_virtqueue *vq);
+int vq_meta_prefetch(struct vhost_virtqueue *vq);
 
 struct vhost_msg_node *vhost_new_msg(struct vhost_virtqueue *vq, int type);
 void vhost_enqueue_msg(struct vhost_dev *dev,
diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c
index f363fbeb5ab0..e09edb5c5e06 100644
--- a/drivers/virtio/virtio_mmio.c
+++ b/drivers/virtio/virtio_mmio.c
@@ -463,9 +463,14 @@ static int vm_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 		       struct irq_affinity *desc)
 {
 	struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
-	unsigned int irq = platform_get_irq(vm_dev->pdev, 0);
+	int irq = platform_get_irq(vm_dev->pdev, 0);
 	int i, err, queue_idx = 0;
 
+	if (irq < 0) {
+		dev_err(&vdev->dev, "Cannot get IRQ resource\n");
+		return irq;
+	}
+
 	err = request_irq(irq, vm_interrupt, IRQF_SHARED,
 			dev_name(&vdev->dev), vm_dev);
 	if (err)
author	Linus Torvalds <torvalds@linux-foundation.org>	2019-07-17 11:26:09 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2019-07-17 11:26:09 -0700
commit	3a1d5384b7decbff6519daa9c65a35665e227323 (patch)
tree	7442f1b74d452d82d6702f8cd25173cc81c0c634 /drivers
parent	37d4607ebbbf5d8b74cbcb9434a5ce6897a51864 (diff)
parent	5e663f0410fa2f355042209154029842ba1abd43 (diff)