diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2021-07-09 11:06:29 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2021-07-09 11:06:29 -0700 |
commit | 1eb8df18677d197d7538583823c373d7f13cbebc (patch) | |
tree | 32b76b7f2b84fea99ec64257bb4f5d95f6303866 /drivers | |
parent | d8dc121eeab9abfbc510097f8db83e87560f753b (diff) | |
parent | db7b337709a15d33cc5e901d2ee35d3bb3e42b2f (diff) |
Merge tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost
Pull virtio,vhost,vdpa updates from Michael Tsirkin:
- Doorbell remapping for ifcvf, mlx5
- virtio_vdpa support for mlx5
- Validate device input in several drivers (for SEV and friends)
- ZONE_MOVABLE aware handling in virtio-mem
- Misc fixes, cleanups
* tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost: (48 commits)
virtio-mem: prioritize unplug from ZONE_MOVABLE in Big Block Mode
virtio-mem: simplify high-level unplug handling in Big Block Mode
virtio-mem: prioritize unplug from ZONE_MOVABLE in Sub Block Mode
virtio-mem: simplify high-level unplug handling in Sub Block Mode
virtio-mem: simplify high-level plug handling in Sub Block Mode
virtio-mem: use page_zonenum() in virtio_mem_fake_offline()
virtio-mem: don't read big block size in Sub Block Mode
virtio/vdpa: clear the virtqueue state during probe
vp_vdpa: allow set vq state to initial state after reset
virtio-pci library: introduce vp_modern_get_driver_features()
vdpa: support packed virtqueue for set/get_vq_state()
virtio-ring: store DMA metadata in desc_extra for split virtqueue
virtio: use err label in __vring_new_virtqueue()
virtio_ring: introduce virtqueue_desc_add_split()
virtio_ring: secure handling of mapping errors
virtio-ring: factor out desc_extra allocation
virtio_ring: rename vring_desc_extra_packed
virtio-ring: maintain next in extra state for packed virtqueue
vdpa/mlx5: Clear vq ready indication upon device reset
vdpa/mlx5: Add support for doorbell bypassing
...
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/block/virtio_blk.c | 17 | ||||
-rw-r--r-- | drivers/char/virtio_console.c | 4 | ||||
-rw-r--r-- | drivers/net/virtio_net.c | 53 | ||||
-rw-r--r-- | drivers/vdpa/ifcvf/ifcvf_base.c | 4 | ||||
-rw-r--r-- | drivers/vdpa/ifcvf/ifcvf_base.h | 14 | ||||
-rw-r--r-- | drivers/vdpa/ifcvf/ifcvf_main.c | 43 | ||||
-rw-r--r-- | drivers/vdpa/mlx5/core/mlx5_vdpa.h | 2 | ||||
-rw-r--r-- | drivers/vdpa/mlx5/core/mr.c | 95 | ||||
-rw-r--r-- | drivers/vdpa/mlx5/core/resources.c | 7 | ||||
-rw-r--r-- | drivers/vdpa/mlx5/net/mlx5_vnet.c | 67 | ||||
-rw-r--r-- | drivers/vdpa/vdpa_sim/vdpa_sim.c | 4 | ||||
-rw-r--r-- | drivers/vdpa/vdpa_sim/vdpa_sim_blk.c | 1 | ||||
-rw-r--r-- | drivers/vdpa/virtio_pci/vp_vdpa.c | 43 | ||||
-rw-r--r-- | drivers/vhost/iotlb.c | 2 | ||||
-rw-r--r-- | drivers/vhost/scsi.c | 21 | ||||
-rw-r--r-- | drivers/vhost/vdpa.c | 4 | ||||
-rw-r--r-- | drivers/vhost/vhost.c | 8 | ||||
-rw-r--r-- | drivers/vhost/vhost.h | 21 | ||||
-rw-r--r-- | drivers/vhost/vsock.c | 2 | ||||
-rw-r--r-- | drivers/virtio/virtio_mem.c | 338 | ||||
-rw-r--r-- | drivers/virtio/virtio_pci_modern_dev.c | 21 | ||||
-rw-r--r-- | drivers/virtio/virtio_ring.c | 229 | ||||
-rw-r--r-- | drivers/virtio/virtio_vdpa.c | 15 |
23 files changed, 669 insertions, 346 deletions
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index e4bd3b1fc3c2..4b49df2dfd23 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -21,6 +21,9 @@ #define VQ_NAME_LEN 16 #define MAX_DISCARD_SEGMENTS 256u +/* The maximum number of sg elements that fit into a virtqueue */ +#define VIRTIO_BLK_MAX_SG_ELEMS 32768 + static int major; static DEFINE_IDA(vd_index_ida); @@ -447,13 +450,6 @@ static void virtblk_update_capacity(struct virtio_blk *vblk, bool resize) /* Host must always specify the capacity. */ virtio_cread(vdev, struct virtio_blk_config, capacity, &capacity); - /* If capacity is too big, truncate with warning. */ - if ((sector_t)capacity != capacity) { - dev_warn(&vdev->dev, "Capacity %llu too large: truncating\n", - (unsigned long long)capacity); - capacity = (sector_t)-1; - } - nblocks = DIV_ROUND_UP_ULL(capacity, queue_logical_block_size(q) >> 9); string_get_size(nblocks, queue_logical_block_size(q), @@ -728,7 +724,10 @@ static int virtblk_probe(struct virtio_device *vdev) if (err || !sg_elems) sg_elems = 1; - /* We need an extra sg elements at head and tail. */ + /* Prevent integer overflows and honor max vq size */ + sg_elems = min_t(u32, sg_elems, VIRTIO_BLK_MAX_SG_ELEMS - 2); + + /* We need extra sg elements at head and tail. */ sg_elems += 2; vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL); if (!vblk) { @@ -936,6 +935,8 @@ static int virtblk_freeze(struct virtio_device *vdev) blk_mq_quiesce_queue(vblk->disk->queue); vdev->config->del_vqs(vdev); + kfree(vblk->vqs); + return 0; } diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index 59dfd9c421a1..7eaf303a7a86 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -475,7 +475,7 @@ static struct port_buffer *get_inbuf(struct port *port) buf = virtqueue_get_buf(port->in_vq, &len); if (buf) { - buf->len = len; + buf->len = min_t(size_t, len, buf->size); buf->offset = 0; port->stats.bytes_received += len; } @@ -1709,7 +1709,7 @@ static void control_work_handler(struct work_struct *work) while ((buf = virtqueue_get_buf(vq, &len))) { spin_unlock(&portdev->c_ivq_lock); - buf->len = len; + buf->len = min_t(size_t, len, buf->size); buf->offset = 0; handle_control_message(vq->vdev, portdev, buf); diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index b0b81458ca94..8a58a2f013af 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1516,12 +1516,16 @@ static void virtnet_poll_cleantx(struct receive_queue *rq) return; if (__netif_tx_trylock(txq)) { - free_old_xmit_skbs(sq, true); + do { + virtqueue_disable_cb(sq->vq); + free_old_xmit_skbs(sq, true); + } while (unlikely(!virtqueue_enable_cb_delayed(sq->vq))); + + if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) + netif_tx_wake_queue(txq); + __netif_tx_unlock(txq); } - - if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) - netif_tx_wake_queue(txq); } static int virtnet_poll(struct napi_struct *napi, int budget) @@ -1592,6 +1596,8 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget) struct virtnet_info *vi = sq->vq->vdev->priv; unsigned int index = vq2txq(sq->vq); struct netdev_queue *txq; + int opaque; + bool done; if (unlikely(is_xdp_raw_buffer_queue(vi, index))) { /* We don't need to enable cb for XDP */ @@ -1601,14 +1607,32 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget) txq = netdev_get_tx_queue(vi->dev, index); __netif_tx_lock(txq, raw_smp_processor_id()); + virtqueue_disable_cb(sq->vq); free_old_xmit_skbs(sq, true); - __netif_tx_unlock(txq); - - virtqueue_napi_complete(napi, sq->vq, 0); if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) netif_tx_wake_queue(txq); + opaque = virtqueue_enable_cb_prepare(sq->vq); + + done = napi_complete_done(napi, 0); + + if (!done) + virtqueue_disable_cb(sq->vq); + + __netif_tx_unlock(txq); + + if (done) { + if (unlikely(virtqueue_poll(sq->vq, opaque))) { + if (napi_schedule_prep(napi)) { + __netif_tx_lock(txq, raw_smp_processor_id()); + virtqueue_disable_cb(sq->vq); + __netif_tx_unlock(txq); + __napi_schedule(napi); + } + } + } + return 0; } @@ -1670,10 +1694,14 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) bool use_napi = sq->napi.weight; /* Free up any pending old buffers before queueing new ones. */ - free_old_xmit_skbs(sq, false); + do { + if (use_napi) + virtqueue_disable_cb(sq->vq); + + free_old_xmit_skbs(sq, false); - if (use_napi && kick) - virtqueue_enable_cb_delayed(sq->vq); + } while (use_napi && kick && + unlikely(!virtqueue_enable_cb_delayed(sq->vq))); /* timestamp packet in software */ skb_tx_timestamp(skb); @@ -3310,8 +3338,11 @@ static __maybe_unused int virtnet_restore(struct virtio_device *vdev) virtnet_set_queues(vi, vi->curr_queue_pairs); err = virtnet_cpu_notif_add(vi); - if (err) + if (err) { + virtnet_freeze_down(vdev); + remove_vq_common(vi); return err; + } return 0; } diff --git a/drivers/vdpa/ifcvf/ifcvf_base.c b/drivers/vdpa/ifcvf/ifcvf_base.c index 1a661ab45af5..6e197fe0fcf9 100644 --- a/drivers/vdpa/ifcvf/ifcvf_base.c +++ b/drivers/vdpa/ifcvf/ifcvf_base.c @@ -133,6 +133,8 @@ int ifcvf_init_hw(struct ifcvf_hw *hw, struct pci_dev *pdev) &hw->notify_off_multiplier); hw->notify_bar = cap.bar; hw->notify_base = get_cap_addr(hw, &cap); + hw->notify_base_pa = pci_resource_start(pdev, cap.bar) + + le32_to_cpu(cap.offset); IFCVF_DBG(pdev, "hw->notify_base = %p\n", hw->notify_base); break; @@ -161,6 +163,8 @@ next: notify_off = ifc_ioread16(&hw->common_cfg->queue_notify_off); hw->vring[i].notify_addr = hw->notify_base + notify_off * hw->notify_off_multiplier; + hw->vring[i].notify_pa = hw->notify_base_pa + + notify_off * hw->notify_off_multiplier; } hw->lm_cfg = hw->base[IFCVF_LM_BAR]; diff --git a/drivers/vdpa/ifcvf/ifcvf_base.h b/drivers/vdpa/ifcvf/ifcvf_base.h index 0111bfdeb342..2996db0da490 100644 --- a/drivers/vdpa/ifcvf/ifcvf_base.h +++ b/drivers/vdpa/ifcvf/ifcvf_base.h @@ -19,21 +19,9 @@ #include <uapi/linux/virtio_config.h> #include <uapi/linux/virtio_pci.h> -#define N3000_VENDOR_ID 0x1AF4 #define N3000_DEVICE_ID 0x1041 -#define N3000_SUBSYS_VENDOR_ID 0x8086 #define N3000_SUBSYS_DEVICE_ID 0x001A -#define C5000X_PL_VENDOR_ID 0x1AF4 -#define C5000X_PL_DEVICE_ID 0x1000 -#define C5000X_PL_SUBSYS_VENDOR_ID 0x8086 -#define C5000X_PL_SUBSYS_DEVICE_ID 0x0001 - -#define C5000X_PL_BLK_VENDOR_ID 0x1AF4 -#define C5000X_PL_BLK_DEVICE_ID 0x1001 -#define C5000X_PL_BLK_SUBSYS_VENDOR_ID 0x8086 -#define C5000X_PL_BLK_SUBSYS_DEVICE_ID 0x0002 - #define IFCVF_NET_SUPPORTED_FEATURES \ ((1ULL << VIRTIO_NET_F_MAC) | \ (1ULL << VIRTIO_F_ANY_LAYOUT) | \ @@ -73,6 +61,7 @@ struct vring_info { u16 last_avail_idx; bool ready; void __iomem *notify_addr; + phys_addr_t notify_pa; u32 irq; struct vdpa_callback cb; char msix_name[256]; @@ -87,6 +76,7 @@ struct ifcvf_hw { u8 notify_bar; /* Notificaiton bar address */ void __iomem *notify_base; + phys_addr_t notify_base_pa; u32 notify_off_multiplier; u64 req_features; u64 hw_features; diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c index ab0ab5cf0f6e..21b78f1cd521 100644 --- a/drivers/vdpa/ifcvf/ifcvf_main.c +++ b/drivers/vdpa/ifcvf/ifcvf_main.c @@ -264,7 +264,7 @@ static int ifcvf_vdpa_get_vq_state(struct vdpa_device *vdpa_dev, u16 qid, { struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev); - state->avail_index = ifcvf_get_vq_state(vf, qid); + state->split.avail_index = ifcvf_get_vq_state(vf, qid); return 0; } @@ -273,7 +273,7 @@ static int ifcvf_vdpa_set_vq_state(struct vdpa_device *vdpa_dev, u16 qid, { struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev); - return ifcvf_set_vq_state(vf, qid, state->avail_index); + return ifcvf_set_vq_state(vf, qid, state->split.avail_index); } static void ifcvf_vdpa_set_vq_cb(struct vdpa_device *vdpa_dev, u16 qid, @@ -413,6 +413,21 @@ static int ifcvf_vdpa_get_vq_irq(struct vdpa_device *vdpa_dev, return vf->vring[qid].irq; } +static struct vdpa_notification_area ifcvf_get_vq_notification(struct vdpa_device *vdpa_dev, + u16 idx) +{ + struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev); + struct vdpa_notification_area area; + + area.addr = vf->vring[idx].notify_pa; + if (!vf->notify_off_multiplier) + area.size = PAGE_SIZE; + else + area.size = vf->notify_off_multiplier; + + return area; +} + /* * IFCVF currently does't have on-chip IOMMU, so not * implemented set_map()/dma_map()/dma_unmap() @@ -440,6 +455,7 @@ static const struct vdpa_config_ops ifc_vdpa_ops = { .get_config = ifcvf_vdpa_get_config, .set_config = ifcvf_vdpa_set_config, .set_config_cb = ifcvf_vdpa_set_config_cb, + .get_vq_notification = ifcvf_get_vq_notification, }; static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id) @@ -536,18 +552,21 @@ static void ifcvf_remove(struct pci_dev *pdev) } static struct pci_device_id ifcvf_pci_ids[] = { - { PCI_DEVICE_SUB(N3000_VENDOR_ID, + /* N3000 network device */ + { PCI_DEVICE_SUB(PCI_VENDOR_ID_REDHAT_QUMRANET, N3000_DEVICE_ID, - N3000_SUBSYS_VENDOR_ID, + PCI_VENDOR_ID_INTEL, N3000_SUBSYS_DEVICE_ID) }, - { PCI_DEVICE_SUB(C5000X_PL_VENDOR_ID, - C5000X_PL_DEVICE_ID, - C5000X_PL_SUBSYS_VENDOR_ID, - C5000X_PL_SUBSYS_DEVICE_ID) }, - { PCI_DEVICE_SUB(C5000X_PL_BLK_VENDOR_ID, - C5000X_PL_BLK_DEVICE_ID, - C5000X_PL_BLK_SUBSYS_VENDOR_ID, - C5000X_PL_BLK_SUBSYS_DEVICE_ID) }, + /* C5000X-PL network device */ + { PCI_DEVICE_SUB(PCI_VENDOR_ID_REDHAT_QUMRANET, + VIRTIO_TRANS_ID_NET, + PCI_VENDOR_ID_INTEL, + VIRTIO_ID_NET) }, + /* C5000X-PL block device */ + { PCI_DEVICE_SUB(PCI_VENDOR_ID_REDHAT_QUMRANET, + VIRTIO_TRANS_ID_BLOCK, + PCI_VENDOR_ID_INTEL, + VIRTIO_ID_BLOCK) }, { 0 }, }; diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h b/drivers/vdpa/mlx5/core/mlx5_vdpa.h index b6cc53ba980c..0002b2136b48 100644 --- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h +++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h @@ -35,12 +35,14 @@ struct mlx5_vdpa_mr { /* serialize mkey creation and destruction */ struct mutex mkey_mtx; + bool user_mr; }; struct mlx5_vdpa_resources { u32 pdn; struct mlx5_uars_page *uar; void __iomem *kick_addr; + u64 phys_kick_addr; u16 uid; u32 null_mkey; bool valid; diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c index 800cfd1967ad..dcee6039e966 100644 --- a/drivers/vdpa/mlx5/core/mr.c +++ b/drivers/vdpa/mlx5/core/mr.c @@ -219,11 +219,6 @@ static void destroy_indirect_key(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_m mlx5_vdpa_destroy_mkey(mvdev, &mkey->mkey); } -static struct device *get_dma_device(struct mlx5_vdpa_dev *mvdev) -{ - return &mvdev->mdev->pdev->dev; -} - static int map_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct_mr *mr, struct vhost_iotlb *iotlb) { @@ -239,7 +234,7 @@ static int map_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct_mr u64 pa; u64 paend; struct scatterlist *sg; - struct device *dma = get_dma_device(mvdev); + struct device *dma = mvdev->vdev.dma_dev; for (map = vhost_iotlb_itree_first(iotlb, mr->start, mr->end - 1); map; map = vhost_iotlb_itree_next(map, start, mr->end - 1)) { @@ -298,7 +293,7 @@ err_map: static void unmap_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct_mr *mr) { - struct device *dma = get_dma_device(mvdev); + struct device *dma = mvdev->vdev.dma_dev; destroy_direct_mr(mvdev, mr); dma_unmap_sg_attrs(dma, mr->sg_head.sgl, mr->nsg, DMA_BIDIRECTIONAL, 0); @@ -360,7 +355,7 @@ err_alloc: * indirect memory key that provides access to the enitre address space given * by iotlb. */ -static int _mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb) +static int create_user_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb) { struct mlx5_vdpa_mr *mr = &mvdev->mr; struct mlx5_vdpa_direct_mr *dmr; @@ -374,9 +369,6 @@ static int _mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb int err = 0; int nnuls; - if (mr->initialized) - return 0; - INIT_LIST_HEAD(&mr->head); for (map = vhost_iotlb_itree_first(iotlb, start, last); map; map = vhost_iotlb_itree_next(map, start, last)) { @@ -414,7 +406,7 @@ static int _mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb if (err) goto err_chain; - mr->initialized = true; + mr->user_mr = true; return 0; err_chain: @@ -426,33 +418,94 @@ err_chain: return err; } -int mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb) +static int create_dma_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr) +{ + int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + void *mkc; + u32 *in; + int err; + + in = kzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + + MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA); + MLX5_SET(mkc, mkc, length64, 1); + MLX5_SET(mkc, mkc, lw, 1); + MLX5_SET(mkc, mkc, lr, 1); + MLX5_SET(mkc, mkc, pd, mvdev->res.pdn); + MLX5_SET(mkc, mkc, qpn, 0xffffff); + + err = mlx5_vdpa_create_mkey(mvdev, &mr->mkey, in, inlen); + if (!err) + mr->user_mr = false; + + kfree(in); + return err; +} + +static void destroy_dma_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr) +{ + mlx5_vdpa_destroy_mkey(mvdev, &mr->mkey); +} + +static int _mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb) { struct mlx5_vdpa_mr *mr = &mvdev->mr; int err; - mutex_lock(&mr->mkey_mtx); + if (mr->initialized) + return 0; + + if (iotlb) + err = create_user_mr(mvdev, iotlb); + else + err = create_dma_mr(mvdev, mr); + + if (!err) + mr->initialized = true; + + return err; +} + +int mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb) +{ + int err; + + mutex_lock(&mvdev->mr.mkey_mtx); err = _mlx5_vdpa_create_mr(mvdev, iotlb); - mutex_unlock(&mr->mkey_mtx); + mutex_unlock(&mvdev->mr.mkey_mtx); return err; } -void mlx5_vdpa_destroy_mr(struct mlx5_vdpa_dev *mvdev) +static void destroy_user_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr) { - struct mlx5_vdpa_mr *mr = &mvdev->mr; struct mlx5_vdpa_direct_mr *dmr; struct mlx5_vdpa_direct_mr *n; - mutex_lock(&mr->mkey_mtx); - if (!mr->initialized) - goto out; - destroy_indirect_key(mvdev, mr); list_for_each_entry_safe_reverse(dmr, n, &mr->head, list) { list_del_init(&dmr->list); unmap_direct_mr(mvdev, dmr); kfree(dmr); } +} + +void mlx5_vdpa_destroy_mr(struct mlx5_vdpa_dev *mvdev) +{ + struct mlx5_vdpa_mr *mr = &mvdev->mr; + + mutex_lock(&mr->mkey_mtx); + if (!mr->initialized) + goto out; + + if (mr->user_mr) + destroy_user_mr(mvdev, mr); + else + destroy_dma_mr(mvdev, mr); + memset(mr, 0, sizeof(*mr)); mr->initialized = false; out: diff --git a/drivers/vdpa/mlx5/core/resources.c b/drivers/vdpa/mlx5/core/resources.c index 6521cbd0f5c2..d4606213f88a 100644 --- a/drivers/vdpa/mlx5/core/resources.c +++ b/drivers/vdpa/mlx5/core/resources.c @@ -54,6 +54,9 @@ static int create_uctx(struct mlx5_vdpa_dev *mvdev, u16 *uid) void *in; int err; + if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) + return 0; + /* 0 means not supported */ if (!MLX5_CAP_GEN(mvdev->mdev, log_max_uctx)) return -EOPNOTSUPP; @@ -79,6 +82,9 @@ static void destroy_uctx(struct mlx5_vdpa_dev *mvdev, u32 uid) u32 out[MLX5_ST_SZ_DW(destroy_uctx_out)] = {}; u32 in[MLX5_ST_SZ_DW(destroy_uctx_in)] = {}; + if (!uid) + return; + MLX5_SET(destroy_uctx_in, in, opcode, MLX5_CMD_OP_DESTROY_UCTX); MLX5_SET(destroy_uctx_in, in, uid, uid); @@ -247,6 +253,7 @@ int mlx5_vdpa_alloc_resources(struct mlx5_vdpa_dev *mvdev) goto err_key; kick_addr = mdev->bar_addr + offset; + res->phys_kick_addr = kick_addr; res->kick_addr = ioremap(kick_addr, PAGE_SIZE); if (!res->kick_addr) { diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index dda5dc6f7737..2a31467f7ac5 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -611,8 +611,8 @@ static void cq_destroy(struct mlx5_vdpa_net *ndev, u16 idx) mlx5_db_free(ndev->mvdev.mdev, &vcq->db); } -static int umem_size(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num, - struct mlx5_vdpa_umem **umemp) +static void set_umem_size(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num, + struct mlx5_vdpa_umem **umemp) { struct mlx5_core_dev *mdev = ndev->mvdev.mdev; int p_a; @@ -635,7 +635,7 @@ static int umem_size(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq *umemp = &mvq->umem3; break; } - return p_a * mvq->num_ent + p_b; + (*umemp)->size = p_a * mvq->num_ent + p_b; } static void umem_frag_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_umem *umem) @@ -651,15 +651,10 @@ static int create_umem(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *m void *in; int err; __be64 *pas; - int size; struct mlx5_vdpa_umem *umem; - size = umem_size(ndev, mvq, num, &umem); - if (size < 0) - return size; - - umem->size = size; - err = umem_frag_buf_alloc(ndev, umem, size); + set_umem_size(ndev, mvq, num, &umem); + err = umem_frag_buf_alloc(ndev, umem, umem->size); if (err) return err; @@ -829,9 +824,9 @@ static int create_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtque MLX5_SET(virtio_q, vq_ctx, umem_1_id, mvq->umem1.id); MLX5_SET(virtio_q, vq_ctx, umem_1_size, mvq->umem1.size); MLX5_SET(virtio_q, vq_ctx, umem_2_id, mvq->umem2.id); - MLX5_SET(virtio_q, vq_ctx, umem_2_size, mvq->umem1.size); + MLX5_SET(virtio_q, vq_ctx, umem_2_size, mvq->umem2.size); MLX5_SET(virtio_q, vq_ctx, umem_3_id, mvq->umem3.id); - MLX5_SET(virtio_q, vq_ctx, umem_3_size, mvq->umem1.size); + MLX5_SET(virtio_q, vq_ctx, umem_3_size, mvq->umem3.size); MLX5_SET(virtio_q, vq_ctx, pd, ndev->mvdev.res.pdn); if (MLX5_CAP_DEV_VDPA_EMULATION(ndev->mvdev.mdev, eth_frame_offload_type)) MLX5_SET(virtio_q, vq_ctx, virtio_version_1_0, 1); @@ -1428,8 +1423,8 @@ static int mlx5_vdpa_set_vq_state(struct vdpa_device *vdev, u16 idx, return -EINVAL; } - mvq->used_idx = state->avail_index; - mvq->avail_idx = state->avail_index; + mvq->used_idx = state->split.avail_index; + mvq->avail_idx = state->split.avail_index; return 0; } @@ -1450,7 +1445,7 @@ static int mlx5_vdpa_get_vq_state(struct vdpa_device *vdev, u16 idx, struct vdpa * Since both values should be identical, we take the value of * used_idx which is reported correctly. */ - state->avail_index = mvq->used_idx; + state->split.avail_index = mvq->used_idx; return 0; } @@ -1459,7 +1454,7 @@ static int mlx5_vdpa_get_vq_state(struct vdpa_device *vdev, u16 idx, struct vdpa mlx5_vdpa_warn(mvdev, "failed to query virtqueue\n"); return err; } - state->avail_index = attr.used_index; + state->split.avail_index = attr.used_index; return 0; } @@ -1772,6 +1767,14 @@ out: mutex_unlock(&ndev->reslock); } +static void clear_vqs_ready(struct mlx5_vdpa_net *ndev) +{ + int i; + + for (i = 0; i < ndev->mvdev.max_vqs; i++) + ndev->vqs[i].ready = false; +} + static void mlx5_vdpa_set_status(struct vdpa_device *vdev, u8 status) { struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); @@ -1782,10 +1785,15 @@ static void mlx5_vdpa_set_status(struct vdpa_device *vdev, u8 status) if (!status) { mlx5_vdpa_info(mvdev, "performing device reset\n"); teardown_driver(ndev); + clear_vqs_ready(ndev); mlx5_vdpa_destroy_mr(&ndev->mvdev); ndev->mvdev.status = 0; ndev->mvdev.mlx_features = 0; ++mvdev->generation; + if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) { + if (mlx5_vdpa_create_mr(mvdev, NULL)) + mlx5_vdpa_warn(mvdev, "create MR failed\n"); + } return; } @@ -1866,6 +1874,7 @@ static void mlx5_vdpa_free(struct vdpa_device *vdev) ndev = to_mlx5_vdpa_ndev(mvdev); free_resources(ndev); + mlx5_vdpa_destroy_mr(mvdev); if (!is_zero_ether_addr(ndev->config.mac)) { pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev)); mlx5_mpfs_del_mac(pfmdev, ndev->config.mac); @@ -1876,8 +1885,22 @@ static void mlx5_vdpa_free(struct vdpa_device *vdev) static struct vdpa_notification_area mlx5_get_vq_notification(struct vdpa_device *vdev, u16 idx) { + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); struct vdpa_notification_area ret = {}; + struct mlx5_vdpa_net *ndev; + phys_addr_t addr; + + /* If SF BAR size is smaller than PAGE_SIZE, do not use direct + * notification to avoid the risk of mapping pages that contain BAR of more + * than one SF + */ + if (MLX5_CAP_GEN(mvdev->mdev, log_min_sf_size) + 12 < PAGE_SHIFT) + return ret; + ndev = to_mlx5_vdpa_ndev(mvdev); + addr = (phys_addr_t)ndev->mvdev.res.phys_kick_addr; + ret.addr = addr; + ret.size = PAGE_SIZE; return ret; } @@ -2037,14 +2060,20 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name) goto err_mtu; } - mvdev->vdev.dma_dev = mdev->device; + mvdev->vdev.dma_dev = &mdev->pdev->dev; err = mlx5_vdpa_alloc_resources(&ndev->mvdev); if (err) goto err_mpfs; + if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) { + err = mlx5_vdpa_create_mr(mvdev, NULL); + if (err) + goto err_res; + } + err = alloc_resources(ndev); if (err) - goto err_res; + goto err_mr; mvdev->vdev.mdev = &mgtdev->mgtdev; err = _vdpa_register_device(&mvdev->vdev, 2 * mlx5_vdpa_max_qps(max_vqs)); @@ -2056,6 +2085,8 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name) err_reg: free_resources(ndev); +err_mr: + mlx5_vdpa_destroy_mr(mvdev); err_res: mlx5_vdpa_free_resources(&ndev->mvdev); err_mpfs: diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index 98f793bc9376..14e024de5cbf 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -374,7 +374,7 @@ static int vdpasim_set_vq_state(struct vdpa_device *vdpa, u16 idx, struct vringh *vrh = &vq->vring; spin_lock(&vdpasim->lock); - vrh->last_avail_idx = state->avail_index; + vrh->last_avail_idx = state->split.avail_index; spin_unlock(&vdpasim->lock); return 0; @@ -387,7 +387,7 @@ static int vdpasim_get_vq_state(struct vdpa_device *vdpa, u16 idx, struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx]; struct vringh *vrh = &vq->vring; - state->avail_index = vrh->last_avail_idx; + state->split.avail_index = vrh->last_avail_idx; return 0; } diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c index 5bfe1c281645..a790903f243e 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c @@ -15,7 +15,6 @@ #include <linux/blkdev.h> #include <linux/vringh.h> #include <linux/vdpa.h> -#include <linux/blkdev.h> #include <uapi/linux/virtio_blk.h> #include "vdpa_sim.h" diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c b/drivers/vdpa/virtio_pci/vp_vdpa.c index c76ebb531212..7b4a6396c553 100644 --- a/drivers/vdpa/virtio_pci/vp_vdpa.c +++ b/drivers/vdpa/virtio_pci/vp_vdpa.c @@ -210,13 +210,49 @@ static int vp_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 qid, return -EOPNOTSUPP; } +static int vp_vdpa_set_vq_state_split(struct vdpa_device *vdpa, + const struct vdpa_vq_state *state) +{ + const struct vdpa_vq_state_split *split = &state->split; + + if (split->avail_index == 0) + return 0; + + return -EOPNOTSUPP; +} + +static int vp_vdpa_set_vq_state_packed(struct vdpa_device *vdpa, + const struct vdpa_vq_state *state) +{ + const struct vdpa_vq_state_packed *packed = &state->packed; + + if (packed->last_avail_counter == 1 && + packed->last_avail_idx == 0 && + packed->last_used_counter == 1 && + packed->last_used_idx == 0) + return 0; + + return -EOPNOTSUPP; +} + static int vp_vdpa_set_vq_state(struct vdpa_device *vdpa, u16 qid, const struct vdpa_vq_state *state) { - /* Note that this is not supported by virtio specification, so - * we return -ENOPOTSUPP here. This means we can't support live - * migration, vhost device start/stop. + struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa); + + /* Note that this is not supported by virtio specification. + * But if the state is by chance equal to the device initial + * state, we can let it go. */ + if ((vp_modern_get_status(mdev) & VIRTIO_CONFIG_S_FEATURES_OK) && + !vp_modern_get_queue_enable(mdev, qid)) { + if (vp_modern_get_driver_features(mdev) & + BIT_ULL(VIRTIO_F_RING_PACKED)) + return vp_vdpa_set_vq_state_packed(vdpa, state); + else + return vp_vdpa_set_vq_state_split(vdpa, state); + } + return -EOPNOTSUPP; } @@ -442,6 +478,7 @@ static int vp_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id) vp_modern_map_vq_notify(mdev, i, &vp_vdpa->vring[i].notify_pa); if (!vp_vdpa->vring[i].notify) { + ret = -EINVAL; dev_warn(&pdev->dev, "Fail to map vq notify %d\n", i); goto err; } diff --git a/drivers/vhost/iotlb.c b/drivers/vhost/iotlb.c index 0fd3f87e913c..0582079e4bcc 100644 --- a/drivers/vhost/iotlb.c +++ b/drivers/vhost/iotlb.c @@ -83,7 +83,7 @@ int vhost_iotlb_add_range(struct vhost_iotlb *iotlb, EXPORT_SYMBOL_GPL(vhost_iotlb_add_range); /** - * vring_iotlb_del_range - delete overlapped ranges from vhost IOTLB + * vhost_iotlb_del_range - delete overlapped ranges from vhost IOTLB * @iotlb: the IOTLB * @start: start of the IOVA range * @last: last of IOVA range diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index d16c04dcc144..46f897e41217 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -1430,11 +1430,6 @@ static void vhost_scsi_handle_kick(struct vhost_work *work) vhost_scsi_handle_vq(vs, vq); } -static void vhost_scsi_flush_vq(struct vhost_scsi *vs, int index) -{ - vhost_poll_flush(&vs->vqs[index].vq.poll); -} - /* Callers must hold dev mutex */ static void vhost_scsi_flush(struct vhost_scsi *vs) { @@ -1453,10 +1448,7 @@ static void vhost_scsi_flush(struct vhost_scsi *vs) kref_put(&old_inflight[i]->kref, vhost_scsi_done_inflight); /* Flush both the vhost poll and vhost work */ - for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) - vhost_scsi_flush_vq(vs, i); - vhost_work_flush(&vs->dev, &vs->vs_completion_work); - vhost_work_flush(&vs->dev, &vs->vs_event_work); + vhost_work_dev_flush(&vs->dev); /* Wait for all reqs issued before the flush to be finished */ for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) @@ -1740,11 +1732,12 @@ vhost_scsi_clear_endpoint(struct vhost_scsi *vs, mutex_lock(&vq->mutex); vhost_vq_set_backend(vq, NULL); mutex_unlock(&vq->mutex); - /* - * Make sure cmds are not running before tearing them - * down. - */ - vhost_scsi_flush(vs); + } + /* Make sure cmds are not running before tearing them down. */ + vhost_scsi_flush(vs); + + for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) { + vq = &vs->vqs[i].vq; vhost_scsi_destroy_vq_cmds(vq); } } diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index fb41db3da611..210ab35a7ebf 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -383,7 +383,7 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd, if (r) return r; - vq->last_avail_idx = vq_state.avail_index; + vq->last_avail_idx = vq_state.split.avail_index; break; } @@ -401,7 +401,7 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd, break; case VHOST_SET_VRING_BASE: - vq_state.avail_index = vq->last_avail_idx; + vq_state.split.avail_index = vq->last_avail_idx; if (ops->set_vq_state(vdpa, idx, &vq_state)) r = -EINVAL; break; diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 5ccb0705beae..b9e853e6094d 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -231,7 +231,7 @@ void vhost_poll_stop(struct vhost_poll *poll) } EXPORT_SYMBOL_GPL(vhost_poll_stop); -void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work) +void vhost_work_dev_flush(struct vhost_dev *dev) { struct vhost_flush_struct flush; @@ -243,13 +243,13 @@ void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work) wait_for_completion(&flush.wait_event); } } -EXPORT_SYMBOL_GPL(vhost_work_flush); +EXPORT_SYMBOL_GPL(vhost_work_dev_flush); /* Flush any work that has been scheduled. When calling this, don't hold any * locks that are also used by the callback. */ void vhost_poll_flush(struct vhost_poll *poll) { - vhost_work_flush(poll->dev, &poll->work); + vhost_work_dev_flush(poll->dev); } EXPORT_SYMBOL_GPL(vhost_poll_flush); @@ -538,7 +538,7 @@ static int vhost_attach_cgroups(struct vhost_dev *dev) attach.owner = current; vhost_work_init(&attach.work, vhost_attach_cgroups_work); vhost_work_queue(dev, &attach.work); - vhost_work_flush(dev, &attach.work); + vhost_work_dev_flush(dev); return attach.ret; } diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index b063324c7669..638bb640d6b4 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -20,20 +20,20 @@ typedef void (*vhost_work_fn_t)(struct vhost_work *work); #define VHOST_WORK_QUEUED 1 struct vhost_work { - struct llist_node node; - vhost_work_fn_t fn; - unsigned long flags; + struct llist_node node; + vhost_work_fn_t fn; + unsigned long flags; }; /* Poll a file (eventfd or socket) */ /* Note: there's nothing vhost specific about this structure. */ struct vhost_poll { - poll_table table; - wait_queue_head_t *wqh; - wait_queue_entry_t wait; - struct vhost_work work; - __poll_t mask; - struct vhost_dev *dev; + poll_table table; + wait_queue_head_t *wqh; + wait_queue_entry_t wait; + struct vhost_work work; + __poll_t mask; + struct vhost_dev *dev; }; void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn); @@ -46,8 +46,7 @@ int vhost_poll_start(struct vhost_poll *poll, struct file *file); void vhost_poll_stop(struct vhost_poll *poll); void vhost_poll_flush(struct vhost_poll *poll); void vhost_poll_queue(struct vhost_poll *poll); -void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work); -long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp); +void vhost_work_dev_flush(struct vhost_dev *dev); struct vhost_log { u64 addr; diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index d38c996b4f46..f249622ef11b 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -708,7 +708,7 @@ static void vhost_vsock_flush(struct vhost_vsock *vsock) for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) if (vsock->vqs[i].handle_kick) vhost_poll_flush(&vsock->vqs[i].poll); - vhost_work_flush(&vsock->dev, &vsock->send_pkt_work); + vhost_work_dev_flush(&vsock->dev); } static void vhost_vsock_reset_orphans(struct sock *sk) diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index dc2a2e2b2ff8..09ed55de07d7 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -75,10 +75,14 @@ enum virtio_mem_sbm_mb_state { VIRTIO_MEM_SBM_MB_OFFLINE, /* Partially plugged, fully added to Linux, offline. */ VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL, - /* Fully plugged, fully added to Linux, online. */ - VIRTIO_MEM_SBM_MB_ONLINE, - /* Partially plugged, fully added to Linux, online. */ - VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL, + /* Fully plugged, fully added to Linux, onlined to a kernel zone. */ + VIRTIO_MEM_SBM_MB_KERNEL, + /* Partially plugged, fully added to Linux, online to a kernel zone */ + VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL, + /* Fully plugged, fully added to Linux, onlined to ZONE_MOVABLE. */ + VIRTIO_MEM_SBM_MB_MOVABLE, + /* Partially plugged, fully added to Linux, onlined to ZONE_MOVABLE. */ + VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL, VIRTIO_MEM_SBM_MB_COUNT }; @@ -699,18 +703,6 @@ static int virtio_mem_sbm_remove_mb(struct virtio_mem *vm, unsigned long mb_id) } /* - * See virtio_mem_remove_memory(): Try to remove all Linux memory blocks covered - * by the big block. - */ -static int virtio_mem_bbm_remove_bb(struct virtio_mem *vm, unsigned long bb_id) -{ - const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); - const uint64_t size = vm->bbm.bb_size; - - return virtio_mem_remove_memory(vm, addr, size); -} - -/* * Try offlining and removing memory from Linux. * * Must not be called with the vm->hotplug_mutex held (possible deadlock with @@ -832,11 +824,13 @@ static void virtio_mem_sbm_notify_offline(struct virtio_mem *vm, unsigned long mb_id) { switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { - case VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL: + case VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL: + case VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL: virtio_mem_sbm_set_mb_state(vm, mb_id, VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); break; - case VIRTIO_MEM_SBM_MB_ONLINE: + case VIRTIO_MEM_SBM_MB_KERNEL: + case VIRTIO_MEM_SBM_MB_MOVABLE: virtio_mem_sbm_set_mb_state(vm, mb_id, VIRTIO_MEM_SBM_MB_OFFLINE); break; @@ -847,21 +841,29 @@ static void virtio_mem_sbm_notify_offline(struct virtio_mem *vm, } static void virtio_mem_sbm_notify_online(struct virtio_mem *vm, - unsigned long mb_id) + unsigned long mb_id, + unsigned long start_pfn) { + const bool is_movable = page_zonenum(pfn_to_page(start_pfn)) == + ZONE_MOVABLE; + int new_state; + switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: - virtio_mem_sbm_set_mb_state(vm, mb_id, - VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL); + new_state = VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL; + if (is_movable) + new_state = VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL; break; case VIRTIO_MEM_SBM_MB_OFFLINE: - virtio_mem_sbm_set_mb_state(vm, mb_id, - VIRTIO_MEM_SBM_MB_ONLINE); + new_state = VIRTIO_MEM_SBM_MB_KERNEL; + if (is_movable) + new_state = VIRTIO_MEM_SBM_MB_MOVABLE; break; default: BUG(); break; } + virtio_mem_sbm_set_mb_state(vm, mb_id, new_state); } static void virtio_mem_sbm_notify_going_offline(struct virtio_mem *vm, @@ -1015,7 +1017,7 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, break; case MEM_ONLINE: if (vm->in_sbm) - virtio_mem_sbm_notify_online(vm, id); + virtio_mem_sbm_notify_online(vm, id, mhp->start_pfn); atomic64_sub(size, &vm->offline_size); /* @@ -1137,7 +1139,7 @@ static void virtio_mem_fake_online(unsigned long pfn, unsigned long nr_pages) */ static int virtio_mem_fake_offline(unsigned long pfn, unsigned long nr_pages) { - const bool is_movable = zone_idx(page_zone(pfn_to_page(pfn))) == + const bool is_movable = page_zonenum(pfn_to_page(pfn)) == ZONE_MOVABLE; int rc, retry_count; @@ -1455,8 +1457,8 @@ static int virtio_mem_bbm_plug_bb(struct virtio_mem *vm, unsigned long bb_id) * * Note: can fail after some subblocks were unplugged. */ -static int virtio_mem_sbm_unplug_any_sb(struct virtio_mem *vm, - unsigned long mb_id, uint64_t *nb_sb) +static int virtio_mem_sbm_unplug_any_sb_raw(struct virtio_mem *vm, + unsigned long mb_id, uint64_t *nb_sb) { int sb_id, count; int rc; @@ -1498,7 +1500,7 @@ static int virtio_mem_sbm_unplug_mb(struct virtio_mem *vm, unsigned long mb_id) { uint64_t nb_sb = vm->sbm.sbs_per_mb; - return virtio_mem_sbm_unplug_any_sb(vm, mb_id, &nb_sb); + return virtio_mem_sbm_unplug_any_sb_raw(vm, mb_id, &nb_sb); } /* @@ -1585,9 +1587,9 @@ static int virtio_mem_sbm_plug_and_add_mb(struct virtio_mem *vm, * Note: Can fail after some subblocks were successfully plugged. */ static int virtio_mem_sbm_plug_any_sb(struct virtio_mem *vm, - unsigned long mb_id, uint64_t *nb_sb, - bool online) + unsigned long mb_id, uint64_t *nb_sb) { + const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id); unsigned long pfn, nr_pages; int sb_id, count; int rc; @@ -1609,7 +1611,7 @@ static int virtio_mem_sbm_plug_any_sb(struct virtio_mem *vm, if (rc) return rc; *nb_sb -= count; - if (!online) + if (old_state == VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) continue; /* fake-online the pages if the memory block is online */ @@ -1619,23 +1621,22 @@ static int virtio_mem_sbm_plug_any_sb(struct virtio_mem *vm, virtio_mem_fake_online(pfn, nr_pages); } - if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { - if (online) - virtio_mem_sbm_set_mb_state(vm, mb_id, - VIRTIO_MEM_SBM_MB_ONLINE); - else - virtio_mem_sbm_set_mb_state(vm, mb_id, - VIRTIO_MEM_SBM_MB_OFFLINE); - } + if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) + virtio_mem_sbm_set_mb_state(vm, mb_id, old_state - 1); return 0; } static int virtio_mem_sbm_plug_request(struct virtio_mem *vm, uint64_t diff) { + const int mb_states[] = { + VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL, + VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL, + VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL, + }; uint64_t nb_sb = diff / vm->sbm.sb_size; unsigned long mb_id; - int rc; + int rc, i; if (!nb_sb) return 0; @@ -1643,22 +1644,13 @@ static int virtio_mem_sbm_plug_request(struct virtio_mem *vm, uint64_t diff) /* Don't race with onlining/offlining */ mutex_lock(&vm->hotplug_mutex); - /* Try to plug subblocks of partially plugged online blocks. */ - virtio_mem_sbm_for_each_mb(vm, mb_id, - VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL) { - rc = virtio_mem_sbm_plug_any_sb(vm, mb_id, &nb_sb, true); - if (rc || !nb_sb) - goto out_unlock; - cond_resched(); - } - - /* Try to plug subblocks of partially plugged offline blocks. */ - virtio_mem_sbm_for_each_mb(vm, mb_id, - VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) { - rc = virtio_mem_sbm_plug_any_sb(vm, mb_id, &nb_sb, false); - if (rc || !nb_sb) - goto out_unlock; - cond_resched(); + for (i = 0; i < ARRAY_SIZE(mb_states); i++) { + virtio_mem_sbm_for_each_mb(vm, mb_id, mb_states[i]) { + rc = virtio_mem_sbm_plug_any_sb(vm, mb_id, &nb_sb); + if (rc || !nb_sb) + goto out_unlock; + cond_resched(); + } } /* @@ -1819,7 +1811,7 @@ static int virtio_mem_sbm_unplug_any_sb_offline(struct virtio_mem *vm, { int rc; - rc = virtio_mem_sbm_unplug_any_sb(vm, mb_id, nb_sb); + rc = virtio_mem_sbm_unplug_any_sb_raw(vm, mb_id, nb_sb); /* some subblocks might have been unplugged even on failure */ if (!virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) @@ -1856,6 +1848,7 @@ static int virtio_mem_sbm_unplug_sb_online(struct virtio_mem *vm, int count) { const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size) * count; + const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id); unsigned long start_pfn; int rc; @@ -1874,8 +1867,17 @@ static int virtio_mem_sbm_unplug_sb_online(struct virtio_mem *vm, return rc; } - virtio_mem_sbm_set_mb_state(vm, mb_id, - VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL); + switch (old_state) { + case VIRTIO_MEM_SBM_MB_KERNEL: + virtio_mem_sbm_set_mb_state(vm, mb_id, + VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL); + break; + case VIRTIO_MEM_SBM_MB_MOVABLE: + virtio_mem_sbm_set_mb_state(vm, mb_id, + VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL); + break; + } + return 0; } @@ -1942,11 +1944,50 @@ unplugged: return 0; } +/* + * Unplug the desired number of plugged subblocks of a memory block that is + * already added to Linux. Will skip subblock of online memory blocks that are + * busy (by the OS). Will fail if any subblock that's not busy cannot get + * unplugged. + * + * Will modify the state of the memory block. Might temporarily drop the + * hotplug_mutex. + * + * Note: Can fail after some subblocks were successfully unplugged. Can + * return 0 even if subblocks were busy and could not get unplugged. + */ +static int virtio_mem_sbm_unplug_any_sb(struct virtio_mem *vm, + unsigned long mb_id, + uint64_t *nb_sb) +{ + const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id); + + switch (old_state) { + case VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL: + case VIRTIO_MEM_SBM_MB_KERNEL: + case VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL: + case VIRTIO_MEM_SBM_MB_MOVABLE: + return virtio_mem_sbm_unplug_any_sb_online(vm, mb_id, nb_sb); + case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: + case VIRTIO_MEM_SBM_MB_OFFLINE: + return virtio_mem_sbm_unplug_any_sb_offline(vm, mb_id, nb_sb); + } + return -EINVAL; +} + static int virtio_mem_sbm_unplug_request(struct virtio_mem *vm, uint64_t diff) { + const int mb_states[] = { + VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL, + VIRTIO_MEM_SBM_MB_OFFLINE, + VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL, + VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL, + VIRTIO_MEM_SBM_MB_MOVABLE, + VIRTIO_MEM_SBM_MB_KERNEL, + }; uint64_t nb_sb = diff / vm->sbm.sb_size; unsigned long mb_id; - int rc; + int rc, i; if (!nb_sb) return 0; @@ -1958,47 +1999,26 @@ static int virtio_mem_sbm_unplug_request(struct virtio_mem *vm, uint64_t diff) */ mutex_lock(&vm->hotplug_mutex); - /* Try to unplug subblocks of partially plugged offline blocks. */ - virtio_mem_sbm_for_each_mb_rev(vm, mb_id, - VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) { - rc = virtio_mem_sbm_unplug_any_sb_offline(vm, mb_id, &nb_sb); - if (rc || !nb_sb) - goto out_unlock; - cond_resched(); - } - - /* Try to unplug subblocks of plugged offline blocks. */ - virtio_mem_sbm_for_each_mb_rev(vm, mb_id, VIRTIO_MEM_SBM_MB_OFFLINE) { - rc = virtio_mem_sbm_unplug_any_sb_offline(vm, mb_id, &nb_sb); - if (rc || !nb_sb) - goto out_unlock; - cond_resched(); - } - - if (!unplug_online) { - mutex_unlock(&vm->hotplug_mutex); - return 0; - } - - /* Try to unplug subblocks of partially plugged online blocks. */ - virtio_mem_sbm_for_each_mb_rev(vm, mb_id, - VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL) { - rc = virtio_mem_sbm_unplug_any_sb_online(vm, mb_id, &nb_sb); - if (rc || !nb_sb) - goto out_unlock; - mutex_unlock(&vm->hotplug_mutex); - cond_resched(); - mutex_lock(&vm->hotplug_mutex); - } - - /* Try to unplug subblocks of plugged online blocks. */ - virtio_mem_sbm_for_each_mb_rev(vm, mb_id, VIRTIO_MEM_SBM_MB_ONLINE) { - rc = virtio_mem_sbm_unplug_any_sb_online(vm, mb_id, &nb_sb); - if (rc || !nb_sb) - goto out_unlock; - mutex_unlock(&vm->hotplug_mutex); - cond_resched(); - mutex_lock(&vm->hotplug_mutex); + /* + * We try unplug from partially plugged blocks first, to try removing + * whole memory blocks along with metadata. We prioritize ZONE_MOVABLE + * as it's more reliable to unplug memory and remove whole memory + * blocks, and we don't want to trigger a zone imbalances by + * accidentially removing too much kernel memory. + */ + for (i = 0; i < ARRAY_SIZE(mb_states); i++) { + virtio_mem_sbm_for_each_mb_rev(vm, mb_id, mb_states[i]) { + rc = virtio_mem_sbm_unplug_any_sb(vm, mb_id, &nb_sb); + if (rc || !nb_sb) + goto out_unlock; + mutex_unlock(&vm->hotplug_mutex); + cond_resched(); + mutex_lock(&vm->hotplug_mutex); + } + if (!unplug_online && i == 1) { + mutex_unlock(&vm->hotplug_mutex); + return 0; + } } mutex_unlock(&vm->hotplug_mutex); @@ -2085,47 +2105,41 @@ rollback_safe_unplug: } /* - * Try to remove a big block from Linux and unplug it. Will fail with - * -EBUSY if some memory is online. - * - * Will modify the state of the memory block. + * Test if a big block is completely offline. */ -static int virtio_mem_bbm_remove_and_unplug_bb(struct virtio_mem *vm, - unsigned long bb_id) +static bool virtio_mem_bbm_bb_is_offline(struct virtio_mem *vm, + unsigned long bb_id) { - int rc; - - if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) != - VIRTIO_MEM_BBM_BB_ADDED)) - return -EINVAL; + const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id)); + const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size); + unsigned long pfn; - rc = virtio_mem_bbm_remove_bb(vm, bb_id); - if (rc) - return -EBUSY; + for (pfn = start_pfn; pfn < start_pfn + nr_pages; + pfn += PAGES_PER_SECTION) { + if (pfn_to_online_page(pfn)) + return false; + } - rc = virtio_mem_bbm_unplug_bb(vm, bb_id); - if (rc) - virtio_mem_bbm_set_bb_state(vm, bb_id, - VIRTIO_MEM_BBM_BB_PLUGGED); - else - virtio_mem_bbm_set_bb_state(vm, bb_id, - VIRTIO_MEM_BBM_BB_UNUSED); - return rc; + return true; } /* - * Test if a big block is completely offline. + * Test if a big block is completely onlined to ZONE_MOVABLE (or offline). */ -static bool virtio_mem_bbm_bb_is_offline(struct virtio_mem *vm, +static bool virtio_mem_bbm_bb_is_movable(struct virtio_mem *vm, unsigned long bb_id) { const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id)); const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size); + struct page *page; unsigned long pfn; for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn += PAGES_PER_SECTION) { - if (pfn_to_online_page(pfn)) + page = pfn_to_online_page(pfn); + if (!page) + continue; + if (page_zonenum(page) != ZONE_MOVABLE) return false; } @@ -2136,42 +2150,37 @@ static int virtio_mem_bbm_unplug_request(struct virtio_mem *vm, uint64_t diff) { uint64_t nb_bb = diff / vm->bbm.bb_size; uint64_t bb_id; - int rc; + int rc, i; if (!nb_bb) return 0; - /* Try to unplug completely offline big blocks first. */ - virtio_mem_bbm_for_each_bb_rev(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED) { - cond_resched(); - /* - * As we're holding no locks, this check is racy as memory - * can get onlined in the meantime - but we'll fail gracefully. - */ - if (!virtio_mem_bbm_bb_is_offline(vm, bb_id)) - continue; - rc = virtio_mem_bbm_remove_and_unplug_bb(vm, bb_id); - if (rc == -EBUSY) - continue; - if (!rc) - nb_bb--; - if (rc || !nb_bb) - return rc; - } - - if (!unplug_online) - return 0; + /* + * Try to unplug big blocks. Similar to SBM, start with offline + * big blocks. + */ + for (i = 0; i < 3; i++) { + virtio_mem_bbm_for_each_bb_rev(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED) { + cond_resched(); - /* Try to unplug any big blocks. */ - virtio_mem_bbm_for_each_bb_rev(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED) { - cond_resched(); - rc = virtio_mem_bbm_offline_remove_and_unplug_bb(vm, bb_id); - if (rc == -EBUSY) - continue; - if (!rc) - nb_bb--; - if (rc || !nb_bb) - return rc; + /* + * As we're holding no locks, these checks are racy, + * but we don't care. + */ + if (i == 0 && !virtio_mem_bbm_bb_is_offline(vm, bb_id)) + continue; + if (i == 1 && !virtio_mem_bbm_bb_is_movable(vm, bb_id)) + continue; + rc = virtio_mem_bbm_offline_remove_and_unplug_bb(vm, bb_id); + if (rc == -EBUSY) + continue; + if (!rc) + nb_bb--; + if (rc || !nb_bb) + return rc; + } + if (i == 0 && !unplug_online) + return 0; } return nb_bb ? -EBUSY : 0; @@ -2422,6 +2431,10 @@ static int virtio_mem_init(struct virtio_mem *vm) dev_warn(&vm->vdev->dev, "Some device memory is not addressable/pluggable. This can make some memory unusable.\n"); + /* Prepare the offline threshold - make sure we can add two blocks. */ + vm->offline_threshold = max_t(uint64_t, 2 * memory_block_size_bytes(), + VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD); + /* * We want subblocks to span at least MAX_ORDER_NR_PAGES and * pageblock_nr_pages pages. This: @@ -2468,14 +2481,11 @@ static int virtio_mem_init(struct virtio_mem *vm) vm->bbm.bb_size - 1; vm->bbm.first_bb_id = virtio_mem_phys_to_bb_id(vm, addr); vm->bbm.next_bb_id = vm->bbm.first_bb_id; - } - /* Prepare the offline threshold - make sure we can add two blocks. */ - vm->offline_threshold = max_t(uint64_t, 2 * memory_block_size_bytes(), - VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD); - /* In BBM, we also want at least two big blocks. */ - vm->offline_threshold = max_t(uint64_t, 2 * vm->bbm.bb_size, - vm->offline_threshold); + /* Make sure we can add two big blocks. */ + vm->offline_threshold = max_t(uint64_t, 2 * vm->bbm.bb_size, + vm->offline_threshold); + } dev_info(&vm->vdev->dev, "start address: 0x%llx", vm->addr); dev_info(&vm->vdev->dev, "region size: 0x%llx", vm->region_size); diff --git a/drivers/virtio/virtio_pci_modern_dev.c b/drivers/virtio/virtio_pci_modern_dev.c index 54f297028586..e11ed748e661 100644 --- a/drivers/virtio/virtio_pci_modern_dev.c +++ b/drivers/virtio/virtio_pci_modern_dev.c @@ -384,6 +384,27 @@ u64 vp_modern_get_features(struct virtio_pci_modern_device *mdev) EXPORT_SYMBOL_GPL(vp_modern_get_features); /* + * vp_modern_get_driver_features - get driver features from device + * @mdev: the modern virtio-pci device + * + * Returns the driver features read from the device + */ +u64 vp_modern_get_driver_features(struct virtio_pci_modern_device *mdev) +{ + struct virtio_pci_common_cfg __iomem *cfg = mdev->common; + + u64 features; + + vp_iowrite32(0, &cfg->guest_feature_select); + features = vp_ioread32(&cfg->guest_feature); + vp_iowrite32(1, &cfg->guest_feature_select); + features |= ((u64)vp_ioread32(&cfg->guest_feature) << 32); + + return features; +} +EXPORT_SYMBOL_GPL(vp_modern_get_driver_features); + +/* * vp_modern_set_features - set features to device * @mdev: the modern virtio-pci device * @features: the features set to device diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 71e16b53e9c1..89bfe46a8a7f 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -74,14 +74,14 @@ struct vring_desc_state_packed { void *data; /* Data for callback. */ struct vring_packed_desc *indir_desc; /* Indirect descriptor, if any. */ u16 num; /* Descriptor list length. */ - u16 next; /* The next desc state in a list. */ u16 last; /* The last desc state in a list. */ }; -struct vring_desc_extra_packed { +struct vring_desc_extra { dma_addr_t addr; /* Buffer DMA addr. */ u32 len; /* Buffer length. */ u16 flags; /* Descriptor flags. */ + u16 next; /* The next desc state in a list. */ }; struct vring_virtqueue { @@ -113,6 +113,9 @@ struct vring_virtqueue { /* Last used index we've seen. */ u16 last_used_idx; + /* Hint for event idx: already triggered no need to disable. */ + bool event_triggered; + union { /* Available for split ring */ struct { @@ -130,6 +133,7 @@ struct vring_virtqueue { /* Per-descriptor state. */ struct vring_desc_state_split *desc_state; + struct vring_desc_extra *desc_extra; /* DMA address and size information */ dma_addr_t queue_dma_addr; @@ -166,7 +170,7 @@ struct vring_virtqueue { /* Per-descriptor state. */ struct vring_desc_state_packed *desc_state; - struct vring_desc_extra_packed *desc_extra; + struct vring_desc_extra *desc_extra; /* DMA address and size information */ dma_addr_t ring_dma_addr; @@ -364,8 +368,8 @@ static int vring_mapping_error(const struct vring_virtqueue *vq, * Split ring specific functions - *_split(). */ -static void vring_unmap_one_split(const struct vring_virtqueue *vq, - struct vring_desc *desc) +static void vring_unmap_one_split_indirect(const struct vring_virtqueue *vq, + struct vring_desc *desc) { u16 flags; @@ -389,6 +393,35 @@ static void vring_unmap_one_split(const struct vring_virtqueue *vq, } } +static unsigned int vring_unmap_one_split(const struct vring_virtqueue *vq, + unsigned int i) +{ + struct vring_desc_extra *extra = vq->split.desc_extra; + u16 flags; + + if (!vq->use_dma_api) + goto out; + + flags = extra[i].flags; + + if (flags & VRING_DESC_F_INDIRECT) { + dma_unmap_single(vring_dma_dev(vq), + extra[i].addr, + extra[i].len, + (flags & VRING_DESC_F_WRITE) ? + DMA_FROM_DEVICE : DMA_TO_DEVICE); + } else { + dma_unmap_page(vring_dma_dev(vq), + extra[i].addr, + extra[i].len, + (flags & VRING_DESC_F_WRITE) ? + DMA_FROM_DEVICE : DMA_TO_DEVICE); + } + +out: + return extra[i].next; +} + static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq, unsigned int total_sg, gfp_t gfp) @@ -412,6 +445,35 @@ static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq, return desc; } +static inline unsigned int virtqueue_add_desc_split(struct virtqueue *vq, + struct vring_desc *desc, + unsigned int i, + dma_addr_t addr, + unsigned int len, + u16 flags, + bool indirect) +{ + struct vring_virtqueue *vring = to_vvq(vq); + struct vring_desc_extra *extra = vring->split.desc_extra; + u16 next; + + desc[i].flags = cpu_to_virtio16(vq->vdev, flags); + desc[i].addr = cpu_to_virtio64(vq->vdev, addr); + desc[i].len = cpu_to_virtio32(vq->vdev, len); + + if (!indirect) { + next = extra[i].next; + desc[i].next = cpu_to_virtio16(vq->vdev, next); + + extra[i].addr = addr; + extra[i].len = len; + extra[i].flags = flags; + } else + next = virtio16_to_cpu(vq->vdev, desc[i].next); + + return next; +} + static inline int virtqueue_add_split(struct virtqueue *_vq, struct scatterlist *sgs[], unsigned int total_sg, @@ -484,11 +546,13 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, if (vring_mapping_error(vq, addr)) goto unmap_release; - desc[i].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT); - desc[i].addr = cpu_to_virtio64(_vq->vdev, addr); - desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length); prev = i; - i = virtio16_to_cpu(_vq->vdev, desc[i].next); + /* Note that we trust indirect descriptor + * table since it use stream DMA mapping. + */ + i = virtqueue_add_desc_split(_vq, desc, i, addr, sg->length, + VRING_DESC_F_NEXT, + indirect); } } for (; n < (out_sgs + in_sgs); n++) { @@ -497,15 +561,22 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, if (vring_mapping_error(vq, addr)) goto unmap_release; - desc[i].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT | VRING_DESC_F_WRITE); - desc[i].addr = cpu_to_virtio64(_vq->vdev, addr); - desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length); prev = i; - i = virtio16_to_cpu(_vq->vdev, desc[i].next); + /* Note that we trust indirect descriptor + * table since it use stream DMA mapping. + */ + i = virtqueue_add_desc_split(_vq, desc, i, addr, + sg->length, + VRING_DESC_F_NEXT | + VRING_DESC_F_WRITE, + indirect); } } /* Last one doesn't continue. */ desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT); + if (!indirect && vq->use_dma_api) + vq->split.desc_extra[prev & (vq->split.vring.num - 1)].flags = + ~VRING_DESC_F_NEXT; if (indirect) { /* Now that the indirect table is filled in, map it. */ @@ -515,13 +586,11 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, if (vring_mapping_error(vq, addr)) goto unmap_release; - vq->split.vring.desc[head].flags = cpu_to_virtio16(_vq->vdev, - VRING_DESC_F_INDIRECT); - vq->split.vring.desc[head].addr = cpu_to_virtio64(_vq->vdev, - addr); - - vq->split.vring.desc[head].len = cpu_to_virtio32(_vq->vdev, - total_sg * sizeof(struct vring_desc)); + virtqueue_add_desc_split(_vq, vq->split.vring.desc, + head, addr, + total_sg * sizeof(struct vring_desc), + VRING_DESC_F_INDIRECT, + false); } /* We're using some buffers from the free list. */ @@ -529,8 +598,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, /* Update free pointer */ if (indirect) - vq->free_head = virtio16_to_cpu(_vq->vdev, - vq->split.vring.desc[head].next); + vq->free_head = vq->split.desc_extra[head].next; else vq->free_head = i; @@ -575,8 +643,11 @@ unmap_release: for (n = 0; n < total_sg; n++) { if (i == err_idx) break; - vring_unmap_one_split(vq, &desc[i]); - i = virtio16_to_cpu(_vq->vdev, desc[i].next); + if (indirect) { + vring_unmap_one_split_indirect(vq, &desc[i]); + i = virtio16_to_cpu(_vq->vdev, desc[i].next); + } else + i = vring_unmap_one_split(vq, i); } if (indirect) @@ -630,14 +701,13 @@ static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head, i = head; while (vq->split.vring.desc[i].flags & nextflag) { - vring_unmap_one_split(vq, &vq->split.vring.desc[i]); - i = virtio16_to_cpu(vq->vq.vdev, vq->split.vring.desc[i].next); + vring_unmap_one_split(vq, i); + i = vq->split.desc_extra[i].next; vq->vq.num_free++; } - vring_unmap_one_split(vq, &vq->split.vring.desc[i]); - vq->split.vring.desc[i].next = cpu_to_virtio16(vq->vq.vdev, - vq->free_head); + vring_unmap_one_split(vq, i); + vq->split.desc_extra[i].next = vq->free_head; vq->free_head = head; /* Plus final descriptor */ @@ -652,15 +722,14 @@ static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head, if (!indir_desc) return; - len = virtio32_to_cpu(vq->vq.vdev, - vq->split.vring.desc[head].len); + len = vq->split.desc_extra[head].len; - BUG_ON(!(vq->split.vring.desc[head].flags & - cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT))); + BUG_ON(!(vq->split.desc_extra[head].flags & + VRING_DESC_F_INDIRECT)); BUG_ON(len == 0 || len % sizeof(struct vring_desc)); for (j = 0; j < len / sizeof(struct vring_desc); j++) - vring_unmap_one_split(vq, &indir_desc[j]); + vring_unmap_one_split_indirect(vq, &indir_desc[j]); kfree(indir_desc); vq->split.desc_state[head].indir_desc = NULL; @@ -739,7 +808,10 @@ static void virtqueue_disable_cb_split(struct virtqueue *_vq) if (!(vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) { vq->split.avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT; - if (!vq->event) + if (vq->event) + /* TODO: this is a hack. Figure out a cleaner value to write. */ + vring_used_event(&vq->split.vring) = 0x0; + else vq->split.vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->split.avail_flags_shadow); @@ -912,7 +984,7 @@ static struct virtqueue *vring_create_virtqueue_split( */ static void vring_unmap_state_packed(const struct vring_virtqueue *vq, - struct vring_desc_extra_packed *state) + struct vring_desc_extra *state) { u16 flags; @@ -1061,7 +1133,7 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq, 1 << VRING_PACKED_DESC_F_USED; } vq->packed.next_avail_idx = n; - vq->free_head = vq->packed.desc_state[id].next; + vq->free_head = vq->packed.desc_extra[id].next; /* Store token and indirect buffer state. */ vq->packed.desc_state[id].num = 1; @@ -1169,7 +1241,7 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq, le16_to_cpu(flags); } prev = curr; - curr = vq->packed.desc_state[curr].next; + curr = vq->packed.desc_extra[curr].next; if ((unlikely(++i >= vq->packed.vring.num))) { i = 0; @@ -1213,13 +1285,16 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq, unmap_release: err_idx = i; i = head; + curr = vq->free_head; vq->packed.avail_used_flags = avail_used_flags; for (n = 0; n < total_sg; n++) { if (i == err_idx) break; - vring_unmap_desc_packed(vq, &desc[i]); + vring_unmap_state_packed(vq, + &vq->packed.desc_extra[curr]); + curr = vq->packed.desc_extra[curr].next; i++; if (i >= vq->packed.vring.num) i = 0; @@ -1290,7 +1365,7 @@ static void detach_buf_packed(struct vring_virtqueue *vq, /* Clear data ptr. */ state->data = NULL; - vq->packed.desc_state[state->last].next = vq->free_head; + vq->packed.desc_extra[state->last].next = vq->free_head; vq->free_head = id; vq->vq.num_free += state->num; @@ -1299,7 +1374,7 @@ static void detach_buf_packed(struct vring_virtqueue *vq, for (i = 0; i < state->num; i++) { vring_unmap_state_packed(vq, &vq->packed.desc_extra[curr]); - curr = vq->packed.desc_state[curr].next; + curr = vq->packed.desc_extra[curr].next; } } @@ -1550,6 +1625,25 @@ static void *virtqueue_detach_unused_buf_packed(struct virtqueue *_vq) return NULL; } +static struct vring_desc_extra *vring_alloc_desc_extra(struct vring_virtqueue *vq, + unsigned int num) +{ + struct vring_desc_extra *desc_extra; + unsigned int i; + + desc_extra = kmalloc_array(num, sizeof(struct vring_desc_extra), + GFP_KERNEL); + if (!desc_extra) + return NULL; + + memset(desc_extra, 0, num * sizeof(struct vring_desc_extra)); + + for (i = 0; i < num - 1; i++) + desc_extra[i].next = i + 1; + + return desc_extra; +} + static struct virtqueue *vring_create_virtqueue_packed( unsigned int index, unsigned int num, @@ -1567,7 +1661,6 @@ static struct virtqueue *vring_create_virtqueue_packed( struct vring_packed_desc_event *driver, *device; dma_addr_t ring_dma_addr, driver_event_dma_addr, device_event_dma_addr; size_t ring_size_in_bytes, event_size_in_bytes; - unsigned int i; ring_size_in_bytes = num * sizeof(struct vring_packed_desc); @@ -1605,6 +1698,7 @@ static struct virtqueue *vring_create_virtqueue_packed( vq->weak_barriers = weak_barriers; vq->broken = false; vq->last_used_idx = 0; + vq->event_triggered = false; vq->num_added = 0; vq->packed_ring = true; vq->use_dma_api = vring_use_dma_api(vdev); @@ -1649,18 +1743,11 @@ static struct virtqueue *vring_create_virtqueue_packed( /* Put everything in free lists. */ vq->free_head = 0; - for (i = 0; i < num-1; i++) - vq->packed.desc_state[i].next = i + 1; - vq->packed.desc_extra = kmalloc_array(num, - sizeof(struct vring_desc_extra_packed), - GFP_KERNEL); + vq->packed.desc_extra = vring_alloc_desc_extra(vq, num); if (!vq->packed.desc_extra) goto err_desc_extra; - memset(vq->packed.desc_extra, 0, - num * sizeof(struct vring_desc_extra_packed)); - /* No callback? Tell other side not to bother us. */ if (!callback) { vq->packed.event_flags_shadow = VRING_PACKED_EVENT_FLAG_DISABLE; @@ -1875,7 +1962,7 @@ bool virtqueue_kick(struct virtqueue *vq) EXPORT_SYMBOL_GPL(virtqueue_kick); /** - * virtqueue_get_buf - get the next used buffer + * virtqueue_get_buf_ctx - get the next used buffer * @_vq: the struct virtqueue we're talking about. * @len: the length written into the buffer * @ctx: extra context for the token @@ -1919,6 +2006,12 @@ void virtqueue_disable_cb(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); + /* If device triggered an event already it won't trigger one again: + * no need to disable. + */ + if (vq->event_triggered) + return; + if (vq->packed_ring) virtqueue_disable_cb_packed(_vq); else @@ -1942,6 +2035,9 @@ unsigned virtqueue_enable_cb_prepare(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); + if (vq->event_triggered) + vq->event_triggered = false; + return vq->packed_ring ? virtqueue_enable_cb_prepare_packed(_vq) : virtqueue_enable_cb_prepare_split(_vq); } @@ -2005,6 +2101,9 @@ bool virtqueue_enable_cb_delayed(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); + if (vq->event_triggered) + vq->event_triggered = false; + return vq->packed_ring ? virtqueue_enable_cb_delayed_packed(_vq) : virtqueue_enable_cb_delayed_split(_vq); } @@ -2044,6 +2143,10 @@ irqreturn_t vring_interrupt(int irq, void *_vq) if (unlikely(vq->broken)) return IRQ_HANDLED; + /* Just a hint for performance: so it's ok that this can be racy! */ + if (vq->event) + vq->event_triggered = true; + pr_debug("virtqueue callback for %p (%p)\n", vq, vq->vq.callback); if (vq->vq.callback) vq->vq.callback(&vq->vq); @@ -2062,7 +2165,6 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index, void (*callback)(struct virtqueue *), const char *name) { - unsigned int i; struct vring_virtqueue *vq; if (virtio_has_feature(vdev, VIRTIO_F_RING_PACKED)) @@ -2083,6 +2185,7 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index, vq->weak_barriers = weak_barriers; vq->broken = false; vq->last_used_idx = 0; + vq->event_triggered = false; vq->num_added = 0; vq->use_dma_api = vring_use_dma_api(vdev); #ifdef DEBUG @@ -2114,20 +2217,26 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index, vq->split.desc_state = kmalloc_array(vring.num, sizeof(struct vring_desc_state_split), GFP_KERNEL); - if (!vq->split.desc_state) { - kfree(vq); - return NULL; - } + if (!vq->split.desc_state) + goto err_state; + + vq->split.desc_extra = vring_alloc_desc_extra(vq, vring.num); + if (!vq->split.desc_extra) + goto err_extra; /* Put everything in free lists. */ vq->free_head = 0; - for (i = 0; i < vring.num-1; i++) - vq->split.vring.desc[i].next = cpu_to_virtio16(vdev, i + 1); memset(vq->split.desc_state, 0, vring.num * sizeof(struct vring_desc_state_split)); list_add_tail(&vq->vq.list, &vdev->vqs); return &vq->vq; + +err_extra: + kfree(vq->split.desc_state); +err_state: + kfree(vq); + return NULL; } EXPORT_SYMBOL_GPL(__vring_new_virtqueue); @@ -2208,8 +2317,10 @@ void vring_del_virtqueue(struct virtqueue *_vq) vq->split.queue_dma_addr); } } - if (!vq->packed_ring) + if (!vq->packed_ring) { kfree(vq->split.desc_state); + kfree(vq->split.desc_extra); + } list_del(&_vq->list); kfree(vq); } diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c index e28acf482e0c..e1a141135992 100644 --- a/drivers/virtio/virtio_vdpa.c +++ b/drivers/virtio/virtio_vdpa.c @@ -142,6 +142,8 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index, struct vdpa_callback cb; struct virtqueue *vq; u64 desc_addr, driver_addr, device_addr; + /* Assume split virtqueue, switch to packed if necessary */ + struct vdpa_vq_state state = {0}; unsigned long flags; u32 align, num; int err; @@ -191,6 +193,19 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index, goto err_vq; } + /* reset virtqueue state index */ + if (virtio_has_feature(vdev, VIRTIO_F_RING_PACKED)) { + struct vdpa_vq_state_packed *s = &state.packed; + + s->last_avail_counter = 1; + s->last_avail_idx = 0; + s->last_used_counter = 1; + s->last_used_idx = 0; + } + err = ops->set_vq_state(vdpa, index, &state); + if (err) + goto err_vq; + ops->set_vq_ready(vdpa, index, 1); vq->priv = info; |