diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2023-11-01 13:55:40 -1000 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2023-11-01 13:55:40 -1000 |
commit | deefd5024f0772cf56052ace9a8c347dc70bcaf3 (patch) | |
tree | 3e0ff80b073c18c45103c5acbd318e62567cb989 | |
parent | 009fbfc97b6367762efa257f1478ec86d37949f9 (diff) | |
parent | 2b88119e35b00d8cb418d86abbace3b90a993bd7 (diff) |
Merge tag 'vfio-v6.7-rc1' of https://github.com/awilliam/linux-vfio
Pull VFIO updates from Alex Williamson:
- Add support for "chunk mode" in the mlx5-vfio-pci variant driver,
which allows both larger device image sizes for migration, beyond the
previous 4GB limit, and also read-ahead support for improved
migration performance (Yishai Hadas)
- A new bus master control interface for the CDX bus driver where there
is no in-band mechanism to toggle device DMA as there is through
config space on PCI devices (Nipun Gupta)
- Add explicit alignment directives to vfio data structures to reduce
the chance of breaking 32-bit userspace. In most cases this is
transparent and the remaining cases where data structures are padded
work within the existing rules for extending data structures within
vfio (Stefan Hajnoczi)
- Resolve a bug in the cdx bus driver noted when compiled with clang
where missing parenthesis result in the wrong operation (Nathan
Chancellor)
- Resolve errors reported by smatch for a function when dealing with
invalid inputs (Alex Williamson)
- Add migration support to the mtty vfio/mdev sample driver for testing
and integration purposes, allowing CI of migration without specific
hardware requirements. Also resolve many of the short- comings of
this driver relative to implementation of the vfio interrupt ioctl
along the way (Alex Williamson)
* tag 'vfio-v6.7-rc1' of https://github.com/awilliam/linux-vfio:
vfio/mtty: Enable migration support
vfio/mtty: Overhaul mtty interrupt handling
vfio: Fix smatch errors in vfio_combine_iova_ranges()
vfio/cdx: Add parentheses between bitwise AND expression and logical NOT
vfio/mlx5: Activate the chunk mode functionality
vfio/mlx5: Add support for READING in chunk mode
vfio/mlx5: Add support for SAVING in chunk mode
vfio/mlx5: Pre-allocate chunks for the STOP_COPY phase
vfio/mlx5: Rename some stuff to match chunk mode
vfio/mlx5: Enable querying state size which is > 4GB
vfio/mlx5: Refactor the SAVE callback to activate a work only upon an error
vfio/mlx5: Wake up the reader post of disabling the SAVING migration file
vfio: use __aligned_u64 in struct vfio_device_ioeventfd
vfio: use __aligned_u64 in struct vfio_device_gfx_plane_info
vfio: trivially use __aligned_u64 for ioctl structs
vfio-cdx: add bus mastering device feature support
vfio: add bus master feature to device feature ioctl
cdx: add support for bus mastering
-rw-r--r-- | drivers/cdx/cdx.c | 32 | ||||
-rw-r--r-- | drivers/cdx/controller/cdx_controller.c | 4 | ||||
-rw-r--r-- | drivers/cdx/controller/mcdi_functions.c | 58 | ||||
-rw-r--r-- | drivers/cdx/controller/mcdi_functions.h | 13 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/gvt/kvmgt.c | 2 | ||||
-rw-r--r-- | drivers/vfio/cdx/main.c | 57 | ||||
-rw-r--r-- | drivers/vfio/cdx/private.h | 2 | ||||
-rw-r--r-- | drivers/vfio/pci/mlx5/cmd.c | 103 | ||||
-rw-r--r-- | drivers/vfio/pci/mlx5/cmd.h | 28 | ||||
-rw-r--r-- | drivers/vfio/pci/mlx5/main.c | 283 | ||||
-rw-r--r-- | drivers/vfio/vfio_main.c | 10 | ||||
-rw-r--r-- | include/linux/cdx/cdx_bus.h | 18 | ||||
-rw-r--r-- | include/uapi/linux/vfio.h | 47 | ||||
-rw-r--r-- | samples/vfio-mdev/mbochs.c | 2 | ||||
-rw-r--r-- | samples/vfio-mdev/mdpy.c | 2 | ||||
-rw-r--r-- | samples/vfio-mdev/mtty.c | 829 |
16 files changed, 1298 insertions, 192 deletions
diff --git a/drivers/cdx/cdx.c b/drivers/cdx/cdx.c index d2cad4c670a0..9efb7584f952 100644 --- a/drivers/cdx/cdx.c +++ b/drivers/cdx/cdx.c @@ -182,6 +182,38 @@ cdx_match_id(const struct cdx_device_id *ids, struct cdx_device *dev) return NULL; } +int cdx_set_master(struct cdx_device *cdx_dev) +{ + struct cdx_controller *cdx = cdx_dev->cdx; + struct cdx_device_config dev_config; + int ret = -EOPNOTSUPP; + + dev_config.type = CDX_DEV_BUS_MASTER_CONF; + dev_config.bus_master_enable = true; + if (cdx->ops->dev_configure) + ret = cdx->ops->dev_configure(cdx, cdx_dev->bus_num, + cdx_dev->dev_num, &dev_config); + + return ret; +} +EXPORT_SYMBOL_GPL(cdx_set_master); + +int cdx_clear_master(struct cdx_device *cdx_dev) +{ + struct cdx_controller *cdx = cdx_dev->cdx; + struct cdx_device_config dev_config; + int ret = -EOPNOTSUPP; + + dev_config.type = CDX_DEV_BUS_MASTER_CONF; + dev_config.bus_master_enable = false; + if (cdx->ops->dev_configure) + ret = cdx->ops->dev_configure(cdx, cdx_dev->bus_num, + cdx_dev->dev_num, &dev_config); + + return ret; +} +EXPORT_SYMBOL_GPL(cdx_clear_master); + /** * cdx_bus_match - device to driver matching callback * @dev: the cdx device to match against diff --git a/drivers/cdx/controller/cdx_controller.c b/drivers/cdx/controller/cdx_controller.c index bb4ae7970e21..7828dac8edb1 100644 --- a/drivers/cdx/controller/cdx_controller.c +++ b/drivers/cdx/controller/cdx_controller.c @@ -56,6 +56,10 @@ static int cdx_configure_device(struct cdx_controller *cdx, case CDX_DEV_RESET_CONF: ret = cdx_mcdi_reset_device(cdx->priv, bus_num, dev_num); break; + case CDX_DEV_BUS_MASTER_CONF: + ret = cdx_mcdi_bus_master_enable(cdx->priv, bus_num, dev_num, + dev_config->bus_master_enable); + break; default: ret = -EINVAL; } diff --git a/drivers/cdx/controller/mcdi_functions.c b/drivers/cdx/controller/mcdi_functions.c index 0158f26533dd..fc82435d5dea 100644 --- a/drivers/cdx/controller/mcdi_functions.c +++ b/drivers/cdx/controller/mcdi_functions.c @@ -137,3 +137,61 @@ int cdx_mcdi_reset_device(struct cdx_mcdi *cdx, u8 bus_num, u8 dev_num) return ret; } + +static int cdx_mcdi_ctrl_flag_get(struct cdx_mcdi *cdx, u8 bus_num, + u8 dev_num, u32 *flags) +{ + MCDI_DECLARE_BUF(inbuf, MC_CMD_CDX_DEVICE_CONTROL_GET_IN_LEN); + MCDI_DECLARE_BUF(outbuf, MC_CMD_CDX_DEVICE_CONTROL_GET_OUT_LEN); + size_t outlen; + int ret; + + MCDI_SET_DWORD(inbuf, CDX_DEVICE_CONTROL_GET_IN_BUS, bus_num); + MCDI_SET_DWORD(inbuf, CDX_DEVICE_CONTROL_GET_IN_DEVICE, dev_num); + ret = cdx_mcdi_rpc(cdx, MC_CMD_CDX_DEVICE_CONTROL_GET, inbuf, + sizeof(inbuf), outbuf, sizeof(outbuf), &outlen); + if (ret) + return ret; + + if (outlen != MC_CMD_CDX_DEVICE_CONTROL_GET_OUT_LEN) + return -EIO; + + *flags = MCDI_DWORD(outbuf, CDX_DEVICE_CONTROL_GET_OUT_FLAGS); + + return 0; +} + +static int cdx_mcdi_ctrl_flag_set(struct cdx_mcdi *cdx, u8 bus_num, + u8 dev_num, bool enable, int bit_pos) +{ + MCDI_DECLARE_BUF(inbuf, MC_CMD_CDX_DEVICE_CONTROL_SET_IN_LEN); + u32 flags; + int ret; + + /* + * Get flags and then set/reset bit at bit_pos according to + * the input params. + */ + ret = cdx_mcdi_ctrl_flag_get(cdx, bus_num, dev_num, &flags); + if (ret) + return ret; + + flags = flags & (u32)(~(BIT(bit_pos))); + if (enable) + flags |= (1 << bit_pos); + + MCDI_SET_DWORD(inbuf, CDX_DEVICE_CONTROL_SET_IN_BUS, bus_num); + MCDI_SET_DWORD(inbuf, CDX_DEVICE_CONTROL_SET_IN_DEVICE, dev_num); + MCDI_SET_DWORD(inbuf, CDX_DEVICE_CONTROL_SET_IN_FLAGS, flags); + ret = cdx_mcdi_rpc(cdx, MC_CMD_CDX_DEVICE_CONTROL_SET, inbuf, + sizeof(inbuf), NULL, 0, NULL); + + return ret; +} + +int cdx_mcdi_bus_master_enable(struct cdx_mcdi *cdx, u8 bus_num, + u8 dev_num, bool enable) +{ + return cdx_mcdi_ctrl_flag_set(cdx, bus_num, dev_num, enable, + MC_CMD_CDX_DEVICE_CONTROL_SET_IN_BUS_MASTER_ENABLE_LBN); +} diff --git a/drivers/cdx/controller/mcdi_functions.h b/drivers/cdx/controller/mcdi_functions.h index 7440ace5539a..a448d6581eb4 100644 --- a/drivers/cdx/controller/mcdi_functions.h +++ b/drivers/cdx/controller/mcdi_functions.h @@ -58,4 +58,17 @@ int cdx_mcdi_get_dev_config(struct cdx_mcdi *cdx, int cdx_mcdi_reset_device(struct cdx_mcdi *cdx, u8 bus_num, u8 dev_num); +/** + * cdx_mcdi_bus_master_enable - Set/Reset bus mastering for cdx device + * represented by bus_num:dev_num + * @cdx: pointer to MCDI interface. + * @bus_num: Bus number. + * @dev_num: Device number. + * @enable: Enable bus mastering if set, disable otherwise. + * + * Return: 0 on success, <0 on failure + */ +int cdx_mcdi_bus_master_enable(struct cdx_mcdi *cdx, u8 bus_num, + u8 dev_num, bool enable); + #endif /* CDX_MCDI_FUNCTIONS_H */ diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index 42ce20e72db7..faf21be724c3 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -1379,7 +1379,7 @@ static long intel_vgpu_ioctl(struct vfio_device *vfio_dev, unsigned int cmd, intel_gvt_reset_vgpu(vgpu); return 0; } else if (cmd == VFIO_DEVICE_QUERY_GFX_PLANE) { - struct vfio_device_gfx_plane_info dmabuf; + struct vfio_device_gfx_plane_info dmabuf = {}; int ret = 0; minsz = offsetofend(struct vfio_device_gfx_plane_info, diff --git a/drivers/vfio/cdx/main.c b/drivers/vfio/cdx/main.c index de56686581ae..a63744302b5e 100644 --- a/drivers/vfio/cdx/main.c +++ b/drivers/vfio/cdx/main.c @@ -14,7 +14,7 @@ static int vfio_cdx_open_device(struct vfio_device *core_vdev) container_of(core_vdev, struct vfio_cdx_device, vdev); struct cdx_device *cdx_dev = to_cdx_device(core_vdev->dev); int count = cdx_dev->res_count; - int i; + int i, ret; vdev->regions = kcalloc(count, sizeof(struct vfio_cdx_region), GFP_KERNEL_ACCOUNT); @@ -39,6 +39,17 @@ static int vfio_cdx_open_device(struct vfio_device *core_vdev) if (!(cdx_dev->res[i].flags & IORESOURCE_READONLY)) vdev->regions[i].flags |= VFIO_REGION_INFO_FLAG_WRITE; } + ret = cdx_dev_reset(core_vdev->dev); + if (ret) { + kfree(vdev->regions); + vdev->regions = NULL; + return ret; + } + ret = cdx_clear_master(cdx_dev); + if (ret) + vdev->flags &= ~BME_SUPPORT; + else + vdev->flags |= BME_SUPPORT; return 0; } @@ -52,6 +63,49 @@ static void vfio_cdx_close_device(struct vfio_device *core_vdev) cdx_dev_reset(core_vdev->dev); } +static int vfio_cdx_bm_ctrl(struct vfio_device *core_vdev, u32 flags, + void __user *arg, size_t argsz) +{ + size_t minsz = + offsetofend(struct vfio_device_feature_bus_master, op); + struct vfio_cdx_device *vdev = + container_of(core_vdev, struct vfio_cdx_device, vdev); + struct cdx_device *cdx_dev = to_cdx_device(core_vdev->dev); + struct vfio_device_feature_bus_master ops; + int ret; + + if (!(vdev->flags & BME_SUPPORT)) + return -ENOTTY; + + ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET, + sizeof(ops)); + if (ret != 1) + return ret; + + if (copy_from_user(&ops, arg, minsz)) + return -EFAULT; + + switch (ops.op) { + case VFIO_DEVICE_FEATURE_CLEAR_MASTER: + return cdx_clear_master(cdx_dev); + case VFIO_DEVICE_FEATURE_SET_MASTER: + return cdx_set_master(cdx_dev); + default: + return -EINVAL; + } +} + +static int vfio_cdx_ioctl_feature(struct vfio_device *device, u32 flags, + void __user *arg, size_t argsz) +{ + switch (flags & VFIO_DEVICE_FEATURE_MASK) { + case VFIO_DEVICE_FEATURE_BUS_MASTER: + return vfio_cdx_bm_ctrl(device, flags, arg, argsz); + default: + return -ENOTTY; + } +} + static int vfio_cdx_ioctl_get_info(struct vfio_cdx_device *vdev, struct vfio_device_info __user *arg) { @@ -169,6 +223,7 @@ static const struct vfio_device_ops vfio_cdx_ops = { .open_device = vfio_cdx_open_device, .close_device = vfio_cdx_close_device, .ioctl = vfio_cdx_ioctl, + .device_feature = vfio_cdx_ioctl_feature, .mmap = vfio_cdx_mmap, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, diff --git a/drivers/vfio/cdx/private.h b/drivers/vfio/cdx/private.h index 8bdc117ea88e..8e9d25913728 100644 --- a/drivers/vfio/cdx/private.h +++ b/drivers/vfio/cdx/private.h @@ -23,6 +23,8 @@ struct vfio_cdx_region { struct vfio_cdx_device { struct vfio_device vdev; struct vfio_cdx_region *regions; + u32 flags; +#define BME_SUPPORT BIT(0) }; #endif /* VFIO_CDX_PRIVATE_H */ diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 33574b04477d..efd1d252cdc9 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -86,7 +86,8 @@ int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) } int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, - size_t *state_size, u8 query_flags) + size_t *state_size, u64 *total_size, + u8 query_flags) { u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {}; u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {}; @@ -128,6 +129,7 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, MLX5_SET(query_vhca_migration_state_in, in, op_mod, 0); MLX5_SET(query_vhca_migration_state_in, in, incremental, query_flags & MLX5VF_QUERY_INC); + MLX5_SET(query_vhca_migration_state_in, in, chunk, mvdev->chunk_mode); ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in, out); @@ -139,6 +141,11 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, *state_size = MLX5_GET(query_vhca_migration_state_out, out, required_umem_size); + if (total_size) + *total_size = mvdev->chunk_mode ? + MLX5_GET64(query_vhca_migration_state_out, out, + remaining_total_size) : *state_size; + return 0; } @@ -254,6 +261,9 @@ void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, mvdev->core_device.vdev.migration_flags |= VFIO_MIGRATION_PRE_COPY; + if (MLX5_CAP_GEN_2(mvdev->mdev, migration_in_chunks)) + mvdev->chunk_mode = 1; + end: mlx5_vf_put_core_dev(mvdev->mdev); } @@ -428,6 +438,7 @@ end: void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf) { spin_lock_irq(&buf->migf->list_lock); + buf->stop_copy_chunk_num = 0; list_add_tail(&buf->buf_elm, &buf->migf->avail_list); spin_unlock_irq(&buf->migf->list_lock); } @@ -475,6 +486,15 @@ found: return buf; } +static void +mlx5vf_save_callback_complete(struct mlx5_vf_migration_file *migf, + struct mlx5vf_async_data *async_data) +{ + kvfree(async_data->out); + complete(&migf->save_comp); + fput(migf->filp); +} + void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) { struct mlx5vf_async_data *async_data = container_of(_work, @@ -487,16 +507,15 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) mlx5vf_put_data_buffer(async_data->buf); if (async_data->header_buf) mlx5vf_put_data_buffer(async_data->header_buf); - if (async_data->status == MLX5_CMD_STAT_BAD_RES_STATE_ERR) + if (!async_data->stop_copy_chunk && + async_data->status == MLX5_CMD_STAT_BAD_RES_STATE_ERR) migf->state = MLX5_MIGF_STATE_PRE_COPY_ERROR; else migf->state = MLX5_MIGF_STATE_ERROR; wake_up_interruptible(&migf->poll_wait); } mutex_unlock(&migf->lock); - kvfree(async_data->out); - complete(&migf->save_comp); - fput(migf->filp); + mlx5vf_save_callback_complete(migf, async_data); } static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf, @@ -536,13 +555,20 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) struct mlx5_vf_migration_file, async_data); if (!status) { + size_t next_required_umem_size = 0; + bool stop_copy_last_chunk; size_t image_size; unsigned long flags; bool initial_pre_copy = migf->state != MLX5_MIGF_STATE_PRE_COPY && - !async_data->last_chunk; + !async_data->stop_copy_chunk; image_size = MLX5_GET(save_vhca_state_out, async_data->out, actual_image_size); + if (async_data->buf->stop_copy_chunk_num) + next_required_umem_size = MLX5_GET(save_vhca_state_out, + async_data->out, next_required_umem_size); + stop_copy_last_chunk = async_data->stop_copy_chunk && + !next_required_umem_size; if (async_data->header_buf) { status = add_buf_header(async_data->header_buf, image_size, initial_pre_copy); @@ -554,19 +580,34 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) migf->max_pos += async_data->buf->length; spin_lock_irqsave(&migf->list_lock, flags); list_add_tail(&async_data->buf->buf_elm, &migf->buf_list); + if (async_data->buf->stop_copy_chunk_num) { + migf->num_ready_chunks++; + if (next_required_umem_size && + migf->num_ready_chunks >= MAX_NUM_CHUNKS) { + /* Delay the next SAVE till one chunk be consumed */ + migf->next_required_umem_size = next_required_umem_size; + next_required_umem_size = 0; + } + } spin_unlock_irqrestore(&migf->list_lock, flags); - if (initial_pre_copy) + if (initial_pre_copy) { migf->pre_copy_initial_bytes += image_size; - migf->state = async_data->last_chunk ? - MLX5_MIGF_STATE_COMPLETE : MLX5_MIGF_STATE_PRE_COPY; + migf->state = MLX5_MIGF_STATE_PRE_COPY; + } + if (stop_copy_last_chunk) + migf->state = MLX5_MIGF_STATE_COMPLETE; wake_up_interruptible(&migf->poll_wait); + if (next_required_umem_size) + mlx5vf_mig_file_set_save_work(migf, + /* Picking up the next chunk num */ + (async_data->buf->stop_copy_chunk_num % MAX_NUM_CHUNKS) + 1, + next_required_umem_size); + mlx5vf_save_callback_complete(migf, async_data); + return; } err: - /* - * The error and the cleanup flows can't run from an - * interrupt context - */ + /* The error flow can't run from an interrupt context */ if (status == -EREMOTEIO) status = MLX5_GET(save_vhca_state_out, async_data->out, status); async_data->status = status; @@ -610,7 +651,7 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, async_data = &migf->async_data; async_data->buf = buf; - async_data->last_chunk = !track; + async_data->stop_copy_chunk = !track; async_data->out = kvzalloc(out_size, GFP_KERNEL); if (!async_data->out) { err = -ENOMEM; @@ -618,10 +659,15 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, } if (MLX5VF_PRE_COPY_SUPP(mvdev)) { - if (async_data->last_chunk && migf->buf_header) { - header_buf = migf->buf_header; - migf->buf_header = NULL; - } else { + if (async_data->stop_copy_chunk) { + u8 header_idx = buf->stop_copy_chunk_num ? + buf->stop_copy_chunk_num - 1 : 0; + + header_buf = migf->buf_header[header_idx]; + migf->buf_header[header_idx] = NULL; + } + + if (!header_buf) { header_buf = mlx5vf_get_data_buffer(migf, sizeof(struct mlx5_vf_migration_header), DMA_NONE); if (IS_ERR(header_buf)) { @@ -631,8 +677,8 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, } } - if (async_data->last_chunk) - migf->state = MLX5_MIGF_STATE_SAVE_LAST; + if (async_data->stop_copy_chunk) + migf->state = MLX5_MIGF_STATE_SAVE_STOP_COPY_CHUNK; async_data->header_buf = header_buf; get_file(migf->filp); @@ -707,18 +753,21 @@ void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf) void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf) { struct mlx5_vhca_data_buffer *entry; + int i; lockdep_assert_held(&migf->mvdev->state_mutex); WARN_ON(migf->mvdev->mdev_detach); - if (migf->buf) { - mlx5vf_free_data_buffer(migf->buf); - migf->buf = NULL; - } + for (i = 0; i < MAX_NUM_CHUNKS; i++) { + if (migf->buf[i]) { + mlx5vf_free_data_buffer(migf->buf[i]); + migf->buf[i] = NULL; + } - if (migf->buf_header) { - mlx5vf_free_data_buffer(migf->buf_header); - migf->buf_header = NULL; + if (migf->buf_header[i]) { + mlx5vf_free_data_buffer(migf->buf_header[i]); + migf->buf_header[i] = NULL; + } } list_splice(&migf->avail_list, &migf->buf_list); diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index aec4c69dd6c1..f2c7227fa683 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -20,7 +20,7 @@ enum mlx5_vf_migf_state { MLX5_MIGF_STATE_ERROR = 1, MLX5_MIGF_STATE_PRE_COPY_ERROR, MLX5_MIGF_STATE_PRE_COPY, - MLX5_MIGF_STATE_SAVE_LAST, + MLX5_MIGF_STATE_SAVE_STOP_COPY_CHUNK, MLX5_MIGF_STATE_COMPLETE, }; @@ -64,6 +64,7 @@ struct mlx5_vhca_data_buffer { u32 mkey; enum dma_data_direction dma_dir; u8 dmaed:1; + u8 stop_copy_chunk_num; struct list_head buf_elm; struct mlx5_vf_migration_file *migf; /* Optimize mlx5vf_get_migration_page() for sequential access */ @@ -78,10 +79,19 @@ struct mlx5vf_async_data { struct mlx5_vhca_data_buffer *buf; struct mlx5_vhca_data_buffer *header_buf; int status; - u8 last_chunk:1; + u8 stop_copy_chunk:1; void *out; }; +struct mlx5vf_save_work_data { + struct mlx5_vf_migration_file *migf; + size_t next_required_umem_size; + struct work_struct work; + u8 chunk_num; +}; + +#define MAX_NUM_CHUNKS 2 + struct mlx5_vf_migration_file { struct file *filp; struct mutex lock; @@ -94,8 +104,12 @@ struct mlx5_vf_migration_file { u32 record_tag; u64 stop_copy_prep_size; u64 pre_copy_initial_bytes; - struct mlx5_vhca_data_buffer *buf; - struct mlx5_vhca_data_buffer *buf_header; + size_t next_required_umem_size; + u8 num_ready_chunks; + /* Upon chunk mode preserve another set of buffers for stop_copy phase */ + struct mlx5_vhca_data_buffer *buf[MAX_NUM_CHUNKS]; + struct mlx5_vhca_data_buffer *buf_header[MAX_NUM_CHUNKS]; + struct mlx5vf_save_work_data save_data[MAX_NUM_CHUNKS]; spinlock_t list_lock; struct list_head buf_list; struct list_head avail_list; @@ -164,6 +178,7 @@ struct mlx5vf_pci_core_device { u8 deferred_reset:1; u8 mdev_detach:1; u8 log_active:1; + u8 chunk_mode:1; struct completion tracker_comp; /* protect migration state */ struct mutex state_mutex; @@ -186,7 +201,8 @@ enum { int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod); int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod); int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, - size_t *state_size, u8 query_flags); + size_t *state_size, u64 *total_size, + u8 query_flags); void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, const struct vfio_migration_ops *mig_ops, const struct vfio_log_ops *log_ops); @@ -217,6 +233,8 @@ struct page *mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work); +void mlx5vf_mig_file_set_save_work(struct mlx5_vf_migration_file *migf, + u8 chunk_num, size_t next_required_umem_size); int mlx5vf_start_page_tracker(struct vfio_device *vdev, struct rb_root_cached *ranges, u32 nnodes, u64 *page_size); int mlx5vf_stop_page_tracker(struct vfio_device *vdev); diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 42ec574a8622..b6ac66c5008d 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -24,6 +24,8 @@ /* Device specification max LOAD size */ #define MAX_LOAD_SIZE (BIT_ULL(__mlx5_bit_sz(load_vhca_state_in, size)) - 1) +#define MAX_CHUNK_SIZE SZ_8M + static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev) { struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); @@ -158,6 +160,41 @@ end: return found ? buf : NULL; } +static void mlx5vf_buf_read_done(struct mlx5_vhca_data_buffer *vhca_buf) +{ + struct mlx5_vf_migration_file *migf = vhca_buf->migf; + + if (vhca_buf->stop_copy_chunk_num) { + bool is_header = vhca_buf->dma_dir == DMA_NONE; + u8 chunk_num = vhca_buf->stop_copy_chunk_num; + size_t next_required_umem_size = 0; + + if (is_header) + migf->buf_header[chunk_num - 1] = vhca_buf; + else + migf->buf[chunk_num - 1] = vhca_buf; + + spin_lock_irq(&migf->list_lock); + list_del_init(&vhca_buf->buf_elm); + if (!is_header) { + next_required_umem_size = + migf->next_required_umem_size; + migf->next_required_umem_size = 0; + migf->num_ready_chunks--; + } + spin_unlock_irq(&migf->list_lock); + if (next_required_umem_size) + mlx5vf_mig_file_set_save_work(migf, chunk_num, + next_required_umem_size); + return; + } + + spin_lock_irq(&migf->list_lock); + list_del_init(&vhca_buf->buf_elm); + list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list); + spin_unlock_irq(&migf->list_lock); +} + static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf, char __user **buf, size_t *len, loff_t *pos) { @@ -193,12 +230,8 @@ static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf, copy_len -= page_len; } - if (*pos >= vhca_buf->start_pos + vhca_buf->length) { - spin_lock_irq(&vhca_buf->migf->list_lock); - list_del_init(&vhca_buf->buf_elm); - list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list); - spin_unlock_irq(&vhca_buf->migf->list_lock); - } + if (*pos >= vhca_buf->start_pos + vhca_buf->length) + mlx5vf_buf_read_done(vhca_buf); return done; } @@ -304,7 +337,75 @@ static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf) wake_up_interruptible(&migf->poll_wait); } -static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf) +void mlx5vf_mig_file_set_save_work(struct mlx5_vf_migration_file *migf, + u8 chunk_num, size_t next_required_umem_size) +{ + migf->save_data[chunk_num - 1].next_required_umem_size = + next_required_umem_size; + migf->save_data[chunk_num - 1].migf = migf; + get_file(migf->filp); + queue_work(migf->mvdev->cb_wq, + &migf->save_data[chunk_num - 1].work); +} + +static struct mlx5_vhca_data_buffer * +mlx5vf_mig_file_get_stop_copy_buf(struct mlx5_vf_migration_file *migf, + u8 index, size_t required_length) +{ + struct mlx5_vhca_data_buffer *buf = migf->buf[index]; + u8 chunk_num; + + WARN_ON(!buf); + chunk_num = buf->stop_copy_chunk_num; + buf->migf->buf[index] = NULL; + /* Checking whether the pre-allocated buffer can fit */ + if (buf->allocated_length >= required_length) + return buf; + + mlx5vf_put_data_buffer(buf); + buf = mlx5vf_get_data_buffer(buf->migf, required_length, + DMA_FROM_DEVICE); + if (IS_ERR(buf)) + return buf; + + buf->stop_copy_chunk_num = chunk_num; + return buf; +} + +static void mlx5vf_mig_file_save_work(struct work_struct *_work) +{ + struct mlx5vf_save_work_data *save_data = container_of(_work, + struct mlx5vf_save_work_data, work); + struct mlx5_vf_migration_file *migf = save_data->migf; + struct mlx5vf_pci_core_device *mvdev = migf->mvdev; + struct mlx5_vhca_data_buffer *buf; + + mutex_lock(&mvdev->state_mutex); + if (migf->state == MLX5_MIGF_STATE_ERROR) + goto end; + + buf = mlx5vf_mig_file_get_stop_copy_buf(migf, + save_data->chunk_num - 1, + save_data->next_required_umem_size); + if (IS_ERR(buf)) + goto err; + + if (mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false)) + goto err_save; + + goto end; + +err_save: + mlx5vf_put_data_buffer(buf); +err: + mlx5vf_mark_err(migf); +end: + mlx5vf_state_mutex_unlock(mvdev); + fput(migf->filp); +} + +static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf, + bool track) { size_t size = sizeof(struct mlx5_vf_migration_header) + sizeof(struct mlx5_vf_migration_tag_stop_copy_data); @@ -331,7 +432,7 @@ static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf) to_buff = kmap_local_page(page); memcpy(to_buff, &header, sizeof(header)); header_buf->length = sizeof(header); - data.stop_copy_size = cpu_to_le64(migf->buf->allocated_length); + data.stop_copy_size = cpu_to_le64(migf->buf[0]->allocated_length); memcpy(to_buff + sizeof(header), &data, sizeof(data)); header_buf->length += sizeof(data); kunmap_local(to_buff); @@ -340,48 +441,86 @@ static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf) spin_lock_irqsave(&migf->list_lock, flags); list_add_tail(&header_buf->buf_elm, &migf->buf_list); spin_unlock_irqrestore(&migf->list_lock, flags); - migf->pre_copy_initial_bytes = size; + if (track) + migf->pre_copy_initial_bytes = size; return 0; err: mlx5vf_put_data_buffer(header_buf); return ret; } -static int mlx5vf_prep_stop_copy(struct mlx5_vf_migration_file *migf, - size_t state_size) +static int mlx5vf_prep_stop_copy(struct mlx5vf_pci_core_device *mvdev, + struct mlx5_vf_migration_file *migf, + size_t state_size, u64 full_size, + bool track) { struct mlx5_vhca_data_buffer *buf; size_t inc_state_size; + int num_chunks; int ret; + int i; - /* let's be ready for stop_copy size that might grow by 10 percents */ - if (check_add_overflow(state_size, state_size / 10, &inc_state_size)) - inc_state_size = state_size; + if (mvdev->chunk_mode) { + size_t chunk_size = min_t(size_t, MAX_CHUNK_SIZE, full_size); - buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE); - if (IS_ERR(buf)) - return PTR_ERR(buf); + /* from firmware perspective at least 'state_size' buffer should be set */ + inc_state_size = max(state_size, chunk_size); + } else { + if (track) { + /* let's be ready for stop_copy size that might grow by 10 percents */ + if (check_add_overflow(state_size, state_size / 10, &inc_state_size)) + inc_state_size = state_size; + } else { + inc_state_size = state_size; + } + } - migf->buf = buf; - buf = mlx5vf_get_data_buffer(migf, - sizeof(struct mlx5_vf_migration_header), DMA_NONE); - if (IS_ERR(buf)) { - ret = PTR_ERR(buf); - goto err; + /* let's not overflow the device specification max SAVE size */ + inc_state_size = min_t(size_t, inc_state_size, + (BIT_ULL(__mlx5_bit_sz(save_vhca_state_in, size)) - PAGE_SIZE)); + + num_chunks = mvdev->chunk_mode ? MAX_NUM_CHUNKS : 1; + for (i = 0; i < num_chunks; i++) { + buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto err; + } + + migf->buf[i] = buf; + buf = mlx5vf_get_data_buffer(migf, + sizeof(struct mlx5_vf_migration_header), DMA_NONE); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto err; + } + migf->buf_header[i] = buf; + if (mvdev->chunk_mode) { + migf->buf[i]->stop_copy_chunk_num = i + 1; + migf->buf_header[i]->stop_copy_chunk_num = i + 1; + INIT_WORK(&migf->save_data[i].work, + mlx5vf_mig_file_save_work); + migf->save_data[i].chunk_num = i + 1; + } } - migf->buf_header = buf; - ret = mlx5vf_add_stop_copy_header(migf); + ret = mlx5vf_add_stop_copy_header(migf, track); if (ret) - goto err_header; + goto err; return 0; -err_header: - mlx5vf_put_data_buffer(migf->buf_header); - migf->buf_header = NULL; err: - mlx5vf_put_data_buffer(migf->buf); - migf->buf = NULL; + for (i = 0; i < num_chunks; i++) { + if (migf->buf[i]) { + mlx5vf_put_data_buffer(migf->buf[i]); + migf->buf[i] = NULL; + } + if (migf->buf_header[i]) { + mlx5vf_put_data_buffer(migf->buf_header[i]); + migf->buf_header[i] = NULL; + } + } + return ret; } @@ -428,7 +567,7 @@ static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd, * As so, the other code below is safe with the proper locks. */ ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length, - MLX5VF_QUERY_INC); + NULL, MLX5VF_QUERY_INC); if (ret) goto err_state_unlock; } @@ -505,21 +644,15 @@ static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev) if (migf->state == MLX5_MIGF_STATE_ERROR) return -ENODEV; - ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, NULL, MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL); if (ret) goto err; - /* Checking whether we have a matching pre-allocated buffer that can fit */ - if (migf->buf && migf->buf->allocated_length >= length) { - buf = migf->buf; - migf->buf = NULL; - } else { - buf = mlx5vf_get_data_buffer(migf, length, DMA_FROM_DEVICE); - if (IS_ERR(buf)) { - ret = PTR_ERR(buf); - goto err; - } + buf = mlx5vf_mig_file_get_stop_copy_buf(migf, 0, length); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto err; } ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false); @@ -541,6 +674,7 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track) struct mlx5_vf_migration_file *migf; struct mlx5_vhca_data_buffer *buf; size_t length; + u64 full_size; int ret; migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT); @@ -574,20 +708,25 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track) INIT_LIST_HEAD(&migf->buf_list); INIT_LIST_HEAD(&migf->avail_list); spin_lock_init(&migf->list_lock); - ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, 0); + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, &full_size, 0); + if (ret) + goto out_pd; + + ret = mlx5vf_prep_stop_copy(mvdev, migf, length, full_size, track); if (ret) goto out_pd; if (track) { - ret = mlx5vf_prep_stop_copy(migf, length); - if (ret) + /* leave the allocated buffer ready for the stop-copy phase */ + buf = mlx5vf_alloc_data_buffer(migf, + migf->buf[0]->allocated_length, DMA_FROM_DEVICE); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); goto out_pd; - } - - buf = mlx5vf_alloc_data_buffer(migf, length, DMA_FROM_DEVICE); - if (IS_ERR(buf)) { - ret = PTR_ERR(buf); - goto out_pd; + } + } else { + buf = migf->buf[0]; + migf->buf[0] = NULL; } ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track); @@ -820,8 +959,8 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, size_t len, loff_t *pos) { struct mlx5_vf_migration_file *migf = filp->private_data; - struct mlx5_vhca_data_buffer *vhca_buf = migf->buf; - struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header; + struct mlx5_vhca_data_buffer *vhca_buf = migf->buf[0]; + struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header[0]; loff_t requested_length; bool has_work = false; ssize_t done = 0; @@ -856,15 +995,15 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, if (vhca_buf_header->allocated_length < migf->record_size) { mlx5vf_free_data_buffer(vhca_buf_header); - migf->buf_header = mlx5vf_alloc_data_buffer(migf, + migf->buf_header[0] = mlx5vf_alloc_data_buffer(migf, migf->record_size, DMA_NONE); - if (IS_ERR(migf->buf_header)) { - ret = PTR_ERR(migf->buf_header); - migf->buf_header = NULL; + if (IS_ERR(migf->buf_header[0])) { + ret = PTR_ERR(migf->buf_header[0]); + migf->buf_header[0] = NULL; goto out_unlock; } - vhca_buf_header = migf->buf_header; + vhca_buf_header = migf->buf_header[0]; } vhca_buf_header->start_pos = migf->max_pos; @@ -884,15 +1023,15 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, if (vhca_buf->allocated_length < size) { mlx5vf_free_data_buffer(vhca_buf); - migf->buf = mlx5vf_alloc_data_buffer(migf, + migf->buf[0] = mlx5vf_alloc_data_buffer(migf, size, DMA_TO_DEVICE); - if (IS_ERR(migf->buf)) { - ret = PTR_ERR(migf->buf); - migf->buf = NULL; + if (IS_ERR(migf->buf[0])) { + ret = PTR_ERR(migf->buf[0]); + migf->buf[0] = NULL; goto out_unlock; } - vhca_buf = migf->buf; + vhca_buf = migf->buf[0]; } vhca_buf->start_pos = migf->max_pos; @@ -974,7 +1113,7 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) goto out_pd; } - migf->buf = buf; + migf->buf[0] = buf; if (MLX5VF_PRE_COPY_SUPP(mvdev)) { buf = mlx5vf_alloc_data_buffer(migf, sizeof(struct mlx5_vf_migration_header), DMA_NONE); @@ -983,7 +1122,7 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) goto out_buf; } - migf->buf_header = buf; + migf->buf_header[0] = buf; migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; } else { /* Initial state will be to read the image */ @@ -997,7 +1136,7 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) spin_lock_init(&migf->list_lock); return migf; out_buf: - mlx5vf_free_data_buffer(migf->buf); + mlx5vf_free_data_buffer(migf->buf[0]); out_pd: mlx5vf_cmd_dealloc_pd(migf); out_free: @@ -1019,6 +1158,7 @@ void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev) mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx); cancel_work_sync(&mvdev->saving_migf->async_data.work); mlx5vf_disable_fd(mvdev->saving_migf); + wake_up_interruptible(&mvdev->saving_migf->poll_wait); mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf); fput(mvdev->saving_migf->filp); mvdev->saving_migf = NULL; @@ -1100,7 +1240,7 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, if (!MLX5VF_PRE_COPY_SUPP(mvdev)) { ret = mlx5vf_cmd_load_vhca_state(mvdev, mvdev->resuming_migf, - mvdev->resuming_migf->buf); + mvdev->resuming_migf->buf[0]); if (ret) return ERR_PTR(ret); } @@ -1194,13 +1334,14 @@ static int mlx5vf_pci_get_data_size(struct vfio_device *vdev, struct mlx5vf_pci_core_device *mvdev = container_of( vdev, struct mlx5vf_pci_core_device, core_device.vdev); size_t state_size; + u64 total_size; int ret; mutex_lock(&mvdev->state_mutex); - ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, - &state_size, 0); + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &state_size, + &total_size, 0); if (!ret) - *stop_copy_length = state_size; + *stop_copy_length = total_size; mlx5vf_state_mutex_unlock(mvdev); return ret; } diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 40732e8ed4c6..e31e1952d7b8 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -946,6 +946,11 @@ void vfio_combine_iova_ranges(struct rb_root_cached *root, u32 cur_nodes, unsigned long last; comb_start = interval_tree_iter_first(root, 0, ULONG_MAX); + + /* Empty list */ + if (WARN_ON_ONCE(!comb_start)) + return; + curr = comb_start; while (curr) { last = curr->last; @@ -975,6 +980,11 @@ void vfio_combine_iova_ranges(struct rb_root_cached *root, u32 cur_nodes, prev = curr; curr = interval_tree_iter_next(curr, 0, ULONG_MAX); } + + /* Empty list or no nodes to combine */ + if (WARN_ON_ONCE(min_gap == ULONG_MAX)) + break; + comb_start->last = comb_end->last; interval_tree_remove(comb_end, root); cur_nodes--; diff --git a/include/linux/cdx/cdx_bus.h b/include/linux/cdx/cdx_bus.h index bead71b7bc73..8320ec3b9e37 100644 --- a/include/linux/cdx/cdx_bus.h +++ b/include/linux/cdx/cdx_bus.h @@ -21,11 +21,13 @@ struct cdx_controller; enum { + CDX_DEV_BUS_MASTER_CONF, CDX_DEV_RESET_CONF, }; struct cdx_device_config { u8 type; + bool bus_master_enable; }; typedef int (*cdx_scan_cb)(struct cdx_controller *cdx); @@ -170,4 +172,20 @@ extern struct bus_type cdx_bus_type; */ int cdx_dev_reset(struct device *dev); +/** + * cdx_set_master - enables bus-mastering for CDX device + * @cdx_dev: the CDX device to enable + * + * Return: 0 for success, -errno on failure + */ +int cdx_set_master(struct cdx_device *cdx_dev); + +/** + * cdx_clear_master - disables bus-mastering for CDX device + * @cdx_dev: the CDX device to disable + * + * Return: 0 for success, -errno on failure + */ +int cdx_clear_master(struct cdx_device *cdx_dev); + #endif /* _CDX_BUS_H_ */ diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index afc1369216d9..7f5fb010226d 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -277,8 +277,8 @@ struct vfio_region_info { #define VFIO_REGION_INFO_FLAG_CAPS (1 << 3) /* Info supports caps */ __u32 index; /* Region index */ __u32 cap_offset; /* Offset within info struct of first cap */ - __u64 size; /* Region size (bytes) */ - __u64 offset; /* Region offset from start of device fd */ + __aligned_u64 size; /* Region size (bytes) */ + __aligned_u64 offset; /* Region offset from start of device fd */ }; #define VFIO_DEVICE_GET_REGION_INFO _IO(VFIO_TYPE, VFIO_BASE + 8) @@ -294,8 +294,8 @@ struct vfio_region_info { #define VFIO_REGION_INFO_CAP_SPARSE_MMAP 1 struct vfio_region_sparse_mmap_area { - __u64 offset; /* Offset of mmap'able area within region */ - __u64 size; /* Size of mmap'able area */ + __aligned_u64 offset; /* Offset of mmap'able area within region */ + __aligned_u64 size; /* Size of mmap'able area */ }; struct vfio_region_info_cap_sparse_mmap { @@ -450,9 +450,9 @@ struct vfio_device_migration_info { VFIO_DEVICE_STATE_V1_RESUMING) __u32 reserved; - __u64 pending_bytes; - __u64 data_offset; - __u64 data_size; + __aligned_u64 pending_bytes; + __aligned_u64 data_offset; + __aligned_u64 data_size; }; /* @@ -476,7 +476,7 @@ struct vfio_device_migration_info { struct vfio_region_info_cap_nvlink2_ssatgt { struct vfio_info_cap_header header; - __u64 tgt; + __aligned_u64 tgt; }; /* @@ -816,7 +816,7 @@ struct vfio_device_gfx_plane_info { __u32 drm_plane_type; /* type of plane: DRM_PLANE_TYPE_* */ /* out */ __u32 drm_format; /* drm format of plane */ - __u64 drm_format_mod; /* tiled mode */ + __aligned_u64 drm_format_mod; /* tiled mode */ __u32 width; /* width of plane */ __u32 height; /* height of plane */ __u32 stride; /* stride of plane */ @@ -829,6 +829,7 @@ struct vfio_device_gfx_plane_info { __u32 region_index; /* region index */ __u32 dmabuf_id; /* dma-buf id */ }; + __u32 reserved; }; #define VFIO_DEVICE_QUERY_GFX_PLANE _IO(VFIO_TYPE, VFIO_BASE + 14) @@ -863,9 +864,10 @@ struct vfio_device_ioeventfd { #define VFIO_DEVICE_IOEVENTFD_32 (1 << 2) /* 4-byte write */ #define VFIO_DEVICE_IOEVENTFD_64 (1 << 3) /* 8-byte write */ #define VFIO_DEVICE_IOEVENTFD_SIZE_MASK (0xf) - __u64 offset; /* device fd offset of write */ - __u64 data; /* data to be written */ + __aligned_u64 offset; /* device fd offset of write */ + __aligned_u64 data; /* data to be written */ __s32 fd; /* -1 for de-assignment */ + __u32 reserved; }; #define VFIO_DEVICE_IOEVENTFD _IO(VFIO_TYPE, VFIO_BASE + 16) @@ -1434,6 +1436,27 @@ struct vfio_device_feature_mig_data_size { #define VFIO_DEVICE_FEATURE_MIG_DATA_SIZE 9 +/** + * Upon VFIO_DEVICE_FEATURE_SET, set or clear the BUS mastering for the device + * based on the operation specified in op flag. + * + * The functionality is incorporated for devices that needs bus master control, + * but the in-band device interface lacks the support. Consequently, it is not + * applicable to PCI devices, as bus master control for PCI devices is managed + * in-band through the configuration space. At present, this feature is supported + * only for CDX devices. + * When the device's BUS MASTER setting is configured as CLEAR, it will result in + * blocking all incoming DMA requests from the device. On the other hand, configuring + * the device's BUS MASTER setting as SET (enable) will grant the device the + * capability to perform DMA to the host memory. + */ +struct vfio_device_feature_bus_master { + __u32 op; +#define VFIO_DEVICE_FEATURE_CLEAR_MASTER 0 /* Clear Bus Master */ +#define VFIO_DEVICE_FEATURE_SET_MASTER 1 /* Set Bus Master */ +}; +#define VFIO_DEVICE_FEATURE_BUS_MASTER 10 + /* -------- API for Type1 VFIO IOMMU -------- */ /** @@ -1449,7 +1472,7 @@ struct vfio_iommu_type1_info { __u32 flags; #define VFIO_IOMMU_INFO_PGSIZES (1 << 0) /* supported page sizes info */ #define VFIO_IOMMU_INFO_CAPS (1 << 1) /* Info supports caps */ - __u64 iova_pgsizes; /* Bitmap of supported page sizes */ + __aligned_u64 iova_pgsizes; /* Bitmap of supported page sizes */ __u32 cap_offset; /* Offset within info struct of first cap */ __u32 pad; }; diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c index 3764d1911b51..93405264ff23 100644 --- a/samples/vfio-mdev/mbochs.c +++ b/samples/vfio-mdev/mbochs.c @@ -1262,7 +1262,7 @@ static long mbochs_ioctl(struct vfio_device *vdev, unsigned int cmd, case VFIO_DEVICE_QUERY_GFX_PLANE: { - struct vfio_device_gfx_plane_info plane; + struct vfio_device_gfx_plane_info plane = {}; minsz = offsetofend(struct vfio_device_gfx_plane_info, region_index); diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c index 064e1c0a7aa8..72ea5832c927 100644 --- a/samples/vfio-mdev/mdpy.c +++ b/samples/vfio-mdev/mdpy.c @@ -591,7 +591,7 @@ static long mdpy_ioctl(struct vfio_device *vdev, unsigned int cmd, case VFIO_DEVICE_QUERY_GFX_PLANE: { - struct vfio_device_gfx_plane_info plane; + struct vfio_device_gfx_plane_info plane = {}; minsz = offsetofend(struct vfio_device_gfx_plane_info, region_index); diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c index 5af00387c519..69ba0281f9e0 100644 --- a/samples/vfio-mdev/mtty.c +++ b/samples/vfio-mdev/mtty.c @@ -29,6 +29,8 @@ #include <linux/serial.h> #include <uapi/linux/serial_reg.h> #include <linux/eventfd.h> +#include <linux/anon_inodes.h> + /* * #defines */ @@ -124,10 +126,32 @@ struct serial_port { u8 intr_trigger_level; /* interrupt trigger level */ }; +struct mtty_data { + u64 magic; +#define MTTY_MAGIC 0x7e9d09898c3e2c4e /* Nothing clever, just random */ + u32 major_ver; +#define MTTY_MAJOR_VER 1 + u32 minor_ver; +#define MTTY_MINOR_VER 0 + u32 nr_ports; + u32 flags; + struct serial_port ports[2]; +}; + +struct mdev_state; + +struct mtty_migration_file { + struct file *filp; + struct mutex lock; + struct mdev_state *mdev_state; + struct mtty_data data; + ssize_t filled_size; + u8 disabled:1; +}; + /* State of each mdev device */ struct mdev_state { struct vfio_device vdev; - int irq_fd; struct eventfd_ctx *intx_evtfd; struct eventfd_ctx *msi_evtfd; int irq_index; @@ -141,6 +165,13 @@ struct mdev_state { struct mutex rxtx_lock; struct vfio_device_info dev_info; int nr_ports; + enum vfio_device_mig_state state; + struct mutex state_mutex; + struct mutex reset_mutex; + struct mtty_migration_file *saving_migf; + struct mtty_migration_file *resuming_migf; + u8 deferred_reset:1; + u8 intx_mask:1; }; static struct mtty_type { @@ -166,10 +197,6 @@ static const struct file_operations vd_fops = { static const struct vfio_device_ops mtty_dev_ops; -/* function prototypes */ - -static int mtty_trigger_interrupt(struct mdev_state *mdev_state); - /* Helper functions */ static void dump_buffer(u8 *buf, uint32_t count) @@ -186,6 +213,36 @@ static void dump_buffer(u8 *buf, uint32_t count) #endif } +static bool is_intx(struct mdev_state *mdev_state) +{ + return mdev_state->irq_index == VFIO_PCI_INTX_IRQ_INDEX; +} + +static bool is_msi(struct mdev_state *mdev_state) +{ + return mdev_state->irq_index == VFIO_PCI_MSI_IRQ_INDEX; +} + +static bool is_noirq(struct mdev_state *mdev_state) +{ + return !is_intx(mdev_state) && !is_msi(mdev_state); +} + +static void mtty_trigger_interrupt(struct mdev_state *mdev_state) +{ + lockdep_assert_held(&mdev_state->ops_lock); + + if (is_msi(mdev_state)) { + if (mdev_state->msi_evtfd) + eventfd_signal(mdev_state->msi_evtfd, 1); + } else if (is_intx(mdev_state)) { + if (mdev_state->intx_evtfd && !mdev_state->intx_mask) { + eventfd_signal(mdev_state->intx_evtfd, 1); + mdev_state->intx_mask = true; + } + } +} + static void mtty_create_config_space(struct mdev_state *mdev_state) { /* PCI dev ID */ @@ -717,6 +774,543 @@ accessfailed: return ret; } +static size_t mtty_data_size(struct mdev_state *mdev_state) +{ + return offsetof(struct mtty_data, ports) + + (mdev_state->nr_ports * sizeof(struct serial_port)); +} + +static void mtty_disable_file(struct mtty_migration_file *migf) +{ + mutex_lock(&migf->lock); + migf->disabled = true; + migf->filled_size = 0; + migf->filp->f_pos = 0; + mutex_unlock(&migf->lock); +} + +static void mtty_disable_files(struct mdev_state *mdev_state) +{ + if (mdev_state->saving_migf) { + mtty_disable_file(mdev_state->saving_migf); + fput(mdev_state->saving_migf->filp); + mdev_state->saving_migf = NULL; + } + + if (mdev_state->resuming_migf) { + mtty_disable_file(mdev_state->resuming_migf); + fput(mdev_state->resuming_migf->filp); + mdev_state->resuming_migf = NULL; + } +} + +static void mtty_state_mutex_unlock(struct mdev_state *mdev_state) +{ +again: + mutex_lock(&mdev_state->reset_mutex); + if (mdev_state->deferred_reset) { + mdev_state->deferred_reset = false; + mutex_unlock(&mdev_state->reset_mutex); + mdev_state->state = VFIO_DEVICE_STATE_RUNNING; + mtty_disable_files(mdev_state); + goto again; + } + mutex_unlock(&mdev_state->state_mutex); + mutex_unlock(&mdev_state->reset_mutex); +} + +static int mtty_release_migf(struct inode *inode, struct file *filp) +{ + struct mtty_migration_file *migf = filp->private_data; + + mtty_disable_file(migf); + mutex_destroy(&migf->lock); + kfree(migf); + + return 0; +} + +static long mtty_precopy_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + struct mtty_migration_file *migf = filp->private_data; + struct mdev_state *mdev_state = migf->mdev_state; + loff_t *pos = &filp->f_pos; + struct vfio_precopy_info info = {}; + unsigned long minsz; + int ret; + + if (cmd != VFIO_MIG_GET_PRECOPY_INFO) + return -ENOTTY; + + minsz = offsetofend(struct vfio_precopy_info, dirty_bytes); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + if (info.argsz < minsz) + return -EINVAL; + + mutex_lock(&mdev_state->state_mutex); + if (mdev_state->state != VFIO_DEVICE_STATE_PRE_COPY && + mdev_state->state != VFIO_DEVICE_STATE_PRE_COPY_P2P) { + ret = -EINVAL; + goto unlock; + } + + mutex_lock(&migf->lock); + + if (migf->disabled) { + mutex_unlock(&migf->lock); + ret = -ENODEV; + goto unlock; + } + + if (*pos > migf->filled_size) { + mutex_unlock(&migf->lock); + ret = -EINVAL; + goto unlock; + } + + info.dirty_bytes = 0; + info.initial_bytes = migf->filled_size - *pos; + mutex_unlock(&migf->lock); + + ret = copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; +unlock: + mtty_state_mutex_unlock(mdev_state); + return ret; +} + +static ssize_t mtty_save_read(struct file *filp, char __user *buf, + size_t len, loff_t *pos) +{ + struct mtty_migration_file *migf = filp->private_data; + ssize_t ret = 0; + + if (pos) + return -ESPIPE; + + pos = &filp->f_pos; + + mutex_lock(&migf->lock); + + dev_dbg(migf->mdev_state->vdev.dev, "%s ask %zu\n", __func__, len); + + if (migf->disabled) { + ret = -ENODEV; + goto out_unlock; + } + + if (*pos > migf->filled_size) { + ret = -EINVAL; + goto out_unlock; + } + + len = min_t(size_t, migf->filled_size - *pos, len); + if (len) { + if (copy_to_user(buf, (void *)&migf->data + *pos, len)) { + ret = -EFAULT; + goto out_unlock; + } + *pos += len; + ret = len; + } +out_unlock: + dev_dbg(migf->mdev_state->vdev.dev, "%s read %zu\n", __func__, ret); + mutex_unlock(&migf->lock); + return ret; +} + +static const struct file_operations mtty_save_fops = { + .owner = THIS_MODULE, + .read = mtty_save_read, + .unlocked_ioctl = mtty_precopy_ioctl, + .compat_ioctl = compat_ptr_ioctl, + .release = mtty_release_migf, + .llseek = no_llseek, +}; + +static void mtty_save_state(struct mdev_state *mdev_state) +{ + struct mtty_migration_file *migf = mdev_state->saving_migf; + int i; + + mutex_lock(&migf->lock); + for (i = 0; i < mdev_state->nr_ports; i++) { + memcpy(&migf->data.ports[i], + &mdev_state->s[i], sizeof(struct serial_port)); + migf->filled_size += sizeof(struct serial_port); + } + dev_dbg(mdev_state->vdev.dev, + "%s filled to %zu\n", __func__, migf->filled_size); + mutex_unlock(&migf->lock); +} + +static int mtty_load_state(struct mdev_state *mdev_state) +{ + struct mtty_migration_file *migf = mdev_state->resuming_migf; + int i; + + mutex_lock(&migf->lock); + /* magic and version already tested by resume write fn */ + if (migf->filled_size < mtty_data_size(mdev_state)) { + dev_dbg(mdev_state->vdev.dev, "%s expected %zu, got %zu\n", + __func__, mtty_data_size(mdev_state), + migf->filled_size); + mutex_unlock(&migf->lock); + return -EINVAL; + } + + for (i = 0; i < mdev_state->nr_ports; i++) + memcpy(&mdev_state->s[i], + &migf->data.ports[i], sizeof(struct serial_port)); + + mutex_unlock(&migf->lock); + return 0; +} + +static struct mtty_migration_file * +mtty_save_device_data(struct mdev_state *mdev_state, + enum vfio_device_mig_state state) +{ + struct mtty_migration_file *migf = mdev_state->saving_migf; + struct mtty_migration_file *ret = NULL; + + if (migf) { + if (state == VFIO_DEVICE_STATE_STOP_COPY) + goto fill_data; + return ret; + } + + migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT); + if (!migf) + return ERR_PTR(-ENOMEM); + + migf->filp = anon_inode_getfile("mtty_mig", &mtty_save_fops, + migf, O_RDONLY); + if (IS_ERR(migf->filp)) { + int rc = PTR_ERR(migf->filp); + + kfree(migf); + return ERR_PTR(rc); + } + + stream_open(migf->filp->f_inode, migf->filp); + mutex_init(&migf->lock); + migf->mdev_state = mdev_state; + + migf->data.magic = MTTY_MAGIC; + migf->data.major_ver = MTTY_MAJOR_VER; + migf->data.minor_ver = MTTY_MINOR_VER; + migf->data.nr_ports = mdev_state->nr_ports; + + migf->filled_size = offsetof(struct mtty_data, ports); + + dev_dbg(mdev_state->vdev.dev, "%s filled header to %zu\n", + __func__, migf->filled_size); + + ret = mdev_state->saving_migf = migf; + +fill_data: + if (state == VFIO_DEVICE_STATE_STOP_COPY) + mtty_save_state(mdev_state); + + return ret; +} + +static ssize_t mtty_resume_write(struct file *filp, const char __user *buf, + size_t len, loff_t *pos) +{ + struct mtty_migration_file *migf = filp->private_data; + struct mdev_state *mdev_state = migf->mdev_state; + loff_t requested_length; + ssize_t ret = 0; + + if (pos) + return -ESPIPE; + + pos = &filp->f_pos; + + if (*pos < 0 || + check_add_overflow((loff_t)len, *pos, &requested_length)) + return -EINVAL; + + if (requested_length > mtty_data_size(mdev_state)) + return -ENOMEM; + + mutex_lock(&migf->lock); + + if (migf->disabled) { + ret = -ENODEV; + goto out_unlock; + } + + if (copy_from_user((void *)&migf->data + *pos, buf, len)) { + ret = -EFAULT; + goto out_unlock; + } + + *pos += len; + ret = len; + + dev_dbg(migf->mdev_state->vdev.dev, "%s received %zu, total %zu\n", + __func__, len, migf->filled_size + len); + + if (migf->filled_size < offsetof(struct mtty_data, ports) && + migf->filled_size + len >= offsetof(struct mtty_data, ports)) { + if (migf->data.magic != MTTY_MAGIC || migf->data.flags || + migf->data.major_ver != MTTY_MAJOR_VER || + migf->data.minor_ver != MTTY_MINOR_VER || + migf->data.nr_ports != mdev_state->nr_ports) { + dev_dbg(migf->mdev_state->vdev.dev, + "%s failed validation\n", __func__); + ret = -EFAULT; + } else { + dev_dbg(migf->mdev_state->vdev.dev, + "%s header validated\n", __func__); + } + } + + migf->filled_size += len; + +out_unlock: + mutex_unlock(&migf->lock); + return ret; +} + +static const struct file_operations mtty_resume_fops = { + .owner = THIS_MODULE, + .write = mtty_resume_write, + .release = mtty_release_migf, + .llseek = no_llseek, +}; + +static struct mtty_migration_file * +mtty_resume_device_data(struct mdev_state *mdev_state) +{ + struct mtty_migration_file *migf; + int ret; + + migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT); + if (!migf) + return ERR_PTR(-ENOMEM); + + migf->filp = anon_inode_getfile("mtty_mig", &mtty_resume_fops, + migf, O_WRONLY); + if (IS_ERR(migf->filp)) { + ret = PTR_ERR(migf->filp); + kfree(migf); + return ERR_PTR(ret); + } + + stream_open(migf->filp->f_inode, migf->filp); + mutex_init(&migf->lock); + migf->mdev_state = mdev_state; + + mdev_state->resuming_migf = migf; + + return migf; +} + +static struct file *mtty_step_state(struct mdev_state *mdev_state, + enum vfio_device_mig_state new) +{ + enum vfio_device_mig_state cur = mdev_state->state; + + dev_dbg(mdev_state->vdev.dev, "%s: %d -> %d\n", __func__, cur, new); + + /* + * The following state transitions are no-op considering + * mtty does not do DMA nor require any explicit start/stop. + * + * RUNNING -> RUNNING_P2P + * RUNNING_P2P -> RUNNING + * RUNNING_P2P -> STOP + * PRE_COPY -> PRE_COPY_P2P + * PRE_COPY_P2P -> PRE_COPY + * STOP -> RUNNING_P2P + */ + if ((cur == VFIO_DEVICE_STATE_RUNNING && + new == VFIO_DEVICE_STATE_RUNNING_P2P) || + (cur == VFIO_DEVICE_STATE_RUNNING_P2P && + (new == VFIO_DEVICE_STATE_RUNNING || + new == VFIO_DEVICE_STATE_STOP)) || + (cur == VFIO_DEVICE_STATE_PRE_COPY && + new == VFIO_DEVICE_STATE_PRE_COPY_P2P) || + (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && + new == VFIO_DEVICE_STATE_PRE_COPY) || + (cur == VFIO_DEVICE_STATE_STOP && + new == VFIO_DEVICE_STATE_RUNNING_P2P)) + return NULL; + + /* + * The following state transitions simply close migration files, + * with the exception of RESUMING -> STOP, which needs to load + * the state first. + * + * RESUMING -> STOP + * PRE_COPY -> RUNNING + * PRE_COPY_P2P -> RUNNING_P2P + * STOP_COPY -> STOP + */ + if (cur == VFIO_DEVICE_STATE_RESUMING && + new == VFIO_DEVICE_STATE_STOP) { + int ret; + + ret = mtty_load_state(mdev_state); + if (ret) + return ERR_PTR(ret); + mtty_disable_files(mdev_state); + return NULL; + } + + if ((cur == VFIO_DEVICE_STATE_PRE_COPY && + new == VFIO_DEVICE_STATE_RUNNING) || + (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && + new == VFIO_DEVICE_STATE_RUNNING_P2P) || + (cur == VFIO_DEVICE_STATE_STOP_COPY && + new == VFIO_DEVICE_STATE_STOP)) { + mtty_disable_files(mdev_state); + return NULL; + } + + /* + * The following state transitions return migration files. + * + * RUNNING -> PRE_COPY + * RUNNING_P2P -> PRE_COPY_P2P + * STOP -> STOP_COPY + * STOP -> RESUMING + * PRE_COPY_P2P -> STOP_COPY + */ + if ((cur == VFIO_DEVICE_STATE_RUNNING && + new == VFIO_DEVICE_STATE_PRE_COPY) || + (cur == VFIO_DEVICE_STATE_RUNNING_P2P && + new == VFIO_DEVICE_STATE_PRE_COPY_P2P) || + (cur == VFIO_DEVICE_STATE_STOP && + new == VFIO_DEVICE_STATE_STOP_COPY) || + (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && + new == VFIO_DEVICE_STATE_STOP_COPY)) { + struct mtty_migration_file *migf; + + migf = mtty_save_device_data(mdev_state, new); + if (IS_ERR(migf)) + return ERR_CAST(migf); + + if (migf) { + get_file(migf->filp); + + return migf->filp; + } + return NULL; + } + + if (cur == VFIO_DEVICE_STATE_STOP && + new == VFIO_DEVICE_STATE_RESUMING) { + struct mtty_migration_file *migf; + + migf = mtty_resume_device_data(mdev_state); + if (IS_ERR(migf)) + return ERR_CAST(migf); + + get_file(migf->filp); + + return migf->filp; + } + + /* vfio_mig_get_next_state() does not use arcs other than the above */ + WARN_ON(true); + return ERR_PTR(-EINVAL); +} + +static struct file *mtty_set_state(struct vfio_device *vdev, + enum vfio_device_mig_state new_state) +{ + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + struct file *ret = NULL; + + dev_dbg(vdev->dev, "%s -> %d\n", __func__, new_state); + + mutex_lock(&mdev_state->state_mutex); + while (mdev_state->state != new_state) { + enum vfio_device_mig_state next_state; + int rc = vfio_mig_get_next_state(vdev, mdev_state->state, + new_state, &next_state); + if (rc) { + ret = ERR_PTR(rc); + break; + } + + ret = mtty_step_state(mdev_state, next_state); + if (IS_ERR(ret)) + break; + + mdev_state->state = next_state; + + if (WARN_ON(ret && new_state != next_state)) { + fput(ret); + ret = ERR_PTR(-EINVAL); + break; + } + } + mtty_state_mutex_unlock(mdev_state); + return ret; +} + +static int mtty_get_state(struct vfio_device *vdev, + enum vfio_device_mig_state *current_state) +{ + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + + mutex_lock(&mdev_state->state_mutex); + *current_state = mdev_state->state; + mtty_state_mutex_unlock(mdev_state); + return 0; +} + +static int mtty_get_data_size(struct vfio_device *vdev, + unsigned long *stop_copy_length) +{ + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + + *stop_copy_length = mtty_data_size(mdev_state); + return 0; +} + +static const struct vfio_migration_ops mtty_migration_ops = { + .migration_set_state = mtty_set_state, + .migration_get_state = mtty_get_state, + .migration_get_data_size = mtty_get_data_size, +}; + +static int mtty_log_start(struct vfio_device *vdev, + struct rb_root_cached *ranges, + u32 nnodes, u64 *page_size) +{ + return 0; +} + +static int mtty_log_stop(struct vfio_device *vdev) +{ + return 0; +} + +static int mtty_log_read_and_clear(struct vfio_device *vdev, + unsigned long iova, unsigned long length, + struct iova_bitmap *dirty) +{ + return 0; +} + +static const struct vfio_log_ops mtty_log_ops = { + .log_start = mtty_log_start, + .log_stop = mtty_log_stop, + .log_read_and_clear = mtty_log_read_and_clear, +}; + static int mtty_init_dev(struct vfio_device *vdev) { struct mdev_state *mdev_state = @@ -749,6 +1343,16 @@ static int mtty_init_dev(struct vfio_device *vdev) mutex_init(&mdev_state->ops_lock); mdev_state->mdev = mdev; mtty_create_config_space(mdev_state); + + mutex_init(&mdev_state->state_mutex); + mutex_init(&mdev_state->reset_mutex); + vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | + VFIO_MIGRATION_P2P | + VFIO_MIGRATION_PRE_COPY; + vdev->mig_ops = &mtty_migration_ops; + vdev->log_ops = &mtty_log_ops; + mdev_state->state = VFIO_DEVICE_STATE_RUNNING; + return 0; err_nr_ports: @@ -782,6 +1386,8 @@ static void mtty_release_dev(struct vfio_device *vdev) struct mdev_state *mdev_state = container_of(vdev, struct mdev_state, vdev); + mutex_destroy(&mdev_state->reset_mutex); + mutex_destroy(&mdev_state->state_mutex); atomic_add(mdev_state->nr_ports, &mdev_avail_ports); kfree(mdev_state->vconfig); } @@ -798,6 +1404,15 @@ static int mtty_reset(struct mdev_state *mdev_state) { pr_info("%s: called\n", __func__); + mutex_lock(&mdev_state->reset_mutex); + mdev_state->deferred_reset = true; + if (!mutex_trylock(&mdev_state->state_mutex)) { + mutex_unlock(&mdev_state->reset_mutex); + return 0; + } + mutex_unlock(&mdev_state->reset_mutex); + mtty_state_mutex_unlock(mdev_state); + return 0; } @@ -921,6 +1536,25 @@ write_err: return -EFAULT; } +static void mtty_disable_intx(struct mdev_state *mdev_state) +{ + if (mdev_state->intx_evtfd) { + eventfd_ctx_put(mdev_state->intx_evtfd); + mdev_state->intx_evtfd = NULL; + mdev_state->intx_mask = false; + mdev_state->irq_index = -1; + } +} + +static void mtty_disable_msi(struct mdev_state *mdev_state) +{ + if (mdev_state->msi_evtfd) { + eventfd_ctx_put(mdev_state->msi_evtfd); + mdev_state->msi_evtfd = NULL; + mdev_state->irq_index = -1; + } +} + static int mtty_set_irqs(struct mdev_state *mdev_state, uint32_t flags, unsigned int index, unsigned int start, unsigned int count, void *data) @@ -932,59 +1566,113 @@ static int mtty_set_irqs(struct mdev_state *mdev_state, uint32_t flags, case VFIO_PCI_INTX_IRQ_INDEX: switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) { case VFIO_IRQ_SET_ACTION_MASK: + if (!is_intx(mdev_state) || start != 0 || count != 1) { + ret = -EINVAL; + break; + } + + if (flags & VFIO_IRQ_SET_DATA_NONE) { + mdev_state->intx_mask = true; + } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { + uint8_t mask = *(uint8_t *)data; + + if (mask) + mdev_state->intx_mask = true; + } else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { + ret = -ENOTTY; /* No support for mask fd */ + } + break; case VFIO_IRQ_SET_ACTION_UNMASK: + if (!is_intx(mdev_state) || start != 0 || count != 1) { + ret = -EINVAL; + break; + } + + if (flags & VFIO_IRQ_SET_DATA_NONE) { + mdev_state->intx_mask = false; + } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { + uint8_t mask = *(uint8_t *)data; + + if (mask) + mdev_state->intx_mask = false; + } else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { + ret = -ENOTTY; /* No support for unmask fd */ + } break; case VFIO_IRQ_SET_ACTION_TRIGGER: - { - if (flags & VFIO_IRQ_SET_DATA_NONE) { - pr_info("%s: disable INTx\n", __func__); - if (mdev_state->intx_evtfd) - eventfd_ctx_put(mdev_state->intx_evtfd); + if (is_intx(mdev_state) && !count && + (flags & VFIO_IRQ_SET_DATA_NONE)) { + mtty_disable_intx(mdev_state); + break; + } + + if (!(is_intx(mdev_state) || is_noirq(mdev_state)) || + start != 0 || count != 1) { + ret = -EINVAL; break; } if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { int fd = *(int *)data; + struct eventfd_ctx *evt; - if (fd > 0) { - struct eventfd_ctx *evt; - - evt = eventfd_ctx_fdget(fd); - if (IS_ERR(evt)) { - ret = PTR_ERR(evt); - break; - } - mdev_state->intx_evtfd = evt; - mdev_state->irq_fd = fd; - mdev_state->irq_index = index; + mtty_disable_intx(mdev_state); + + if (fd < 0) + break; + + evt = eventfd_ctx_fdget(fd); + if (IS_ERR(evt)) { + ret = PTR_ERR(evt); break; } + mdev_state->intx_evtfd = evt; + mdev_state->irq_index = index; + break; + } + + if (!is_intx(mdev_state)) { + ret = -EINVAL; + break; + } + + if (flags & VFIO_IRQ_SET_DATA_NONE) { + mtty_trigger_interrupt(mdev_state); + } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { + uint8_t trigger = *(uint8_t *)data; + + if (trigger) + mtty_trigger_interrupt(mdev_state); } break; } - } break; case VFIO_PCI_MSI_IRQ_INDEX: switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) { case VFIO_IRQ_SET_ACTION_MASK: case VFIO_IRQ_SET_ACTION_UNMASK: + ret = -ENOTTY; break; case VFIO_IRQ_SET_ACTION_TRIGGER: - if (flags & VFIO_IRQ_SET_DATA_NONE) { - if (mdev_state->msi_evtfd) - eventfd_ctx_put(mdev_state->msi_evtfd); - pr_info("%s: disable MSI\n", __func__); - mdev_state->irq_index = VFIO_PCI_INTX_IRQ_INDEX; + if (is_msi(mdev_state) && !count && + (flags & VFIO_IRQ_SET_DATA_NONE)) { + mtty_disable_msi(mdev_state); + break; + } + + if (!(is_msi(mdev_state) || is_noirq(mdev_state)) || + start != 0 || count != 1) { + ret = -EINVAL; break; } + if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { int fd = *(int *)data; struct eventfd_ctx *evt; - if (fd <= 0) - break; + mtty_disable_msi(mdev_state); - if (mdev_state->msi_evtfd) + if (fd < 0) break; evt = eventfd_ctx_fdget(fd); @@ -993,20 +1681,37 @@ static int mtty_set_irqs(struct mdev_state *mdev_state, uint32_t flags, break; } mdev_state->msi_evtfd = evt; - mdev_state->irq_fd = fd; mdev_state->irq_index = index; + break; + } + + if (!is_msi(mdev_state)) { + ret = -EINVAL; + break; + } + + if (flags & VFIO_IRQ_SET_DATA_NONE) { + mtty_trigger_interrupt(mdev_state); + } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { + uint8_t trigger = *(uint8_t *)data; + + if (trigger) + mtty_trigger_interrupt(mdev_state); } break; - } - break; + } + break; case VFIO_PCI_MSIX_IRQ_INDEX: - pr_info("%s: MSIX_IRQ\n", __func__); + dev_dbg(mdev_state->vdev.dev, "%s: MSIX_IRQ\n", __func__); + ret = -ENOTTY; break; case VFIO_PCI_ERR_IRQ_INDEX: - pr_info("%s: ERR_IRQ\n", __func__); + dev_dbg(mdev_state->vdev.dev, "%s: ERR_IRQ\n", __func__); + ret = -ENOTTY; break; case VFIO_PCI_REQ_IRQ_INDEX: - pr_info("%s: REQ_IRQ\n", __func__); + dev_dbg(mdev_state->vdev.dev, "%s: REQ_IRQ\n", __func__); + ret = -ENOTTY; break; } @@ -1014,33 +1719,6 @@ static int mtty_set_irqs(struct mdev_state *mdev_state, uint32_t flags, return ret; } -static int mtty_trigger_interrupt(struct mdev_state *mdev_state) -{ - int ret = -1; - - if ((mdev_state->irq_index == VFIO_PCI_MSI_IRQ_INDEX) && - (!mdev_state->msi_evtfd)) - return -EINVAL; - else if ((mdev_state->irq_index == VFIO_PCI_INTX_IRQ_INDEX) && - (!mdev_state->intx_evtfd)) { - pr_info("%s: Intr eventfd not found\n", __func__); - return -EINVAL; - } - - if (mdev_state->irq_index == VFIO_PCI_MSI_IRQ_INDEX) - ret = eventfd_signal(mdev_state->msi_evtfd, 1); - else - ret = eventfd_signal(mdev_state->intx_evtfd, 1); - -#if defined(DEBUG_INTR) - pr_info("Intx triggered\n"); -#endif - if (ret != 1) - pr_err("%s: eventfd signal failed (%d)\n", __func__, ret); - - return ret; -} - static int mtty_get_region_info(struct mdev_state *mdev_state, struct vfio_region_info *region_info, u16 *cap_type_id, void **cap_type) @@ -1084,22 +1762,16 @@ static int mtty_get_region_info(struct mdev_state *mdev_state, static int mtty_get_irq_info(struct vfio_irq_info *irq_info) { - switch (irq_info->index) { - case VFIO_PCI_INTX_IRQ_INDEX: - case VFIO_PCI_MSI_IRQ_INDEX: - case VFIO_PCI_REQ_IRQ_INDEX: - break; - - default: + if (irq_info->index != VFIO_PCI_INTX_IRQ_INDEX && + irq_info->index != VFIO_PCI_MSI_IRQ_INDEX) return -EINVAL; - } irq_info->flags = VFIO_IRQ_INFO_EVENTFD; irq_info->count = 1; if (irq_info->index == VFIO_PCI_INTX_IRQ_INDEX) - irq_info->flags |= (VFIO_IRQ_INFO_MASKABLE | - VFIO_IRQ_INFO_AUTOMASKED); + irq_info->flags |= VFIO_IRQ_INFO_MASKABLE | + VFIO_IRQ_INFO_AUTOMASKED; else irq_info->flags |= VFIO_IRQ_INFO_NORESIZE; @@ -1262,6 +1934,16 @@ static unsigned int mtty_get_available(struct mdev_type *mtype) return atomic_read(&mdev_avail_ports) / type->nr_ports; } +static void mtty_close(struct vfio_device *vdev) +{ + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + + mtty_disable_files(mdev_state); + mtty_disable_intx(mdev_state); + mtty_disable_msi(mdev_state); +} + static const struct vfio_device_ops mtty_dev_ops = { .name = "vfio-mtty", .init = mtty_init_dev, @@ -1273,6 +1955,7 @@ static const struct vfio_device_ops mtty_dev_ops = { .unbind_iommufd = vfio_iommufd_emulated_unbind, .attach_ioas = vfio_iommufd_emulated_attach_ioas, .detach_ioas = vfio_iommufd_emulated_detach_ioas, + .close_device = mtty_close, }; static struct mdev_driver mtty_driver = { |