summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-11-01 13:55:40 -1000
committerLinus Torvalds <torvalds@linux-foundation.org>2023-11-01 13:55:40 -1000
commitdeefd5024f0772cf56052ace9a8c347dc70bcaf3 (patch)
tree3e0ff80b073c18c45103c5acbd318e62567cb989
parent009fbfc97b6367762efa257f1478ec86d37949f9 (diff)
parent2b88119e35b00d8cb418d86abbace3b90a993bd7 (diff)
Merge tag 'vfio-v6.7-rc1' of https://github.com/awilliam/linux-vfio
Pull VFIO updates from Alex Williamson: - Add support for "chunk mode" in the mlx5-vfio-pci variant driver, which allows both larger device image sizes for migration, beyond the previous 4GB limit, and also read-ahead support for improved migration performance (Yishai Hadas) - A new bus master control interface for the CDX bus driver where there is no in-band mechanism to toggle device DMA as there is through config space on PCI devices (Nipun Gupta) - Add explicit alignment directives to vfio data structures to reduce the chance of breaking 32-bit userspace. In most cases this is transparent and the remaining cases where data structures are padded work within the existing rules for extending data structures within vfio (Stefan Hajnoczi) - Resolve a bug in the cdx bus driver noted when compiled with clang where missing parenthesis result in the wrong operation (Nathan Chancellor) - Resolve errors reported by smatch for a function when dealing with invalid inputs (Alex Williamson) - Add migration support to the mtty vfio/mdev sample driver for testing and integration purposes, allowing CI of migration without specific hardware requirements. Also resolve many of the short- comings of this driver relative to implementation of the vfio interrupt ioctl along the way (Alex Williamson) * tag 'vfio-v6.7-rc1' of https://github.com/awilliam/linux-vfio: vfio/mtty: Enable migration support vfio/mtty: Overhaul mtty interrupt handling vfio: Fix smatch errors in vfio_combine_iova_ranges() vfio/cdx: Add parentheses between bitwise AND expression and logical NOT vfio/mlx5: Activate the chunk mode functionality vfio/mlx5: Add support for READING in chunk mode vfio/mlx5: Add support for SAVING in chunk mode vfio/mlx5: Pre-allocate chunks for the STOP_COPY phase vfio/mlx5: Rename some stuff to match chunk mode vfio/mlx5: Enable querying state size which is > 4GB vfio/mlx5: Refactor the SAVE callback to activate a work only upon an error vfio/mlx5: Wake up the reader post of disabling the SAVING migration file vfio: use __aligned_u64 in struct vfio_device_ioeventfd vfio: use __aligned_u64 in struct vfio_device_gfx_plane_info vfio: trivially use __aligned_u64 for ioctl structs vfio-cdx: add bus mastering device feature support vfio: add bus master feature to device feature ioctl cdx: add support for bus mastering
-rw-r--r--drivers/cdx/cdx.c32
-rw-r--r--drivers/cdx/controller/cdx_controller.c4
-rw-r--r--drivers/cdx/controller/mcdi_functions.c58
-rw-r--r--drivers/cdx/controller/mcdi_functions.h13
-rw-r--r--drivers/gpu/drm/i915/gvt/kvmgt.c2
-rw-r--r--drivers/vfio/cdx/main.c57
-rw-r--r--drivers/vfio/cdx/private.h2
-rw-r--r--drivers/vfio/pci/mlx5/cmd.c103
-rw-r--r--drivers/vfio/pci/mlx5/cmd.h28
-rw-r--r--drivers/vfio/pci/mlx5/main.c283
-rw-r--r--drivers/vfio/vfio_main.c10
-rw-r--r--include/linux/cdx/cdx_bus.h18
-rw-r--r--include/uapi/linux/vfio.h47
-rw-r--r--samples/vfio-mdev/mbochs.c2
-rw-r--r--samples/vfio-mdev/mdpy.c2
-rw-r--r--samples/vfio-mdev/mtty.c829
16 files changed, 1298 insertions, 192 deletions
diff --git a/drivers/cdx/cdx.c b/drivers/cdx/cdx.c
index d2cad4c670a0..9efb7584f952 100644
--- a/drivers/cdx/cdx.c
+++ b/drivers/cdx/cdx.c
@@ -182,6 +182,38 @@ cdx_match_id(const struct cdx_device_id *ids, struct cdx_device *dev)
return NULL;
}
+int cdx_set_master(struct cdx_device *cdx_dev)
+{
+ struct cdx_controller *cdx = cdx_dev->cdx;
+ struct cdx_device_config dev_config;
+ int ret = -EOPNOTSUPP;
+
+ dev_config.type = CDX_DEV_BUS_MASTER_CONF;
+ dev_config.bus_master_enable = true;
+ if (cdx->ops->dev_configure)
+ ret = cdx->ops->dev_configure(cdx, cdx_dev->bus_num,
+ cdx_dev->dev_num, &dev_config);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(cdx_set_master);
+
+int cdx_clear_master(struct cdx_device *cdx_dev)
+{
+ struct cdx_controller *cdx = cdx_dev->cdx;
+ struct cdx_device_config dev_config;
+ int ret = -EOPNOTSUPP;
+
+ dev_config.type = CDX_DEV_BUS_MASTER_CONF;
+ dev_config.bus_master_enable = false;
+ if (cdx->ops->dev_configure)
+ ret = cdx->ops->dev_configure(cdx, cdx_dev->bus_num,
+ cdx_dev->dev_num, &dev_config);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(cdx_clear_master);
+
/**
* cdx_bus_match - device to driver matching callback
* @dev: the cdx device to match against
diff --git a/drivers/cdx/controller/cdx_controller.c b/drivers/cdx/controller/cdx_controller.c
index bb4ae7970e21..7828dac8edb1 100644
--- a/drivers/cdx/controller/cdx_controller.c
+++ b/drivers/cdx/controller/cdx_controller.c
@@ -56,6 +56,10 @@ static int cdx_configure_device(struct cdx_controller *cdx,
case CDX_DEV_RESET_CONF:
ret = cdx_mcdi_reset_device(cdx->priv, bus_num, dev_num);
break;
+ case CDX_DEV_BUS_MASTER_CONF:
+ ret = cdx_mcdi_bus_master_enable(cdx->priv, bus_num, dev_num,
+ dev_config->bus_master_enable);
+ break;
default:
ret = -EINVAL;
}
diff --git a/drivers/cdx/controller/mcdi_functions.c b/drivers/cdx/controller/mcdi_functions.c
index 0158f26533dd..fc82435d5dea 100644
--- a/drivers/cdx/controller/mcdi_functions.c
+++ b/drivers/cdx/controller/mcdi_functions.c
@@ -137,3 +137,61 @@ int cdx_mcdi_reset_device(struct cdx_mcdi *cdx, u8 bus_num, u8 dev_num)
return ret;
}
+
+static int cdx_mcdi_ctrl_flag_get(struct cdx_mcdi *cdx, u8 bus_num,
+ u8 dev_num, u32 *flags)
+{
+ MCDI_DECLARE_BUF(inbuf, MC_CMD_CDX_DEVICE_CONTROL_GET_IN_LEN);
+ MCDI_DECLARE_BUF(outbuf, MC_CMD_CDX_DEVICE_CONTROL_GET_OUT_LEN);
+ size_t outlen;
+ int ret;
+
+ MCDI_SET_DWORD(inbuf, CDX_DEVICE_CONTROL_GET_IN_BUS, bus_num);
+ MCDI_SET_DWORD(inbuf, CDX_DEVICE_CONTROL_GET_IN_DEVICE, dev_num);
+ ret = cdx_mcdi_rpc(cdx, MC_CMD_CDX_DEVICE_CONTROL_GET, inbuf,
+ sizeof(inbuf), outbuf, sizeof(outbuf), &outlen);
+ if (ret)
+ return ret;
+
+ if (outlen != MC_CMD_CDX_DEVICE_CONTROL_GET_OUT_LEN)
+ return -EIO;
+
+ *flags = MCDI_DWORD(outbuf, CDX_DEVICE_CONTROL_GET_OUT_FLAGS);
+
+ return 0;
+}
+
+static int cdx_mcdi_ctrl_flag_set(struct cdx_mcdi *cdx, u8 bus_num,
+ u8 dev_num, bool enable, int bit_pos)
+{
+ MCDI_DECLARE_BUF(inbuf, MC_CMD_CDX_DEVICE_CONTROL_SET_IN_LEN);
+ u32 flags;
+ int ret;
+
+ /*
+ * Get flags and then set/reset bit at bit_pos according to
+ * the input params.
+ */
+ ret = cdx_mcdi_ctrl_flag_get(cdx, bus_num, dev_num, &flags);
+ if (ret)
+ return ret;
+
+ flags = flags & (u32)(~(BIT(bit_pos)));
+ if (enable)
+ flags |= (1 << bit_pos);
+
+ MCDI_SET_DWORD(inbuf, CDX_DEVICE_CONTROL_SET_IN_BUS, bus_num);
+ MCDI_SET_DWORD(inbuf, CDX_DEVICE_CONTROL_SET_IN_DEVICE, dev_num);
+ MCDI_SET_DWORD(inbuf, CDX_DEVICE_CONTROL_SET_IN_FLAGS, flags);
+ ret = cdx_mcdi_rpc(cdx, MC_CMD_CDX_DEVICE_CONTROL_SET, inbuf,
+ sizeof(inbuf), NULL, 0, NULL);
+
+ return ret;
+}
+
+int cdx_mcdi_bus_master_enable(struct cdx_mcdi *cdx, u8 bus_num,
+ u8 dev_num, bool enable)
+{
+ return cdx_mcdi_ctrl_flag_set(cdx, bus_num, dev_num, enable,
+ MC_CMD_CDX_DEVICE_CONTROL_SET_IN_BUS_MASTER_ENABLE_LBN);
+}
diff --git a/drivers/cdx/controller/mcdi_functions.h b/drivers/cdx/controller/mcdi_functions.h
index 7440ace5539a..a448d6581eb4 100644
--- a/drivers/cdx/controller/mcdi_functions.h
+++ b/drivers/cdx/controller/mcdi_functions.h
@@ -58,4 +58,17 @@ int cdx_mcdi_get_dev_config(struct cdx_mcdi *cdx,
int cdx_mcdi_reset_device(struct cdx_mcdi *cdx,
u8 bus_num, u8 dev_num);
+/**
+ * cdx_mcdi_bus_master_enable - Set/Reset bus mastering for cdx device
+ * represented by bus_num:dev_num
+ * @cdx: pointer to MCDI interface.
+ * @bus_num: Bus number.
+ * @dev_num: Device number.
+ * @enable: Enable bus mastering if set, disable otherwise.
+ *
+ * Return: 0 on success, <0 on failure
+ */
+int cdx_mcdi_bus_master_enable(struct cdx_mcdi *cdx, u8 bus_num,
+ u8 dev_num, bool enable);
+
#endif /* CDX_MCDI_FUNCTIONS_H */
diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c
index 42ce20e72db7..faf21be724c3 100644
--- a/drivers/gpu/drm/i915/gvt/kvmgt.c
+++ b/drivers/gpu/drm/i915/gvt/kvmgt.c
@@ -1379,7 +1379,7 @@ static long intel_vgpu_ioctl(struct vfio_device *vfio_dev, unsigned int cmd,
intel_gvt_reset_vgpu(vgpu);
return 0;
} else if (cmd == VFIO_DEVICE_QUERY_GFX_PLANE) {
- struct vfio_device_gfx_plane_info dmabuf;
+ struct vfio_device_gfx_plane_info dmabuf = {};
int ret = 0;
minsz = offsetofend(struct vfio_device_gfx_plane_info,
diff --git a/drivers/vfio/cdx/main.c b/drivers/vfio/cdx/main.c
index de56686581ae..a63744302b5e 100644
--- a/drivers/vfio/cdx/main.c
+++ b/drivers/vfio/cdx/main.c
@@ -14,7 +14,7 @@ static int vfio_cdx_open_device(struct vfio_device *core_vdev)
container_of(core_vdev, struct vfio_cdx_device, vdev);
struct cdx_device *cdx_dev = to_cdx_device(core_vdev->dev);
int count = cdx_dev->res_count;
- int i;
+ int i, ret;
vdev->regions = kcalloc(count, sizeof(struct vfio_cdx_region),
GFP_KERNEL_ACCOUNT);
@@ -39,6 +39,17 @@ static int vfio_cdx_open_device(struct vfio_device *core_vdev)
if (!(cdx_dev->res[i].flags & IORESOURCE_READONLY))
vdev->regions[i].flags |= VFIO_REGION_INFO_FLAG_WRITE;
}
+ ret = cdx_dev_reset(core_vdev->dev);
+ if (ret) {
+ kfree(vdev->regions);
+ vdev->regions = NULL;
+ return ret;
+ }
+ ret = cdx_clear_master(cdx_dev);
+ if (ret)
+ vdev->flags &= ~BME_SUPPORT;
+ else
+ vdev->flags |= BME_SUPPORT;
return 0;
}
@@ -52,6 +63,49 @@ static void vfio_cdx_close_device(struct vfio_device *core_vdev)
cdx_dev_reset(core_vdev->dev);
}
+static int vfio_cdx_bm_ctrl(struct vfio_device *core_vdev, u32 flags,
+ void __user *arg, size_t argsz)
+{
+ size_t minsz =
+ offsetofend(struct vfio_device_feature_bus_master, op);
+ struct vfio_cdx_device *vdev =
+ container_of(core_vdev, struct vfio_cdx_device, vdev);
+ struct cdx_device *cdx_dev = to_cdx_device(core_vdev->dev);
+ struct vfio_device_feature_bus_master ops;
+ int ret;
+
+ if (!(vdev->flags & BME_SUPPORT))
+ return -ENOTTY;
+
+ ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET,
+ sizeof(ops));
+ if (ret != 1)
+ return ret;
+
+ if (copy_from_user(&ops, arg, minsz))
+ return -EFAULT;
+
+ switch (ops.op) {
+ case VFIO_DEVICE_FEATURE_CLEAR_MASTER:
+ return cdx_clear_master(cdx_dev);
+ case VFIO_DEVICE_FEATURE_SET_MASTER:
+ return cdx_set_master(cdx_dev);
+ default:
+ return -EINVAL;
+ }
+}
+
+static int vfio_cdx_ioctl_feature(struct vfio_device *device, u32 flags,
+ void __user *arg, size_t argsz)
+{
+ switch (flags & VFIO_DEVICE_FEATURE_MASK) {
+ case VFIO_DEVICE_FEATURE_BUS_MASTER:
+ return vfio_cdx_bm_ctrl(device, flags, arg, argsz);
+ default:
+ return -ENOTTY;
+ }
+}
+
static int vfio_cdx_ioctl_get_info(struct vfio_cdx_device *vdev,
struct vfio_device_info __user *arg)
{
@@ -169,6 +223,7 @@ static const struct vfio_device_ops vfio_cdx_ops = {
.open_device = vfio_cdx_open_device,
.close_device = vfio_cdx_close_device,
.ioctl = vfio_cdx_ioctl,
+ .device_feature = vfio_cdx_ioctl_feature,
.mmap = vfio_cdx_mmap,
.bind_iommufd = vfio_iommufd_physical_bind,
.unbind_iommufd = vfio_iommufd_physical_unbind,
diff --git a/drivers/vfio/cdx/private.h b/drivers/vfio/cdx/private.h
index 8bdc117ea88e..8e9d25913728 100644
--- a/drivers/vfio/cdx/private.h
+++ b/drivers/vfio/cdx/private.h
@@ -23,6 +23,8 @@ struct vfio_cdx_region {
struct vfio_cdx_device {
struct vfio_device vdev;
struct vfio_cdx_region *regions;
+ u32 flags;
+#define BME_SUPPORT BIT(0)
};
#endif /* VFIO_CDX_PRIVATE_H */
diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c
index 33574b04477d..efd1d252cdc9 100644
--- a/drivers/vfio/pci/mlx5/cmd.c
+++ b/drivers/vfio/pci/mlx5/cmd.c
@@ -86,7 +86,8 @@ int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
}
int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
- size_t *state_size, u8 query_flags)
+ size_t *state_size, u64 *total_size,
+ u8 query_flags)
{
u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {};
u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {};
@@ -128,6 +129,7 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
MLX5_SET(query_vhca_migration_state_in, in, op_mod, 0);
MLX5_SET(query_vhca_migration_state_in, in, incremental,
query_flags & MLX5VF_QUERY_INC);
+ MLX5_SET(query_vhca_migration_state_in, in, chunk, mvdev->chunk_mode);
ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in,
out);
@@ -139,6 +141,11 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
*state_size = MLX5_GET(query_vhca_migration_state_out, out,
required_umem_size);
+ if (total_size)
+ *total_size = mvdev->chunk_mode ?
+ MLX5_GET64(query_vhca_migration_state_out, out,
+ remaining_total_size) : *state_size;
+
return 0;
}
@@ -254,6 +261,9 @@ void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev,
mvdev->core_device.vdev.migration_flags |=
VFIO_MIGRATION_PRE_COPY;
+ if (MLX5_CAP_GEN_2(mvdev->mdev, migration_in_chunks))
+ mvdev->chunk_mode = 1;
+
end:
mlx5_vf_put_core_dev(mvdev->mdev);
}
@@ -428,6 +438,7 @@ end:
void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf)
{
spin_lock_irq(&buf->migf->list_lock);
+ buf->stop_copy_chunk_num = 0;
list_add_tail(&buf->buf_elm, &buf->migf->avail_list);
spin_unlock_irq(&buf->migf->list_lock);
}
@@ -475,6 +486,15 @@ found:
return buf;
}
+static void
+mlx5vf_save_callback_complete(struct mlx5_vf_migration_file *migf,
+ struct mlx5vf_async_data *async_data)
+{
+ kvfree(async_data->out);
+ complete(&migf->save_comp);
+ fput(migf->filp);
+}
+
void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work)
{
struct mlx5vf_async_data *async_data = container_of(_work,
@@ -487,16 +507,15 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work)
mlx5vf_put_data_buffer(async_data->buf);
if (async_data->header_buf)
mlx5vf_put_data_buffer(async_data->header_buf);
- if (async_data->status == MLX5_CMD_STAT_BAD_RES_STATE_ERR)
+ if (!async_data->stop_copy_chunk &&
+ async_data->status == MLX5_CMD_STAT_BAD_RES_STATE_ERR)
migf->state = MLX5_MIGF_STATE_PRE_COPY_ERROR;
else
migf->state = MLX5_MIGF_STATE_ERROR;
wake_up_interruptible(&migf->poll_wait);
}
mutex_unlock(&migf->lock);
- kvfree(async_data->out);
- complete(&migf->save_comp);
- fput(migf->filp);
+ mlx5vf_save_callback_complete(migf, async_data);
}
static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf,
@@ -536,13 +555,20 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)
struct mlx5_vf_migration_file, async_data);
if (!status) {
+ size_t next_required_umem_size = 0;
+ bool stop_copy_last_chunk;
size_t image_size;
unsigned long flags;
bool initial_pre_copy = migf->state != MLX5_MIGF_STATE_PRE_COPY &&
- !async_data->last_chunk;
+ !async_data->stop_copy_chunk;
image_size = MLX5_GET(save_vhca_state_out, async_data->out,
actual_image_size);
+ if (async_data->buf->stop_copy_chunk_num)
+ next_required_umem_size = MLX5_GET(save_vhca_state_out,
+ async_data->out, next_required_umem_size);
+ stop_copy_last_chunk = async_data->stop_copy_chunk &&
+ !next_required_umem_size;
if (async_data->header_buf) {
status = add_buf_header(async_data->header_buf, image_size,
initial_pre_copy);
@@ -554,19 +580,34 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)
migf->max_pos += async_data->buf->length;
spin_lock_irqsave(&migf->list_lock, flags);
list_add_tail(&async_data->buf->buf_elm, &migf->buf_list);
+ if (async_data->buf->stop_copy_chunk_num) {
+ migf->num_ready_chunks++;
+ if (next_required_umem_size &&
+ migf->num_ready_chunks >= MAX_NUM_CHUNKS) {
+ /* Delay the next SAVE till one chunk be consumed */
+ migf->next_required_umem_size = next_required_umem_size;
+ next_required_umem_size = 0;
+ }
+ }
spin_unlock_irqrestore(&migf->list_lock, flags);
- if (initial_pre_copy)
+ if (initial_pre_copy) {
migf->pre_copy_initial_bytes += image_size;
- migf->state = async_data->last_chunk ?
- MLX5_MIGF_STATE_COMPLETE : MLX5_MIGF_STATE_PRE_COPY;
+ migf->state = MLX5_MIGF_STATE_PRE_COPY;
+ }
+ if (stop_copy_last_chunk)
+ migf->state = MLX5_MIGF_STATE_COMPLETE;
wake_up_interruptible(&migf->poll_wait);
+ if (next_required_umem_size)
+ mlx5vf_mig_file_set_save_work(migf,
+ /* Picking up the next chunk num */
+ (async_data->buf->stop_copy_chunk_num % MAX_NUM_CHUNKS) + 1,
+ next_required_umem_size);
+ mlx5vf_save_callback_complete(migf, async_data);
+ return;
}
err:
- /*
- * The error and the cleanup flows can't run from an
- * interrupt context
- */
+ /* The error flow can't run from an interrupt context */
if (status == -EREMOTEIO)
status = MLX5_GET(save_vhca_state_out, async_data->out, status);
async_data->status = status;
@@ -610,7 +651,7 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
async_data = &migf->async_data;
async_data->buf = buf;
- async_data->last_chunk = !track;
+ async_data->stop_copy_chunk = !track;
async_data->out = kvzalloc(out_size, GFP_KERNEL);
if (!async_data->out) {
err = -ENOMEM;
@@ -618,10 +659,15 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
}
if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
- if (async_data->last_chunk && migf->buf_header) {
- header_buf = migf->buf_header;
- migf->buf_header = NULL;
- } else {
+ if (async_data->stop_copy_chunk) {
+ u8 header_idx = buf->stop_copy_chunk_num ?
+ buf->stop_copy_chunk_num - 1 : 0;
+
+ header_buf = migf->buf_header[header_idx];
+ migf->buf_header[header_idx] = NULL;
+ }
+
+ if (!header_buf) {
header_buf = mlx5vf_get_data_buffer(migf,
sizeof(struct mlx5_vf_migration_header), DMA_NONE);
if (IS_ERR(header_buf)) {
@@ -631,8 +677,8 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
}
}
- if (async_data->last_chunk)
- migf->state = MLX5_MIGF_STATE_SAVE_LAST;
+ if (async_data->stop_copy_chunk)
+ migf->state = MLX5_MIGF_STATE_SAVE_STOP_COPY_CHUNK;
async_data->header_buf = header_buf;
get_file(migf->filp);
@@ -707,18 +753,21 @@ void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf)
void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf)
{
struct mlx5_vhca_data_buffer *entry;
+ int i;
lockdep_assert_held(&migf->mvdev->state_mutex);
WARN_ON(migf->mvdev->mdev_detach);
- if (migf->buf) {
- mlx5vf_free_data_buffer(migf->buf);
- migf->buf = NULL;
- }
+ for (i = 0; i < MAX_NUM_CHUNKS; i++) {
+ if (migf->buf[i]) {
+ mlx5vf_free_data_buffer(migf->buf[i]);
+ migf->buf[i] = NULL;
+ }
- if (migf->buf_header) {
- mlx5vf_free_data_buffer(migf->buf_header);
- migf->buf_header = NULL;
+ if (migf->buf_header[i]) {
+ mlx5vf_free_data_buffer(migf->buf_header[i]);
+ migf->buf_header[i] = NULL;
+ }
}
list_splice(&migf->avail_list, &migf->buf_list);
diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h
index aec4c69dd6c1..f2c7227fa683 100644
--- a/drivers/vfio/pci/mlx5/cmd.h
+++ b/drivers/vfio/pci/mlx5/cmd.h
@@ -20,7 +20,7 @@ enum mlx5_vf_migf_state {
MLX5_MIGF_STATE_ERROR = 1,
MLX5_MIGF_STATE_PRE_COPY_ERROR,
MLX5_MIGF_STATE_PRE_COPY,
- MLX5_MIGF_STATE_SAVE_LAST,
+ MLX5_MIGF_STATE_SAVE_STOP_COPY_CHUNK,
MLX5_MIGF_STATE_COMPLETE,
};
@@ -64,6 +64,7 @@ struct mlx5_vhca_data_buffer {
u32 mkey;
enum dma_data_direction dma_dir;
u8 dmaed:1;
+ u8 stop_copy_chunk_num;
struct list_head buf_elm;
struct mlx5_vf_migration_file *migf;
/* Optimize mlx5vf_get_migration_page() for sequential access */
@@ -78,10 +79,19 @@ struct mlx5vf_async_data {
struct mlx5_vhca_data_buffer *buf;
struct mlx5_vhca_data_buffer *header_buf;
int status;
- u8 last_chunk:1;
+ u8 stop_copy_chunk:1;
void *out;
};
+struct mlx5vf_save_work_data {
+ struct mlx5_vf_migration_file *migf;
+ size_t next_required_umem_size;
+ struct work_struct work;
+ u8 chunk_num;
+};
+
+#define MAX_NUM_CHUNKS 2
+
struct mlx5_vf_migration_file {
struct file *filp;
struct mutex lock;
@@ -94,8 +104,12 @@ struct mlx5_vf_migration_file {
u32 record_tag;
u64 stop_copy_prep_size;
u64 pre_copy_initial_bytes;
- struct mlx5_vhca_data_buffer *buf;
- struct mlx5_vhca_data_buffer *buf_header;
+ size_t next_required_umem_size;
+ u8 num_ready_chunks;
+ /* Upon chunk mode preserve another set of buffers for stop_copy phase */
+ struct mlx5_vhca_data_buffer *buf[MAX_NUM_CHUNKS];
+ struct mlx5_vhca_data_buffer *buf_header[MAX_NUM_CHUNKS];
+ struct mlx5vf_save_work_data save_data[MAX_NUM_CHUNKS];
spinlock_t list_lock;
struct list_head buf_list;
struct list_head avail_list;
@@ -164,6 +178,7 @@ struct mlx5vf_pci_core_device {
u8 deferred_reset:1;
u8 mdev_detach:1;
u8 log_active:1;
+ u8 chunk_mode:1;
struct completion tracker_comp;
/* protect migration state */
struct mutex state_mutex;
@@ -186,7 +201,8 @@ enum {
int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod);
int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod);
int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
- size_t *state_size, u8 query_flags);
+ size_t *state_size, u64 *total_size,
+ u8 query_flags);
void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev,
const struct vfio_migration_ops *mig_ops,
const struct vfio_log_ops *log_ops);
@@ -217,6 +233,8 @@ struct page *mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf,
void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev);
void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev);
void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work);
+void mlx5vf_mig_file_set_save_work(struct mlx5_vf_migration_file *migf,
+ u8 chunk_num, size_t next_required_umem_size);
int mlx5vf_start_page_tracker(struct vfio_device *vdev,
struct rb_root_cached *ranges, u32 nnodes, u64 *page_size);
int mlx5vf_stop_page_tracker(struct vfio_device *vdev);
diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c
index 42ec574a8622..b6ac66c5008d 100644
--- a/drivers/vfio/pci/mlx5/main.c
+++ b/drivers/vfio/pci/mlx5/main.c
@@ -24,6 +24,8 @@
/* Device specification max LOAD size */
#define MAX_LOAD_SIZE (BIT_ULL(__mlx5_bit_sz(load_vhca_state_in, size)) - 1)
+#define MAX_CHUNK_SIZE SZ_8M
+
static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev)
{
struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
@@ -158,6 +160,41 @@ end:
return found ? buf : NULL;
}
+static void mlx5vf_buf_read_done(struct mlx5_vhca_data_buffer *vhca_buf)
+{
+ struct mlx5_vf_migration_file *migf = vhca_buf->migf;
+
+ if (vhca_buf->stop_copy_chunk_num) {
+ bool is_header = vhca_buf->dma_dir == DMA_NONE;
+ u8 chunk_num = vhca_buf->stop_copy_chunk_num;
+ size_t next_required_umem_size = 0;
+
+ if (is_header)
+ migf->buf_header[chunk_num - 1] = vhca_buf;
+ else
+ migf->buf[chunk_num - 1] = vhca_buf;
+
+ spin_lock_irq(&migf->list_lock);
+ list_del_init(&vhca_buf->buf_elm);
+ if (!is_header) {
+ next_required_umem_size =
+ migf->next_required_umem_size;
+ migf->next_required_umem_size = 0;
+ migf->num_ready_chunks--;
+ }
+ spin_unlock_irq(&migf->list_lock);
+ if (next_required_umem_size)
+ mlx5vf_mig_file_set_save_work(migf, chunk_num,
+ next_required_umem_size);
+ return;
+ }
+
+ spin_lock_irq(&migf->list_lock);
+ list_del_init(&vhca_buf->buf_elm);
+ list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list);
+ spin_unlock_irq(&migf->list_lock);
+}
+
static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf,
char __user **buf, size_t *len, loff_t *pos)
{
@@ -193,12 +230,8 @@ static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf,
copy_len -= page_len;
}
- if (*pos >= vhca_buf->start_pos + vhca_buf->length) {
- spin_lock_irq(&vhca_buf->migf->list_lock);
- list_del_init(&vhca_buf->buf_elm);
- list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list);
- spin_unlock_irq(&vhca_buf->migf->list_lock);
- }
+ if (*pos >= vhca_buf->start_pos + vhca_buf->length)
+ mlx5vf_buf_read_done(vhca_buf);
return done;
}
@@ -304,7 +337,75 @@ static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf)
wake_up_interruptible(&migf->poll_wait);
}
-static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf)
+void mlx5vf_mig_file_set_save_work(struct mlx5_vf_migration_file *migf,
+ u8 chunk_num, size_t next_required_umem_size)
+{
+ migf->save_data[chunk_num - 1].next_required_umem_size =
+ next_required_umem_size;
+ migf->save_data[chunk_num - 1].migf = migf;
+ get_file(migf->filp);
+ queue_work(migf->mvdev->cb_wq,
+ &migf->save_data[chunk_num - 1].work);
+}
+
+static struct mlx5_vhca_data_buffer *
+mlx5vf_mig_file_get_stop_copy_buf(struct mlx5_vf_migration_file *migf,
+ u8 index, size_t required_length)
+{
+ struct mlx5_vhca_data_buffer *buf = migf->buf[index];
+ u8 chunk_num;
+
+ WARN_ON(!buf);
+ chunk_num = buf->stop_copy_chunk_num;
+ buf->migf->buf[index] = NULL;
+ /* Checking whether the pre-allocated buffer can fit */
+ if (buf->allocated_length >= required_length)
+ return buf;
+
+ mlx5vf_put_data_buffer(buf);
+ buf = mlx5vf_get_data_buffer(buf->migf, required_length,
+ DMA_FROM_DEVICE);
+ if (IS_ERR(buf))
+ return buf;
+
+ buf->stop_copy_chunk_num = chunk_num;
+ return buf;
+}
+
+static void mlx5vf_mig_file_save_work(struct work_struct *_work)
+{
+ struct mlx5vf_save_work_data *save_data = container_of(_work,
+ struct mlx5vf_save_work_data, work);
+ struct mlx5_vf_migration_file *migf = save_data->migf;
+ struct mlx5vf_pci_core_device *mvdev = migf->mvdev;
+ struct mlx5_vhca_data_buffer *buf;
+
+ mutex_lock(&mvdev->state_mutex);
+ if (migf->state == MLX5_MIGF_STATE_ERROR)
+ goto end;
+
+ buf = mlx5vf_mig_file_get_stop_copy_buf(migf,
+ save_data->chunk_num - 1,
+ save_data->next_required_umem_size);
+ if (IS_ERR(buf))
+ goto err;
+
+ if (mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false))
+ goto err_save;
+
+ goto end;
+
+err_save:
+ mlx5vf_put_data_buffer(buf);
+err:
+ mlx5vf_mark_err(migf);
+end:
+ mlx5vf_state_mutex_unlock(mvdev);
+ fput(migf->filp);
+}
+
+static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf,
+ bool track)
{
size_t size = sizeof(struct mlx5_vf_migration_header) +
sizeof(struct mlx5_vf_migration_tag_stop_copy_data);
@@ -331,7 +432,7 @@ static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf)
to_buff = kmap_local_page(page);
memcpy(to_buff, &header, sizeof(header));
header_buf->length = sizeof(header);
- data.stop_copy_size = cpu_to_le64(migf->buf->allocated_length);
+ data.stop_copy_size = cpu_to_le64(migf->buf[0]->allocated_length);
memcpy(to_buff + sizeof(header), &data, sizeof(data));
header_buf->length += sizeof(data);
kunmap_local(to_buff);
@@ -340,48 +441,86 @@ static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf)
spin_lock_irqsave(&migf->list_lock, flags);
list_add_tail(&header_buf->buf_elm, &migf->buf_list);
spin_unlock_irqrestore(&migf->list_lock, flags);
- migf->pre_copy_initial_bytes = size;
+ if (track)
+ migf->pre_copy_initial_bytes = size;
return 0;
err:
mlx5vf_put_data_buffer(header_buf);
return ret;
}
-static int mlx5vf_prep_stop_copy(struct mlx5_vf_migration_file *migf,
- size_t state_size)
+static int mlx5vf_prep_stop_copy(struct mlx5vf_pci_core_device *mvdev,
+ struct mlx5_vf_migration_file *migf,
+ size_t state_size, u64 full_size,
+ bool track)
{
struct mlx5_vhca_data_buffer *buf;
size_t inc_state_size;
+ int num_chunks;
int ret;
+ int i;
- /* let's be ready for stop_copy size that might grow by 10 percents */
- if (check_add_overflow(state_size, state_size / 10, &inc_state_size))
- inc_state_size = state_size;
+ if (mvdev->chunk_mode) {
+ size_t chunk_size = min_t(size_t, MAX_CHUNK_SIZE, full_size);
- buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE);
- if (IS_ERR(buf))
- return PTR_ERR(buf);
+ /* from firmware perspective at least 'state_size' buffer should be set */
+ inc_state_size = max(state_size, chunk_size);
+ } else {
+ if (track) {
+ /* let's be ready for stop_copy size that might grow by 10 percents */
+ if (check_add_overflow(state_size, state_size / 10, &inc_state_size))
+ inc_state_size = state_size;
+ } else {
+ inc_state_size = state_size;
+ }
+ }
- migf->buf = buf;
- buf = mlx5vf_get_data_buffer(migf,
- sizeof(struct mlx5_vf_migration_header), DMA_NONE);
- if (IS_ERR(buf)) {
- ret = PTR_ERR(buf);
- goto err;
+ /* let's not overflow the device specification max SAVE size */
+ inc_state_size = min_t(size_t, inc_state_size,
+ (BIT_ULL(__mlx5_bit_sz(save_vhca_state_in, size)) - PAGE_SIZE));
+
+ num_chunks = mvdev->chunk_mode ? MAX_NUM_CHUNKS : 1;
+ for (i = 0; i < num_chunks; i++) {
+ buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE);
+ if (IS_ERR(buf)) {
+ ret = PTR_ERR(buf);
+ goto err;
+ }
+
+ migf->buf[i] = buf;
+ buf = mlx5vf_get_data_buffer(migf,
+ sizeof(struct mlx5_vf_migration_header), DMA_NONE);
+ if (IS_ERR(buf)) {
+ ret = PTR_ERR(buf);
+ goto err;
+ }
+ migf->buf_header[i] = buf;
+ if (mvdev->chunk_mode) {
+ migf->buf[i]->stop_copy_chunk_num = i + 1;
+ migf->buf_header[i]->stop_copy_chunk_num = i + 1;
+ INIT_WORK(&migf->save_data[i].work,
+ mlx5vf_mig_file_save_work);
+ migf->save_data[i].chunk_num = i + 1;
+ }
}
- migf->buf_header = buf;
- ret = mlx5vf_add_stop_copy_header(migf);
+ ret = mlx5vf_add_stop_copy_header(migf, track);
if (ret)
- goto err_header;
+ goto err;
return 0;
-err_header:
- mlx5vf_put_data_buffer(migf->buf_header);
- migf->buf_header = NULL;
err:
- mlx5vf_put_data_buffer(migf->buf);
- migf->buf = NULL;
+ for (i = 0; i < num_chunks; i++) {
+ if (migf->buf[i]) {
+ mlx5vf_put_data_buffer(migf->buf[i]);
+ migf->buf[i] = NULL;
+ }
+ if (migf->buf_header[i]) {
+ mlx5vf_put_data_buffer(migf->buf_header[i]);
+ migf->buf_header[i] = NULL;
+ }
+ }
+
return ret;
}
@@ -428,7 +567,7 @@ static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd,
* As so, the other code below is safe with the proper locks.
*/
ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length,
- MLX5VF_QUERY_INC);
+ NULL, MLX5VF_QUERY_INC);
if (ret)
goto err_state_unlock;
}
@@ -505,21 +644,15 @@ static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev)
if (migf->state == MLX5_MIGF_STATE_ERROR)
return -ENODEV;
- ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length,
+ ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, NULL,
MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL);
if (ret)
goto err;
- /* Checking whether we have a matching pre-allocated buffer that can fit */
- if (migf->buf && migf->buf->allocated_length >= length) {
- buf = migf->buf;
- migf->buf = NULL;
- } else {
- buf = mlx5vf_get_data_buffer(migf, length, DMA_FROM_DEVICE);
- if (IS_ERR(buf)) {
- ret = PTR_ERR(buf);
- goto err;
- }
+ buf = mlx5vf_mig_file_get_stop_copy_buf(migf, 0, length);
+ if (IS_ERR(buf)) {
+ ret = PTR_ERR(buf);
+ goto err;
}
ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false);
@@ -541,6 +674,7 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
struct mlx5_vf_migration_file *migf;
struct mlx5_vhca_data_buffer *buf;
size_t length;
+ u64 full_size;
int ret;
migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
@@ -574,20 +708,25 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
INIT_LIST_HEAD(&migf->buf_list);
INIT_LIST_HEAD(&migf->avail_list);
spin_lock_init(&migf->list_lock);
- ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, 0);
+ ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, &full_size, 0);
+ if (ret)
+ goto out_pd;
+
+ ret = mlx5vf_prep_stop_copy(mvdev, migf, length, full_size, track);
if (ret)
goto out_pd;
if (track) {
- ret = mlx5vf_prep_stop_copy(migf, length);
- if (ret)
+ /* leave the allocated buffer ready for the stop-copy phase */
+ buf = mlx5vf_alloc_data_buffer(migf,
+ migf->buf[0]->allocated_length, DMA_FROM_DEVICE);
+ if (IS_ERR(buf)) {
+ ret = PTR_ERR(buf);
goto out_pd;
- }
-
- buf = mlx5vf_alloc_data_buffer(migf, length, DMA_FROM_DEVICE);
- if (IS_ERR(buf)) {
- ret = PTR_ERR(buf);
- goto out_pd;
+ }
+ } else {
+ buf = migf->buf[0];
+ migf->buf[0] = NULL;
}
ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track);
@@ -820,8 +959,8 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
size_t len, loff_t *pos)
{
struct mlx5_vf_migration_file *migf = filp->private_data;
- struct mlx5_vhca_data_buffer *vhca_buf = migf->buf;
- struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header;
+ struct mlx5_vhca_data_buffer *vhca_buf = migf->buf[0];
+ struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header[0];
loff_t requested_length;
bool has_work = false;
ssize_t done = 0;
@@ -856,15 +995,15 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
if (vhca_buf_header->allocated_length < migf->record_size) {
mlx5vf_free_data_buffer(vhca_buf_header);
- migf->buf_header = mlx5vf_alloc_data_buffer(migf,
+ migf->buf_header[0] = mlx5vf_alloc_data_buffer(migf,
migf->record_size, DMA_NONE);
- if (IS_ERR(migf->buf_header)) {
- ret = PTR_ERR(migf->buf_header);
- migf->buf_header = NULL;
+ if (IS_ERR(migf->buf_header[0])) {
+ ret = PTR_ERR(migf->buf_header[0]);
+ migf->buf_header[0] = NULL;
goto out_unlock;
}
- vhca_buf_header = migf->buf_header;
+ vhca_buf_header = migf->buf_header[0];
}
vhca_buf_header->start_pos = migf->max_pos;
@@ -884,15 +1023,15 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
if (vhca_buf->allocated_length < size) {
mlx5vf_free_data_buffer(vhca_buf);
- migf->buf = mlx5vf_alloc_data_buffer(migf,
+ migf->buf[0] = mlx5vf_alloc_data_buffer(migf,
size, DMA_TO_DEVICE);
- if (IS_ERR(migf->buf)) {
- ret = PTR_ERR(migf->buf);
- migf->buf = NULL;
+ if (IS_ERR(migf->buf[0])) {
+ ret = PTR_ERR(migf->buf[0]);
+ migf->buf[0] = NULL;
goto out_unlock;
}
- vhca_buf = migf->buf;
+ vhca_buf = migf->buf[0];
}
vhca_buf->start_pos = migf->max_pos;
@@ -974,7 +1113,7 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
goto out_pd;
}
- migf->buf = buf;
+ migf->buf[0] = buf;
if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
buf = mlx5vf_alloc_data_buffer(migf,
sizeof(struct mlx5_vf_migration_header), DMA_NONE);
@@ -983,7 +1122,7 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
goto out_buf;
}
- migf->buf_header = buf;
+ migf->buf_header[0] = buf;
migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
} else {
/* Initial state will be to read the image */
@@ -997,7 +1136,7 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
spin_lock_init(&migf->list_lock);
return migf;
out_buf:
- mlx5vf_free_data_buffer(migf->buf);
+ mlx5vf_free_data_buffer(migf->buf[0]);
out_pd:
mlx5vf_cmd_dealloc_pd(migf);
out_free:
@@ -1019,6 +1158,7 @@ void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev)
mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx);
cancel_work_sync(&mvdev->saving_migf->async_data.work);
mlx5vf_disable_fd(mvdev->saving_migf);
+ wake_up_interruptible(&mvdev->saving_migf->poll_wait);
mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf);
fput(mvdev->saving_migf->filp);
mvdev->saving_migf = NULL;
@@ -1100,7 +1240,7 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
if (!MLX5VF_PRE_COPY_SUPP(mvdev)) {
ret = mlx5vf_cmd_load_vhca_state(mvdev,
mvdev->resuming_migf,
- mvdev->resuming_migf->buf);
+ mvdev->resuming_migf->buf[0]);
if (ret)
return ERR_PTR(ret);
}
@@ -1194,13 +1334,14 @@ static int mlx5vf_pci_get_data_size(struct vfio_device *vdev,
struct mlx5vf_pci_core_device *mvdev = container_of(
vdev, struct mlx5vf_pci_core_device, core_device.vdev);
size_t state_size;
+ u64 total_size;
int ret;
mutex_lock(&mvdev->state_mutex);
- ret = mlx5vf_cmd_query_vhca_migration_state(mvdev,
- &state_size, 0);
+ ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &state_size,
+ &total_size, 0);
if (!ret)
- *stop_copy_length = state_size;
+ *stop_copy_length = total_size;
mlx5vf_state_mutex_unlock(mvdev);
return ret;
}
diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
index 40732e8ed4c6..e31e1952d7b8 100644
--- a/drivers/vfio/vfio_main.c
+++ b/drivers/vfio/vfio_main.c
@@ -946,6 +946,11 @@ void vfio_combine_iova_ranges(struct rb_root_cached *root, u32 cur_nodes,
unsigned long last;
comb_start = interval_tree_iter_first(root, 0, ULONG_MAX);
+
+ /* Empty list */
+ if (WARN_ON_ONCE(!comb_start))
+ return;
+
curr = comb_start;
while (curr) {
last = curr->last;
@@ -975,6 +980,11 @@ void vfio_combine_iova_ranges(struct rb_root_cached *root, u32 cur_nodes,
prev = curr;
curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
}
+
+ /* Empty list or no nodes to combine */
+ if (WARN_ON_ONCE(min_gap == ULONG_MAX))
+ break;
+
comb_start->last = comb_end->last;
interval_tree_remove(comb_end, root);
cur_nodes--;
diff --git a/include/linux/cdx/cdx_bus.h b/include/linux/cdx/cdx_bus.h
index bead71b7bc73..8320ec3b9e37 100644
--- a/include/linux/cdx/cdx_bus.h
+++ b/include/linux/cdx/cdx_bus.h
@@ -21,11 +21,13 @@
struct cdx_controller;
enum {
+ CDX_DEV_BUS_MASTER_CONF,
CDX_DEV_RESET_CONF,
};
struct cdx_device_config {
u8 type;
+ bool bus_master_enable;
};
typedef int (*cdx_scan_cb)(struct cdx_controller *cdx);
@@ -170,4 +172,20 @@ extern struct bus_type cdx_bus_type;
*/
int cdx_dev_reset(struct device *dev);
+/**
+ * cdx_set_master - enables bus-mastering for CDX device
+ * @cdx_dev: the CDX device to enable
+ *
+ * Return: 0 for success, -errno on failure
+ */
+int cdx_set_master(struct cdx_device *cdx_dev);
+
+/**
+ * cdx_clear_master - disables bus-mastering for CDX device
+ * @cdx_dev: the CDX device to disable
+ *
+ * Return: 0 for success, -errno on failure
+ */
+int cdx_clear_master(struct cdx_device *cdx_dev);
+
#endif /* _CDX_BUS_H_ */
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index afc1369216d9..7f5fb010226d 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -277,8 +277,8 @@ struct vfio_region_info {
#define VFIO_REGION_INFO_FLAG_CAPS (1 << 3) /* Info supports caps */
__u32 index; /* Region index */
__u32 cap_offset; /* Offset within info struct of first cap */
- __u64 size; /* Region size (bytes) */
- __u64 offset; /* Region offset from start of device fd */
+ __aligned_u64 size; /* Region size (bytes) */
+ __aligned_u64 offset; /* Region offset from start of device fd */
};
#define VFIO_DEVICE_GET_REGION_INFO _IO(VFIO_TYPE, VFIO_BASE + 8)
@@ -294,8 +294,8 @@ struct vfio_region_info {
#define VFIO_REGION_INFO_CAP_SPARSE_MMAP 1
struct vfio_region_sparse_mmap_area {
- __u64 offset; /* Offset of mmap'able area within region */
- __u64 size; /* Size of mmap'able area */
+ __aligned_u64 offset; /* Offset of mmap'able area within region */
+ __aligned_u64 size; /* Size of mmap'able area */
};
struct vfio_region_info_cap_sparse_mmap {
@@ -450,9 +450,9 @@ struct vfio_device_migration_info {
VFIO_DEVICE_STATE_V1_RESUMING)
__u32 reserved;
- __u64 pending_bytes;
- __u64 data_offset;
- __u64 data_size;
+ __aligned_u64 pending_bytes;
+ __aligned_u64 data_offset;
+ __aligned_u64 data_size;
};
/*
@@ -476,7 +476,7 @@ struct vfio_device_migration_info {
struct vfio_region_info_cap_nvlink2_ssatgt {
struct vfio_info_cap_header header;
- __u64 tgt;
+ __aligned_u64 tgt;
};
/*
@@ -816,7 +816,7 @@ struct vfio_device_gfx_plane_info {
__u32 drm_plane_type; /* type of plane: DRM_PLANE_TYPE_* */
/* out */
__u32 drm_format; /* drm format of plane */
- __u64 drm_format_mod; /* tiled mode */
+ __aligned_u64 drm_format_mod; /* tiled mode */
__u32 width; /* width of plane */
__u32 height; /* height of plane */
__u32 stride; /* stride of plane */
@@ -829,6 +829,7 @@ struct vfio_device_gfx_plane_info {
__u32 region_index; /* region index */
__u32 dmabuf_id; /* dma-buf id */
};
+ __u32 reserved;
};
#define VFIO_DEVICE_QUERY_GFX_PLANE _IO(VFIO_TYPE, VFIO_BASE + 14)
@@ -863,9 +864,10 @@ struct vfio_device_ioeventfd {
#define VFIO_DEVICE_IOEVENTFD_32 (1 << 2) /* 4-byte write */
#define VFIO_DEVICE_IOEVENTFD_64 (1 << 3) /* 8-byte write */
#define VFIO_DEVICE_IOEVENTFD_SIZE_MASK (0xf)
- __u64 offset; /* device fd offset of write */
- __u64 data; /* data to be written */
+ __aligned_u64 offset; /* device fd offset of write */
+ __aligned_u64 data; /* data to be written */
__s32 fd; /* -1 for de-assignment */
+ __u32 reserved;
};
#define VFIO_DEVICE_IOEVENTFD _IO(VFIO_TYPE, VFIO_BASE + 16)
@@ -1434,6 +1436,27 @@ struct vfio_device_feature_mig_data_size {
#define VFIO_DEVICE_FEATURE_MIG_DATA_SIZE 9
+/**
+ * Upon VFIO_DEVICE_FEATURE_SET, set or clear the BUS mastering for the device
+ * based on the operation specified in op flag.
+ *
+ * The functionality is incorporated for devices that needs bus master control,
+ * but the in-band device interface lacks the support. Consequently, it is not
+ * applicable to PCI devices, as bus master control for PCI devices is managed
+ * in-band through the configuration space. At present, this feature is supported
+ * only for CDX devices.
+ * When the device's BUS MASTER setting is configured as CLEAR, it will result in
+ * blocking all incoming DMA requests from the device. On the other hand, configuring
+ * the device's BUS MASTER setting as SET (enable) will grant the device the
+ * capability to perform DMA to the host memory.
+ */
+struct vfio_device_feature_bus_master {
+ __u32 op;
+#define VFIO_DEVICE_FEATURE_CLEAR_MASTER 0 /* Clear Bus Master */
+#define VFIO_DEVICE_FEATURE_SET_MASTER 1 /* Set Bus Master */
+};
+#define VFIO_DEVICE_FEATURE_BUS_MASTER 10
+
/* -------- API for Type1 VFIO IOMMU -------- */
/**
@@ -1449,7 +1472,7 @@ struct vfio_iommu_type1_info {
__u32 flags;
#define VFIO_IOMMU_INFO_PGSIZES (1 << 0) /* supported page sizes info */
#define VFIO_IOMMU_INFO_CAPS (1 << 1) /* Info supports caps */
- __u64 iova_pgsizes; /* Bitmap of supported page sizes */
+ __aligned_u64 iova_pgsizes; /* Bitmap of supported page sizes */
__u32 cap_offset; /* Offset within info struct of first cap */
__u32 pad;
};
diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c
index 3764d1911b51..93405264ff23 100644
--- a/samples/vfio-mdev/mbochs.c
+++ b/samples/vfio-mdev/mbochs.c
@@ -1262,7 +1262,7 @@ static long mbochs_ioctl(struct vfio_device *vdev, unsigned int cmd,
case VFIO_DEVICE_QUERY_GFX_PLANE:
{
- struct vfio_device_gfx_plane_info plane;
+ struct vfio_device_gfx_plane_info plane = {};
minsz = offsetofend(struct vfio_device_gfx_plane_info,
region_index);
diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c
index 064e1c0a7aa8..72ea5832c927 100644
--- a/samples/vfio-mdev/mdpy.c
+++ b/samples/vfio-mdev/mdpy.c
@@ -591,7 +591,7 @@ static long mdpy_ioctl(struct vfio_device *vdev, unsigned int cmd,
case VFIO_DEVICE_QUERY_GFX_PLANE:
{
- struct vfio_device_gfx_plane_info plane;
+ struct vfio_device_gfx_plane_info plane = {};
minsz = offsetofend(struct vfio_device_gfx_plane_info,
region_index);
diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c
index 5af00387c519..69ba0281f9e0 100644
--- a/samples/vfio-mdev/mtty.c
+++ b/samples/vfio-mdev/mtty.c
@@ -29,6 +29,8 @@
#include <linux/serial.h>
#include <uapi/linux/serial_reg.h>
#include <linux/eventfd.h>
+#include <linux/anon_inodes.h>
+
/*
* #defines
*/
@@ -124,10 +126,32 @@ struct serial_port {
u8 intr_trigger_level; /* interrupt trigger level */
};
+struct mtty_data {
+ u64 magic;
+#define MTTY_MAGIC 0x7e9d09898c3e2c4e /* Nothing clever, just random */
+ u32 major_ver;
+#define MTTY_MAJOR_VER 1
+ u32 minor_ver;
+#define MTTY_MINOR_VER 0
+ u32 nr_ports;
+ u32 flags;
+ struct serial_port ports[2];
+};
+
+struct mdev_state;
+
+struct mtty_migration_file {
+ struct file *filp;
+ struct mutex lock;
+ struct mdev_state *mdev_state;
+ struct mtty_data data;
+ ssize_t filled_size;
+ u8 disabled:1;
+};
+
/* State of each mdev device */
struct mdev_state {
struct vfio_device vdev;
- int irq_fd;
struct eventfd_ctx *intx_evtfd;
struct eventfd_ctx *msi_evtfd;
int irq_index;
@@ -141,6 +165,13 @@ struct mdev_state {
struct mutex rxtx_lock;
struct vfio_device_info dev_info;
int nr_ports;
+ enum vfio_device_mig_state state;
+ struct mutex state_mutex;
+ struct mutex reset_mutex;
+ struct mtty_migration_file *saving_migf;
+ struct mtty_migration_file *resuming_migf;
+ u8 deferred_reset:1;
+ u8 intx_mask:1;
};
static struct mtty_type {
@@ -166,10 +197,6 @@ static const struct file_operations vd_fops = {
static const struct vfio_device_ops mtty_dev_ops;
-/* function prototypes */
-
-static int mtty_trigger_interrupt(struct mdev_state *mdev_state);
-
/* Helper functions */
static void dump_buffer(u8 *buf, uint32_t count)
@@ -186,6 +213,36 @@ static void dump_buffer(u8 *buf, uint32_t count)
#endif
}
+static bool is_intx(struct mdev_state *mdev_state)
+{
+ return mdev_state->irq_index == VFIO_PCI_INTX_IRQ_INDEX;
+}
+
+static bool is_msi(struct mdev_state *mdev_state)
+{
+ return mdev_state->irq_index == VFIO_PCI_MSI_IRQ_INDEX;
+}
+
+static bool is_noirq(struct mdev_state *mdev_state)
+{
+ return !is_intx(mdev_state) && !is_msi(mdev_state);
+}
+
+static void mtty_trigger_interrupt(struct mdev_state *mdev_state)
+{
+ lockdep_assert_held(&mdev_state->ops_lock);
+
+ if (is_msi(mdev_state)) {
+ if (mdev_state->msi_evtfd)
+ eventfd_signal(mdev_state->msi_evtfd, 1);
+ } else if (is_intx(mdev_state)) {
+ if (mdev_state->intx_evtfd && !mdev_state->intx_mask) {
+ eventfd_signal(mdev_state->intx_evtfd, 1);
+ mdev_state->intx_mask = true;
+ }
+ }
+}
+
static void mtty_create_config_space(struct mdev_state *mdev_state)
{
/* PCI dev ID */
@@ -717,6 +774,543 @@ accessfailed:
return ret;
}
+static size_t mtty_data_size(struct mdev_state *mdev_state)
+{
+ return offsetof(struct mtty_data, ports) +
+ (mdev_state->nr_ports * sizeof(struct serial_port));
+}
+
+static void mtty_disable_file(struct mtty_migration_file *migf)
+{
+ mutex_lock(&migf->lock);
+ migf->disabled = true;
+ migf->filled_size = 0;
+ migf->filp->f_pos = 0;
+ mutex_unlock(&migf->lock);
+}
+
+static void mtty_disable_files(struct mdev_state *mdev_state)
+{
+ if (mdev_state->saving_migf) {
+ mtty_disable_file(mdev_state->saving_migf);
+ fput(mdev_state->saving_migf->filp);
+ mdev_state->saving_migf = NULL;
+ }
+
+ if (mdev_state->resuming_migf) {
+ mtty_disable_file(mdev_state->resuming_migf);
+ fput(mdev_state->resuming_migf->filp);
+ mdev_state->resuming_migf = NULL;
+ }
+}
+
+static void mtty_state_mutex_unlock(struct mdev_state *mdev_state)
+{
+again:
+ mutex_lock(&mdev_state->reset_mutex);
+ if (mdev_state->deferred_reset) {
+ mdev_state->deferred_reset = false;
+ mutex_unlock(&mdev_state->reset_mutex);
+ mdev_state->state = VFIO_DEVICE_STATE_RUNNING;
+ mtty_disable_files(mdev_state);
+ goto again;
+ }
+ mutex_unlock(&mdev_state->state_mutex);
+ mutex_unlock(&mdev_state->reset_mutex);
+}
+
+static int mtty_release_migf(struct inode *inode, struct file *filp)
+{
+ struct mtty_migration_file *migf = filp->private_data;
+
+ mtty_disable_file(migf);
+ mutex_destroy(&migf->lock);
+ kfree(migf);
+
+ return 0;
+}
+
+static long mtty_precopy_ioctl(struct file *filp, unsigned int cmd,
+ unsigned long arg)
+{
+ struct mtty_migration_file *migf = filp->private_data;
+ struct mdev_state *mdev_state = migf->mdev_state;
+ loff_t *pos = &filp->f_pos;
+ struct vfio_precopy_info info = {};
+ unsigned long minsz;
+ int ret;
+
+ if (cmd != VFIO_MIG_GET_PRECOPY_INFO)
+ return -ENOTTY;
+
+ minsz = offsetofend(struct vfio_precopy_info, dirty_bytes);
+
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+ if (info.argsz < minsz)
+ return -EINVAL;
+
+ mutex_lock(&mdev_state->state_mutex);
+ if (mdev_state->state != VFIO_DEVICE_STATE_PRE_COPY &&
+ mdev_state->state != VFIO_DEVICE_STATE_PRE_COPY_P2P) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ mutex_lock(&migf->lock);
+
+ if (migf->disabled) {
+ mutex_unlock(&migf->lock);
+ ret = -ENODEV;
+ goto unlock;
+ }
+
+ if (*pos > migf->filled_size) {
+ mutex_unlock(&migf->lock);
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ info.dirty_bytes = 0;
+ info.initial_bytes = migf->filled_size - *pos;
+ mutex_unlock(&migf->lock);
+
+ ret = copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0;
+unlock:
+ mtty_state_mutex_unlock(mdev_state);
+ return ret;
+}
+
+static ssize_t mtty_save_read(struct file *filp, char __user *buf,
+ size_t len, loff_t *pos)
+{
+ struct mtty_migration_file *migf = filp->private_data;
+ ssize_t ret = 0;
+
+ if (pos)
+ return -ESPIPE;
+
+ pos = &filp->f_pos;
+
+ mutex_lock(&migf->lock);
+
+ dev_dbg(migf->mdev_state->vdev.dev, "%s ask %zu\n", __func__, len);
+
+ if (migf->disabled) {
+ ret = -ENODEV;
+ goto out_unlock;
+ }
+
+ if (*pos > migf->filled_size) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ len = min_t(size_t, migf->filled_size - *pos, len);
+ if (len) {
+ if (copy_to_user(buf, (void *)&migf->data + *pos, len)) {
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+ *pos += len;
+ ret = len;
+ }
+out_unlock:
+ dev_dbg(migf->mdev_state->vdev.dev, "%s read %zu\n", __func__, ret);
+ mutex_unlock(&migf->lock);
+ return ret;
+}
+
+static const struct file_operations mtty_save_fops = {
+ .owner = THIS_MODULE,
+ .read = mtty_save_read,
+ .unlocked_ioctl = mtty_precopy_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
+ .release = mtty_release_migf,
+ .llseek = no_llseek,
+};
+
+static void mtty_save_state(struct mdev_state *mdev_state)
+{
+ struct mtty_migration_file *migf = mdev_state->saving_migf;
+ int i;
+
+ mutex_lock(&migf->lock);
+ for (i = 0; i < mdev_state->nr_ports; i++) {
+ memcpy(&migf->data.ports[i],
+ &mdev_state->s[i], sizeof(struct serial_port));
+ migf->filled_size += sizeof(struct serial_port);
+ }
+ dev_dbg(mdev_state->vdev.dev,
+ "%s filled to %zu\n", __func__, migf->filled_size);
+ mutex_unlock(&migf->lock);
+}
+
+static int mtty_load_state(struct mdev_state *mdev_state)
+{
+ struct mtty_migration_file *migf = mdev_state->resuming_migf;
+ int i;
+
+ mutex_lock(&migf->lock);
+ /* magic and version already tested by resume write fn */
+ if (migf->filled_size < mtty_data_size(mdev_state)) {
+ dev_dbg(mdev_state->vdev.dev, "%s expected %zu, got %zu\n",
+ __func__, mtty_data_size(mdev_state),
+ migf->filled_size);
+ mutex_unlock(&migf->lock);
+ return -EINVAL;
+ }
+
+ for (i = 0; i < mdev_state->nr_ports; i++)
+ memcpy(&mdev_state->s[i],
+ &migf->data.ports[i], sizeof(struct serial_port));
+
+ mutex_unlock(&migf->lock);
+ return 0;
+}
+
+static struct mtty_migration_file *
+mtty_save_device_data(struct mdev_state *mdev_state,
+ enum vfio_device_mig_state state)
+{
+ struct mtty_migration_file *migf = mdev_state->saving_migf;
+ struct mtty_migration_file *ret = NULL;
+
+ if (migf) {
+ if (state == VFIO_DEVICE_STATE_STOP_COPY)
+ goto fill_data;
+ return ret;
+ }
+
+ migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
+ if (!migf)
+ return ERR_PTR(-ENOMEM);
+
+ migf->filp = anon_inode_getfile("mtty_mig", &mtty_save_fops,
+ migf, O_RDONLY);
+ if (IS_ERR(migf->filp)) {
+ int rc = PTR_ERR(migf->filp);
+
+ kfree(migf);
+ return ERR_PTR(rc);
+ }
+
+ stream_open(migf->filp->f_inode, migf->filp);
+ mutex_init(&migf->lock);
+ migf->mdev_state = mdev_state;
+
+ migf->data.magic = MTTY_MAGIC;
+ migf->data.major_ver = MTTY_MAJOR_VER;
+ migf->data.minor_ver = MTTY_MINOR_VER;
+ migf->data.nr_ports = mdev_state->nr_ports;
+
+ migf->filled_size = offsetof(struct mtty_data, ports);
+
+ dev_dbg(mdev_state->vdev.dev, "%s filled header to %zu\n",
+ __func__, migf->filled_size);
+
+ ret = mdev_state->saving_migf = migf;
+
+fill_data:
+ if (state == VFIO_DEVICE_STATE_STOP_COPY)
+ mtty_save_state(mdev_state);
+
+ return ret;
+}
+
+static ssize_t mtty_resume_write(struct file *filp, const char __user *buf,
+ size_t len, loff_t *pos)
+{
+ struct mtty_migration_file *migf = filp->private_data;
+ struct mdev_state *mdev_state = migf->mdev_state;
+ loff_t requested_length;
+ ssize_t ret = 0;
+
+ if (pos)
+ return -ESPIPE;
+
+ pos = &filp->f_pos;
+
+ if (*pos < 0 ||
+ check_add_overflow((loff_t)len, *pos, &requested_length))
+ return -EINVAL;
+
+ if (requested_length > mtty_data_size(mdev_state))
+ return -ENOMEM;
+
+ mutex_lock(&migf->lock);
+
+ if (migf->disabled) {
+ ret = -ENODEV;
+ goto out_unlock;
+ }
+
+ if (copy_from_user((void *)&migf->data + *pos, buf, len)) {
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+
+ *pos += len;
+ ret = len;
+
+ dev_dbg(migf->mdev_state->vdev.dev, "%s received %zu, total %zu\n",
+ __func__, len, migf->filled_size + len);
+
+ if (migf->filled_size < offsetof(struct mtty_data, ports) &&
+ migf->filled_size + len >= offsetof(struct mtty_data, ports)) {
+ if (migf->data.magic != MTTY_MAGIC || migf->data.flags ||
+ migf->data.major_ver != MTTY_MAJOR_VER ||
+ migf->data.minor_ver != MTTY_MINOR_VER ||
+ migf->data.nr_ports != mdev_state->nr_ports) {
+ dev_dbg(migf->mdev_state->vdev.dev,
+ "%s failed validation\n", __func__);
+ ret = -EFAULT;
+ } else {
+ dev_dbg(migf->mdev_state->vdev.dev,
+ "%s header validated\n", __func__);
+ }
+ }
+
+ migf->filled_size += len;
+
+out_unlock:
+ mutex_unlock(&migf->lock);
+ return ret;
+}
+
+static const struct file_operations mtty_resume_fops = {
+ .owner = THIS_MODULE,
+ .write = mtty_resume_write,
+ .release = mtty_release_migf,
+ .llseek = no_llseek,
+};
+
+static struct mtty_migration_file *
+mtty_resume_device_data(struct mdev_state *mdev_state)
+{
+ struct mtty_migration_file *migf;
+ int ret;
+
+ migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
+ if (!migf)
+ return ERR_PTR(-ENOMEM);
+
+ migf->filp = anon_inode_getfile("mtty_mig", &mtty_resume_fops,
+ migf, O_WRONLY);
+ if (IS_ERR(migf->filp)) {
+ ret = PTR_ERR(migf->filp);
+ kfree(migf);
+ return ERR_PTR(ret);
+ }
+
+ stream_open(migf->filp->f_inode, migf->filp);
+ mutex_init(&migf->lock);
+ migf->mdev_state = mdev_state;
+
+ mdev_state->resuming_migf = migf;
+
+ return migf;
+}
+
+static struct file *mtty_step_state(struct mdev_state *mdev_state,
+ enum vfio_device_mig_state new)
+{
+ enum vfio_device_mig_state cur = mdev_state->state;
+
+ dev_dbg(mdev_state->vdev.dev, "%s: %d -> %d\n", __func__, cur, new);
+
+ /*
+ * The following state transitions are no-op considering
+ * mtty does not do DMA nor require any explicit start/stop.
+ *
+ * RUNNING -> RUNNING_P2P
+ * RUNNING_P2P -> RUNNING
+ * RUNNING_P2P -> STOP
+ * PRE_COPY -> PRE_COPY_P2P
+ * PRE_COPY_P2P -> PRE_COPY
+ * STOP -> RUNNING_P2P
+ */
+ if ((cur == VFIO_DEVICE_STATE_RUNNING &&
+ new == VFIO_DEVICE_STATE_RUNNING_P2P) ||
+ (cur == VFIO_DEVICE_STATE_RUNNING_P2P &&
+ (new == VFIO_DEVICE_STATE_RUNNING ||
+ new == VFIO_DEVICE_STATE_STOP)) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY &&
+ new == VFIO_DEVICE_STATE_PRE_COPY_P2P) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P &&
+ new == VFIO_DEVICE_STATE_PRE_COPY) ||
+ (cur == VFIO_DEVICE_STATE_STOP &&
+ new == VFIO_DEVICE_STATE_RUNNING_P2P))
+ return NULL;
+
+ /*
+ * The following state transitions simply close migration files,
+ * with the exception of RESUMING -> STOP, which needs to load
+ * the state first.
+ *
+ * RESUMING -> STOP
+ * PRE_COPY -> RUNNING
+ * PRE_COPY_P2P -> RUNNING_P2P
+ * STOP_COPY -> STOP
+ */
+ if (cur == VFIO_DEVICE_STATE_RESUMING &&
+ new == VFIO_DEVICE_STATE_STOP) {
+ int ret;
+
+ ret = mtty_load_state(mdev_state);
+ if (ret)
+ return ERR_PTR(ret);
+ mtty_disable_files(mdev_state);
+ return NULL;
+ }
+
+ if ((cur == VFIO_DEVICE_STATE_PRE_COPY &&
+ new == VFIO_DEVICE_STATE_RUNNING) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P &&
+ new == VFIO_DEVICE_STATE_RUNNING_P2P) ||
+ (cur == VFIO_DEVICE_STATE_STOP_COPY &&
+ new == VFIO_DEVICE_STATE_STOP)) {
+ mtty_disable_files(mdev_state);
+ return NULL;
+ }
+
+ /*
+ * The following state transitions return migration files.
+ *
+ * RUNNING -> PRE_COPY
+ * RUNNING_P2P -> PRE_COPY_P2P
+ * STOP -> STOP_COPY
+ * STOP -> RESUMING
+ * PRE_COPY_P2P -> STOP_COPY
+ */
+ if ((cur == VFIO_DEVICE_STATE_RUNNING &&
+ new == VFIO_DEVICE_STATE_PRE_COPY) ||
+ (cur == VFIO_DEVICE_STATE_RUNNING_P2P &&
+ new == VFIO_DEVICE_STATE_PRE_COPY_P2P) ||
+ (cur == VFIO_DEVICE_STATE_STOP &&
+ new == VFIO_DEVICE_STATE_STOP_COPY) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P &&
+ new == VFIO_DEVICE_STATE_STOP_COPY)) {
+ struct mtty_migration_file *migf;
+
+ migf = mtty_save_device_data(mdev_state, new);
+ if (IS_ERR(migf))
+ return ERR_CAST(migf);
+
+ if (migf) {
+ get_file(migf->filp);
+
+ return migf->filp;
+ }
+ return NULL;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_STOP &&
+ new == VFIO_DEVICE_STATE_RESUMING) {
+ struct mtty_migration_file *migf;
+
+ migf = mtty_resume_device_data(mdev_state);
+ if (IS_ERR(migf))
+ return ERR_CAST(migf);
+
+ get_file(migf->filp);
+
+ return migf->filp;
+ }
+
+ /* vfio_mig_get_next_state() does not use arcs other than the above */
+ WARN_ON(true);
+ return ERR_PTR(-EINVAL);
+}
+
+static struct file *mtty_set_state(struct vfio_device *vdev,
+ enum vfio_device_mig_state new_state)
+{
+ struct mdev_state *mdev_state =
+ container_of(vdev, struct mdev_state, vdev);
+ struct file *ret = NULL;
+
+ dev_dbg(vdev->dev, "%s -> %d\n", __func__, new_state);
+
+ mutex_lock(&mdev_state->state_mutex);
+ while (mdev_state->state != new_state) {
+ enum vfio_device_mig_state next_state;
+ int rc = vfio_mig_get_next_state(vdev, mdev_state->state,
+ new_state, &next_state);
+ if (rc) {
+ ret = ERR_PTR(rc);
+ break;
+ }
+
+ ret = mtty_step_state(mdev_state, next_state);
+ if (IS_ERR(ret))
+ break;
+
+ mdev_state->state = next_state;
+
+ if (WARN_ON(ret && new_state != next_state)) {
+ fput(ret);
+ ret = ERR_PTR(-EINVAL);
+ break;
+ }
+ }
+ mtty_state_mutex_unlock(mdev_state);
+ return ret;
+}
+
+static int mtty_get_state(struct vfio_device *vdev,
+ enum vfio_device_mig_state *current_state)
+{
+ struct mdev_state *mdev_state =
+ container_of(vdev, struct mdev_state, vdev);
+
+ mutex_lock(&mdev_state->state_mutex);
+ *current_state = mdev_state->state;
+ mtty_state_mutex_unlock(mdev_state);
+ return 0;
+}
+
+static int mtty_get_data_size(struct vfio_device *vdev,
+ unsigned long *stop_copy_length)
+{
+ struct mdev_state *mdev_state =
+ container_of(vdev, struct mdev_state, vdev);
+
+ *stop_copy_length = mtty_data_size(mdev_state);
+ return 0;
+}
+
+static const struct vfio_migration_ops mtty_migration_ops = {
+ .migration_set_state = mtty_set_state,
+ .migration_get_state = mtty_get_state,
+ .migration_get_data_size = mtty_get_data_size,
+};
+
+static int mtty_log_start(struct vfio_device *vdev,
+ struct rb_root_cached *ranges,
+ u32 nnodes, u64 *page_size)
+{
+ return 0;
+}
+
+static int mtty_log_stop(struct vfio_device *vdev)
+{
+ return 0;
+}
+
+static int mtty_log_read_and_clear(struct vfio_device *vdev,
+ unsigned long iova, unsigned long length,
+ struct iova_bitmap *dirty)
+{
+ return 0;
+}
+
+static const struct vfio_log_ops mtty_log_ops = {
+ .log_start = mtty_log_start,
+ .log_stop = mtty_log_stop,
+ .log_read_and_clear = mtty_log_read_and_clear,
+};
+
static int mtty_init_dev(struct vfio_device *vdev)
{
struct mdev_state *mdev_state =
@@ -749,6 +1343,16 @@ static int mtty_init_dev(struct vfio_device *vdev)
mutex_init(&mdev_state->ops_lock);
mdev_state->mdev = mdev;
mtty_create_config_space(mdev_state);
+
+ mutex_init(&mdev_state->state_mutex);
+ mutex_init(&mdev_state->reset_mutex);
+ vdev->migration_flags = VFIO_MIGRATION_STOP_COPY |
+ VFIO_MIGRATION_P2P |
+ VFIO_MIGRATION_PRE_COPY;
+ vdev->mig_ops = &mtty_migration_ops;
+ vdev->log_ops = &mtty_log_ops;
+ mdev_state->state = VFIO_DEVICE_STATE_RUNNING;
+
return 0;
err_nr_ports:
@@ -782,6 +1386,8 @@ static void mtty_release_dev(struct vfio_device *vdev)
struct mdev_state *mdev_state =
container_of(vdev, struct mdev_state, vdev);
+ mutex_destroy(&mdev_state->reset_mutex);
+ mutex_destroy(&mdev_state->state_mutex);
atomic_add(mdev_state->nr_ports, &mdev_avail_ports);
kfree(mdev_state->vconfig);
}
@@ -798,6 +1404,15 @@ static int mtty_reset(struct mdev_state *mdev_state)
{
pr_info("%s: called\n", __func__);
+ mutex_lock(&mdev_state->reset_mutex);
+ mdev_state->deferred_reset = true;
+ if (!mutex_trylock(&mdev_state->state_mutex)) {
+ mutex_unlock(&mdev_state->reset_mutex);
+ return 0;
+ }
+ mutex_unlock(&mdev_state->reset_mutex);
+ mtty_state_mutex_unlock(mdev_state);
+
return 0;
}
@@ -921,6 +1536,25 @@ write_err:
return -EFAULT;
}
+static void mtty_disable_intx(struct mdev_state *mdev_state)
+{
+ if (mdev_state->intx_evtfd) {
+ eventfd_ctx_put(mdev_state->intx_evtfd);
+ mdev_state->intx_evtfd = NULL;
+ mdev_state->intx_mask = false;
+ mdev_state->irq_index = -1;
+ }
+}
+
+static void mtty_disable_msi(struct mdev_state *mdev_state)
+{
+ if (mdev_state->msi_evtfd) {
+ eventfd_ctx_put(mdev_state->msi_evtfd);
+ mdev_state->msi_evtfd = NULL;
+ mdev_state->irq_index = -1;
+ }
+}
+
static int mtty_set_irqs(struct mdev_state *mdev_state, uint32_t flags,
unsigned int index, unsigned int start,
unsigned int count, void *data)
@@ -932,59 +1566,113 @@ static int mtty_set_irqs(struct mdev_state *mdev_state, uint32_t flags,
case VFIO_PCI_INTX_IRQ_INDEX:
switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
case VFIO_IRQ_SET_ACTION_MASK:
+ if (!is_intx(mdev_state) || start != 0 || count != 1) {
+ ret = -EINVAL;
+ break;
+ }
+
+ if (flags & VFIO_IRQ_SET_DATA_NONE) {
+ mdev_state->intx_mask = true;
+ } else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
+ uint8_t mask = *(uint8_t *)data;
+
+ if (mask)
+ mdev_state->intx_mask = true;
+ } else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
+ ret = -ENOTTY; /* No support for mask fd */
+ }
+ break;
case VFIO_IRQ_SET_ACTION_UNMASK:
+ if (!is_intx(mdev_state) || start != 0 || count != 1) {
+ ret = -EINVAL;
+ break;
+ }
+
+ if (flags & VFIO_IRQ_SET_DATA_NONE) {
+ mdev_state->intx_mask = false;
+ } else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
+ uint8_t mask = *(uint8_t *)data;
+
+ if (mask)
+ mdev_state->intx_mask = false;
+ } else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
+ ret = -ENOTTY; /* No support for unmask fd */
+ }
break;
case VFIO_IRQ_SET_ACTION_TRIGGER:
- {
- if (flags & VFIO_IRQ_SET_DATA_NONE) {
- pr_info("%s: disable INTx\n", __func__);
- if (mdev_state->intx_evtfd)
- eventfd_ctx_put(mdev_state->intx_evtfd);
+ if (is_intx(mdev_state) && !count &&
+ (flags & VFIO_IRQ_SET_DATA_NONE)) {
+ mtty_disable_intx(mdev_state);
+ break;
+ }
+
+ if (!(is_intx(mdev_state) || is_noirq(mdev_state)) ||
+ start != 0 || count != 1) {
+ ret = -EINVAL;
break;
}
if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
int fd = *(int *)data;
+ struct eventfd_ctx *evt;
- if (fd > 0) {
- struct eventfd_ctx *evt;
-
- evt = eventfd_ctx_fdget(fd);
- if (IS_ERR(evt)) {
- ret = PTR_ERR(evt);
- break;
- }
- mdev_state->intx_evtfd = evt;
- mdev_state->irq_fd = fd;
- mdev_state->irq_index = index;
+ mtty_disable_intx(mdev_state);
+
+ if (fd < 0)
+ break;
+
+ evt = eventfd_ctx_fdget(fd);
+ if (IS_ERR(evt)) {
+ ret = PTR_ERR(evt);
break;
}
+ mdev_state->intx_evtfd = evt;
+ mdev_state->irq_index = index;
+ break;
+ }
+
+ if (!is_intx(mdev_state)) {
+ ret = -EINVAL;
+ break;
+ }
+
+ if (flags & VFIO_IRQ_SET_DATA_NONE) {
+ mtty_trigger_interrupt(mdev_state);
+ } else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
+ uint8_t trigger = *(uint8_t *)data;
+
+ if (trigger)
+ mtty_trigger_interrupt(mdev_state);
}
break;
}
- }
break;
case VFIO_PCI_MSI_IRQ_INDEX:
switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
case VFIO_IRQ_SET_ACTION_MASK:
case VFIO_IRQ_SET_ACTION_UNMASK:
+ ret = -ENOTTY;
break;
case VFIO_IRQ_SET_ACTION_TRIGGER:
- if (flags & VFIO_IRQ_SET_DATA_NONE) {
- if (mdev_state->msi_evtfd)
- eventfd_ctx_put(mdev_state->msi_evtfd);
- pr_info("%s: disable MSI\n", __func__);
- mdev_state->irq_index = VFIO_PCI_INTX_IRQ_INDEX;
+ if (is_msi(mdev_state) && !count &&
+ (flags & VFIO_IRQ_SET_DATA_NONE)) {
+ mtty_disable_msi(mdev_state);
+ break;
+ }
+
+ if (!(is_msi(mdev_state) || is_noirq(mdev_state)) ||
+ start != 0 || count != 1) {
+ ret = -EINVAL;
break;
}
+
if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
int fd = *(int *)data;
struct eventfd_ctx *evt;
- if (fd <= 0)
- break;
+ mtty_disable_msi(mdev_state);
- if (mdev_state->msi_evtfd)
+ if (fd < 0)
break;
evt = eventfd_ctx_fdget(fd);
@@ -993,20 +1681,37 @@ static int mtty_set_irqs(struct mdev_state *mdev_state, uint32_t flags,
break;
}
mdev_state->msi_evtfd = evt;
- mdev_state->irq_fd = fd;
mdev_state->irq_index = index;
+ break;
+ }
+
+ if (!is_msi(mdev_state)) {
+ ret = -EINVAL;
+ break;
+ }
+
+ if (flags & VFIO_IRQ_SET_DATA_NONE) {
+ mtty_trigger_interrupt(mdev_state);
+ } else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
+ uint8_t trigger = *(uint8_t *)data;
+
+ if (trigger)
+ mtty_trigger_interrupt(mdev_state);
}
break;
- }
- break;
+ }
+ break;
case VFIO_PCI_MSIX_IRQ_INDEX:
- pr_info("%s: MSIX_IRQ\n", __func__);
+ dev_dbg(mdev_state->vdev.dev, "%s: MSIX_IRQ\n", __func__);
+ ret = -ENOTTY;
break;
case VFIO_PCI_ERR_IRQ_INDEX:
- pr_info("%s: ERR_IRQ\n", __func__);
+ dev_dbg(mdev_state->vdev.dev, "%s: ERR_IRQ\n", __func__);
+ ret = -ENOTTY;
break;
case VFIO_PCI_REQ_IRQ_INDEX:
- pr_info("%s: REQ_IRQ\n", __func__);
+ dev_dbg(mdev_state->vdev.dev, "%s: REQ_IRQ\n", __func__);
+ ret = -ENOTTY;
break;
}
@@ -1014,33 +1719,6 @@ static int mtty_set_irqs(struct mdev_state *mdev_state, uint32_t flags,
return ret;
}
-static int mtty_trigger_interrupt(struct mdev_state *mdev_state)
-{
- int ret = -1;
-
- if ((mdev_state->irq_index == VFIO_PCI_MSI_IRQ_INDEX) &&
- (!mdev_state->msi_evtfd))
- return -EINVAL;
- else if ((mdev_state->irq_index == VFIO_PCI_INTX_IRQ_INDEX) &&
- (!mdev_state->intx_evtfd)) {
- pr_info("%s: Intr eventfd not found\n", __func__);
- return -EINVAL;
- }
-
- if (mdev_state->irq_index == VFIO_PCI_MSI_IRQ_INDEX)
- ret = eventfd_signal(mdev_state->msi_evtfd, 1);
- else
- ret = eventfd_signal(mdev_state->intx_evtfd, 1);
-
-#if defined(DEBUG_INTR)
- pr_info("Intx triggered\n");
-#endif
- if (ret != 1)
- pr_err("%s: eventfd signal failed (%d)\n", __func__, ret);
-
- return ret;
-}
-
static int mtty_get_region_info(struct mdev_state *mdev_state,
struct vfio_region_info *region_info,
u16 *cap_type_id, void **cap_type)
@@ -1084,22 +1762,16 @@ static int mtty_get_region_info(struct mdev_state *mdev_state,
static int mtty_get_irq_info(struct vfio_irq_info *irq_info)
{
- switch (irq_info->index) {
- case VFIO_PCI_INTX_IRQ_INDEX:
- case VFIO_PCI_MSI_IRQ_INDEX:
- case VFIO_PCI_REQ_IRQ_INDEX:
- break;
-
- default:
+ if (irq_info->index != VFIO_PCI_INTX_IRQ_INDEX &&
+ irq_info->index != VFIO_PCI_MSI_IRQ_INDEX)
return -EINVAL;
- }
irq_info->flags = VFIO_IRQ_INFO_EVENTFD;
irq_info->count = 1;
if (irq_info->index == VFIO_PCI_INTX_IRQ_INDEX)
- irq_info->flags |= (VFIO_IRQ_INFO_MASKABLE |
- VFIO_IRQ_INFO_AUTOMASKED);
+ irq_info->flags |= VFIO_IRQ_INFO_MASKABLE |
+ VFIO_IRQ_INFO_AUTOMASKED;
else
irq_info->flags |= VFIO_IRQ_INFO_NORESIZE;
@@ -1262,6 +1934,16 @@ static unsigned int mtty_get_available(struct mdev_type *mtype)
return atomic_read(&mdev_avail_ports) / type->nr_ports;
}
+static void mtty_close(struct vfio_device *vdev)
+{
+ struct mdev_state *mdev_state =
+ container_of(vdev, struct mdev_state, vdev);
+
+ mtty_disable_files(mdev_state);
+ mtty_disable_intx(mdev_state);
+ mtty_disable_msi(mdev_state);
+}
+
static const struct vfio_device_ops mtty_dev_ops = {
.name = "vfio-mtty",
.init = mtty_init_dev,
@@ -1273,6 +1955,7 @@ static const struct vfio_device_ops mtty_dev_ops = {
.unbind_iommufd = vfio_iommufd_emulated_unbind,
.attach_ioas = vfio_iommufd_emulated_attach_ioas,
.detach_ioas = vfio_iommufd_emulated_detach_ioas,
+ .close_device = mtty_close,
};
static struct mdev_driver mtty_driver = {