diff options
author | David Yat Sin <david.yatsin@amd.com> | 2021-01-25 12:50:14 -0500 |
---|---|---|
committer | Alex Deucher <alexander.deucher@amd.com> | 2022-02-07 17:59:52 -0500 |
commit | 42c6c48214b726c30918e8dc80e2168607d13ae4 (patch) | |
tree | cdb245637820e0d2fc82e58a5b8a032c6799c021 /drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | |
parent | 5bb6a8fa75fcfda93592bee0a4910420ceab15f3 (diff) |
drm/amdkfd: CRIU checkpoint and restore queue mqds
Checkpoint contents of queue MQD's on CRIU dump and restore them during
CRIU restore.
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: David Yat Sin <david.yatsin@amd.com>
Signed-off-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 157 |
1 files changed, 140 insertions, 17 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c index 8afe6879b861..844917c1c346 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c @@ -208,6 +208,7 @@ int pqm_create_queue(struct process_queue_manager *pqm, struct queue_properties *properties, unsigned int *qid, const struct kfd_criu_queue_priv_data *q_data, + const void *restore_mqd, uint32_t *p_doorbell_offset_in_process) { int retval; @@ -272,7 +273,7 @@ int pqm_create_queue(struct process_queue_manager *pqm, goto err_create_queue; pqn->q = q; pqn->kq = NULL; - retval = dev->dqm->ops.create_queue(dev->dqm, q, &pdd->qpd, q_data); + retval = dev->dqm->ops.create_queue(dev->dqm, q, &pdd->qpd, q_data, restore_mqd); print_queue(q); break; @@ -292,7 +293,7 @@ int pqm_create_queue(struct process_queue_manager *pqm, goto err_create_queue; pqn->q = q; pqn->kq = NULL; - retval = dev->dqm->ops.create_queue(dev->dqm, q, &pdd->qpd, q_data); + retval = dev->dqm->ops.create_queue(dev->dqm, q, &pdd->qpd, q_data, restore_mqd); print_queue(q); break; case KFD_QUEUE_TYPE_DIQ: @@ -517,12 +518,25 @@ int pqm_get_wave_state(struct process_queue_manager *pqm, save_area_used_size); } +static int get_queue_data_sizes(struct kfd_process_device *pdd, struct queue *q, uint32_t *mqd_size) +{ + int ret; + + ret = pqm_get_queue_checkpoint_info(&pdd->process->pqm, q->properties.queue_id, mqd_size); + if (ret) + pr_err("Failed to get queue dump info (%d)\n", ret); + + return ret; +} + int kfd_process_get_queue_info(struct kfd_process *p, uint32_t *num_queues, uint64_t *priv_data_sizes) { + uint32_t extra_data_sizes = 0; struct queue *q; int i; + int ret; *num_queues = 0; @@ -534,23 +548,53 @@ int kfd_process_get_queue_info(struct kfd_process *p, if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE || q->properties.type == KFD_QUEUE_TYPE_SDMA || q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) { - + uint32_t mqd_size; *num_queues = *num_queues + 1; + + ret = get_queue_data_sizes(pdd, q, &mqd_size); + if (ret) + return ret; + + extra_data_sizes += mqd_size; } else { pr_err("Unsupported queue type (%d)\n", q->properties.type); return -EOPNOTSUPP; } } } - *priv_data_sizes = *num_queues * sizeof(struct kfd_criu_queue_priv_data); + *priv_data_sizes = extra_data_sizes + + (*num_queues * sizeof(struct kfd_criu_queue_priv_data)); return 0; } -static void criu_checkpoint_queue(struct kfd_process_device *pdd, +static int pqm_checkpoint_mqd(struct process_queue_manager *pqm, unsigned int qid, void *mqd) +{ + struct process_queue_node *pqn; + + pqn = get_queue_by_qid(pqm, qid); + if (!pqn) { + pr_debug("amdkfd: No queue %d exists for operation\n", qid); + return -EFAULT; + } + + if (!pqn->q->device->dqm->ops.checkpoint_mqd) { + pr_err("amdkfd: queue dumping not supported on this device\n"); + return -EOPNOTSUPP; + } + + return pqn->q->device->dqm->ops.checkpoint_mqd(pqn->q->device->dqm, pqn->q, mqd); +} + +static int criu_checkpoint_queue(struct kfd_process_device *pdd, struct queue *q, struct kfd_criu_queue_priv_data *q_data) { + uint8_t *mqd; + int ret; + + mqd = (void *)(q_data + 1); + q_data->gpu_id = pdd->dev->id; q_data->type = q->properties.type; q_data->format = q->properties.format; @@ -576,7 +620,14 @@ static void criu_checkpoint_queue(struct kfd_process_device *pdd, q_data->ctx_save_restore_area_size = q->properties.ctx_save_restore_area_size; + ret = pqm_checkpoint_mqd(&pdd->process->pqm, q->properties.queue_id, mqd); + if (ret) { + pr_err("Failed checkpoint queue_mqd (%d)\n", ret); + return ret; + } + pr_debug("Dumping Queue: gpu_id:%x queue_id:%u\n", q_data->gpu_id, q_data->q_id); + return ret; } static int criu_checkpoint_queues_device(struct kfd_process_device *pdd, @@ -584,15 +635,16 @@ static int criu_checkpoint_queues_device(struct kfd_process_device *pdd, unsigned int *q_index, uint64_t *queues_priv_data_offset) { - struct kfd_criu_queue_priv_data *q_data; + unsigned int q_private_data_size = 0; + uint8_t *q_private_data = NULL; /* Local buffer to store individual queue private data */ struct queue *q; int ret = 0; - q_data = kzalloc(sizeof(*q_data), GFP_KERNEL); - if (!q_data) - return -ENOMEM; - list_for_each_entry(q, &pdd->qpd.queues_list, list) { + struct kfd_criu_queue_priv_data *q_data; + uint64_t q_data_size; + uint32_t mqd_size; + if (q->properties.type != KFD_QUEUE_TYPE_COMPUTE && q->properties.type != KFD_QUEUE_TYPE_SDMA && q->properties.type != KFD_QUEUE_TYPE_SDMA_XGMI) { @@ -602,19 +654,46 @@ static int criu_checkpoint_queues_device(struct kfd_process_device *pdd, break; } - criu_checkpoint_queue(pdd, q, q_data); + ret = get_queue_data_sizes(pdd, q, &mqd_size); + if (ret) + break; + + q_data_size = sizeof(*q_data) + mqd_size; + + /* Increase local buffer space if needed */ + if (q_private_data_size < q_data_size) { + kfree(q_private_data); + + q_private_data = kzalloc(q_data_size, GFP_KERNEL); + if (!q_private_data) { + ret = -ENOMEM; + break; + } + q_private_data_size = q_data_size; + } + + q_data = (struct kfd_criu_queue_priv_data *)q_private_data; + + /* data stored in this order: priv_data, mqd */ + q_data->mqd_size = mqd_size; + + ret = criu_checkpoint_queue(pdd, q, q_data); + if (ret) + break; + q_data->object_type = KFD_CRIU_OBJECT_TYPE_QUEUE; - ret = copy_to_user(user_priv + *queues_priv_data_offset, q_data, sizeof(*q_data)); + ret = copy_to_user(user_priv + *queues_priv_data_offset, + q_data, q_data_size); if (ret) { ret = -EFAULT; break; } - *queues_priv_data_offset += sizeof(*q_data); + *queues_priv_data_offset += q_data_size; *q_index = *q_index + 1; } - kfree(q_data); + kfree(q_private_data); return ret; } @@ -668,11 +747,12 @@ int kfd_criu_restore_queue(struct kfd_process *p, uint64_t max_priv_data_size) { struct kfd_criu_queue_priv_data *q_data; + uint8_t *mqd, *q_extra_data = NULL; struct kfd_process_device *pdd; - struct kfd_dev *dev; + uint64_t q_extra_data_size; struct queue_properties qp; unsigned int queue_id; - + struct kfd_dev *dev; int ret = 0; if (*priv_data_offset + sizeof(*q_data) > max_priv_data_size) @@ -689,6 +769,26 @@ int kfd_criu_restore_queue(struct kfd_process *p, } *priv_data_offset += sizeof(*q_data); + q_extra_data_size = q_data->mqd_size; + + if (*priv_data_offset + q_extra_data_size > max_priv_data_size) { + ret = -EINVAL; + goto exit; + } + + q_extra_data = kmalloc(q_extra_data_size, GFP_KERNEL); + if (!q_extra_data) { + ret = -ENOMEM; + goto exit; + } + + ret = copy_from_user(q_extra_data, user_priv_ptr + *priv_data_offset, q_extra_data_size); + if (ret) { + ret = -EFAULT; + goto exit; + } + + *priv_data_offset += q_extra_data_size; dev = kfd_device_by_id(q_data->gpu_id); if (!dev) { @@ -705,13 +805,15 @@ int kfd_criu_restore_queue(struct kfd_process *p, ret = -EFAULT; return ret; } + /* data stored in this order: mqd */ + mqd = q_extra_data; memset(&qp, 0, sizeof(qp)); set_queue_properties_from_criu(&qp, q_data); print_queue_properties(&qp); - ret = pqm_create_queue(&p->pqm, pdd->dev, NULL, &qp, &queue_id, q_data, NULL); + ret = pqm_create_queue(&p->pqm, pdd->dev, NULL, &qp, &queue_id, q_data, mqd, NULL); if (ret) { pr_err("Failed to create new queue err:%d\n", ret); ret = -EINVAL; @@ -728,6 +830,27 @@ exit: return ret; } +int pqm_get_queue_checkpoint_info(struct process_queue_manager *pqm, + unsigned int qid, + uint32_t *mqd_size) +{ + struct process_queue_node *pqn; + + pqn = get_queue_by_qid(pqm, qid); + if (!pqn) { + pr_debug("amdkfd: No queue %d exists for operation\n", qid); + return -EFAULT; + } + + if (!pqn->q->device->dqm->ops.get_queue_checkpoint_info) { + pr_err("amdkfd: queue dumping not supported on this device\n"); + return -EOPNOTSUPP; + } + + pqn->q->device->dqm->ops.get_queue_checkpoint_info(pqn->q->device->dqm, pqn->q, mqd_size); + return 0; +} + #if defined(CONFIG_DEBUG_FS) int pqm_debugfs_mqds(struct seq_file *m, void *data) |