diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 283 |
1 files changed, 177 insertions, 106 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 668ad07ebe1f..ec0d62a16e53 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -61,6 +61,8 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, static void deallocate_sdma_queue(struct device_queue_manager *dqm, unsigned int sdma_queue_id); +static void kfd_process_hw_exception(struct work_struct *work); + static inline enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type) { @@ -99,6 +101,17 @@ unsigned int get_pipes_per_mec(struct device_queue_manager *dqm) return dqm->dev->shared_resources.num_pipe_per_mec; } +static unsigned int get_num_sdma_engines(struct device_queue_manager *dqm) +{ + return dqm->dev->device_info->num_sdma_engines; +} + +unsigned int get_num_sdma_queues(struct device_queue_manager *dqm) +{ + return dqm->dev->device_info->num_sdma_engines + * KFD_SDMA_QUEUES_PER_ENGINE; +} + void program_sh_mem_settings(struct device_queue_manager *dqm, struct qcm_process_device *qpd) { @@ -240,7 +253,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm, print_queue(q); - mutex_lock(&dqm->lock); + dqm_lock(dqm); if (dqm->total_queue_count >= max_num_of_queues_per_device) { pr_warn("Can't create new usermode queue because %d queues were already created\n", @@ -297,7 +310,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm, dqm->total_queue_count); out_unlock: - mutex_unlock(&dqm->lock); + dqm_unlock(dqm); return retval; } @@ -346,10 +359,10 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, struct qcm_process_device *qpd) { int retval; - struct mqd_manager *mqd; + struct mqd_manager *mqd_mgr; - mqd = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_COMPUTE); - if (!mqd) + mqd_mgr = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_COMPUTE); + if (!mqd_mgr) return -ENOMEM; retval = allocate_hqd(dqm, q); @@ -360,7 +373,7 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, if (retval) goto out_deallocate_hqd; - retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, + retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj, &q->gart_mqd_addr, &q->properties); if (retval) goto out_deallocate_doorbell; @@ -374,15 +387,15 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, if (!q->properties.is_active) return 0; - retval = mqd->load_mqd(mqd, q->mqd, q->pipe, q->queue, &q->properties, - q->process->mm); + retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe, q->queue, + &q->properties, q->process->mm); if (retval) goto out_uninit_mqd; return 0; out_uninit_mqd: - mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); + mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj); out_deallocate_doorbell: deallocate_doorbell(qpd, q); out_deallocate_hqd: @@ -399,11 +412,11 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm, struct queue *q) { int retval; - struct mqd_manager *mqd; + struct mqd_manager *mqd_mgr; - mqd = dqm->ops.get_mqd_manager(dqm, + mqd_mgr = dqm->ops.get_mqd_manager(dqm, get_mqd_type_from_queue_type(q->properties.type)); - if (!mqd) + if (!mqd_mgr) return -ENOMEM; if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) { @@ -420,14 +433,14 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm, deallocate_doorbell(qpd, q); - retval = mqd->destroy_mqd(mqd, q->mqd, + retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd, KFD_PREEMPT_TYPE_WAVEFRONT_RESET, KFD_UNMAP_LATENCY_MS, q->pipe, q->queue); if (retval == -ETIME) qpd->reset_wavefronts = true; - mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); + mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj); list_del(&q->list); if (list_empty(&qpd->queues_list)) { @@ -457,9 +470,9 @@ static int destroy_queue_nocpsch(struct device_queue_manager *dqm, { int retval; - mutex_lock(&dqm->lock); + dqm_lock(dqm); retval = destroy_queue_nocpsch_locked(dqm, qpd, q); - mutex_unlock(&dqm->lock); + dqm_unlock(dqm); return retval; } @@ -467,19 +480,19 @@ static int destroy_queue_nocpsch(struct device_queue_manager *dqm, static int update_queue(struct device_queue_manager *dqm, struct queue *q) { int retval; - struct mqd_manager *mqd; + struct mqd_manager *mqd_mgr; struct kfd_process_device *pdd; bool prev_active = false; - mutex_lock(&dqm->lock); + dqm_lock(dqm); pdd = kfd_get_process_device_data(q->device, q->process); if (!pdd) { retval = -ENODEV; goto out_unlock; } - mqd = dqm->ops.get_mqd_manager(dqm, + mqd_mgr = dqm->ops.get_mqd_manager(dqm, get_mqd_type_from_queue_type(q->properties.type)); - if (!mqd) { + if (!mqd_mgr) { retval = -ENOMEM; goto out_unlock; } @@ -506,7 +519,7 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q) } else if (prev_active && (q->properties.type == KFD_QUEUE_TYPE_COMPUTE || q->properties.type == KFD_QUEUE_TYPE_SDMA)) { - retval = mqd->destroy_mqd(mqd, q->mqd, + retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd, KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN, KFD_UNMAP_LATENCY_MS, q->pipe, q->queue); if (retval) { @@ -515,7 +528,7 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q) } } - retval = mqd->update_mqd(mqd, q->mqd, &q->properties); + retval = mqd_mgr->update_mqd(mqd_mgr, q->mqd, &q->properties); /* * check active state vs. the previous state and modify @@ -533,44 +546,44 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q) else if (q->properties.is_active && (q->properties.type == KFD_QUEUE_TYPE_COMPUTE || q->properties.type == KFD_QUEUE_TYPE_SDMA)) - retval = mqd->load_mqd(mqd, q->mqd, q->pipe, q->queue, + retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe, q->queue, &q->properties, q->process->mm); out_unlock: - mutex_unlock(&dqm->lock); + dqm_unlock(dqm); return retval; } static struct mqd_manager *get_mqd_manager( struct device_queue_manager *dqm, enum KFD_MQD_TYPE type) { - struct mqd_manager *mqd; + struct mqd_manager *mqd_mgr; if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) return NULL; pr_debug("mqd type %d\n", type); - mqd = dqm->mqds[type]; - if (!mqd) { - mqd = mqd_manager_init(type, dqm->dev); - if (!mqd) + mqd_mgr = dqm->mqd_mgrs[type]; + if (!mqd_mgr) { + mqd_mgr = mqd_manager_init(type, dqm->dev); + if (!mqd_mgr) pr_err("mqd manager is NULL"); - dqm->mqds[type] = mqd; + dqm->mqd_mgrs[type] = mqd_mgr; } - return mqd; + return mqd_mgr; } static int evict_process_queues_nocpsch(struct device_queue_manager *dqm, struct qcm_process_device *qpd) { struct queue *q; - struct mqd_manager *mqd; + struct mqd_manager *mqd_mgr; struct kfd_process_device *pdd; int retval = 0; - mutex_lock(&dqm->lock); + dqm_lock(dqm); if (qpd->evicted++ > 0) /* already evicted, do nothing */ goto out; @@ -582,16 +595,16 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm, list_for_each_entry(q, &qpd->queues_list, list) { if (!q->properties.is_active) continue; - mqd = dqm->ops.get_mqd_manager(dqm, + mqd_mgr = dqm->ops.get_mqd_manager(dqm, get_mqd_type_from_queue_type(q->properties.type)); - if (!mqd) { /* should not be here */ + if (!mqd_mgr) { /* should not be here */ pr_err("Cannot evict queue, mqd mgr is NULL\n"); retval = -ENOMEM; goto out; } q->properties.is_evicted = true; q->properties.is_active = false; - retval = mqd->destroy_mqd(mqd, q->mqd, + retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd, KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN, KFD_UNMAP_LATENCY_MS, q->pipe, q->queue); if (retval) @@ -600,7 +613,7 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm, } out: - mutex_unlock(&dqm->lock); + dqm_unlock(dqm); return retval; } @@ -611,7 +624,7 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm, struct kfd_process_device *pdd; int retval = 0; - mutex_lock(&dqm->lock); + dqm_lock(dqm); if (qpd->evicted++ > 0) /* already evicted, do nothing */ goto out; @@ -633,7 +646,7 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0); out: - mutex_unlock(&dqm->lock); + dqm_unlock(dqm); return retval; } @@ -641,7 +654,7 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm, struct qcm_process_device *qpd) { struct queue *q; - struct mqd_manager *mqd; + struct mqd_manager *mqd_mgr; struct kfd_process_device *pdd; uint32_t pd_base; int retval = 0; @@ -650,7 +663,7 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm, /* Retrieve PD base */ pd_base = dqm->dev->kfd2kgd->get_process_page_dir(pdd->vm); - mutex_lock(&dqm->lock); + dqm_lock(dqm); if (WARN_ON_ONCE(!qpd->evicted)) /* already restored, do nothing */ goto out; if (qpd->evicted > 1) { /* ref count still > 0, decrement & quit */ @@ -677,16 +690,16 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm, list_for_each_entry(q, &qpd->queues_list, list) { if (!q->properties.is_evicted) continue; - mqd = dqm->ops.get_mqd_manager(dqm, + mqd_mgr = dqm->ops.get_mqd_manager(dqm, get_mqd_type_from_queue_type(q->properties.type)); - if (!mqd) { /* should not be here */ + if (!mqd_mgr) { /* should not be here */ pr_err("Cannot restore queue, mqd mgr is NULL\n"); retval = -ENOMEM; goto out; } q->properties.is_evicted = false; q->properties.is_active = true; - retval = mqd->load_mqd(mqd, q->mqd, q->pipe, + retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe, q->queue, &q->properties, q->process->mm); if (retval) @@ -695,7 +708,7 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm, } qpd->evicted = 0; out: - mutex_unlock(&dqm->lock); + dqm_unlock(dqm); return retval; } @@ -711,7 +724,7 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm, /* Retrieve PD base */ pd_base = dqm->dev->kfd2kgd->get_process_page_dir(pdd->vm); - mutex_lock(&dqm->lock); + dqm_lock(dqm); if (WARN_ON_ONCE(!qpd->evicted)) /* already restored, do nothing */ goto out; if (qpd->evicted > 1) { /* ref count still > 0, decrement & quit */ @@ -739,7 +752,7 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm, if (!retval) qpd->evicted = 0; out: - mutex_unlock(&dqm->lock); + dqm_unlock(dqm); return retval; } @@ -761,7 +774,7 @@ static int register_process(struct device_queue_manager *dqm, /* Retrieve PD base */ pd_base = dqm->dev->kfd2kgd->get_process_page_dir(pdd->vm); - mutex_lock(&dqm->lock); + dqm_lock(dqm); list_add(&n->list, &dqm->queues); /* Update PD Base in QPD */ @@ -769,9 +782,10 @@ static int register_process(struct device_queue_manager *dqm, retval = dqm->asic_ops.update_qpd(dqm, qpd); - dqm->processes_count++; + if (dqm->processes_count++ == 0) + dqm->dev->kfd2kgd->set_compute_idle(dqm->dev->kgd, false); - mutex_unlock(&dqm->lock); + dqm_unlock(dqm); return retval; } @@ -786,20 +800,22 @@ static int unregister_process(struct device_queue_manager *dqm, list_empty(&qpd->queues_list) ? "empty" : "not empty"); retval = 0; - mutex_lock(&dqm->lock); + dqm_lock(dqm); list_for_each_entry_safe(cur, next, &dqm->queues, list) { if (qpd == cur->qpd) { list_del(&cur->list); kfree(cur); - dqm->processes_count--; + if (--dqm->processes_count == 0) + dqm->dev->kfd2kgd->set_compute_idle( + dqm->dev->kgd, true); goto out; } } /* qpd not found in dqm list */ retval = 1; out: - mutex_unlock(&dqm->lock); + dqm_unlock(dqm); return retval; } @@ -838,7 +854,7 @@ static int initialize_nocpsch(struct device_queue_manager *dqm) if (!dqm->allocated_queues) return -ENOMEM; - mutex_init(&dqm->lock); + mutex_init(&dqm->lock_hidden); INIT_LIST_HEAD(&dqm->queues); dqm->queue_count = dqm->next_pipe_to_allocate = 0; dqm->sdma_queue_count = 0; @@ -853,7 +869,7 @@ static int initialize_nocpsch(struct device_queue_manager *dqm) } dqm->vmid_bitmap = (1 << dqm->dev->vm_info.vmid_num_kfd) - 1; - dqm->sdma_bitmap = (1 << CIK_SDMA_QUEUES) - 1; + dqm->sdma_bitmap = (1 << get_num_sdma_queues(dqm)) - 1; return 0; } @@ -866,8 +882,8 @@ static void uninitialize(struct device_queue_manager *dqm) kfree(dqm->allocated_queues); for (i = 0 ; i < KFD_MQD_TYPE_MAX ; i++) - kfree(dqm->mqds[i]); - mutex_destroy(&dqm->lock); + kfree(dqm->mqd_mgrs[i]); + mutex_destroy(&dqm->lock_hidden); kfd_gtt_sa_free(dqm->dev, dqm->pipeline_mem); } @@ -901,7 +917,7 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm, static void deallocate_sdma_queue(struct device_queue_manager *dqm, unsigned int sdma_queue_id) { - if (sdma_queue_id >= CIK_SDMA_QUEUES) + if (sdma_queue_id >= get_num_sdma_queues(dqm)) return; dqm->sdma_bitmap |= (1 << sdma_queue_id); } @@ -910,19 +926,19 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, struct queue *q, struct qcm_process_device *qpd) { - struct mqd_manager *mqd; + struct mqd_manager *mqd_mgr; int retval; - mqd = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_SDMA); - if (!mqd) + mqd_mgr = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_SDMA); + if (!mqd_mgr) return -ENOMEM; retval = allocate_sdma_queue(dqm, &q->sdma_id); if (retval) return retval; - q->properties.sdma_queue_id = q->sdma_id / CIK_SDMA_QUEUES_PER_ENGINE; - q->properties.sdma_engine_id = q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE; + q->properties.sdma_queue_id = q->sdma_id / get_num_sdma_engines(dqm); + q->properties.sdma_engine_id = q->sdma_id % get_num_sdma_engines(dqm); retval = allocate_doorbell(qpd, q); if (retval) @@ -933,19 +949,20 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id); dqm->asic_ops.init_sdma_vm(dqm, q, qpd); - retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, + retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj, &q->gart_mqd_addr, &q->properties); if (retval) goto out_deallocate_doorbell; - retval = mqd->load_mqd(mqd, q->mqd, 0, 0, &q->properties, NULL); + retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, 0, 0, &q->properties, + NULL); if (retval) goto out_uninit_mqd; return 0; out_uninit_mqd: - mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); + mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj); out_deallocate_doorbell: deallocate_doorbell(qpd, q); out_deallocate_sdma_queue: @@ -1003,12 +1020,14 @@ static int initialize_cpsch(struct device_queue_manager *dqm) { pr_debug("num of pipes: %d\n", get_pipes_per_mec(dqm)); - mutex_init(&dqm->lock); + mutex_init(&dqm->lock_hidden); INIT_LIST_HEAD(&dqm->queues); dqm->queue_count = dqm->processes_count = 0; dqm->sdma_queue_count = 0; dqm->active_runlist = false; - dqm->sdma_bitmap = (1 << CIK_SDMA_QUEUES) - 1; + dqm->sdma_bitmap = (1 << get_num_sdma_queues(dqm)) - 1; + + INIT_WORK(&dqm->hw_exception_work, kfd_process_hw_exception); return 0; } @@ -1041,9 +1060,11 @@ static int start_cpsch(struct device_queue_manager *dqm) init_interrupts(dqm); - mutex_lock(&dqm->lock); + dqm_lock(dqm); + /* clear hang status when driver try to start the hw scheduler */ + dqm->is_hws_hang = false; execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0); - mutex_unlock(&dqm->lock); + dqm_unlock(dqm); return 0; fail_allocate_vidmem: @@ -1055,9 +1076,9 @@ fail_packet_manager_init: static int stop_cpsch(struct device_queue_manager *dqm) { - mutex_lock(&dqm->lock); + dqm_lock(dqm); unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0); - mutex_unlock(&dqm->lock); + dqm_unlock(dqm); kfd_gtt_sa_free(dqm->dev, dqm->fence_mem); pm_uninit(&dqm->packets); @@ -1069,11 +1090,11 @@ static int create_kernel_queue_cpsch(struct device_queue_manager *dqm, struct kernel_queue *kq, struct qcm_process_device *qpd) { - mutex_lock(&dqm->lock); + dqm_lock(dqm); if (dqm->total_queue_count >= max_num_of_queues_per_device) { pr_warn("Can't create new kernel queue because %d queues were already created\n", dqm->total_queue_count); - mutex_unlock(&dqm->lock); + dqm_unlock(dqm); return -EPERM; } @@ -1089,7 +1110,7 @@ static int create_kernel_queue_cpsch(struct device_queue_manager *dqm, dqm->queue_count++; qpd->is_debug = true; execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0); - mutex_unlock(&dqm->lock); + dqm_unlock(dqm); return 0; } @@ -1098,7 +1119,7 @@ static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm, struct kernel_queue *kq, struct qcm_process_device *qpd) { - mutex_lock(&dqm->lock); + dqm_lock(dqm); list_del(&kq->list); dqm->queue_count--; qpd->is_debug = false; @@ -1110,18 +1131,18 @@ static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm, dqm->total_queue_count--; pr_debug("Total of %d queues are accountable so far\n", dqm->total_queue_count); - mutex_unlock(&dqm->lock); + dqm_unlock(dqm); } static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, struct qcm_process_device *qpd) { int retval; - struct mqd_manager *mqd; + struct mqd_manager *mqd_mgr; retval = 0; - mutex_lock(&dqm->lock); + dqm_lock(dqm); if (dqm->total_queue_count >= max_num_of_queues_per_device) { pr_warn("Can't create new usermode queue because %d queues were already created\n", @@ -1135,19 +1156,19 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, if (retval) goto out_unlock; q->properties.sdma_queue_id = - q->sdma_id / CIK_SDMA_QUEUES_PER_ENGINE; + q->sdma_id / get_num_sdma_engines(dqm); q->properties.sdma_engine_id = - q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE; + q->sdma_id % get_num_sdma_engines(dqm); } retval = allocate_doorbell(qpd, q); if (retval) goto out_deallocate_sdma_queue; - mqd = dqm->ops.get_mqd_manager(dqm, + mqd_mgr = dqm->ops.get_mqd_manager(dqm, get_mqd_type_from_queue_type(q->properties.type)); - if (!mqd) { + if (!mqd_mgr) { retval = -ENOMEM; goto out_deallocate_doorbell; } @@ -1164,7 +1185,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, q->properties.tba_addr = qpd->tba_addr; q->properties.tma_addr = qpd->tma_addr; - retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, + retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj, &q->gart_mqd_addr, &q->properties); if (retval) goto out_deallocate_doorbell; @@ -1188,7 +1209,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, pr_debug("Total of %d queues are accountable so far\n", dqm->total_queue_count); - mutex_unlock(&dqm->lock); + dqm_unlock(dqm); return retval; out_deallocate_doorbell: @@ -1197,7 +1218,8 @@ out_deallocate_sdma_queue: if (q->properties.type == KFD_QUEUE_TYPE_SDMA) deallocate_sdma_queue(dqm, q->sdma_id); out_unlock: - mutex_unlock(&dqm->lock); + dqm_unlock(dqm); + return retval; } @@ -1210,6 +1232,13 @@ int amdkfd_fence_wait_timeout(unsigned int *fence_addr, while (*fence_addr != fence_value) { if (time_after(jiffies, end_jiffies)) { pr_err("qcm fence wait loop timeout expired\n"); + /* In HWS case, this is used to halt the driver thread + * in order not to mess up CP states before doing + * scandumps for FW debugging. + */ + while (halt_if_hws_hang) + schedule(); + return -ETIME; } schedule(); @@ -1254,6 +1283,8 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, { int retval = 0; + if (dqm->is_hws_hang) + return -EIO; if (!dqm->active_runlist) return retval; @@ -1292,9 +1323,13 @@ static int execute_queues_cpsch(struct device_queue_manager *dqm, { int retval; + if (dqm->is_hws_hang) + return -EIO; retval = unmap_queues_cpsch(dqm, filter, filter_param); if (retval) { pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n"); + dqm->is_hws_hang = true; + schedule_work(&dqm->hw_exception_work); return retval; } @@ -1306,7 +1341,7 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, struct queue *q) { int retval; - struct mqd_manager *mqd; + struct mqd_manager *mqd_mgr; bool preempt_all_queues; preempt_all_queues = false; @@ -1314,7 +1349,7 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, retval = 0; /* remove queue from list to prevent rescheduling after preemption */ - mutex_lock(&dqm->lock); + dqm_lock(dqm); if (qpd->is_debug) { /* @@ -1326,9 +1361,9 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, } - mqd = dqm->ops.get_mqd_manager(dqm, + mqd_mgr = dqm->ops.get_mqd_manager(dqm, get_mqd_type_from_queue_type(q->properties.type)); - if (!mqd) { + if (!mqd_mgr) { retval = -ENOMEM; goto failed; } @@ -1350,7 +1385,7 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, qpd->reset_wavefronts = true; } - mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); + mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj); /* * Unconditionally decrement this counter, regardless of the queue's @@ -1360,14 +1395,14 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, pr_debug("Total of %d queues are accountable so far\n", dqm->total_queue_count); - mutex_unlock(&dqm->lock); + dqm_unlock(dqm); return retval; failed: failed_try_destroy_debugged_queue: - mutex_unlock(&dqm->lock); + dqm_unlock(dqm); return retval; } @@ -1391,7 +1426,7 @@ static bool set_cache_memory_policy(struct device_queue_manager *dqm, if (!dqm->asic_ops.set_cache_memory_policy) return retval; - mutex_lock(&dqm->lock); + dqm_lock(dqm); if (alternate_aperture_size == 0) { /* base > limit disables APE1 */ @@ -1437,7 +1472,7 @@ static bool set_cache_memory_policy(struct device_queue_manager *dqm, qpd->sh_mem_ape1_limit); out: - mutex_unlock(&dqm->lock); + dqm_unlock(dqm); return retval; } @@ -1468,7 +1503,7 @@ static int process_termination_nocpsch(struct device_queue_manager *dqm, struct device_process_node *cur, *next_dpn; int retval = 0; - mutex_lock(&dqm->lock); + dqm_lock(dqm); /* Clear all user mode queues */ list_for_each_entry_safe(q, next, &qpd->queues_list, list) { @@ -1489,7 +1524,7 @@ static int process_termination_nocpsch(struct device_queue_manager *dqm, } } - mutex_unlock(&dqm->lock); + dqm_unlock(dqm); return retval; } @@ -1500,14 +1535,14 @@ static int process_termination_cpsch(struct device_queue_manager *dqm, int retval; struct queue *q, *next; struct kernel_queue *kq, *kq_next; - struct mqd_manager *mqd; + struct mqd_manager *mqd_mgr; struct device_process_node *cur, *next_dpn; enum kfd_unmap_queues_filter filter = KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES; retval = 0; - mutex_lock(&dqm->lock); + dqm_lock(dqm); /* Clean all kernel queues */ list_for_each_entry_safe(kq, kq_next, &qpd->priv_queue_list, list) { @@ -1542,7 +1577,7 @@ static int process_termination_cpsch(struct device_queue_manager *dqm, } retval = execute_queues_cpsch(dqm, filter, 0); - if (retval || qpd->reset_wavefronts) { + if ((!dqm->is_hws_hang) && (retval || qpd->reset_wavefronts)) { pr_warn("Resetting wave fronts (cpsch) on dev %p\n", dqm->dev); dbgdev_wave_reset_wavefronts(dqm->dev, qpd->pqm->process); qpd->reset_wavefronts = false; @@ -1550,19 +1585,19 @@ static int process_termination_cpsch(struct device_queue_manager *dqm, /* lastly, free mqd resources */ list_for_each_entry_safe(q, next, &qpd->queues_list, list) { - mqd = dqm->ops.get_mqd_manager(dqm, + mqd_mgr = dqm->ops.get_mqd_manager(dqm, get_mqd_type_from_queue_type(q->properties.type)); - if (!mqd) { + if (!mqd_mgr) { retval = -ENOMEM; goto out; } list_del(&q->list); qpd->queue_count--; - mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); + mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj); } out: - mutex_unlock(&dqm->lock); + dqm_unlock(dqm); return retval; } @@ -1683,6 +1718,30 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm) kfree(dqm); } +int kfd_process_vm_fault(struct device_queue_manager *dqm, + unsigned int pasid) +{ + struct kfd_process_device *pdd; + struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + int ret = 0; + + if (!p) + return -EINVAL; + pdd = kfd_get_process_device_data(dqm->dev, p); + if (pdd) + ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd); + kfd_unref_process(p); + + return ret; +} + +static void kfd_process_hw_exception(struct work_struct *work) +{ + struct device_queue_manager *dqm = container_of(work, + struct device_queue_manager, hw_exception_work); + dqm->dev->kfd2kgd->gpu_recover(dqm->dev->kgd); +} + #if defined(CONFIG_DEBUG_FS) static void seq_reg_dump(struct seq_file *m, @@ -1746,8 +1805,8 @@ int dqm_debugfs_hqds(struct seq_file *m, void *data) } } - for (pipe = 0; pipe < CIK_SDMA_ENGINE_NUM; pipe++) { - for (queue = 0; queue < CIK_SDMA_QUEUES_PER_ENGINE; queue++) { + for (pipe = 0; pipe < get_num_sdma_engines(dqm); pipe++) { + for (queue = 0; queue < KFD_SDMA_QUEUES_PER_ENGINE; queue++) { r = dqm->dev->kfd2kgd->hqd_sdma_dump( dqm->dev->kgd, pipe, queue, &dump, &n_regs); if (r) @@ -1764,4 +1823,16 @@ int dqm_debugfs_hqds(struct seq_file *m, void *data) return r; } +int dqm_debugfs_execute_queues(struct device_queue_manager *dqm) +{ + int r = 0; + + dqm_lock(dqm); + dqm->active_runlist = true; + r = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0); + dqm_unlock(dqm); + + return r; +} + #endif |