diff options
Diffstat (limited to 'drivers/gpu/drm/amd/scheduler/gpu_scheduler.c')
-rw-r--r-- | drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 134 |
1 files changed, 82 insertions, 52 deletions
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c index 92ec663fdada..dcb987e6d94a 100644 --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c @@ -28,9 +28,14 @@ #include <drm/drmP.h> #include "gpu_scheduler.h" +#include "spsc_queue.h" + #define CREATE_TRACE_POINTS #include "gpu_sched_trace.h" +#define to_amd_sched_job(sched_job) \ + container_of((sched_job), struct amd_sched_job, queue_node) + static bool amd_sched_entity_is_ready(struct amd_sched_entity *entity); static void amd_sched_wakeup(struct amd_gpu_scheduler *sched); static void amd_sched_process_job(struct dma_fence *f, struct dma_fence_cb *cb); @@ -121,10 +126,8 @@ amd_sched_rq_select_entity(struct amd_sched_rq *rq) int amd_sched_entity_init(struct amd_gpu_scheduler *sched, struct amd_sched_entity *entity, struct amd_sched_rq *rq, - uint32_t jobs) + uint32_t jobs, atomic_t *guilty) { - int r; - if (!(sched && entity && rq)) return -EINVAL; @@ -132,12 +135,11 @@ int amd_sched_entity_init(struct amd_gpu_scheduler *sched, INIT_LIST_HEAD(&entity->list); entity->rq = rq; entity->sched = sched; + entity->guilty = guilty; spin_lock_init(&entity->rq_lock); spin_lock_init(&entity->queue_lock); - r = kfifo_alloc(&entity->job_queue, jobs * sizeof(void *), GFP_KERNEL); - if (r) - return r; + spsc_queue_init(&entity->job_queue); atomic_set(&entity->fence_seq, 0); entity->fence_context = dma_fence_context_alloc(2); @@ -170,7 +172,7 @@ static bool amd_sched_entity_is_initialized(struct amd_gpu_scheduler *sched, static bool amd_sched_entity_is_idle(struct amd_sched_entity *entity) { rmb(); - if (kfifo_is_empty(&entity->job_queue)) + if (spsc_queue_peek(&entity->job_queue) == NULL) return true; return false; @@ -185,7 +187,7 @@ static bool amd_sched_entity_is_idle(struct amd_sched_entity *entity) */ static bool amd_sched_entity_is_ready(struct amd_sched_entity *entity) { - if (kfifo_is_empty(&entity->job_queue)) + if (spsc_queue_peek(&entity->job_queue) == NULL) return false; if (READ_ONCE(entity->dependency)) @@ -227,17 +229,23 @@ void amd_sched_entity_fini(struct amd_gpu_scheduler *sched, */ kthread_park(sched->thread); kthread_unpark(sched->thread); - while (kfifo_out(&entity->job_queue, &job, sizeof(job))) { + if (entity->dependency) { + dma_fence_remove_callback(entity->dependency, + &entity->cb); + dma_fence_put(entity->dependency); + entity->dependency = NULL; + } + + while ((job = to_amd_sched_job(spsc_queue_pop(&entity->job_queue)))) { struct amd_sched_fence *s_fence = job->s_fence; amd_sched_fence_scheduled(s_fence); dma_fence_set_error(&s_fence->finished, -ESRCH); amd_sched_fence_finished(s_fence); + WARN_ON(s_fence->parent); dma_fence_put(&s_fence->finished); sched->ops->free_job(job); } - } - kfifo_free(&entity->job_queue); } static void amd_sched_entity_wakeup(struct dma_fence *f, struct dma_fence_cb *cb) @@ -332,40 +340,44 @@ static bool amd_sched_entity_add_dependency_cb(struct amd_sched_entity *entity) } static struct amd_sched_job * -amd_sched_entity_peek_job(struct amd_sched_entity *entity) +amd_sched_entity_pop_job(struct amd_sched_entity *entity) { struct amd_gpu_scheduler *sched = entity->sched; - struct amd_sched_job *sched_job; + struct amd_sched_job *sched_job = to_amd_sched_job( + spsc_queue_peek(&entity->job_queue)); - if (!kfifo_out_peek(&entity->job_queue, &sched_job, sizeof(sched_job))) + if (!sched_job) return NULL; - while ((entity->dependency = sched->ops->dependency(sched_job))) + while ((entity->dependency = sched->ops->dependency(sched_job, entity))) if (amd_sched_entity_add_dependency_cb(entity)) return NULL; + /* skip jobs from entity that marked guilty */ + if (entity->guilty && atomic_read(entity->guilty)) + dma_fence_set_error(&sched_job->s_fence->finished, -ECANCELED); + + spsc_queue_pop(&entity->job_queue); return sched_job; } /** - * Helper to submit a job to the job queue + * Submit a job to the job queue * * @sched_job The pointer to job required to submit * - * Returns true if we could submit the job. + * Returns 0 for success, negative error code otherwise. */ -static bool amd_sched_entity_in(struct amd_sched_job *sched_job) +void amd_sched_entity_push_job(struct amd_sched_job *sched_job, + struct amd_sched_entity *entity) { struct amd_gpu_scheduler *sched = sched_job->sched; - struct amd_sched_entity *entity = sched_job->s_entity; - bool added, first = false; + bool first = false; - spin_lock(&entity->queue_lock); - added = kfifo_in(&entity->job_queue, &sched_job, - sizeof(sched_job)) == sizeof(sched_job); + trace_amd_sched_job(sched_job, entity); - if (added && kfifo_len(&entity->job_queue) == sizeof(sched_job)) - first = true; + spin_lock(&entity->queue_lock); + first = spsc_queue_push(&entity->job_queue, &sched_job->queue_node); spin_unlock(&entity->queue_lock); @@ -377,7 +389,6 @@ static bool amd_sched_entity_in(struct amd_sched_job *sched_job) spin_unlock(&entity->rq_lock); amd_sched_wakeup(sched); } - return added; } /* job_finish is called after hw fence signaled @@ -442,9 +453,11 @@ static void amd_sched_job_timedout(struct work_struct *work) job->sched->ops->timedout_job(job); } -void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched) +void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_job *bad) { struct amd_sched_job *s_job; + struct amd_sched_entity *entity, *tmp; + int i;; spin_lock(&sched->job_list_lock); list_for_each_entry_reverse(s_job, &sched->ring_mirror_list, node) { @@ -457,6 +470,30 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched) } } spin_unlock(&sched->job_list_lock); + + if (bad && bad->s_priority != AMD_SCHED_PRIORITY_KERNEL) { + atomic_inc(&bad->karma); + /* don't increase @bad's karma if it's from KERNEL RQ, + * becuase sometimes GPU hang would cause kernel jobs (like VM updating jobs) + * corrupt but keep in mind that kernel jobs always considered good. + */ + for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_KERNEL; i++ ) { + struct amd_sched_rq *rq = &sched->sched_rq[i]; + + spin_lock(&rq->lock); + list_for_each_entry_safe(entity, tmp, &rq->entities, list) { + if (bad->s_fence->scheduled.context == entity->fence_context) { + if (atomic_read(&bad->karma) > bad->sched->hang_limit) + if (entity->guilty) + atomic_set(entity->guilty, 1); + break; + } + } + spin_unlock(&rq->lock); + if (&entity->list != &rq->entities) + break; + } + } } void amd_sched_job_kickout(struct amd_sched_job *s_job) @@ -471,6 +508,7 @@ void amd_sched_job_kickout(struct amd_sched_job *s_job) void amd_sched_job_recovery(struct amd_gpu_scheduler *sched) { struct amd_sched_job *s_job, *tmp; + bool found_guilty = false; int r; spin_lock(&sched->job_list_lock); @@ -482,6 +520,15 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched) list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) { struct amd_sched_fence *s_fence = s_job->s_fence; struct dma_fence *fence; + uint64_t guilty_context; + + if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) { + found_guilty = true; + guilty_context = s_job->s_fence->scheduled.context; + } + + if (found_guilty && s_job->s_fence->scheduled.context == guilty_context) + dma_fence_set_error(&s_fence->finished, -ECANCELED); spin_unlock(&sched->job_list_lock); fence = sched->ops->run_job(s_job); @@ -497,7 +544,6 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched) r); dma_fence_put(fence); } else { - DRM_ERROR("Failed to run job!\n"); amd_sched_process_job(NULL, &s_fence->cb); } spin_lock(&sched->job_list_lock); @@ -505,22 +551,6 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched) spin_unlock(&sched->job_list_lock); } -/** - * Submit a job to the job queue - * - * @sched_job The pointer to job required to submit - * - * Returns 0 for success, negative error code otherwise. - */ -void amd_sched_entity_push_job(struct amd_sched_job *sched_job) -{ - struct amd_sched_entity *entity = sched_job->s_entity; - - trace_amd_sched_job(sched_job); - wait_event(entity->sched->job_scheduled, - amd_sched_entity_in(sched_job)); -} - /* init a sched_job with basic field */ int amd_sched_job_init(struct amd_sched_job *job, struct amd_gpu_scheduler *sched, @@ -528,7 +558,7 @@ int amd_sched_job_init(struct amd_sched_job *job, void *owner) { job->sched = sched; - job->s_entity = entity; + job->s_priority = entity->rq - sched->sched_rq; job->s_fence = amd_sched_fence_create(entity, owner); if (!job->s_fence) return -ENOMEM; @@ -610,7 +640,7 @@ static int amd_sched_main(void *param) { struct sched_param sparam = {.sched_priority = 1}; struct amd_gpu_scheduler *sched = (struct amd_gpu_scheduler *)param; - int r, count; + int r; sched_setscheduler(current, SCHED_FIFO, &sparam); @@ -628,7 +658,7 @@ static int amd_sched_main(void *param) if (!entity) continue; - sched_job = amd_sched_entity_peek_job(entity); + sched_job = amd_sched_entity_pop_job(entity); if (!sched_job) continue; @@ -651,13 +681,9 @@ static int amd_sched_main(void *param) r); dma_fence_put(fence); } else { - DRM_ERROR("Failed to run job!\n"); amd_sched_process_job(NULL, &s_fence->cb); } - count = kfifo_out(&entity->job_queue, &sched_job, - sizeof(sched_job)); - WARN_ON(count != sizeof(sched_job)); wake_up(&sched->job_scheduled); } return 0; @@ -675,13 +701,17 @@ static int amd_sched_main(void *param) */ int amd_sched_init(struct amd_gpu_scheduler *sched, const struct amd_sched_backend_ops *ops, - unsigned hw_submission, long timeout, const char *name) + unsigned hw_submission, + unsigned hang_limit, + long timeout, + const char *name) { int i; sched->ops = ops; sched->hw_submission_limit = hw_submission; sched->name = name; sched->timeout = timeout; + sched->hang_limit = hang_limit; for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_MAX; i++) amd_sched_rq_init(&sched->sched_rq[i]); |