diff options
Diffstat (limited to 'drivers/gpu/drm/amd/scheduler/gpu_scheduler.c')
-rw-r--r-- | drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 39 |
1 files changed, 23 insertions, 16 deletions
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c index f116de798204..941b5920b97b 100644 --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c @@ -345,6 +345,10 @@ amd_sched_entity_pop_job(struct amd_sched_entity *entity) if (amd_sched_entity_add_dependency_cb(entity)) return NULL; + /* skip jobs from entity that marked guilty */ + if (entity->guilty && atomic_read(entity->guilty)) + dma_fence_set_error(&sched_job->s_fence->finished, -ECANCELED); + spsc_queue_pop(&entity->job_queue); return sched_job; } @@ -441,14 +445,6 @@ static void amd_sched_job_timedout(struct work_struct *work) job->sched->ops->timedout_job(job); } -static void amd_sched_set_guilty(struct amd_sched_job *s_job, - struct amd_sched_entity *s_entity) -{ - if (atomic_inc_return(&s_job->karma) > s_job->sched->hang_limit) - if (s_entity->guilty) - atomic_set(s_entity->guilty, 1); -} - void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_job *bad) { struct amd_sched_job *s_job; @@ -468,21 +464,24 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_jo spin_unlock(&sched->job_list_lock); if (bad) { - bool found = false; - - for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_MAX; i++ ) { + /* don't increase @bad's karma if it's from KERNEL RQ, + * becuase sometimes GPU hang would cause kernel jobs (like VM updating jobs) + * corrupt but keep in mind that kernel jobs always considered good. + */ + for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_KERNEL; i++ ) { struct amd_sched_rq *rq = &sched->sched_rq[i]; spin_lock(&rq->lock); list_for_each_entry_safe(entity, tmp, &rq->entities, list) { if (bad->s_fence->scheduled.context == entity->fence_context) { - found = true; - amd_sched_set_guilty(bad, entity); + if (atomic_inc_return(&bad->karma) > bad->sched->hang_limit) + if (entity->guilty) + atomic_set(entity->guilty, 1); break; } } spin_unlock(&rq->lock); - if (found) + if (&entity->list != &rq->entities) break; } } @@ -500,6 +499,7 @@ void amd_sched_job_kickout(struct amd_sched_job *s_job) void amd_sched_job_recovery(struct amd_gpu_scheduler *sched) { struct amd_sched_job *s_job, *tmp; + bool found_guilty = false; int r; spin_lock(&sched->job_list_lock); @@ -511,6 +511,15 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched) list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) { struct amd_sched_fence *s_fence = s_job->s_fence; struct dma_fence *fence; + uint64_t guilty_context; + + if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) { + found_guilty = true; + guilty_context = s_job->s_fence->scheduled.context; + } + + if (found_guilty && s_job->s_fence->scheduled.context == guilty_context) + dma_fence_set_error(&s_fence->finished, -ECANCELED); spin_unlock(&sched->job_list_lock); fence = sched->ops->run_job(s_job); @@ -526,7 +535,6 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched) r); dma_fence_put(fence); } else { - DRM_ERROR("Failed to run job!\n"); amd_sched_process_job(NULL, &s_fence->cb); } spin_lock(&sched->job_list_lock); @@ -664,7 +672,6 @@ static int amd_sched_main(void *param) r); dma_fence_put(fence); } else { - DRM_ERROR("Failed to run job!\n"); amd_sched_process_job(NULL, &s_fence->cb); } |