summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/scheduler/gpu_scheduler.c')
-rw-r--r--drivers/gpu/drm/amd/scheduler/gpu_scheduler.c134
1 files changed, 82 insertions, 52 deletions
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
index 92ec663fdada..dcb987e6d94a 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
@@ -28,9 +28,14 @@
#include <drm/drmP.h>
#include "gpu_scheduler.h"
+#include "spsc_queue.h"
+
#define CREATE_TRACE_POINTS
#include "gpu_sched_trace.h"
+#define to_amd_sched_job(sched_job) \
+ container_of((sched_job), struct amd_sched_job, queue_node)
+
static bool amd_sched_entity_is_ready(struct amd_sched_entity *entity);
static void amd_sched_wakeup(struct amd_gpu_scheduler *sched);
static void amd_sched_process_job(struct dma_fence *f, struct dma_fence_cb *cb);
@@ -121,10 +126,8 @@ amd_sched_rq_select_entity(struct amd_sched_rq *rq)
int amd_sched_entity_init(struct amd_gpu_scheduler *sched,
struct amd_sched_entity *entity,
struct amd_sched_rq *rq,
- uint32_t jobs)
+ uint32_t jobs, atomic_t *guilty)
{
- int r;
-
if (!(sched && entity && rq))
return -EINVAL;
@@ -132,12 +135,11 @@ int amd_sched_entity_init(struct amd_gpu_scheduler *sched,
INIT_LIST_HEAD(&entity->list);
entity->rq = rq;
entity->sched = sched;
+ entity->guilty = guilty;
spin_lock_init(&entity->rq_lock);
spin_lock_init(&entity->queue_lock);
- r = kfifo_alloc(&entity->job_queue, jobs * sizeof(void *), GFP_KERNEL);
- if (r)
- return r;
+ spsc_queue_init(&entity->job_queue);
atomic_set(&entity->fence_seq, 0);
entity->fence_context = dma_fence_context_alloc(2);
@@ -170,7 +172,7 @@ static bool amd_sched_entity_is_initialized(struct amd_gpu_scheduler *sched,
static bool amd_sched_entity_is_idle(struct amd_sched_entity *entity)
{
rmb();
- if (kfifo_is_empty(&entity->job_queue))
+ if (spsc_queue_peek(&entity->job_queue) == NULL)
return true;
return false;
@@ -185,7 +187,7 @@ static bool amd_sched_entity_is_idle(struct amd_sched_entity *entity)
*/
static bool amd_sched_entity_is_ready(struct amd_sched_entity *entity)
{
- if (kfifo_is_empty(&entity->job_queue))
+ if (spsc_queue_peek(&entity->job_queue) == NULL)
return false;
if (READ_ONCE(entity->dependency))
@@ -227,17 +229,23 @@ void amd_sched_entity_fini(struct amd_gpu_scheduler *sched,
*/
kthread_park(sched->thread);
kthread_unpark(sched->thread);
- while (kfifo_out(&entity->job_queue, &job, sizeof(job))) {
+ if (entity->dependency) {
+ dma_fence_remove_callback(entity->dependency,
+ &entity->cb);
+ dma_fence_put(entity->dependency);
+ entity->dependency = NULL;
+ }
+
+ while ((job = to_amd_sched_job(spsc_queue_pop(&entity->job_queue)))) {
struct amd_sched_fence *s_fence = job->s_fence;
amd_sched_fence_scheduled(s_fence);
dma_fence_set_error(&s_fence->finished, -ESRCH);
amd_sched_fence_finished(s_fence);
+ WARN_ON(s_fence->parent);
dma_fence_put(&s_fence->finished);
sched->ops->free_job(job);
}
-
}
- kfifo_free(&entity->job_queue);
}
static void amd_sched_entity_wakeup(struct dma_fence *f, struct dma_fence_cb *cb)
@@ -332,40 +340,44 @@ static bool amd_sched_entity_add_dependency_cb(struct amd_sched_entity *entity)
}
static struct amd_sched_job *
-amd_sched_entity_peek_job(struct amd_sched_entity *entity)
+amd_sched_entity_pop_job(struct amd_sched_entity *entity)
{
struct amd_gpu_scheduler *sched = entity->sched;
- struct amd_sched_job *sched_job;
+ struct amd_sched_job *sched_job = to_amd_sched_job(
+ spsc_queue_peek(&entity->job_queue));
- if (!kfifo_out_peek(&entity->job_queue, &sched_job, sizeof(sched_job)))
+ if (!sched_job)
return NULL;
- while ((entity->dependency = sched->ops->dependency(sched_job)))
+ while ((entity->dependency = sched->ops->dependency(sched_job, entity)))
if (amd_sched_entity_add_dependency_cb(entity))
return NULL;
+ /* skip jobs from entity that marked guilty */
+ if (entity->guilty && atomic_read(entity->guilty))
+ dma_fence_set_error(&sched_job->s_fence->finished, -ECANCELED);
+
+ spsc_queue_pop(&entity->job_queue);
return sched_job;
}
/**
- * Helper to submit a job to the job queue
+ * Submit a job to the job queue
*
* @sched_job The pointer to job required to submit
*
- * Returns true if we could submit the job.
+ * Returns 0 for success, negative error code otherwise.
*/
-static bool amd_sched_entity_in(struct amd_sched_job *sched_job)
+void amd_sched_entity_push_job(struct amd_sched_job *sched_job,
+ struct amd_sched_entity *entity)
{
struct amd_gpu_scheduler *sched = sched_job->sched;
- struct amd_sched_entity *entity = sched_job->s_entity;
- bool added, first = false;
+ bool first = false;
- spin_lock(&entity->queue_lock);
- added = kfifo_in(&entity->job_queue, &sched_job,
- sizeof(sched_job)) == sizeof(sched_job);
+ trace_amd_sched_job(sched_job, entity);
- if (added && kfifo_len(&entity->job_queue) == sizeof(sched_job))
- first = true;
+ spin_lock(&entity->queue_lock);
+ first = spsc_queue_push(&entity->job_queue, &sched_job->queue_node);
spin_unlock(&entity->queue_lock);
@@ -377,7 +389,6 @@ static bool amd_sched_entity_in(struct amd_sched_job *sched_job)
spin_unlock(&entity->rq_lock);
amd_sched_wakeup(sched);
}
- return added;
}
/* job_finish is called after hw fence signaled
@@ -442,9 +453,11 @@ static void amd_sched_job_timedout(struct work_struct *work)
job->sched->ops->timedout_job(job);
}
-void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched)
+void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_job *bad)
{
struct amd_sched_job *s_job;
+ struct amd_sched_entity *entity, *tmp;
+ int i;;
spin_lock(&sched->job_list_lock);
list_for_each_entry_reverse(s_job, &sched->ring_mirror_list, node) {
@@ -457,6 +470,30 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched)
}
}
spin_unlock(&sched->job_list_lock);
+
+ if (bad && bad->s_priority != AMD_SCHED_PRIORITY_KERNEL) {
+ atomic_inc(&bad->karma);
+ /* don't increase @bad's karma if it's from KERNEL RQ,
+ * becuase sometimes GPU hang would cause kernel jobs (like VM updating jobs)
+ * corrupt but keep in mind that kernel jobs always considered good.
+ */
+ for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_KERNEL; i++ ) {
+ struct amd_sched_rq *rq = &sched->sched_rq[i];
+
+ spin_lock(&rq->lock);
+ list_for_each_entry_safe(entity, tmp, &rq->entities, list) {
+ if (bad->s_fence->scheduled.context == entity->fence_context) {
+ if (atomic_read(&bad->karma) > bad->sched->hang_limit)
+ if (entity->guilty)
+ atomic_set(entity->guilty, 1);
+ break;
+ }
+ }
+ spin_unlock(&rq->lock);
+ if (&entity->list != &rq->entities)
+ break;
+ }
+ }
}
void amd_sched_job_kickout(struct amd_sched_job *s_job)
@@ -471,6 +508,7 @@ void amd_sched_job_kickout(struct amd_sched_job *s_job)
void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
{
struct amd_sched_job *s_job, *tmp;
+ bool found_guilty = false;
int r;
spin_lock(&sched->job_list_lock);
@@ -482,6 +520,15 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
struct amd_sched_fence *s_fence = s_job->s_fence;
struct dma_fence *fence;
+ uint64_t guilty_context;
+
+ if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) {
+ found_guilty = true;
+ guilty_context = s_job->s_fence->scheduled.context;
+ }
+
+ if (found_guilty && s_job->s_fence->scheduled.context == guilty_context)
+ dma_fence_set_error(&s_fence->finished, -ECANCELED);
spin_unlock(&sched->job_list_lock);
fence = sched->ops->run_job(s_job);
@@ -497,7 +544,6 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
r);
dma_fence_put(fence);
} else {
- DRM_ERROR("Failed to run job!\n");
amd_sched_process_job(NULL, &s_fence->cb);
}
spin_lock(&sched->job_list_lock);
@@ -505,22 +551,6 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
spin_unlock(&sched->job_list_lock);
}
-/**
- * Submit a job to the job queue
- *
- * @sched_job The pointer to job required to submit
- *
- * Returns 0 for success, negative error code otherwise.
- */
-void amd_sched_entity_push_job(struct amd_sched_job *sched_job)
-{
- struct amd_sched_entity *entity = sched_job->s_entity;
-
- trace_amd_sched_job(sched_job);
- wait_event(entity->sched->job_scheduled,
- amd_sched_entity_in(sched_job));
-}
-
/* init a sched_job with basic field */
int amd_sched_job_init(struct amd_sched_job *job,
struct amd_gpu_scheduler *sched,
@@ -528,7 +558,7 @@ int amd_sched_job_init(struct amd_sched_job *job,
void *owner)
{
job->sched = sched;
- job->s_entity = entity;
+ job->s_priority = entity->rq - sched->sched_rq;
job->s_fence = amd_sched_fence_create(entity, owner);
if (!job->s_fence)
return -ENOMEM;
@@ -610,7 +640,7 @@ static int amd_sched_main(void *param)
{
struct sched_param sparam = {.sched_priority = 1};
struct amd_gpu_scheduler *sched = (struct amd_gpu_scheduler *)param;
- int r, count;
+ int r;
sched_setscheduler(current, SCHED_FIFO, &sparam);
@@ -628,7 +658,7 @@ static int amd_sched_main(void *param)
if (!entity)
continue;
- sched_job = amd_sched_entity_peek_job(entity);
+ sched_job = amd_sched_entity_pop_job(entity);
if (!sched_job)
continue;
@@ -651,13 +681,9 @@ static int amd_sched_main(void *param)
r);
dma_fence_put(fence);
} else {
- DRM_ERROR("Failed to run job!\n");
amd_sched_process_job(NULL, &s_fence->cb);
}
- count = kfifo_out(&entity->job_queue, &sched_job,
- sizeof(sched_job));
- WARN_ON(count != sizeof(sched_job));
wake_up(&sched->job_scheduled);
}
return 0;
@@ -675,13 +701,17 @@ static int amd_sched_main(void *param)
*/
int amd_sched_init(struct amd_gpu_scheduler *sched,
const struct amd_sched_backend_ops *ops,
- unsigned hw_submission, long timeout, const char *name)
+ unsigned hw_submission,
+ unsigned hang_limit,
+ long timeout,
+ const char *name)
{
int i;
sched->ops = ops;
sched->hw_submission_limit = hw_submission;
sched->name = name;
sched->timeout = timeout;
+ sched->hang_limit = hang_limit;
for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_MAX; i++)
amd_sched_rq_init(&sched->sched_rq[i]);