diff options
author | Chris Wilson <chris@chris-wilson.co.uk> | 2019-02-19 12:21:52 +0000 |
---|---|---|
committer | Chris Wilson <chris@chris-wilson.co.uk> | 2019-02-19 14:46:21 +0000 |
commit | 7f4127c4839b1801087e08b1797e830a766391c1 (patch) | |
tree | 9c96e1995f79b02ca35d2e45954727e48b47d612 /drivers/gpu/drm/i915/i915_gpu_error.c | |
parent | 8f54b3c6c921275d10e33746553c40294ffa0d58 (diff) |
drm/i915: Use time based guilty context banning
Currently, we accumulate each time a context hangs the GPU, offset
against the number of requests it submits, and if that score exceeds a
certain threshold, we ban that context from submitting any more requests
(cancelling any work in flight). In contrast, we use a simple timer on
the file, that if we see more than a 9 hangs faster than 60s apart in
total across all of its contexts, we will ban the client from creating
any more contexts. This leads to a confusing situation where the file
may be banned before the context, so lets use a simple timer scheme for
each.
If the context submits 3 hanging requests within a 120s period, declare
it forbidden to ever send more requests.
This has the advantage of not being easy to repair by simply sending
empty requests, but has the disadvantage that if the context is idle
then it is forgiven. However, if the context is idle, it is not
disrupting the system, but a hog can evade the request counting and
cause much more severe disruption to the system.
Updating ban_score from request retirement is dubious as the retirement
is purposely not in sync with request submission (i.e. we try and batch
retirement to reduce overhead and avoid latency on submission), which
leads to surprising situations where we can forgive a hang immediately
due to a backlog of requests from before the hang being retired
afterwards.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190219122215.8941-2-chris@chris-wilson.co.uk
Diffstat (limited to 'drivers/gpu/drm/i915/i915_gpu_error.c')
-rw-r--r-- | drivers/gpu/drm/i915/i915_gpu_error.c | 31 |
1 files changed, 8 insertions, 23 deletions
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index 9a65341fec09..3f6eddb6f6de 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -434,11 +434,6 @@ static void error_print_instdone(struct drm_i915_error_state_buf *m, ee->instdone.row[slice][subslice]); } -static const char *bannable(const struct drm_i915_error_context *ctx) -{ - return ctx->bannable ? "" : " (unbannable)"; -} - static void error_print_request(struct drm_i915_error_state_buf *m, const char *prefix, const struct drm_i915_error_request *erq, @@ -447,9 +442,8 @@ static void error_print_request(struct drm_i915_error_state_buf *m, if (!erq->seqno) return; - err_printf(m, "%s pid %d, ban score %d, seqno %8x:%08x%s%s, prio %d, emitted %dms, start %08x, head %08x, tail %08x\n", - prefix, erq->pid, erq->ban_score, - erq->context, erq->seqno, + err_printf(m, "%s pid %d, seqno %8x:%08x%s%s, prio %d, emitted %dms, start %08x, head %08x, tail %08x\n", + prefix, erq->pid, erq->context, erq->seqno, test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &erq->flags) ? "!" : "", test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, @@ -463,10 +457,9 @@ static void error_print_context(struct drm_i915_error_state_buf *m, const char *header, const struct drm_i915_error_context *ctx) { - err_printf(m, "%s%s[%d] user_handle %d hw_id %d, prio %d, ban score %d%s guilty %d active %d\n", + err_printf(m, "%s%s[%d] user_handle %d hw_id %d, prio %d, guilty %d active %d\n", header, ctx->comm, ctx->pid, ctx->handle, ctx->hw_id, - ctx->sched_attr.priority, ctx->ban_score, bannable(ctx), - ctx->guilty, ctx->active); + ctx->sched_attr.priority, ctx->guilty, ctx->active); } static void error_print_engine(struct drm_i915_error_state_buf *m, @@ -688,12 +681,10 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, if (!error->engine[i].context.pid) continue; - err_printf(m, "Active process (on ring %s): %s [%d], score %d%s\n", + err_printf(m, "Active process (on ring %s): %s [%d]\n", engine_name(m->i915, i), error->engine[i].context.comm, - error->engine[i].context.pid, - error->engine[i].context.ban_score, - bannable(&error->engine[i].context)); + error->engine[i].context.pid); } err_printf(m, "Reset count: %u\n", error->reset_count); err_printf(m, "Suspend count: %u\n", error->suspend_count); @@ -779,13 +770,11 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, if (obj) { err_puts(m, m->i915->engine[i]->name); if (ee->context.pid) - err_printf(m, " (submitted by %s [%d], ctx %d [%d], score %d%s)", + err_printf(m, " (submitted by %s [%d], ctx %d [%d])", ee->context.comm, ee->context.pid, ee->context.handle, - ee->context.hw_id, - ee->context.ban_score, - bannable(&ee->context)); + ee->context.hw_id); err_printf(m, " --- gtt_offset = 0x%08x %08x\n", upper_32_bits(obj->gtt_offset), lower_32_bits(obj->gtt_offset)); @@ -1301,8 +1290,6 @@ static void record_request(struct i915_request *request, erq->flags = request->fence.flags; erq->context = ctx->hw_id; erq->sched_attr = request->sched.attr; - erq->ban_score = atomic_read(&ctx->ban_score); - erq->seqno = request->global_seqno; erq->jiffies = request->emitted_jiffies; erq->start = i915_ggtt_offset(request->ring->vma); erq->head = request->head; @@ -1396,8 +1383,6 @@ static void record_context(struct drm_i915_error_context *e, e->handle = ctx->user_handle; e->hw_id = ctx->hw_id; e->sched_attr = ctx->sched; - e->ban_score = atomic_read(&ctx->ban_score); - e->bannable = i915_gem_context_is_bannable(ctx); e->guilty = atomic_read(&ctx->guilty_count); e->active = atomic_read(&ctx->active_count); } |