From c98171ccf6580407d07a3b5dc8188ce9e1f4f7ca Mon Sep 17 00:00:00 2001 From: Felix Kuehling <Felix.Kuehling@amd.com> Date: Thu, 21 Sep 2017 16:26:41 -0400 Subject: drm/amdgpu: Handle GPUVM fault storms When many wavefronts cause VM faults at the same time, it can overwhelm the interrupt handler and cause IH ring overflows before the driver can notify or kill the faulting application. As a workaround I'm introducing limited per-VM fault credit. After that number of VM faults have occurred, further VM faults are filtered out at the prescreen stage of processing. This depends on the PASID in the interrupt packet, so it currently only works for KFD contexts. Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com> Reviewed-by: Alex Deucher <alexander.deucher@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/amdgpu/cik_ih.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/cik_ih.c') diff --git a/drivers/gpu/drm/amd/amdgpu/cik_ih.c b/drivers/gpu/drm/amd/amdgpu/cik_ih.c index 07d3d895da10..a870b354e3f7 100644 --- a/drivers/gpu/drm/amd/amdgpu/cik_ih.c +++ b/drivers/gpu/drm/amd/amdgpu/cik_ih.c @@ -237,8 +237,23 @@ static u32 cik_ih_get_wptr(struct amdgpu_device *adev) */ static bool cik_ih_prescreen_iv(struct amdgpu_device *adev) { - /* Process all interrupts */ - return true; + u32 ring_index = adev->irq.ih.rptr >> 2; + u16 pasid; + + switch (le32_to_cpu(adev->irq.ih.ring[ring_index]) & 0xff) { + case 146: + case 147: + pasid = le32_to_cpu(adev->irq.ih.ring[ring_index + 2]) >> 16; + if (!pasid || amdgpu_vm_pasid_fault_credit(adev, pasid)) + return true; + break; + default: + /* Not a VM fault */ + return true; + } + + adev->irq.ih.rptr += 16; + return false; } /** -- cgit v1.2.3-70-g09d2