From 4f860edecdafeb2e5fb29fecc6428090997936fe Mon Sep 17 00:00:00 2001 From: Somalapuram Amaranath Date: Mon, 17 Jan 2022 13:19:10 +0530 Subject: drm/amdgpu: limit the number of dst address in trace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit trace_amdgpu_vm_update_ptes trace unable to log when nptes too large Signed-off-by: Somalapuram Amaranath Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index b37fc7d7d2c7..4017bcf75e1f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -1634,7 +1634,7 @@ static int amdgpu_vm_update_ptes(struct amdgpu_vm_update_params *params, nptes = max(nptes, 1u); trace_amdgpu_vm_update_ptes(params, frag_start, upd_end, - nptes, dst, incr, upd_flags, + min(nptes, 32u), dst, incr, upd_flags, vm->task_info.pid, vm->immediate.fence_context); amdgpu_vm_update_flags(params, to_amdgpu_bo_vm(pt), -- cgit v1.2.3-70-g09d2 From 2d022081b333a7f15ba27607696d4a41a7a2b5f9 Mon Sep 17 00:00:00 2001 From: Christian König Date: Tue, 1 Feb 2022 16:21:04 +0100 Subject: drm/amdgpu: add some lockdep checks to the VM code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Whenever a bo_va structure is added or removed the VM and eventually added BO should be locked. Signed-off-by: Christian König Reviewed-by: Alex Deucher Reviewed-by: Felix Kuehling Acked-by: Daniel Vetter Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 4017bcf75e1f..5a67ac0b6b7a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -2244,6 +2244,8 @@ struct amdgpu_bo_va *amdgpu_vm_bo_add(struct amdgpu_device *adev, { struct amdgpu_bo_va *bo_va; + dma_resv_assert_held(vm->root.bo->tbo.base.resv); + bo_va = kzalloc(sizeof(struct amdgpu_bo_va), GFP_KERNEL); if (bo_va == NULL) { return NULL; @@ -2257,6 +2259,7 @@ struct amdgpu_bo_va *amdgpu_vm_bo_add(struct amdgpu_device *adev, if (!bo) return bo_va; + dma_resv_assert_held(bo->tbo.base.resv); if (amdgpu_dmabuf_is_xgmi_accessible(adev, bo)) { bo_va->is_xgmi = true; /* Power up XGMI if it can be potentially used */ @@ -2651,7 +2654,10 @@ void amdgpu_vm_bo_rmv(struct amdgpu_device *adev, struct amdgpu_vm *vm = bo_va->base.vm; struct amdgpu_vm_bo_base **base; + dma_resv_assert_held(vm->root.bo->tbo.base.resv); + if (bo) { + dma_resv_assert_held(bo->tbo.base.resv); if (bo->tbo.base.resv == vm->root.bo->tbo.base.resv) vm->bulk_moveable = false; -- cgit v1.2.3-70-g09d2 From e56694f718f0f6694c18d7595e61533a2663335e Mon Sep 17 00:00:00 2001 From: Christian König Date: Tue, 1 Feb 2022 16:25:52 +0100 Subject: drm/amdgpu: rename amdgpu_vm_bo_rmv to _del MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some people complained about the name and this matches much more Linux naming conventions for object functions. Signed-off-by: Christian König Reviewed-by: Alex Deucher Reviewed-by: Felix Kuehling Acked-by: Daniel Vetter Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 2 +- 6 files changed, 9 insertions(+), 9 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 5df387c4d7fb..5d00a6878ef2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -778,7 +778,7 @@ unwind: continue; if (attachment[i]->bo_va) { amdgpu_bo_reserve(bo[i], true); - amdgpu_vm_bo_rmv(adev, attachment[i]->bo_va); + amdgpu_vm_bo_del(adev, attachment[i]->bo_va); amdgpu_bo_unreserve(bo[i]); list_del(&attachment[i]->list); } @@ -795,7 +795,7 @@ static void kfd_mem_detach(struct kfd_mem_attachment *attachment) pr_debug("\t remove VA 0x%llx in entry %p\n", attachment->va, attachment); - amdgpu_vm_bo_rmv(attachment->adev, attachment->bo_va); + amdgpu_vm_bo_del(attachment->adev, attachment->bo_va); drm_gem_object_put(&bo->tbo.base); list_del(&attachment->list); kfree(attachment); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c index da21e60bb827..c6d4d41c4393 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c @@ -98,7 +98,7 @@ int amdgpu_map_static_csa(struct amdgpu_device *adev, struct amdgpu_vm *vm, if (r) { DRM_ERROR("failed to do bo_map on static CSA, err=%d\n", r); - amdgpu_vm_bo_rmv(adev, *bo_va); + amdgpu_vm_bo_del(adev, *bo_va); ttm_eu_backoff_reservation(&ticket, &list); return r; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c index c0d8f40a5b45..bab6500728cb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c @@ -222,7 +222,7 @@ static void amdgpu_gem_object_close(struct drm_gem_object *obj, if (!bo_va || --bo_va->ref_count) goto out_unlock; - amdgpu_vm_bo_rmv(adev, bo_va); + amdgpu_vm_bo_del(adev, bo_va); if (!amdgpu_vm_ready(vm)) goto out_unlock; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c index c2cb345b1421..d1970f37bc88 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c @@ -1188,12 +1188,12 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev, if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_VCE) != NULL) amdgpu_vce_free_handles(adev, file_priv); - amdgpu_vm_bo_rmv(adev, fpriv->prt_va); + amdgpu_vm_bo_del(adev, fpriv->prt_va); if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { /* TODO: how to handle reserve failure */ BUG_ON(amdgpu_bo_reserve(adev->virt.csa_obj, true)); - amdgpu_vm_bo_rmv(adev, fpriv->csa_va); + amdgpu_vm_bo_del(adev, fpriv->csa_va); fpriv->csa_va = NULL; amdgpu_bo_unreserve(adev->virt.csa_obj); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 5a67ac0b6b7a..e0a2f5211b73 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -2637,7 +2637,7 @@ void amdgpu_vm_bo_trace_cs(struct amdgpu_vm *vm, struct ww_acquire_ctx *ticket) } /** - * amdgpu_vm_bo_rmv - remove a bo to a specific vm + * amdgpu_vm_bo_del - remove a bo from a specific vm * * @adev: amdgpu_device pointer * @bo_va: requested bo_va @@ -2646,7 +2646,7 @@ void amdgpu_vm_bo_trace_cs(struct amdgpu_vm *vm, struct ww_acquire_ctx *ticket) * * Object have to be reserved! */ -void amdgpu_vm_bo_rmv(struct amdgpu_device *adev, +void amdgpu_vm_bo_del(struct amdgpu_device *adev, struct amdgpu_bo_va *bo_va) { struct amdgpu_bo_va_mapping *mapping, *next; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index 85fcfb8c5efd..a40a6a993bb0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -435,7 +435,7 @@ int amdgpu_vm_bo_clear_mappings(struct amdgpu_device *adev, struct amdgpu_bo_va_mapping *amdgpu_vm_bo_lookup_mapping(struct amdgpu_vm *vm, uint64_t addr); void amdgpu_vm_bo_trace_cs(struct amdgpu_vm *vm, struct ww_acquire_ctx *ticket); -void amdgpu_vm_bo_rmv(struct amdgpu_device *adev, +void amdgpu_vm_bo_del(struct amdgpu_device *adev, struct amdgpu_bo_va *bo_va); void amdgpu_vm_adjust_size(struct amdgpu_device *adev, uint32_t min_vm_size, uint32_t fragment_size_default, unsigned max_level, -- cgit v1.2.3-70-g09d2 From d7d7ddc15672940be0dbbe03e016c5bb617256b8 Mon Sep 17 00:00:00 2001 From: Christian König Date: Fri, 4 Feb 2022 09:42:22 +0100 Subject: drm/amdgpu: move lockdep assert to the right place. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since newly added BOs don't have any mappings it's ok to add them without holding the VM lock. Only when we add per VM BOs the lock is mandatory. Signed-off-by: Christian König Reported-by: Bhardwaj, Rajneesh Acked-by: Alex Deucher Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index e0a2f5211b73..37acd8911168 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -375,6 +375,8 @@ static void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base *base, if (bo->tbo.base.resv != vm->root.bo->tbo.base.resv) return; + dma_resv_assert_held(vm->root.bo->tbo.base.resv); + vm->bulk_moveable = false; if (bo->tbo.type == ttm_bo_type_kernel && bo->parent) amdgpu_vm_bo_relocated(base); @@ -2244,8 +2246,6 @@ struct amdgpu_bo_va *amdgpu_vm_bo_add(struct amdgpu_device *adev, { struct amdgpu_bo_va *bo_va; - dma_resv_assert_held(vm->root.bo->tbo.base.resv); - bo_va = kzalloc(sizeof(struct amdgpu_bo_va), GFP_KERNEL); if (bo_va == NULL) { return NULL; -- cgit v1.2.3-70-g09d2 From c1a66c3bc425ff93774fb2f6eefa67b83170dd7e Mon Sep 17 00:00:00 2001 From: Qiang Yu Date: Mon, 21 Feb 2022 17:53:56 +0800 Subject: drm/amdgpu: check vm ready by amdgpu_vm->evicting flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Workstation application ANSA/META v21.1.4 get this error dmesg when running CI test suite provided by ANSA/META: [drm:amdgpu_gem_va_ioctl [amdgpu]] *ERROR* Couldn't update BO_VA (-16) This is caused by: 1. create a 256MB buffer in invisible VRAM 2. CPU map the buffer and access it causes vm_fault and try to move it to visible VRAM 3. force visible VRAM space and traverse all VRAM bos to check if evicting this bo is valuable 4. when checking a VM bo (in invisible VRAM), amdgpu_vm_evictable() will set amdgpu_vm->evicting, but latter due to not in visible VRAM, won't really evict it so not add it to amdgpu_vm->evicted 5. before next CS to clear the amdgpu_vm->evicting, user VM ops ioctl will pass amdgpu_vm_ready() (check amdgpu_vm->evicted) but fail in amdgpu_vm_bo_update_mapping() (check amdgpu_vm->evicting) and get this error log This error won't affect functionality as next CS will finish the waiting VM ops. But we'd better clear the error log by checking the amdgpu_vm->evicting flag in amdgpu_vm_ready() to stop calling amdgpu_vm_bo_update_mapping() later. Another reason is amdgpu_vm->evicted list holds all BOs (both user buffer and page table), but only page table BOs' eviction prevent VM ops. amdgpu_vm->evicting flag is set only for page table BOs, so we should use evicting flag instead of evicted list in amdgpu_vm_ready(). The side effect of this change is: previously blocked VM op (user buffer in "evicted" list but no page table in it) gets done immediately. v2: update commit comments. Acked-by: Paul Menzel Reviewed-by: Christian König Signed-off-by: Qiang Yu Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index b37fc7d7d2c7..d62190b3dd9b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -768,11 +768,16 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm, * Check if all VM PDs/PTs are ready for updates * * Returns: - * True if eviction list is empty. + * True if VM is not evicting. */ bool amdgpu_vm_ready(struct amdgpu_vm *vm) { - return list_empty(&vm->evicted); + bool ret; + + amdgpu_vm_eviction_lock(vm); + ret = !vm->evicting; + amdgpu_vm_eviction_unlock(vm); + return ret; } /** -- cgit v1.2.3-70-g09d2 From b6901d93cc126bbfbdc6caf5f0c03b82945e43f2 Mon Sep 17 00:00:00 2001 From: Qiang Yu Date: Tue, 1 Mar 2022 14:11:59 +0800 Subject: drm/amdgpu: fix suspend/resume hang regression MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Regression has been reported that suspend/resume may hang with the previous vm ready check commit. So bring back the evicted list check as a temp fix. Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/1922 Fixes: c1a66c3bc425 ("drm/amdgpu: check vm ready by amdgpu_vm->evicting flag") Reviewed-by: Christian König Signed-off-by: Qiang Yu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 2cd9f1a2e5fa..fc4563cf2828 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -779,7 +779,8 @@ bool amdgpu_vm_ready(struct amdgpu_vm *vm) amdgpu_vm_eviction_lock(vm); ret = !vm->evicting; amdgpu_vm_eviction_unlock(vm); - return ret; + + return ret && list_empty(&vm->evicted); } /** -- cgit v1.2.3-70-g09d2