From e77673d14f2cec6d47d2da4e58dce87c2d66e54f Mon Sep 17 00:00:00 2001 From: Mukul Joshi Date: Fri, 9 Jun 2023 11:11:53 -0400 Subject: drm/amdgpu: Update invalid PTE flag setting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update the invalid PTE flag setting with TF enabled. This is to ensure, in addition to transitioning the retry fault to a no-retry fault, it also causes the wavefront to enter the trap handler. With the current setting, the fault only transitions to a no-retry fault. Additionally, have 2 sets of invalid PTE settings, one for TF enabled, the other for TF disabled. The setting with TF disabled, doesn't work with TF enabled. Signed-off-by: Mukul Joshi Acked-by: Christian König Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index 9c85d494f2a2..b81fcb962d8f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -84,7 +84,13 @@ struct amdgpu_mem_stats; /* PDE Block Fragment Size for VEGA10 */ #define AMDGPU_PDE_BFS(a) ((uint64_t)a << 59) +/* Flag combination to set no-retry with TF disabled */ +#define AMDGPU_VM_NORETRY_FLAGS (AMDGPU_PTE_EXECUTABLE | AMDGPU_PDE_PTE | \ + AMDGPU_PTE_TF) +/* Flag combination to set no-retry with TF enabled */ +#define AMDGPU_VM_NORETRY_FLAGS_TF (AMDGPU_PTE_VALID | AMDGPU_PTE_SYSTEM | \ + AMDGPU_PTE_PRT) /* For GFX9 */ #define AMDGPU_PTE_MTYPE_VG10(a) ((uint64_t)(a) << 57) #define AMDGPU_PTE_MTYPE_VG10_MASK AMDGPU_PTE_MTYPE_VG10(3ULL) -- cgit v1.2.3-70-g09d2 From eb58ad143dab0c9d649d702cc929f6bd4b62b455 Mon Sep 17 00:00:00 2001 From: Xiaogang Chen Date: Fri, 30 Jun 2023 11:38:35 -0500 Subject: drm/amdgpu: have bos for PDs/PTS cpu accessible when kfd uses cpu to update vm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When kfd uses cpu to update vm iterates all current PDs/PTs bos, adds AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED flag and kmap them to kernel virtual address space before kfd updates the vm that was created by gfx. Signed-off-by: Xiaogang Chen Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 11 ++++------- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 28 ++++++++++++++++++++++++++++ 4 files changed, 35 insertions(+), 7 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 8eda8f7ac612..92a84e7b0db8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -2279,16 +2279,13 @@ int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm) goto unreserve_bo; vm->update_funcs = &amdgpu_vm_cpu_funcs; + r = amdgpu_vm_pt_map_tables(adev, vm); + if (r) + goto unreserve_bo; + } else { vm->update_funcs = &amdgpu_vm_sdma_funcs; } - /* - * Make sure root PD gets mapped. As vm_update_mode could be changed - * when turning a GFX VM into a compute VM. - */ - r = vm->update_funcs->map_table(to_amdgpu_bo_vm(vm->root.bo)); - if (r) - goto unreserve_bo; dma_fence_put(vm->last_update); vm->last_update = dma_fence_get_stub(); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index b81fcb962d8f..88ee4507f6b6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -497,6 +497,8 @@ void amdgpu_vm_pt_free_work(struct work_struct *work); void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct seq_file *m); #endif +int amdgpu_vm_pt_map_tables(struct amdgpu_device *adev, struct amdgpu_vm *vm); + /** * amdgpu_vm_tlb_seq - return tlb flush sequence number * @vm: the amdgpu_vm structure to query diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c index 31913ae86de6..6e31621452de 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c @@ -31,6 +31,7 @@ */ static int amdgpu_vm_cpu_map_table(struct amdgpu_bo_vm *table) { + table->bo.flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED; return amdgpu_bo_kmap(&table->bo, NULL); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c index 24ddf6a0512a..70fc5856a5b9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c @@ -1075,3 +1075,31 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params *params, return 0; } + +/** + * amdgpu_vm_pt_map_tables - have bo of root PD cpu accessible + * @adev: amdgpu device structure + * @vm: amdgpu vm structure + * + * make root page directory and everything below it cpu accessible. + */ +int amdgpu_vm_pt_map_tables(struct amdgpu_device *adev, struct amdgpu_vm *vm) +{ + struct amdgpu_vm_pt_cursor cursor; + struct amdgpu_vm_bo_base *entry; + + for_each_amdgpu_vm_pt_dfs_safe(adev, vm, NULL, cursor, entry) { + + struct amdgpu_bo_vm *bo; + int r; + + if (entry->bo) { + bo = to_amdgpu_bo_vm(entry->bo); + r = vm->update_funcs->map_table(bo); + if (r) + return r; + } + } + + return 0; +} -- cgit v1.2.3-70-g09d2 From 5003ca63bce63b20c02c8049be46c44135939a64 Mon Sep 17 00:00:00 2001 From: Guchun Chen Date: Thu, 13 Jul 2023 15:09:37 +0800 Subject: drm/amdgpu: fix slab-out-of-bounds issue in amdgpu_vm_pt_create MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Recent code set xcp_id stored from file private data when opening device to amdgpu bo for accounting memory usage etc, but not all VMs are attached to this fpriv structure like the vm cases in amdgpu_mes_self_test, otherwise, KASAN will complain below out of bound access. And more importantly, VM code should not touch fpriv structure, so drop fpriv code handling from amdgpu_vm_pt. [ 77.292314] BUG: KASAN: slab-out-of-bounds in amdgpu_vm_pt_create+0x17e/0x4b0 [amdgpu] [ 77.293845] Read of size 4 at addr ffff888102c48a48 by task modprobe/1069 [ 77.294146] Call Trace: [ 77.294178] [ 77.294208] dump_stack_lvl+0x49/0x63 [ 77.294260] print_report+0x16f/0x4a6 [ 77.294307] ? amdgpu_vm_pt_create+0x17e/0x4b0 [amdgpu] [ 77.295979] ? kasan_complete_mode_report_info+0x3c/0x200 [ 77.296057] ? amdgpu_vm_pt_create+0x17e/0x4b0 [amdgpu] [ 77.297556] kasan_report+0xb4/0x130 [ 77.297609] ? amdgpu_vm_pt_create+0x17e/0x4b0 [amdgpu] [ 77.299202] __asan_load4+0x6f/0x90 [ 77.299272] amdgpu_vm_pt_create+0x17e/0x4b0 [amdgpu] [ 77.300796] ? amdgpu_init+0x6e/0x1000 [amdgpu] [ 77.302222] ? amdgpu_vm_pt_clear+0x750/0x750 [amdgpu] [ 77.303721] ? preempt_count_sub+0x18/0xc0 [ 77.303786] amdgpu_vm_init+0x39e/0x870 [amdgpu] [ 77.305186] ? amdgpu_vm_wait_idle+0x90/0x90 [amdgpu] [ 77.306683] ? kasan_set_track+0x25/0x30 [ 77.306737] ? kasan_save_alloc_info+0x1b/0x30 [ 77.306795] ? __kasan_kmalloc+0x87/0xa0 [ 77.306852] amdgpu_mes_self_test+0x169/0x620 [amdgpu] v2: without specifying xcp partition for PD/PT bo, the xcp id is -1. Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2686 Fixes: 3ebfd221c1a8 ("drm/amdkfd: Store xcp partition id to amdgpu bo") Signed-off-by: Guchun Chen Tested-by: Mikhail Gavrilov Reviewed-by: Felix Kuehling Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 5 +++-- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 5 +++-- drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 11 ++++++----- 5 files changed, 14 insertions(+), 11 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c index 53a024cf0544..cab2fdd5b76a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c @@ -1236,7 +1236,7 @@ int amdgpu_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv) if (r) goto error_pasid; - r = amdgpu_vm_init(adev, &fpriv->vm); + r = amdgpu_vm_init(adev, &fpriv->vm, fpriv->xcp_id); if (r) goto error_pasid; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c index e9091ebfe230..f808841310fd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c @@ -1382,7 +1382,7 @@ int amdgpu_mes_self_test(struct amdgpu_device *adev) goto error_pasid; } - r = amdgpu_vm_init(adev, vm); + r = amdgpu_vm_init(adev, vm, -1); if (r) { DRM_ERROR("failed to initialize vm\n"); goto error_pasid; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 32adc31c093d..74380b21e7a5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -2121,13 +2121,14 @@ long amdgpu_vm_wait_idle(struct amdgpu_vm *vm, long timeout) * * @adev: amdgpu_device pointer * @vm: requested vm + * @xcp_id: GPU partition selection id * * Init @vm fields. * * Returns: * 0 for success, error for failure. */ -int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm) +int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, int32_t xcp_id) { struct amdgpu_bo *root_bo; struct amdgpu_bo_vm *root; @@ -2177,7 +2178,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm) vm->evicting = false; r = amdgpu_vm_pt_create(adev, vm, adev->vm_manager.root_level, - false, &root); + false, &root, xcp_id); if (r) goto error_free_delayed; root_bo = &root->bo; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index 88ee4507f6b6..bca258c38919 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -398,7 +398,7 @@ int amdgpu_vm_set_pasid(struct amdgpu_device *adev, struct amdgpu_vm *vm, u32 pasid); long amdgpu_vm_wait_idle(struct amdgpu_vm *vm, long timeout); -int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm); +int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, int32_t xcp_id); int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm); void amdgpu_vm_release_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm); void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm); @@ -481,7 +481,8 @@ void amdgpu_vm_get_memory(struct amdgpu_vm *vm, int amdgpu_vm_pt_clear(struct amdgpu_device *adev, struct amdgpu_vm *vm, struct amdgpu_bo_vm *vmbo, bool immediate); int amdgpu_vm_pt_create(struct amdgpu_device *adev, struct amdgpu_vm *vm, - int level, bool immediate, struct amdgpu_bo_vm **vmbo); + int level, bool immediate, struct amdgpu_bo_vm **vmbo, + int32_t xcp_id); void amdgpu_vm_pt_free_root(struct amdgpu_device *adev, struct amdgpu_vm *vm); bool amdgpu_vm_pt_is_root_clean(struct amdgpu_device *adev, struct amdgpu_vm *vm); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c index 70fc5856a5b9..eb52dfe64948 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c @@ -498,11 +498,12 @@ exit: * @level: the page table level * @immediate: use a immediate update * @vmbo: pointer to the buffer object pointer + * @xcp_id: GPU partition id */ int amdgpu_vm_pt_create(struct amdgpu_device *adev, struct amdgpu_vm *vm, - int level, bool immediate, struct amdgpu_bo_vm **vmbo) + int level, bool immediate, struct amdgpu_bo_vm **vmbo, + int32_t xcp_id) { - struct amdgpu_fpriv *fpriv = container_of(vm, struct amdgpu_fpriv, vm); struct amdgpu_bo_param bp; struct amdgpu_bo *bo; struct dma_resv *resv; @@ -535,7 +536,7 @@ int amdgpu_vm_pt_create(struct amdgpu_device *adev, struct amdgpu_vm *vm, bp.type = ttm_bo_type_kernel; bp.no_wait_gpu = immediate; - bp.xcp_id_plus1 = fpriv->xcp_id == ~0 ? 0 : fpriv->xcp_id + 1; + bp.xcp_id_plus1 = xcp_id + 1; if (vm->root.bo) bp.resv = vm->root.bo->tbo.base.resv; @@ -561,7 +562,7 @@ int amdgpu_vm_pt_create(struct amdgpu_device *adev, struct amdgpu_vm *vm, bp.type = ttm_bo_type_kernel; bp.resv = bo->tbo.base.resv; bp.bo_ptr_size = sizeof(struct amdgpu_bo); - bp.xcp_id_plus1 = fpriv->xcp_id == ~0 ? 0 : fpriv->xcp_id + 1; + bp.xcp_id_plus1 = xcp_id + 1; r = amdgpu_bo_create(adev, &bp, &(*vmbo)->shadow); @@ -606,7 +607,7 @@ static int amdgpu_vm_pt_alloc(struct amdgpu_device *adev, return 0; amdgpu_vm_eviction_unlock(vm); - r = amdgpu_vm_pt_create(adev, vm, cursor->level, immediate, &pt); + r = amdgpu_vm_pt_create(adev, vm, cursor->level, immediate, &pt, 0); amdgpu_vm_eviction_lock(vm); if (r) return r; -- cgit v1.2.3-70-g09d2