From a4c63cafa58b4bd9e15511bab77a4752b93d3aa0 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Tue, 30 Nov 2021 16:19:03 -0500 Subject: drm/amdgpu: Introduce reset domain MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Defined a reset_domain struct such that all the entities that go through reset together will be serialized one against another. Do it for both single device and XGMI hive cases. Signed-off-by: Andrey Grodzovsky Suggested-by: Daniel Vetter Suggested-by: Christian König Reviewed-by: Christian König Link: https://www.spinics.net/lists/amd-gfx/msg74111.html --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index ed077de426d9..9704b0e1fd82 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2398,9 +2398,27 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) if (r) goto init_failed; - if (adev->gmc.xgmi.num_physical_nodes > 1) + if (adev->gmc.xgmi.num_physical_nodes > 1) { + struct amdgpu_hive_info *hive; + amdgpu_xgmi_add_device(adev); + hive = amdgpu_get_xgmi_hive(adev); + if (!hive || !hive->reset_domain.wq) { + DRM_ERROR("Failed to obtain reset domain info for XGMI hive:%llx", hive->hive_id); + r = -EINVAL; + goto init_failed; + } + + adev->reset_domain.wq = hive->reset_domain.wq; + } else { + adev->reset_domain.wq = alloc_ordered_workqueue("amdgpu-reset-dev", 0); + if (!adev->reset_domain.wq) { + r = -ENOMEM; + goto init_failed; + } + } + /* Don't init kfd if whole hive need to be reset during init */ if (!adev->gmc.xgmi.pending_reset) amdgpu_amdkfd_device_init(adev); -- cgit v1.2.3-70-g09d2 From 5fd8518d187ed03403a4d4f7f56f52c00b11c148 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Mon, 6 Dec 2021 14:59:35 -0500 Subject: drm/amdgpu: Move scheduler init to after XGMI is ready MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before we initialize schedulers we must know which reset domain are we in - for single device there iis a single domain per device and so single wq per device. For XGMI the reset domain spans the entire XGMI hive and so the reset wq is per hive. Signed-off-by: Andrey Grodzovsky Reviewed-by: Christian König Link: https://www.spinics.net/lists/amd-gfx/msg74112.html --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 45 ++++++++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 43 ++++------------------------ drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 5 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 6 ++-- 4 files changed, 56 insertions(+), 43 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 9704b0e1fd82..00123b0013d3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2287,6 +2287,47 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev) return r; } +static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) +{ + long timeout; + int r, i; + + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { + struct amdgpu_ring *ring = adev->rings[i]; + + /* No need to setup the GPU scheduler for rings that don't need it */ + if (!ring || ring->no_scheduler) + continue; + + switch (ring->funcs->type) { + case AMDGPU_RING_TYPE_GFX: + timeout = adev->gfx_timeout; + break; + case AMDGPU_RING_TYPE_COMPUTE: + timeout = adev->compute_timeout; + break; + case AMDGPU_RING_TYPE_SDMA: + timeout = adev->sdma_timeout; + break; + default: + timeout = adev->video_timeout; + break; + } + + r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, + ring->num_hw_submission, amdgpu_job_hang_limit, + timeout, adev->reset_domain.wq, ring->sched_score, ring->name); + if (r) { + DRM_ERROR("Failed to create scheduler on ring %s.\n", + ring->name); + return r; + } + } + + return 0; +} + + /** * amdgpu_device_ip_init - run init for hardware IPs * @@ -2419,6 +2460,10 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) } } + r = amdgpu_device_init_schedulers(adev); + if (r) + goto init_failed; + /* Don't init kfd if whole hive need to be reset during init */ if (!adev->gmc.xgmi.pending_reset) amdgpu_amdkfd_device_init(adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c index 45977a72b5dd..5d13ed376ab4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c @@ -446,24 +446,18 @@ int amdgpu_fence_driver_start_ring(struct amdgpu_ring *ring, * for the requested ring. * * @ring: ring to init the fence driver on - * @num_hw_submission: number of entries on the hardware queue - * @sched_score: optional score atomic shared with other schedulers * * Init the fence driver for the requested ring (all asics). * Helper function for amdgpu_fence_driver_init(). */ -int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring, - unsigned num_hw_submission, - atomic_t *sched_score) +int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring) { struct amdgpu_device *adev = ring->adev; - long timeout; - int r; if (!adev) return -EINVAL; - if (!is_power_of_2(num_hw_submission)) + if (!is_power_of_2(ring->num_hw_submission)) return -EINVAL; ring->fence_drv.cpu_addr = NULL; @@ -474,41 +468,14 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring, timer_setup(&ring->fence_drv.fallback_timer, amdgpu_fence_fallback, 0); - ring->fence_drv.num_fences_mask = num_hw_submission * 2 - 1; + ring->fence_drv.num_fences_mask = ring->num_hw_submission * 2 - 1; spin_lock_init(&ring->fence_drv.lock); - ring->fence_drv.fences = kcalloc(num_hw_submission * 2, sizeof(void *), + ring->fence_drv.fences = kcalloc(ring->num_hw_submission * 2, sizeof(void *), GFP_KERNEL); + if (!ring->fence_drv.fences) return -ENOMEM; - /* No need to setup the GPU scheduler for rings that don't need it */ - if (ring->no_scheduler) - return 0; - - switch (ring->funcs->type) { - case AMDGPU_RING_TYPE_GFX: - timeout = adev->gfx_timeout; - break; - case AMDGPU_RING_TYPE_COMPUTE: - timeout = adev->compute_timeout; - break; - case AMDGPU_RING_TYPE_SDMA: - timeout = adev->sdma_timeout; - break; - default: - timeout = adev->video_timeout; - break; - } - - r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, - num_hw_submission, amdgpu_job_hang_limit, - timeout, NULL, sched_score, ring->name); - if (r) { - DRM_ERROR("Failed to create scheduler on ring %s.\n", - ring->name); - return r; - } - return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c index ab2351ba9574..35bcb6dc1816 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c @@ -191,8 +191,9 @@ int amdgpu_ring_init(struct amdgpu_device *adev, struct amdgpu_ring *ring, ring->adev = adev; ring->idx = adev->num_rings++; adev->rings[ring->idx] = ring; - r = amdgpu_fence_driver_init_ring(ring, sched_hw_submission, - sched_score); + ring->num_hw_submission = sched_hw_submission; + ring->sched_score = sched_score; + r = amdgpu_fence_driver_init_ring(ring); if (r) return r; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h index fae7d185ad0d..48365da213dc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h @@ -114,9 +114,7 @@ struct amdgpu_fence_driver { void amdgpu_fence_driver_clear_job_fences(struct amdgpu_ring *ring); void amdgpu_fence_driver_force_completion(struct amdgpu_ring *ring); -int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring, - unsigned num_hw_submission, - atomic_t *sched_score); +int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring); int amdgpu_fence_driver_start_ring(struct amdgpu_ring *ring, struct amdgpu_irq_src *irq_src, unsigned irq_type); @@ -251,6 +249,8 @@ struct amdgpu_ring { bool has_compute_vm_bug; bool no_scheduler; int hw_prio; + unsigned num_hw_submission; + atomic_t *sched_score; }; #define amdgpu_ring_parse_cs(r, p, ib) ((r)->funcs->parse_cs((p), (ib))) -- cgit v1.2.3-70-g09d2 From 54f329cc7a7a7ea265c45b206d45e3d09192aba7 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Fri, 17 Dec 2021 13:05:15 -0500 Subject: drm/amdgpu: Serialize non TDR gpu recovery with TDRs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use reset domain wq also for non TDR gpu recovery trigers such as sysfs and RAS. We must serialize all possible GPU recoveries to gurantee no concurrency there. For TDR call the original recovery function directly since it's already executed from within the wq. For others just use a wrapper to qeueue work and wait on it to finish. v2: Rename to amdgpu_recover_work_struct Signed-off-by: Andrey Grodzovsky Reviewed-by: Christian König Link: https://www.spinics.net/lists/amd-gfx/msg74113.html --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 33 +++++++++++++++++++++++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 2 +- 3 files changed, 35 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index b76c1cfad7f1..540a38fe5cd6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1298,6 +1298,8 @@ bool amdgpu_device_has_job_running(struct amdgpu_device *adev); bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev); int amdgpu_device_gpu_recover(struct amdgpu_device *adev, struct amdgpu_job* job); +int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev, + struct amdgpu_job *job); void amdgpu_device_pci_config_reset(struct amdgpu_device *adev); int amdgpu_device_pci_reset(struct amdgpu_device *adev); bool amdgpu_device_need_post(struct amdgpu_device *adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 00123b0013d3..15e8fde3ac2d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5033,7 +5033,7 @@ retry: * Returns 0 for success or an error on failure. */ -int amdgpu_device_gpu_recover(struct amdgpu_device *adev, +int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev, struct amdgpu_job *job) { struct list_head device_list, *device_list_handle = NULL; @@ -5292,6 +5292,37 @@ skip_recovery: return r; } +struct amdgpu_recover_work_struct { + struct work_struct base; + struct amdgpu_device *adev; + struct amdgpu_job *job; + int ret; +}; + +static void amdgpu_device_queue_gpu_recover_work(struct work_struct *work) +{ + struct amdgpu_recover_work_struct *recover_work = container_of(work, struct amdgpu_recover_work_struct, base); + + recover_work->ret = amdgpu_device_gpu_recover_imp(recover_work->adev, recover_work->job); +} +/* + * Serialize gpu recover into reset domain single threaded wq + */ +int amdgpu_device_gpu_recover(struct amdgpu_device *adev, + struct amdgpu_job *job) +{ + struct amdgpu_recover_work_struct work = {.adev = adev, .job = job}; + + INIT_WORK(&work.base, amdgpu_device_queue_gpu_recover_work); + + if (!queue_work(adev->reset_domain.wq, &work.base)) + return -EAGAIN; + + flush_work(&work.base); + + return work.ret; +} + /** * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot * diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index bfc47bea23db..38c9fd7b7ad4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -63,7 +63,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) ti.process_name, ti.tgid, ti.task_name, ti.pid); if (amdgpu_device_should_recover_gpu(ring->adev)) { - amdgpu_device_gpu_recover(ring->adev, job); + amdgpu_device_gpu_recover_imp(ring->adev, job); } else { drm_sched_suspend_timeout(&ring->sched); if (amdgpu_sriov_vf(adev)) -- cgit v1.2.3-70-g09d2 From 681260df4dad45337b14ba762f94b402204e9ac3 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Wed, 15 Dec 2021 16:55:25 -0500 Subject: drm/amdgpu: Drop hive->in_reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since we serialize all resets no need to protect from concurrent resets. Signed-off-by: Andrey Grodzovsky Reviewed-by: Christian König Link: https://www.spinics.net/lists/amd-gfx/msg74115.html --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 19 +------------------ drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 1 - drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 1 - 3 files changed, 1 insertion(+), 20 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 15e8fde3ac2d..7e92f2432087 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5067,26 +5067,10 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev, dev_info(adev->dev, "GPU %s begin!\n", need_emergency_restart ? "jobs stop":"reset"); - /* - * Here we trylock to avoid chain of resets executing from - * either trigger by jobs on different adevs in XGMI hive or jobs on - * different schedulers for same device while this TO handler is running. - * We always reset all schedulers for device and all devices for XGMI - * hive so that should take care of them too. - */ if (!amdgpu_sriov_vf(adev)) hive = amdgpu_get_xgmi_hive(adev); - if (hive) { - if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) { - DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", - job ? job->base.id : -1, hive->hive_id); - amdgpu_put_xgmi_hive(hive); - if (job && job->vm) - drm_sched_increase_karma(&job->base); - return 0; - } + if (hive) mutex_lock(&hive->hive_lock); - } reset_context.method = AMD_RESET_METHOD_NONE; reset_context.reset_req_dev = adev; @@ -5282,7 +5266,6 @@ skip_sched_resume: skip_recovery: if (hive) { - atomic_set(&hive->in_reset, 0); mutex_unlock(&hive->hive_lock); amdgpu_put_xgmi_hive(hive); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index d406897346d6..89b682afe821 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -410,7 +410,6 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev) INIT_LIST_HEAD(&hive->device_list); INIT_LIST_HEAD(&hive->node); mutex_init(&hive->hive_lock); - atomic_set(&hive->in_reset, 0); atomic_set(&hive->number_devices, 0); task_barrier_init(&hive->tb); hive->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h index 6121aaa292cb..2f2ce53645a5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h @@ -33,7 +33,6 @@ struct amdgpu_hive_info { struct list_head node; atomic_t number_devices; struct mutex hive_lock; - atomic_t in_reset; int hi_req_count; struct amdgpu_device *hi_req_gpu; struct task_barrier tb; -- cgit v1.2.3-70-g09d2 From f287a3c5b03f51efa8d8f3e141a79177f91047e0 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Thu, 16 Dec 2021 14:24:43 -0500 Subject: drm/amdgpu: Drop concurrent GPU reset protection for device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since now all GPU resets are serialzied there is no need for this. This patch also reverts 'drm/amdgpu: race issue when jobs on 2 ring timeout' Signed-off-by: Andrey Grodzovsky Reviewed-by: Christian König Link: https://www.spinics.net/lists/amd-gfx/msg74119.html --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 89 +++--------------------------- 1 file changed, 7 insertions(+), 82 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 7e92f2432087..e3c0ec684a85 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4817,11 +4817,10 @@ end: return r; } -static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, +static void amdgpu_device_lock_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive) { - if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0) - return false; + atomic_set(&adev->in_gpu_reset, 1); if (hive) { down_write_nest_lock(&adev->reset_sem, &hive->hive_lock); @@ -4840,8 +4839,6 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, adev->mp1_state = PP_MP1_STATE_NONE; break; } - - return true; } static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) @@ -4852,46 +4849,6 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) up_write(&adev->reset_sem); } -/* - * to lockup a list of amdgpu devices in a hive safely, if not a hive - * with multiple nodes, it will be similar as amdgpu_device_lock_adev. - * - * unlock won't require roll back. - */ -static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive) -{ - struct amdgpu_device *tmp_adev = NULL; - - if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { - if (!hive) { - dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes"); - return -ENODEV; - } - list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { - if (!amdgpu_device_lock_adev(tmp_adev, hive)) - goto roll_back; - } - } else if (!amdgpu_device_lock_adev(adev, hive)) - return -EAGAIN; - - return 0; -roll_back: - if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) { - /* - * if the lockup iteration break in the middle of a hive, - * it may means there may has a race issue, - * or a hive device locked up independently. - * we may be in trouble and may not, so will try to roll back - * the lock and give out a warnning. - */ - dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock"); - list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) { - amdgpu_device_unlock_adev(tmp_adev); - } - } - return -EAGAIN; -} - static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) { struct pci_dev *p = NULL; @@ -5078,22 +5035,6 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev, reset_context.hive = hive; clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); - /* - * lock the device before we try to operate the linked list - * if didn't get the device lock, don't touch the linked list since - * others may iterating it. - */ - r = amdgpu_device_lock_hive_adev(adev, hive); - if (r) { - dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress", - job ? job->base.id : -1); - - /* even we skipped this reset, still need to set the job to guilty */ - if (job && job->vm) - drm_sched_increase_karma(&job->base); - goto skip_recovery; - } - /* * Build list of devices to reset. * In case we are in XGMI hive mode, resort the device list @@ -5113,6 +5054,9 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev, /* block all schedulers and reset given job's ring */ list_for_each_entry(tmp_adev, device_list_handle, reset_list) { + + amdgpu_device_lock_adev(tmp_adev, hive); + /* * Try to put the audio codec into suspend state * before gpu reset started. @@ -5264,13 +5208,12 @@ skip_sched_resume: amdgpu_device_unlock_adev(tmp_adev); } -skip_recovery: if (hive) { mutex_unlock(&hive->hive_lock); amdgpu_put_xgmi_hive(hive); } - if (r && r != -EAGAIN) + if (r) dev_info(adev->dev, "GPU reset end with ret = %d\n", r); return r; } @@ -5493,20 +5436,6 @@ int amdgpu_device_baco_exit(struct drm_device *dev) return 0; } -static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev) -{ - int i; - - for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { - struct amdgpu_ring *ring = adev->rings[i]; - - if (!ring || !ring->sched.thread) - continue; - - cancel_delayed_work_sync(&ring->sched.work_tdr); - } -} - /** * amdgpu_pci_error_detected - Called when a PCI error is detected. * @pdev: PCI device struct @@ -5537,14 +5466,10 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta /* Fatal error, prepare for slot reset */ case pci_channel_io_frozen: /* - * Cancel and wait for all TDRs in progress if failing to - * set adev->in_gpu_reset in amdgpu_device_lock_adev - * * Locking adev->reset_sem will prevent any external access * to GPU during PCI error recovery */ - while (!amdgpu_device_lock_adev(adev, NULL)) - amdgpu_cancel_all_tdr(adev); + amdgpu_device_lock_adev(adev, NULL); /* * Block any work scheduling as we do for regular GPU reset -- cgit v1.2.3-70-g09d2 From cfbb6b0047448e2d986160d9f30d60f604d9ad0f Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Fri, 21 Jan 2022 17:23:32 -0500 Subject: drm/amdgpu: Rework reset domain to be refcounted. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The reset domain contains register access semaphor now and so needs to be present as long as each device in a hive needs it and so it cannot be binded to XGMI hive life cycle. Adress this by making reset domain refcounted and pointed by each member of the hive and the hive itself. v4: Fix crash on boot witrh XGMI hive by adding type to reset_domain. XGMI will only create a new reset_domain if prevoius was of single device type meaning it's first boot. Otherwsie it will take a refocunt to exsiting reset_domain from the amdgou device. Add a wrapper around reset_domain->refcount get/put and a wrapper around send to reset wq (Lijo) Signed-off-by: Andrey Grodzovsky Acked-by: Christian König Link: https://www.spinics.net/lists/amd-gfx/msg74121.html --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 6 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 44 ++++++++++++++++++------------ drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 40 +++++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 35 ++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 29 ++++++++++++++++---- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 2 +- drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 6 ++-- drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 6 ++-- drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 6 ++-- 9 files changed, 140 insertions(+), 34 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 540a38fe5cd6..cb9764513df8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -813,9 +813,7 @@ struct amd_powerplay { #define AMDGPU_RESET_MAGIC_NUM 64 #define AMDGPU_MAX_DF_PERFMONS 4 #define AMDGPU_PRODUCT_NAME_LEN 64 -struct amdgpu_reset_domain { - struct workqueue_struct *wq; -}; +struct amdgpu_reset_domain; struct amdgpu_device { struct device *dev; @@ -1104,7 +1102,7 @@ struct amdgpu_device { uint32_t ip_versions[MAX_HWIP][HWIP_MAX_INSTANCE]; bool ram_is_direct_mapped; - struct amdgpu_reset_domain reset_domain; + struct amdgpu_reset_domain *reset_domain; }; static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index e3c0ec684a85..d61bc0a0457c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2316,7 +2316,7 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, ring->num_hw_submission, amdgpu_job_hang_limit, - timeout, adev->reset_domain.wq, ring->sched_score, ring->name); + timeout, adev->reset_domain->wq, ring->sched_score, ring->name); if (r) { DRM_ERROR("Failed to create scheduler on ring %s.\n", ring->name); @@ -2439,24 +2439,22 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) if (r) goto init_failed; + /** + * In case of XGMI grab extra reference for reset domain for this device + */ if (adev->gmc.xgmi.num_physical_nodes > 1) { - struct amdgpu_hive_info *hive; + if (amdgpu_xgmi_add_device(adev) == 0) { + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); - amdgpu_xgmi_add_device(adev); + if (!hive->reset_domain || + !amdgpu_reset_get_reset_domain(hive->reset_domain)) { + r = -ENOENT; + goto init_failed; + } - hive = amdgpu_get_xgmi_hive(adev); - if (!hive || !hive->reset_domain.wq) { - DRM_ERROR("Failed to obtain reset domain info for XGMI hive:%llx", hive->hive_id); - r = -EINVAL; - goto init_failed; - } - - adev->reset_domain.wq = hive->reset_domain.wq; - } else { - adev->reset_domain.wq = alloc_ordered_workqueue("amdgpu-reset-dev", 0); - if (!adev->reset_domain.wq) { - r = -ENOMEM; - goto init_failed; + /* Drop the early temporary reset domain we created for device */ + amdgpu_reset_put_reset_domain(adev->reset_domain); + adev->reset_domain = hive->reset_domain; } } @@ -3640,6 +3638,15 @@ int amdgpu_device_init(struct amdgpu_device *adev, return r; } + /* + * Reset domain needs to be present early, before XGMI hive discovered + * (if any) and intitialized to use reset sem and in_gpu reset flag + * early on during init. + */ + adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE ,"amdgpu-reset-dev"); + if (!adev->reset_domain) + return -ENOMEM; + /* early init functions */ r = amdgpu_device_ip_early_init(adev); if (r) @@ -4016,6 +4023,9 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev) if (adev->mman.discovery_bin) amdgpu_discovery_fini(adev); + amdgpu_reset_put_reset_domain(adev->reset_domain); + adev->reset_domain = NULL; + kfree(adev->pci_state); } @@ -5241,7 +5251,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, INIT_WORK(&work.base, amdgpu_device_queue_gpu_recover_work); - if (!queue_work(adev->reset_domain.wq, &work.base)) + if (!amdgpu_reset_domain_schedule(adev->reset_domain, &work.base)) return -EAGAIN; flush_work(&work.base); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c index 02afd4115675..91864947063f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c @@ -96,3 +96,43 @@ int amdgpu_reset_perform_reset(struct amdgpu_device *adev, return reset_handler->restore_hwcontext(adev->reset_cntl, reset_context); } + + +void amdgpu_reset_destroy_reset_domain(struct kref *ref) +{ + struct amdgpu_reset_domain *reset_domain = container_of(ref, + struct amdgpu_reset_domain, + refcount); + if (reset_domain->wq) + destroy_workqueue(reset_domain->wq); + + kvfree(reset_domain); +} + +struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_domain_type type, + char *wq_name) +{ + struct amdgpu_reset_domain *reset_domain; + + reset_domain = kvzalloc(sizeof(struct amdgpu_reset_domain), GFP_KERNEL); + if (!reset_domain) { + DRM_ERROR("Failed to allocate amdgpu_reset_domain!"); + return NULL; + } + + reset_domain->type = type; + kref_init(&reset_domain->refcount); + + reset_domain->wq = create_singlethread_workqueue(wq_name); + if (!reset_domain->wq) { + DRM_ERROR("Failed to allocate wq for amdgpu_reset_domain!"); + amdgpu_reset_put_reset_domain(reset_domain); + return NULL; + + } + + return reset_domain; +} + + + diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h index e00d38d9160a..cc625e441fa0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h @@ -70,6 +70,19 @@ struct amdgpu_reset_control { void (*async_reset)(struct work_struct *work); }; + +enum amdgpu_reset_domain_type { + SINGLE_DEVICE, + XGMI_HIVE +}; + +struct amdgpu_reset_domain { + struct kref refcount; + struct workqueue_struct *wq; + enum amdgpu_reset_domain_type type; +}; + + int amdgpu_reset_init(struct amdgpu_device *adev); int amdgpu_reset_fini(struct amdgpu_device *adev); @@ -82,4 +95,26 @@ int amdgpu_reset_perform_reset(struct amdgpu_device *adev, int amdgpu_reset_add_handler(struct amdgpu_reset_control *reset_ctl, struct amdgpu_reset_handler *handler); +struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_domain_type type, + char *wq_name); + +void amdgpu_reset_destroy_reset_domain(struct kref *ref); + +static inline bool amdgpu_reset_get_reset_domain(struct amdgpu_reset_domain *domain) +{ + return kref_get_unless_zero(&domain->refcount) != 0; +} + +static inline void amdgpu_reset_put_reset_domain(struct amdgpu_reset_domain *domain) +{ + kref_put(&domain->refcount, amdgpu_reset_destroy_reset_domain); +} + +static inline bool amdgpu_reset_domain_schedule(struct amdgpu_reset_domain *domain, + struct work_struct *work) +{ + return queue_work(domain->wq, work); +} + + #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 89b682afe821..eb06322d2972 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -32,6 +32,8 @@ #include "wafl/wafl2_4_0_0_smn.h" #include "wafl/wafl2_4_0_0_sh_mask.h" +#include "amdgpu_reset.h" + #define smnPCS_XGMI23_PCS_ERROR_STATUS 0x11a01210 #define smnPCS_XGMI3X16_PCS_ERROR_STATUS 0x11a0020c #define smnPCS_GOPX1_PCS_ERROR_STATUS 0x12200210 @@ -227,6 +229,9 @@ static void amdgpu_xgmi_hive_release(struct kobject *kobj) struct amdgpu_hive_info *hive = container_of( kobj, struct amdgpu_hive_info, kobj); + amdgpu_reset_put_reset_domain(hive->reset_domain); + hive->reset_domain = NULL; + mutex_destroy(&hive->hive_lock); kfree(hive); } @@ -398,12 +403,24 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev) goto pro_end; } - hive->reset_domain.wq = alloc_ordered_workqueue("amdgpu-reset-hive", 0); - if (!hive->reset_domain.wq) { - dev_err(adev->dev, "XGMI: failed allocating wq for reset domain!\n"); - kfree(hive); - hive = NULL; - goto pro_end; + /** + * Avoid recreating reset domain when hive is reconstructed for the case + * of reset the devices in the XGMI hive during probe for SRIOV + * See https://www.spinics.net/lists/amd-gfx/msg58836.html + */ + if (adev->reset_domain->type != XGMI_HIVE) { + hive->reset_domain = amdgpu_reset_create_reset_domain(XGMI_HIVE, "amdgpu-reset-hive"); + if (!hive->reset_domain) { + dev_err(adev->dev, "XGMI: failed initializing reset domain for xgmi hive\n"); + ret = -ENOMEM; + kobject_put(&hive->kobj); + kfree(hive); + hive = NULL; + goto pro_end; + } + } else { + amdgpu_reset_get_reset_domain(adev->reset_domain); + hive->reset_domain = adev->reset_domain; } hive->hive_id = adev->gmc.xgmi.hive_id; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h index 2f2ce53645a5..dcaad22be492 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h @@ -42,7 +42,7 @@ struct amdgpu_hive_info { AMDGPU_XGMI_PSTATE_UNKNOWN } pstate; - struct amdgpu_reset_domain reset_domain; + struct amdgpu_reset_domain *reset_domain; }; struct amdgpu_pcs_ras_field { diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c index 5869d51d8bee..6740eef84ee1 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c @@ -32,6 +32,8 @@ #include "soc15_common.h" #include "mxgpu_ai.h" +#include "amdgpu_reset.h" + static void xgpu_ai_mailbox_send_ack(struct amdgpu_device *adev) { WREG8(AI_MAIBOX_CONTROL_RCV_OFFSET_BYTE, 2); @@ -308,8 +310,8 @@ static int xgpu_ai_mailbox_rcv_irq(struct amdgpu_device *adev, switch (event) { case IDH_FLR_NOTIFICATION: if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev)) - WARN_ONCE(!queue_work(adev->reset_domain.wq, - &adev->virt.flr_work), + WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain, + &adev->virt.flr_work), "Failed to queue work! at %s", __func__); break; diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c index 5728a6401d73..e967d61c7134 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c @@ -31,6 +31,8 @@ #include "soc15_common.h" #include "mxgpu_nv.h" +#include "amdgpu_reset.h" + static void xgpu_nv_mailbox_send_ack(struct amdgpu_device *adev) { WREG8(NV_MAIBOX_CONTROL_RCV_OFFSET_BYTE, 2); @@ -338,8 +340,8 @@ static int xgpu_nv_mailbox_rcv_irq(struct amdgpu_device *adev, switch (event) { case IDH_FLR_NOTIFICATION: if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev)) - WARN_ONCE(!queue_work(adev->reset_domain.wq, - &adev->virt.flr_work), + WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain, + &adev->virt.flr_work), "Failed to queue work! at %s", __func__); break; diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c index 02290febfcf4..531cfba759dd 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c @@ -42,6 +42,8 @@ #include "smu/smu_7_1_3_d.h" #include "mxgpu_vi.h" +#include "amdgpu_reset.h" + /* VI golden setting */ static const u32 xgpu_fiji_mgcg_cgcg_init[] = { mmRLC_CGTT_MGCG_OVERRIDE, 0xffffffff, 0xffffffff, @@ -551,8 +553,8 @@ static int xgpu_vi_mailbox_rcv_irq(struct amdgpu_device *adev, /* only handle FLR_NOTIFY now */ if (!r && !amdgpu_in_reset(adev)) - WARN_ONCE(!queue_work(adev->reset_domain.wq, - &adev->virt.flr_work), + WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain, + &adev->virt.flr_work), "Failed to queue work! at %s", __func__); } -- cgit v1.2.3-70-g09d2 From d0fb18b535679a28b1f55a312b7454563b9bb36e Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Wed, 19 Jan 2022 17:09:58 -0500 Subject: drm/amdgpu: Move reset sem into reset_domain MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We want single instance of reset sem across all reset clients because in case of XGMI we should stop access cross device MMIO because any of them could be in a reset in the moment. Signed-off-by: Andrey Grodzovsky Reviewed-by: Christian König Link: https://www.spinics.net/lists/amd-gfx/msg74117.html --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 - drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 10 ++++++---- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 23 +++++++++++------------ drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 18 ++++++++++-------- drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 1 + drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 6 ++++-- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 14 ++++++++------ drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 4 ++-- 10 files changed, 46 insertions(+), 37 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index cb9764513df8..ddfbcc8fd3d3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1058,7 +1058,6 @@ struct amdgpu_device { atomic_t in_gpu_reset; enum pp_mp1_state mp1_state; - struct rw_semaphore reset_sem; struct amdgpu_doorbell_index doorbell_index; struct mutex notifier_lock; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c index 25e2e5bf90eb..c3728061d65a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c @@ -37,6 +37,8 @@ #include "amdgpu_fw_attestation.h" #include "amdgpu_umr.h" +#include "amdgpu_reset.h" + #if defined(CONFIG_DEBUG_FS) /** @@ -1279,7 +1281,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file *m, void *unused) } /* Avoid accidently unparking the sched thread during GPU reset */ - r = down_write_killable(&adev->reset_sem); + r = down_write_killable(&adev->reset_domain->sem); if (r) return r; @@ -1308,7 +1310,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file *m, void *unused) kthread_unpark(ring->sched.thread); } - up_write(&adev->reset_sem); + up_write(&adev->reset_domain->sem); pm_runtime_mark_last_busy(dev->dev); pm_runtime_put_autosuspend(dev->dev); @@ -1517,7 +1519,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val) return -ENOMEM; /* Avoid accidently unparking the sched thread during GPU reset */ - r = down_read_killable(&adev->reset_sem); + r = down_read_killable(&adev->reset_domain->sem); if (r) goto pro_end; @@ -1560,7 +1562,7 @@ failure: /* restart the scheduler */ kthread_unpark(ring->sched.thread); - up_read(&adev->reset_sem); + up_read(&adev->reset_domain->sem); ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index d61bc0a0457c..dcbb175d336f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -424,10 +424,10 @@ bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) * the lock. */ if (in_task()) { - if (down_read_trylock(&adev->reset_sem)) - up_read(&adev->reset_sem); + if (down_read_trylock(&adev->reset_domain->sem)) + up_read(&adev->reset_domain->sem); else - lockdep_assert_held(&adev->reset_sem); + lockdep_assert_held(&adev->reset_domain->sem); } #endif return false; @@ -453,9 +453,9 @@ uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, if ((reg * 4) < adev->rmmio_size) { if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) && - down_read_trylock(&adev->reset_sem)) { + down_read_trylock(&adev->reset_domain->sem)) { ret = amdgpu_kiq_rreg(adev, reg); - up_read(&adev->reset_sem); + up_read(&adev->reset_domain->sem); } else { ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); } @@ -538,9 +538,9 @@ void amdgpu_device_wreg(struct amdgpu_device *adev, if ((reg * 4) < adev->rmmio_size) { if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) && - down_read_trylock(&adev->reset_sem)) { + down_read_trylock(&adev->reset_domain->sem)) { amdgpu_kiq_wreg(adev, reg, v); - up_read(&adev->reset_sem); + up_read(&adev->reset_domain->sem); } else { writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); } @@ -3555,7 +3555,6 @@ int amdgpu_device_init(struct amdgpu_device *adev, mutex_init(&adev->virt.vf_errors.lock); hash_init(adev->mn_hash); atomic_set(&adev->in_gpu_reset, 0); - init_rwsem(&adev->reset_sem); mutex_init(&adev->psp.mutex); mutex_init(&adev->notifier_lock); @@ -4833,9 +4832,9 @@ static void amdgpu_device_lock_adev(struct amdgpu_device *adev, atomic_set(&adev->in_gpu_reset, 1); if (hive) { - down_write_nest_lock(&adev->reset_sem, &hive->hive_lock); + down_write_nest_lock(&adev->reset_domain->sem, &hive->hive_lock); } else { - down_write(&adev->reset_sem); + down_write(&adev->reset_domain->sem); } switch (amdgpu_asic_reset_method(adev)) { @@ -4856,7 +4855,7 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) amdgpu_vf_error_trans_all(adev); adev->mp1_state = PP_MP1_STATE_NONE; atomic_set(&adev->in_gpu_reset, 0); - up_write(&adev->reset_sem); + up_write(&adev->reset_domain->sem); } static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) @@ -5476,7 +5475,7 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta /* Fatal error, prepare for slot reset */ case pci_channel_io_frozen: /* - * Locking adev->reset_sem will prevent any external access + * Locking adev->reset_domain->sem will prevent any external access * to GPU during PCI error recovery */ amdgpu_device_lock_adev(adev, NULL); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index 05117eda105b..d3e055314804 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -31,6 +31,8 @@ #include #include +#include "amdgpu_reset.h" + #define EEPROM_I2C_MADDR_VEGA20 0x0 #define EEPROM_I2C_MADDR_ARCTURUS 0x40000 #define EEPROM_I2C_MADDR_ARCTURUS_D342 0x0 @@ -193,12 +195,12 @@ static int __write_table_header(struct amdgpu_ras_eeprom_control *control) __encode_table_header_to_buf(&control->tbl_hdr, buf); /* i2c may be unstable in gpu reset */ - down_read(&adev->reset_sem); + down_read(&adev->reset_domain->sem); res = amdgpu_eeprom_write(&adev->pm.smu_i2c, control->i2c_address + control->ras_header_offset, buf, RAS_TABLE_HEADER_SIZE); - up_read(&adev->reset_sem); + up_read(&adev->reset_domain->sem); if (res < 0) { DRM_ERROR("Failed to write EEPROM table header:%d", res); @@ -387,13 +389,13 @@ static int __amdgpu_ras_eeprom_write(struct amdgpu_ras_eeprom_control *control, int res; /* i2c may be unstable in gpu reset */ - down_read(&adev->reset_sem); + down_read(&adev->reset_domain->sem); buf_size = num * RAS_TABLE_RECORD_SIZE; res = amdgpu_eeprom_write(&adev->pm.smu_i2c, control->i2c_address + RAS_INDEX_TO_OFFSET(control, fri), buf, buf_size); - up_read(&adev->reset_sem); + up_read(&adev->reset_domain->sem); if (res < 0) { DRM_ERROR("Writing %d EEPROM table records error:%d", num, res); @@ -547,12 +549,12 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control) goto Out; } - down_read(&adev->reset_sem); + down_read(&adev->reset_domain->sem); res = amdgpu_eeprom_read(&adev->pm.smu_i2c, control->i2c_address + control->ras_record_offset, buf, buf_size); - up_read(&adev->reset_sem); + up_read(&adev->reset_domain->sem); if (res < 0) { DRM_ERROR("EEPROM failed reading records:%d\n", res); @@ -642,13 +644,13 @@ static int __amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control, int res; /* i2c may be unstable in gpu reset */ - down_read(&adev->reset_sem); + down_read(&adev->reset_domain->sem); buf_size = num * RAS_TABLE_RECORD_SIZE; res = amdgpu_eeprom_read(&adev->pm.smu_i2c, control->i2c_address + RAS_INDEX_TO_OFFSET(control, fri), buf, buf_size); - up_read(&adev->reset_sem); + up_read(&adev->reset_domain->sem); if (res < 0) { DRM_ERROR("Reading %d EEPROM table records error:%d", num, res); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c index 91864947063f..c0988c804459 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c @@ -131,6 +131,8 @@ struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d } + init_rwsem(&reset_domain->sem); + return reset_domain; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h index cc625e441fa0..80f918e87d4f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h @@ -80,6 +80,7 @@ struct amdgpu_reset_domain { struct kref refcount; struct workqueue_struct *wq; enum amdgpu_reset_domain_type type; + struct rw_semaphore sem; }; diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c index 38bb42727715..222b1da9d601 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c @@ -48,6 +48,8 @@ #include "athub_v2_0.h" #include "athub_v2_1.h" +#include "amdgpu_reset.h" + #if 0 static const struct soc15_reg_golden golden_settings_navi10_hdp[] = { @@ -328,7 +330,7 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid, */ if (adev->gfx.kiq.ring.sched.ready && (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) && - down_read_trylock(&adev->reset_sem)) { + down_read_trylock(&adev->reset_domain->sem)) { struct amdgpu_vmhub *hub = &adev->vmhub[vmhub]; const unsigned eng = 17; u32 inv_req = hub->vmhub_funcs->get_invalidate_req(vmid, flush_type); @@ -338,7 +340,7 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid, amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req, 1 << vmid); - up_read(&adev->reset_sem); + up_read(&adev->reset_domain->sem); return; } diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 88c1eb9ad068..3a5efe969735 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -62,6 +62,8 @@ #include "amdgpu_ras.h" #include "amdgpu_xgmi.h" +#include "amdgpu_reset.h" + /* add these here since we already include dce12 headers and these are for DCN */ #define mmHUBP0_DCSURF_PRI_VIEWPORT_DIMENSION 0x055d #define mmHUBP0_DCSURF_PRI_VIEWPORT_DIMENSION_BASE_IDX 2 @@ -787,13 +789,13 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid, */ if (adev->gfx.kiq.ring.sched.ready && (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) && - down_read_trylock(&adev->reset_sem)) { + down_read_trylock(&adev->reset_domain->sem)) { uint32_t req = hub->vm_inv_eng0_req + hub->eng_distance * eng; uint32_t ack = hub->vm_inv_eng0_ack + hub->eng_distance * eng; amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req, 1 << vmid); - up_read(&adev->reset_sem); + up_read(&adev->reset_domain->sem); return; } @@ -900,7 +902,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev, if (amdgpu_in_reset(adev)) return -EIO; - if (ring->sched.ready && down_read_trylock(&adev->reset_sem)) { + if (ring->sched.ready && down_read_trylock(&adev->reset_domain->sem)) { /* Vega20+XGMI caches PTEs in TC and TLB. Add a * heavy-weight TLB flush (type 2), which flushes * both. Due to a race condition with concurrent @@ -927,7 +929,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev, if (r) { amdgpu_ring_undo(ring); spin_unlock(&adev->gfx.kiq.ring_lock); - up_read(&adev->reset_sem); + up_read(&adev->reset_domain->sem); return -ETIME; } @@ -936,10 +938,10 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev, r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout); if (r < 1) { dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r); - up_read(&adev->reset_sem); + up_read(&adev->reset_domain->sem); return -ETIME; } - up_read(&adev->reset_sem); + up_read(&adev->reset_domain->sem); return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c index 6740eef84ee1..4e23c29e665c 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c @@ -262,7 +262,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0) return; - down_write(&adev->reset_sem); + down_write(&adev->reset_domain->sem); amdgpu_virt_fini_data_exchange(adev); @@ -278,7 +278,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) flr_done: atomic_set(&adev->in_gpu_reset, 0); - up_write(&adev->reset_sem); + up_write(&adev->reset_domain->sem); /* Trigger recovery for world switch failure if no TDR */ if (amdgpu_device_should_recover_gpu(adev) diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c index e967d61c7134..f715780f7d20 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c @@ -286,7 +286,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work) if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0) return; - down_write(&adev->reset_sem); + down_write(&adev->reset_domain->sem); amdgpu_virt_fini_data_exchange(adev); @@ -302,7 +302,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work) flr_done: atomic_set(&adev->in_gpu_reset, 0); - up_write(&adev->reset_sem); + up_write(&adev->reset_domain->sem); /* Trigger recovery for world switch failure if no TDR */ if (amdgpu_device_should_recover_gpu(adev) -- cgit v1.2.3-70-g09d2 From 89a7a87093d67e2c633e1ed400ba00ffd15bdae5 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Wed, 19 Jan 2022 17:20:00 -0500 Subject: drm/amdgpu: Move in_gpu_reset into reset_domain MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We should have a single instance per entrire reset domain. Signed-off-by: Andrey Grodzovsky Suggested-by: Lijo Lazar Reviewed-by: Christian König Link: https://www.spinics.net/lists/amd-gfx/msg74116.html --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 7 ++----- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 10 +++++++--- drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 1 + drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 4 ++-- 6 files changed, 15 insertions(+), 12 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index ddfbcc8fd3d3..b89406b01694 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1056,7 +1056,6 @@ struct amdgpu_device { bool in_s4; bool in_s0ix; - atomic_t in_gpu_reset; enum pp_mp1_state mp1_state; struct amdgpu_doorbell_index doorbell_index; @@ -1463,8 +1462,6 @@ static inline bool amdgpu_is_tmz(struct amdgpu_device *adev) return adev->gmc.tmz_enabled; } -static inline int amdgpu_in_reset(struct amdgpu_device *adev) -{ - return atomic_read(&adev->in_gpu_reset); -} +int amdgpu_in_reset(struct amdgpu_device *adev); + #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index dcbb175d336f..e05d7cbefd2c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3554,7 +3554,6 @@ int amdgpu_device_init(struct amdgpu_device *adev, mutex_init(&adev->mn_lock); mutex_init(&adev->virt.vf_errors.lock); hash_init(adev->mn_hash); - atomic_set(&adev->in_gpu_reset, 0); mutex_init(&adev->psp.mutex); mutex_init(&adev->notifier_lock); @@ -4829,7 +4828,7 @@ end: static void amdgpu_device_lock_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive) { - atomic_set(&adev->in_gpu_reset, 1); + atomic_set(&adev->reset_domain->in_gpu_reset, 1); if (hive) { down_write_nest_lock(&adev->reset_domain->sem, &hive->hive_lock); @@ -4854,7 +4853,7 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) { amdgpu_vf_error_trans_all(adev); adev->mp1_state = PP_MP1_STATE_NONE; - atomic_set(&adev->in_gpu_reset, 0); + atomic_set(&adev->reset_domain->in_gpu_reset, 0); up_write(&adev->reset_domain->sem); } @@ -5699,6 +5698,11 @@ void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, amdgpu_asic_invalidate_hdp(adev, ring); } +int amdgpu_in_reset(struct amdgpu_device *adev) +{ + return atomic_read(&adev->reset_domain->in_gpu_reset); + } + /** * amdgpu_device_halt() - bring hardware to some kind of halt state * diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c index c0988c804459..5ab72c3bfbda 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c @@ -131,6 +131,7 @@ struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d } + atomic_set(&reset_domain->in_gpu_reset, 0); init_rwsem(&reset_domain->sem); return reset_domain; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h index 80f918e87d4f..ea6fc98ea927 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h @@ -81,6 +81,7 @@ struct amdgpu_reset_domain { struct workqueue_struct *wq; enum amdgpu_reset_domain_type type; struct rw_semaphore sem; + atomic_t in_gpu_reset; }; diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c index 4e23c29e665c..b81acf59870c 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c @@ -259,7 +259,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) * otherwise the mailbox msg will be ruined/reseted by * the VF FLR. */ - if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0) + if (atomic_cmpxchg(&adev->reset_domain->in_gpu_reset, 0, 1) != 0) return; down_write(&adev->reset_domain->sem); @@ -277,7 +277,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) } while (timeout > 1); flr_done: - atomic_set(&adev->in_gpu_reset, 0); + atomic_set(&adev->reset_domain->in_gpu_reset, 0); up_write(&adev->reset_domain->sem); /* Trigger recovery for world switch failure if no TDR */ diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c index f715780f7d20..22c10b97ea81 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c @@ -283,7 +283,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work) * otherwise the mailbox msg will be ruined/reseted by * the VF FLR. */ - if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0) + if (atomic_cmpxchg(&adev->reset_domain->in_gpu_reset, 0, 1) != 0) return; down_write(&adev->reset_domain->sem); @@ -301,7 +301,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work) } while (timeout > 1); flr_done: - atomic_set(&adev->in_gpu_reset, 0); + atomic_set(&adev->reset_domain->in_gpu_reset, 0); up_write(&adev->reset_domain->sem); /* Trigger recovery for world switch failure if no TDR */ -- cgit v1.2.3-70-g09d2 From e923be9934a9c54a94e443f9e77bda5b9fbd1ce5 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Tue, 25 Jan 2022 11:32:47 -0500 Subject: drm/amdgpu: Rework amdgpu_device_lock_adev MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This functions needs to be split into 2 parts where one is called only once for locking single instance of reset_domain's sem and reset flag and the other part which handles MP1 states should still be called for each device in XGMI hive. Signed-off-by: Andrey Grodzovsky Reviewed-by: Christian König Link: https://www.spinics.net/lists/amd-gfx/msg74118.html --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 37 ++++++++++++++++-------------- drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 19 +++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 4 ++++ 3 files changed, 43 insertions(+), 17 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index e05d7cbefd2c..f69ab22ff096 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4825,16 +4825,8 @@ end: return r; } -static void amdgpu_device_lock_adev(struct amdgpu_device *adev, - struct amdgpu_hive_info *hive) +static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) { - atomic_set(&adev->reset_domain->in_gpu_reset, 1); - - if (hive) { - down_write_nest_lock(&adev->reset_domain->sem, &hive->hive_lock); - } else { - down_write(&adev->reset_domain->sem); - } switch (amdgpu_asic_reset_method(adev)) { case AMD_RESET_METHOD_MODE1: @@ -4849,12 +4841,10 @@ static void amdgpu_device_lock_adev(struct amdgpu_device *adev, } } -static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) +static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) { amdgpu_vf_error_trans_all(adev); adev->mp1_state = PP_MP1_STATE_NONE; - atomic_set(&adev->reset_domain->in_gpu_reset, 0); - up_write(&adev->reset_domain->sem); } static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) @@ -5060,10 +5050,15 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev, device_list_handle = &device_list; } + /* We need to lock reset domain only once both for XGMI and single device */ + tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, + reset_list); + amdgpu_device_lock_reset_domain(tmp_adev->reset_domain, hive); + /* block all schedulers and reset given job's ring */ list_for_each_entry(tmp_adev, device_list_handle, reset_list) { - amdgpu_device_lock_adev(tmp_adev, hive); + amdgpu_device_set_mp1_state(tmp_adev); /* * Try to put the audio codec into suspend state @@ -5213,9 +5208,14 @@ skip_sched_resume: if (audio_suspended) amdgpu_device_resume_display_audio(tmp_adev); - amdgpu_device_unlock_adev(tmp_adev); + + amdgpu_device_unset_mp1_state(tmp_adev); } + tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, + reset_list); + amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); + if (hive) { mutex_unlock(&hive->hive_lock); amdgpu_put_xgmi_hive(hive); @@ -5477,7 +5477,8 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta * Locking adev->reset_domain->sem will prevent any external access * to GPU during PCI error recovery */ - amdgpu_device_lock_adev(adev, NULL); + amdgpu_device_lock_reset_domain(adev->reset_domain, NULL); + amdgpu_device_set_mp1_state(adev); /* * Block any work scheduling as we do for regular GPU reset @@ -5584,7 +5585,8 @@ out: DRM_INFO("PCIe error recovery succeeded\n"); } else { DRM_ERROR("PCIe error recovery failed, err:%d", r); - amdgpu_device_unlock_adev(adev); + amdgpu_device_unset_mp1_state(adev); + amdgpu_device_unlock_reset_domain(adev->reset_domain); } return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; @@ -5621,7 +5623,8 @@ void amdgpu_pci_resume(struct pci_dev *pdev) drm_sched_start(&ring->sched, true); } - amdgpu_device_unlock_adev(adev); + amdgpu_device_unset_mp1_state(adev); + amdgpu_device_unlock_reset_domain(adev->reset_domain); } bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c index 5ab72c3bfbda..9b18ad0b77ab 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c @@ -137,5 +137,24 @@ struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d return reset_domain; } +void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain, + struct amdgpu_hive_info *hive) +{ + atomic_set(&reset_domain->in_gpu_reset, 1); + + if (hive) { + down_write_nest_lock(&reset_domain->sem, &hive->hive_lock); + } else { + down_write(&reset_domain->sem); + } +} + + +void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain) +{ + atomic_set(&reset_domain->in_gpu_reset, 0); + up_write(&reset_domain->sem); +} + diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h index ea6fc98ea927..92de3b7965a1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h @@ -118,5 +118,9 @@ static inline bool amdgpu_reset_domain_schedule(struct amdgpu_reset_domain *doma return queue_work(domain->wq, work); } +void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain, + struct amdgpu_hive_info *hive); + +void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain); #endif -- cgit v1.2.3-70-g09d2 From 3675c2f26f33ab4928859fb8950a4697a16be5c9 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Tue, 25 Jan 2022 11:36:18 -0500 Subject: drm/amdgpu: Revert 'drm/amdgpu: annotate a false positive recursive locking' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since we have a single instance of reset semaphore which we lock only once even for XGMI hive we don't need the nested locking hint anymore. Signed-off-by: Andrey Grodzovsky Reviewed-by: Christian König Link: https://www.spinics.net/lists/amd-gfx/msg74120.html --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 10 ++-------- 2 files changed, 4 insertions(+), 10 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index f69ab22ff096..54f8e1fa4cae 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5053,7 +5053,7 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev, /* We need to lock reset domain only once both for XGMI and single device */ tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, reset_list); - amdgpu_device_lock_reset_domain(tmp_adev->reset_domain, hive); + amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); /* block all schedulers and reset given job's ring */ list_for_each_entry(tmp_adev, device_list_handle, reset_list) { @@ -5477,7 +5477,7 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta * Locking adev->reset_domain->sem will prevent any external access * to GPU during PCI error recovery */ - amdgpu_device_lock_reset_domain(adev->reset_domain, NULL); + amdgpu_device_lock_reset_domain(adev->reset_domain); amdgpu_device_set_mp1_state(adev); /* diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c index 9b18ad0b77ab..248d64158721 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c @@ -137,16 +137,10 @@ struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d return reset_domain; } -void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain, - struct amdgpu_hive_info *hive) +void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain) { atomic_set(&reset_domain->in_gpu_reset, 1); - - if (hive) { - down_write_nest_lock(&reset_domain->sem, &hive->hive_lock); - } else { - down_write(&reset_domain->sem); - } + down_write(&reset_domain->sem); } -- cgit v1.2.3-70-g09d2 From c7703ce38c1ecdeeea6791b54fbee29a08816ea9 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Fri, 11 Feb 2022 15:55:00 -0500 Subject: drm/amdgpu: Fix htmldoc warning Update function name. Signed-off-by: Andrey Grodzovsky Reported-by: kernel test robot Reviewed-by: Alex Deucher Link: https://patchwork.freedesktop.org/patch/msgid/20220211205500.601391-1-andrey.grodzovsky@amd.com --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 54f8e1fa4cae..d78141e2c509 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4978,7 +4978,7 @@ retry: } /** - * amdgpu_device_gpu_recover - reset the asic and recover scheduler + * amdgpu_device_gpu_recover_imp - reset the asic and recover scheduler * * @adev: amdgpu_device pointer * @job: which job trigger hang -- cgit v1.2.3-70-g09d2 From 8ab62eda177bc350f34fea4fcea23603b8184bfd Mon Sep 17 00:00:00 2001 From: Jiawei Gu Date: Tue, 22 Feb 2022 17:42:52 +0800 Subject: drm/sched: Add device pointer to drm_gpu_scheduler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add device pointer so scheduler's printing can use DRM_DEV_ERROR() instead, which makes life easier under multiple GPU scenario. v2: amend all calls of drm_sched_init() v3: fill dev pointer for all drm_sched_init() calls Signed-off-by: Jiawei Gu Reviewed-by: Andrey Grodzovsky Reviewed-by: Christian König Signed-off-by: Christian König Link: https://patchwork.freedesktop.org/patch/msgid/20220221095705.5290-1-Jiawei.Gu@amd.com --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 +++- drivers/gpu/drm/etnaviv/etnaviv_sched.c | 2 +- drivers/gpu/drm/lima/lima_sched.c | 2 +- drivers/gpu/drm/msm/msm_ringbuffer.c | 2 +- drivers/gpu/drm/panfrost/panfrost_job.c | 2 +- drivers/gpu/drm/scheduler/sched_main.c | 9 +++++---- drivers/gpu/drm/v3d/v3d_sched.c | 10 +++++----- include/drm/gpu_scheduler.h | 3 ++- 8 files changed, 19 insertions(+), 15 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index d78141e2c509..cd1f9140a878 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2316,7 +2316,9 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, ring->num_hw_submission, amdgpu_job_hang_limit, - timeout, adev->reset_domain->wq, ring->sched_score, ring->name); + timeout, adev->reset_domain->wq, + ring->sched_score, ring->name, + adev->dev); if (r) { DRM_ERROR("Failed to create scheduler on ring %s.\n", ring->name); diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c b/drivers/gpu/drm/etnaviv/etnaviv_sched.c index 58f593b278c1..35e5ef7dbdcc 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c @@ -195,7 +195,7 @@ int etnaviv_sched_init(struct etnaviv_gpu *gpu) ret = drm_sched_init(&gpu->sched, &etnaviv_sched_ops, etnaviv_hw_jobs_limit, etnaviv_job_hang_limit, msecs_to_jiffies(500), NULL, NULL, - dev_name(gpu->dev)); + dev_name(gpu->dev), gpu->dev); if (ret) return ret; diff --git a/drivers/gpu/drm/lima/lima_sched.c b/drivers/gpu/drm/lima/lima_sched.c index 12437e42cc76..9cf4c60f4fca 100644 --- a/drivers/gpu/drm/lima/lima_sched.c +++ b/drivers/gpu/drm/lima/lima_sched.c @@ -491,7 +491,7 @@ int lima_sched_pipe_init(struct lima_sched_pipe *pipe, const char *name) return drm_sched_init(&pipe->base, &lima_sched_ops, 1, lima_job_hang_limit, msecs_to_jiffies(timeout), NULL, - NULL, name); + NULL, name, pipe->ldev->dev); } void lima_sched_pipe_fini(struct lima_sched_pipe *pipe) diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.c b/drivers/gpu/drm/msm/msm_ringbuffer.c index 3bbf574c3bdc..367a6aaa3a20 100644 --- a/drivers/gpu/drm/msm/msm_ringbuffer.c +++ b/drivers/gpu/drm/msm/msm_ringbuffer.c @@ -89,7 +89,7 @@ struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int id, ret = drm_sched_init(&ring->sched, &msm_sched_ops, num_hw_submissions, 0, sched_timeout, - NULL, NULL, to_msm_bo(ring->bo)->name); + NULL, NULL, to_msm_bo(ring->bo)->name, gpu->dev->dev); if (ret) { goto fail; } diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c index 908d79520853..a6925dbb6224 100644 --- a/drivers/gpu/drm/panfrost/panfrost_job.c +++ b/drivers/gpu/drm/panfrost/panfrost_job.c @@ -812,7 +812,7 @@ int panfrost_job_init(struct panfrost_device *pfdev) nentries, 0, msecs_to_jiffies(JOB_TIMEOUT_MS), pfdev->reset.wq, - NULL, "pan_js"); + NULL, "pan_js", pfdev->dev); if (ret) { dev_err(pfdev->dev, "Failed to create scheduler: %d.", ret); goto err_sched; diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index f91fb31ab7a7..b81fceb0b8a2 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -491,7 +491,7 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery) if (r == -ENOENT) drm_sched_job_done(s_job); else if (r) - DRM_ERROR("fence add callback failed (%d)\n", + DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n", r); } else drm_sched_job_done(s_job); @@ -957,7 +957,7 @@ static int drm_sched_main(void *param) if (r == -ENOENT) drm_sched_job_done(sched_job); else if (r) - DRM_ERROR("fence add callback failed (%d)\n", + DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n", r); dma_fence_put(fence); } else { @@ -991,7 +991,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched, const struct drm_sched_backend_ops *ops, unsigned hw_submission, unsigned hang_limit, long timeout, struct workqueue_struct *timeout_wq, - atomic_t *score, const char *name) + atomic_t *score, const char *name, struct device *dev) { int i, ret; sched->ops = ops; @@ -1001,6 +1001,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched, sched->timeout_wq = timeout_wq ? : system_wq; sched->hang_limit = hang_limit; sched->score = score ? score : &sched->_score; + sched->dev = dev; for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_COUNT; i++) drm_sched_rq_init(sched, &sched->sched_rq[i]); @@ -1018,7 +1019,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched, if (IS_ERR(sched->thread)) { ret = PTR_ERR(sched->thread); sched->thread = NULL; - DRM_ERROR("Failed to create scheduler for %s.\n", name); + DRM_DEV_ERROR(sched->dev, "Failed to create scheduler for %s.\n", name); return ret; } diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c index e0cb7d0697a7..39459ae96f30 100644 --- a/drivers/gpu/drm/v3d/v3d_sched.c +++ b/drivers/gpu/drm/v3d/v3d_sched.c @@ -391,7 +391,7 @@ v3d_sched_init(struct v3d_dev *v3d) &v3d_bin_sched_ops, hw_jobs_limit, job_hang_limit, msecs_to_jiffies(hang_limit_ms), NULL, - NULL, "v3d_bin"); + NULL, "v3d_bin", v3d->drm.dev); if (ret) { dev_err(v3d->drm.dev, "Failed to create bin scheduler: %d.", ret); return ret; @@ -401,7 +401,7 @@ v3d_sched_init(struct v3d_dev *v3d) &v3d_render_sched_ops, hw_jobs_limit, job_hang_limit, msecs_to_jiffies(hang_limit_ms), NULL, - NULL, "v3d_render"); + NULL, "v3d_render", v3d->drm.dev); if (ret) { dev_err(v3d->drm.dev, "Failed to create render scheduler: %d.", ret); @@ -413,7 +413,7 @@ v3d_sched_init(struct v3d_dev *v3d) &v3d_tfu_sched_ops, hw_jobs_limit, job_hang_limit, msecs_to_jiffies(hang_limit_ms), NULL, - NULL, "v3d_tfu"); + NULL, "v3d_tfu", v3d->drm.dev); if (ret) { dev_err(v3d->drm.dev, "Failed to create TFU scheduler: %d.", ret); @@ -426,7 +426,7 @@ v3d_sched_init(struct v3d_dev *v3d) &v3d_csd_sched_ops, hw_jobs_limit, job_hang_limit, msecs_to_jiffies(hang_limit_ms), NULL, - NULL, "v3d_csd"); + NULL, "v3d_csd", v3d->drm.dev); if (ret) { dev_err(v3d->drm.dev, "Failed to create CSD scheduler: %d.", ret); @@ -438,7 +438,7 @@ v3d_sched_init(struct v3d_dev *v3d) &v3d_cache_clean_sched_ops, hw_jobs_limit, job_hang_limit, msecs_to_jiffies(hang_limit_ms), NULL, - NULL, "v3d_cache_clean"); + NULL, "v3d_cache_clean", v3d->drm.dev); if (ret) { dev_err(v3d->drm.dev, "Failed to create CACHE_CLEAN scheduler: %d.", ret); diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index bbc22fad8d80..944f83ef9f2e 100644 --- a/include/drm/gpu_scheduler.h +++ b/include/drm/gpu_scheduler.h @@ -457,13 +457,14 @@ struct drm_gpu_scheduler { atomic_t _score; bool ready; bool free_guilty; + struct device *dev; }; int drm_sched_init(struct drm_gpu_scheduler *sched, const struct drm_sched_backend_ops *ops, uint32_t hw_submission, unsigned hang_limit, long timeout, struct workqueue_struct *timeout_wq, - atomic_t *score, const char *name); + atomic_t *score, const char *name, struct device *dev); void drm_sched_fini(struct drm_gpu_scheduler *sched); int drm_sched_job_init(struct drm_sched_job *job, -- cgit v1.2.3-70-g09d2