From 8935ff00e3b110e47e3a291ff11951eb1066b1ae Mon Sep 17 00:00:00 2001 From: Luben Tuikov Date: Thu, 3 Dec 2020 22:17:18 -0500 Subject: drm/scheduler: "node" --> "list" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename "node" to "list" in struct drm_sched_job, in order to make it consistent with what we see being used throughout gpu_scheduler.h, for instance in struct drm_sched_entity, as well as the rest of DRM and the kernel. Signed-off-by: Luben Tuikov Reviewed-by: Christian König Link: https://patchwork.freedesktop.org/patch/403515/ Cc: Alexander Deucher Cc: Andrey Grodzovsky Cc: Christian König Cc: Daniel Vetter Signed-off-by: Christian König --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 7560b05e4ac1..4df6de81cd41 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4128,7 +4128,7 @@ bool amdgpu_device_has_job_running(struct amdgpu_device *adev) spin_lock(&ring->sched.job_list_lock); job = list_first_entry_or_null(&ring->sched.ring_mirror_list, - struct drm_sched_job, node); + struct drm_sched_job, list); spin_unlock(&ring->sched.job_list_lock); if (job) return true; -- cgit v1.2.3-70-g09d2 From 6efa4b465cfdaa9be251ba043253e302ddb9976f Mon Sep 17 00:00:00 2001 From: Luben Tuikov Date: Thu, 3 Dec 2020 22:17:19 -0500 Subject: gpu/drm: ring_mirror_list --> pending_list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename "ring_mirror_list" to "pending_list", to describe what something is, not what it does, how it's used, or how the hardware implements it. This also abstracts the actual hardware implementation, i.e. how the low-level driver communicates with the device it drives, ring, CAM, etc., shouldn't be exposed to DRM. The pending_list keeps jobs submitted, which are out of our control. Usually this means they are pending execution status in hardware, but the latter definition is a more general (inclusive) definition. Signed-off-by: Luben Tuikov Acked-by: Christian König Link: https://patchwork.freedesktop.org/patch/405573/ Cc: Alexander Deucher Cc: Andrey Grodzovsky Cc: Christian König Cc: Daniel Vetter Signed-off-by: Christian König --- drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 2 +- drivers/gpu/drm/scheduler/sched_main.c | 34 ++++++++++++++--------------- include/drm/gpu_scheduler.h | 10 ++++----- 5 files changed, 27 insertions(+), 27 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c index 8358cae0b5a4..db77a5bdfa45 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c @@ -1427,7 +1427,7 @@ static void amdgpu_ib_preempt_job_recovery(struct drm_gpu_scheduler *sched) struct dma_fence *fence; spin_lock(&sched->job_list_lock); - list_for_each_entry(s_job, &sched->ring_mirror_list, list) { + list_for_each_entry(s_job, &sched->pending_list, list) { fence = sched->ops->run_job(s_job); dma_fence_put(fence); } @@ -1459,7 +1459,7 @@ static void amdgpu_ib_preempt_mark_partial_job(struct amdgpu_ring *ring) no_preempt: spin_lock(&sched->job_list_lock); - list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, list) { + list_for_each_entry_safe(s_job, tmp, &sched->pending_list, list) { if (dma_fence_is_signaled(&s_job->s_fence->finished)) { /* remove job from ring_mirror_list */ list_del_init(&s_job->list); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 4df6de81cd41..fbae600aa5f9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4127,8 +4127,8 @@ bool amdgpu_device_has_job_running(struct amdgpu_device *adev) continue; spin_lock(&ring->sched.job_list_lock); - job = list_first_entry_or_null(&ring->sched.ring_mirror_list, - struct drm_sched_job, list); + job = list_first_entry_or_null(&ring->sched.pending_list, + struct drm_sched_job, list); spin_unlock(&ring->sched.job_list_lock); if (job) return true; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index aca52a46b93d..ff48101bab55 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -271,7 +271,7 @@ void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched) } /* Signal all jobs already scheduled to HW */ - list_for_each_entry(s_job, &sched->ring_mirror_list, list) { + list_for_each_entry(s_job, &sched->pending_list, list) { struct drm_sched_fence *s_fence = s_job->s_fence; dma_fence_set_error(&s_fence->finished, -EHWPOISON); diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index c52eba407ebd..b694df12aaba 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -198,7 +198,7 @@ EXPORT_SYMBOL(drm_sched_dependency_optimized); static void drm_sched_start_timeout(struct drm_gpu_scheduler *sched) { if (sched->timeout != MAX_SCHEDULE_TIMEOUT && - !list_empty(&sched->ring_mirror_list)) + !list_empty(&sched->pending_list)) schedule_delayed_work(&sched->work_tdr, sched->timeout); } @@ -258,7 +258,7 @@ void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched, { spin_lock(&sched->job_list_lock); - if (list_empty(&sched->ring_mirror_list)) + if (list_empty(&sched->pending_list)) cancel_delayed_work(&sched->work_tdr); else mod_delayed_work(system_wq, &sched->work_tdr, remaining); @@ -272,7 +272,7 @@ static void drm_sched_job_begin(struct drm_sched_job *s_job) struct drm_gpu_scheduler *sched = s_job->sched; spin_lock(&sched->job_list_lock); - list_add_tail(&s_job->list, &sched->ring_mirror_list); + list_add_tail(&s_job->list, &sched->pending_list); drm_sched_start_timeout(sched); spin_unlock(&sched->job_list_lock); } @@ -286,7 +286,7 @@ static void drm_sched_job_timedout(struct work_struct *work) /* Protects against concurrent deletion in drm_sched_get_cleanup_job */ spin_lock(&sched->job_list_lock); - job = list_first_entry_or_null(&sched->ring_mirror_list, + job = list_first_entry_or_null(&sched->pending_list, struct drm_sched_job, list); if (job) { @@ -371,7 +371,7 @@ EXPORT_SYMBOL(drm_sched_increase_karma); * Stop the scheduler and also removes and frees all completed jobs. * Note: bad job will not be freed as it might be used later and so it's * callers responsibility to release it manually if it's not part of the - * mirror list any more. + * pending list any more. * */ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad) @@ -392,15 +392,15 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad) * Add at the head of the queue to reflect it was the earliest * job extracted. */ - list_add(&bad->list, &sched->ring_mirror_list); + list_add(&bad->list, &sched->pending_list); /* * Iterate the job list from later to earlier one and either deactive - * their HW callbacks or remove them from mirror list if they already + * their HW callbacks or remove them from pending list if they already * signaled. * This iteration is thread safe as sched thread is stopped. */ - list_for_each_entry_safe_reverse(s_job, tmp, &sched->ring_mirror_list, + list_for_each_entry_safe_reverse(s_job, tmp, &sched->pending_list, list) { if (s_job->s_fence->parent && dma_fence_remove_callback(s_job->s_fence->parent, @@ -408,7 +408,7 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad) atomic_dec(&sched->hw_rq_count); } else { /* - * remove job from ring_mirror_list. + * remove job from pending_list. * Locking here is for concurrent resume timeout */ spin_lock(&sched->job_list_lock); @@ -463,7 +463,7 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery) * so no new jobs are being inserted or removed. Also concurrent * GPU recovers can't run in parallel. */ - list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, list) { + list_for_each_entry_safe(s_job, tmp, &sched->pending_list, list) { struct dma_fence *fence = s_job->s_fence->parent; atomic_inc(&sched->hw_rq_count); @@ -494,7 +494,7 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery) EXPORT_SYMBOL(drm_sched_start); /** - * drm_sched_resubmit_jobs - helper to relunch job from mirror ring list + * drm_sched_resubmit_jobs - helper to relunch job from pending ring list * * @sched: scheduler instance * @@ -506,7 +506,7 @@ void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched) bool found_guilty = false; struct dma_fence *fence; - list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, list) { + list_for_each_entry_safe(s_job, tmp, &sched->pending_list, list) { struct drm_sched_fence *s_fence = s_job->s_fence; if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) { @@ -665,7 +665,7 @@ static void drm_sched_process_job(struct dma_fence *f, struct dma_fence_cb *cb) * * @sched: scheduler instance * - * Returns the next finished job from the mirror list (if there is one) + * Returns the next finished job from the pending list (if there is one) * ready for it to be destroyed. */ static struct drm_sched_job * @@ -675,7 +675,7 @@ drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched) /* * Don't destroy jobs while the timeout worker is running OR thread - * is being parked and hence assumed to not touch ring_mirror_list + * is being parked and hence assumed to not touch pending_list */ if ((sched->timeout != MAX_SCHEDULE_TIMEOUT && !cancel_delayed_work(&sched->work_tdr)) || @@ -684,11 +684,11 @@ drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched) spin_lock(&sched->job_list_lock); - job = list_first_entry_or_null(&sched->ring_mirror_list, + job = list_first_entry_or_null(&sched->pending_list, struct drm_sched_job, list); if (job && dma_fence_is_signaled(&job->s_fence->finished)) { - /* remove job from ring_mirror_list */ + /* remove job from pending_list */ list_del_init(&job->list); } else { job = NULL; @@ -858,7 +858,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched, init_waitqueue_head(&sched->wake_up_worker); init_waitqueue_head(&sched->job_scheduled); - INIT_LIST_HEAD(&sched->ring_mirror_list); + INIT_LIST_HEAD(&sched->pending_list); spin_lock_init(&sched->job_list_lock); atomic_set(&sched->hw_rq_count, 0); INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout); diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index 3add0072bd37..2e0c368e19f6 100644 --- a/include/drm/gpu_scheduler.h +++ b/include/drm/gpu_scheduler.h @@ -174,7 +174,7 @@ struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f); * @sched: the scheduler instance on which this job is scheduled. * @s_fence: contains the fences for the scheduling of job. * @finish_cb: the callback for the finished fence. - * @node: used to append this struct to the @drm_gpu_scheduler.ring_mirror_list. + * @node: used to append this struct to the @drm_gpu_scheduler.pending_list. * @id: a unique id assigned to each job scheduled on the scheduler. * @karma: increment on every hang caused by this job. If this exceeds the hang * limit of the scheduler then the job is marked guilty and will not @@ -203,7 +203,7 @@ struct drm_sched_job { static inline bool drm_sched_invalidate_job(struct drm_sched_job *s_job, int threshold) { - return (s_job && atomic_inc_return(&s_job->karma) > threshold); + return s_job && atomic_inc_return(&s_job->karma) > threshold; } /** @@ -260,8 +260,8 @@ struct drm_sched_backend_ops { * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the * timeout interval is over. * @thread: the kthread on which the scheduler which run. - * @ring_mirror_list: the list of jobs which are currently in the job queue. - * @job_list_lock: lock to protect the ring_mirror_list. + * @pending_list: the list of jobs which are currently in the job queue. + * @job_list_lock: lock to protect the pending_list. * @hang_limit: once the hangs by a job crosses this limit then it is marked * guilty and it will be considered for scheduling further. * @score: score to help loadbalancer pick a idle sched @@ -282,7 +282,7 @@ struct drm_gpu_scheduler { atomic64_t job_id_count; struct delayed_work work_tdr; struct task_struct *thread; - struct list_head ring_mirror_list; + struct list_head pending_list; spinlock_t job_list_lock; int hang_limit; atomic_t score; -- cgit v1.2.3-70-g09d2 From 26eb6b51da86ec7dcc29fd90baf465b38acecfb7 Mon Sep 17 00:00:00 2001 From: Dennis Li Date: Wed, 30 Dec 2020 19:45:15 +0800 Subject: drm/amdgpu: fix a GPU hang issue when remove device When GFXOFF is enabled and GPU is idle, driver will fail to access some registers. Therefore change to disable power gating before all access registers with MMIO. Dmesg log is as following: amdgpu 0000:03:00.0: amdgpu: amdgpu: finishing device. amdgpu: cp queue pipe 4 queue 0 preemption failed amdgpu 0000:03:00.0: amdgpu: failed to write reg 2890 wait reg 28a2 amdgpu 0000:03:00.0: amdgpu: failed to write reg 1a6f4 wait reg 1a706 amdgpu 0000:03:00.0: amdgpu: failed to write reg 2890 wait reg 28a2 amdgpu 0000:03:00.0: amdgpu: failed to write reg 1a6f4 wait reg 1a706 Signed-off-by: Dennis Li Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 1cb7d73f7317..b69c34074d8d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2548,11 +2548,11 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev) if (adev->gmc.xgmi.num_physical_nodes > 1) amdgpu_xgmi_remove_device(adev); - amdgpu_amdkfd_device_fini(adev); - amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); + amdgpu_amdkfd_device_fini(adev); + /* need to disable SMC first */ for (i = 0; i < adev->num_ip_blocks; i++) { if (!adev->ip_blocks[i].status.hw) -- cgit v1.2.3-70-g09d2 From 044a48f420b9d3c19a135b821c34de5b2bee4075 Mon Sep 17 00:00:00 2001 From: Alexandre Demers Date: Thu, 7 Jan 2021 18:53:03 -0500 Subject: drm/amdgpu: fix DRM_INFO flood if display core is not supported (bug 210921) This fix bug 210921 where DRM_INFO floods log when hitting an unsupported ASIC in amdgpu_device_asic_has_dc_support(). This info should be only called once. Bug: https://bugzilla.kernel.org/show_bug.cgi?id=210921 Signed-off-by: Alexandre Demers Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index b69c34074d8d..087afab67e22 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3034,7 +3034,7 @@ bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) #endif default: if (amdgpu_dc > 0) - DRM_INFO("Display Core has been requested via kernel parameter " + DRM_INFO_ONCE("Display Core has been requested via kernel parameter " "but isn't supported by ASIC, ignoring\n"); return false; } -- cgit v1.2.3-70-g09d2 From 665fe4dce83d14177f46fac814964ec107f196b5 Mon Sep 17 00:00:00 2001 From: Jiansong Chen Date: Mon, 11 Jan 2021 17:41:25 +0800 Subject: drm/amdgpu: enable gpu recovery for navy_flounder Enable gpu recovery for navy_flounder by default to trigger reset once needed. Signed-off-by: Jiansong Chen Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 087afab67e22..dd67b589b4ab 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4206,6 +4206,7 @@ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) case CHIP_NAVI14: case CHIP_NAVI12: case CHIP_SIENNA_CICHLID: + case CHIP_NAVY_FLOUNDER: break; default: goto disabled; -- cgit v1.2.3-70-g09d2 From 453f617a30aa1298fe19728026e9d24098fdae03 Mon Sep 17 00:00:00 2001 From: Nirmoy Das Date: Thu, 7 Jan 2021 15:15:10 +0100 Subject: drm/amdgpu: Resize BAR0 to the maximum available size, even if it doesn't cover VRAM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This allows BAR0 resizing to be done for cards which don't advertise support for a size large enough to cover the VRAM but which do advertise at least one size larger than the default. For example, my RX 5600 XT, which advertises 256MB, 512MB and 1GB. Signed-off-by: Darren Salt Signed-off-by: Christian König Signed-off-by: Nirmoy Das Reviewed-by: Nirmoy Das Link: https://patchwork.kernel.org/project/dri-devel/patch/20210107175017.15893-4-nirmoy.das@amd.com --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 8f451e809127..348ac678a230 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1107,7 +1107,7 @@ void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) { u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size); - u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1; + int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); struct pci_bus *root; struct resource *res; unsigned i; @@ -1138,6 +1138,10 @@ int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) if (!res) return 0; + /* Limit the BAR size to what is available */ + rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, + rbar_size); + /* Disable memory decoding while we change the BAR addresses and size */ pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); pci_write_config_word(adev->pdev, PCI_COMMAND, -- cgit v1.2.3-70-g09d2 From 8a11d283788e41c6e65b14ad99a457cb56702a68 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Thu, 7 Jan 2021 09:07:41 +0100 Subject: drm/amdgpu: Fix trailing whitespaces MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adhere to kernel coding style. Signed-off-by: Thomas Zimmermann Reviewed-by: Christian König Acked-by: Alex Deucher Acked-by: Sam Ravnborg Cc: Alex Deucher Cc: Christian König Link: https://patchwork.freedesktop.org/patch/msgid/20210107080748.4768-2-tzimmermann@suse.de --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 348ac678a230..bb3eb2c96eb7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4954,8 +4954,8 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta case pci_channel_io_normal: return PCI_ERS_RESULT_CAN_RECOVER; /* Fatal error, prepare for slot reset */ - case pci_channel_io_frozen: - /* + case pci_channel_io_frozen: + /* * Cancel and wait for all TDRs in progress if failing to * set adev->in_gpu_reset in amdgpu_device_lock_adev * @@ -5046,7 +5046,7 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) goto out; } - adev->in_pci_err_recovery = true; + adev->in_pci_err_recovery = true; r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset); adev->in_pci_err_recovery = false; if (r) -- cgit v1.2.3-70-g09d2 From 8f66090b7bb7f2a2183f46e72e367922d836e0dd Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Thu, 7 Jan 2021 09:07:42 +0100 Subject: drm/amdgpu: Remove references to struct drm_device.pdev MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using struct drm_device.pdev is deprecated. Convert amdgpu to struct drm_device.dev. No functional changes. v3: * rebased Signed-off-by: Thomas Zimmermann Acked-by: Christian König Acked-by: Alex Deucher Acked-by: Sam Ravnborg Cc: Alex Deucher Cc: Christian König Link: https://patchwork.freedesktop.org/patch/msgid/20210107080748.4768-3-tzimmermann@suse.de --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 ++++++++--------- drivers/gpu/drm/amd/amdgpu/amdgpu_display.c | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 1 - drivers/gpu/drm/amd/amdgpu/amdgpu_fb.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 10 +++++----- drivers/gpu/drm/amd/amdgpu/amdgpu_i2c.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 10 +++++----- 7 files changed, 21 insertions(+), 22 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index bb3eb2c96eb7..1ec308de5826 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1427,9 +1427,9 @@ static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, /* don't suspend or resume card normally */ dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; - pci_set_power_state(dev->pdev, PCI_D0); - amdgpu_device_load_pci_state(dev->pdev); - r = pci_enable_device(dev->pdev); + pci_set_power_state(pdev, PCI_D0); + amdgpu_device_load_pci_state(pdev); + r = pci_enable_device(pdev); if (r) DRM_WARN("pci_enable_device failed (%d)\n", r); amdgpu_device_resume(dev, true); @@ -1441,10 +1441,10 @@ static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, drm_kms_helper_poll_disable(dev); dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; amdgpu_device_suspend(dev, true); - amdgpu_device_cache_pci_state(dev->pdev); + amdgpu_device_cache_pci_state(pdev); /* Shut down the device */ - pci_disable_device(dev->pdev); - pci_set_power_state(dev->pdev, PCI_D3cold); + pci_disable_device(pdev); + pci_set_power_state(pdev, PCI_D3cold); dev->switch_power_state = DRM_SWITCH_POWER_OFF; } } @@ -1707,8 +1707,7 @@ static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) adev->enable_virtual_display = false; if (amdgpu_virtual_display) { - struct drm_device *ddev = adev_to_drm(adev); - const char *pci_address_name = pci_name(ddev->pdev); + const char *pci_address_name = pci_name(adev->pdev); char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); @@ -3401,7 +3400,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, } } - pci_enable_pcie_error_reporting(adev->ddev.pdev); + pci_enable_pcie_error_reporting(adev->pdev); /* Post card if necessary */ if (amdgpu_device_need_post(adev)) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c index f764803c53a4..0150a51b65ef 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c @@ -926,6 +926,7 @@ amdgpu_display_user_framebuffer_create(struct drm_device *dev, struct drm_file *file_priv, const struct drm_mode_fb_cmd2 *mode_cmd) { + struct amdgpu_device *adev = drm_to_adev(dev); struct drm_gem_object *obj; struct amdgpu_framebuffer *amdgpu_fb; int ret; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 72efd579ec5e..b4ea67e12ada 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -1204,7 +1204,6 @@ static int amdgpu_pci_probe(struct pci_dev *pdev, if (ret) return ret; - ddev->pdev = pdev; pci_set_drvdata(pdev, ddev); ret = amdgpu_driver_load_kms(adev, ent->driver_data); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fb.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fb.c index 0bf7d36c6686..51cd49c6f38f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fb.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fb.c @@ -271,7 +271,7 @@ static int amdgpufb_create(struct drm_fb_helper *helper, DRM_INFO("fb depth is %d\n", fb->format->depth); DRM_INFO(" pitch is %d\n", fb->pitches[0]); - vga_switcheroo_client_fb_set(adev_to_drm(adev)->pdev, info); + vga_switcheroo_client_fb_set(adev->pdev, info); return 0; out: diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c index d0a1fee1f5f6..a5c42c3004a0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c @@ -619,7 +619,7 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data, int r = 0; if (args->va_address < AMDGPU_VA_RESERVED_SIZE) { - dev_dbg(&dev->pdev->dev, + dev_dbg(dev->dev, "va_address 0x%LX is in reserved area 0x%LX\n", args->va_address, AMDGPU_VA_RESERVED_SIZE); return -EINVAL; @@ -627,7 +627,7 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data, if (args->va_address >= AMDGPU_GMC_HOLE_START && args->va_address < AMDGPU_GMC_HOLE_END) { - dev_dbg(&dev->pdev->dev, + dev_dbg(dev->dev, "va_address 0x%LX is in VA hole 0x%LX-0x%LX\n", args->va_address, AMDGPU_GMC_HOLE_START, AMDGPU_GMC_HOLE_END); @@ -639,14 +639,14 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data, vm_size = adev->vm_manager.max_pfn * AMDGPU_GPU_PAGE_SIZE; vm_size -= AMDGPU_VA_RESERVED_SIZE; if (args->va_address + args->map_size > vm_size) { - dev_dbg(&dev->pdev->dev, + dev_dbg(dev->dev, "va_address 0x%llx is in top reserved area 0x%llx\n", args->va_address + args->map_size, vm_size); return -EINVAL; } if ((args->flags & ~valid_flags) && (args->flags & ~prt_flags)) { - dev_dbg(&dev->pdev->dev, "invalid flags combination 0x%08X\n", + dev_dbg(dev->dev, "invalid flags combination 0x%08X\n", args->flags); return -EINVAL; } @@ -658,7 +658,7 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data, case AMDGPU_VA_OP_REPLACE: break; default: - dev_dbg(&dev->pdev->dev, "unsupported operation %d\n", + dev_dbg(dev->dev, "unsupported operation %d\n", args->operation); return -EINVAL; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_i2c.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_i2c.c index 47cad23a6b9e..bca4dddd5a15 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_i2c.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_i2c.c @@ -176,7 +176,7 @@ struct amdgpu_i2c_chan *amdgpu_i2c_create(struct drm_device *dev, i2c->rec = *rec; i2c->adapter.owner = THIS_MODULE; i2c->adapter.class = I2C_CLASS_DDC; - i2c->adapter.dev.parent = &dev->pdev->dev; + i2c->adapter.dev.parent = dev->dev; i2c->dev = dev; i2c_set_adapdata(&i2c->adapter, i2c); mutex_init(&i2c->mutex); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c index b16b32797624..3c37cf1ae8b7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c @@ -142,7 +142,7 @@ int amdgpu_driver_load_kms(struct amdgpu_device *adev, unsigned long flags) (amdgpu_is_atpx_hybrid() || amdgpu_has_atpx_dgpu_power_cntl()) && ((flags & AMD_IS_APU) == 0) && - !pci_is_thunderbolt_attached(dev->pdev)) + !pci_is_thunderbolt_attached(to_pci_dev(dev->dev))) flags |= AMD_IS_PX; parent = pci_upstream_bridge(adev->pdev); @@ -156,7 +156,7 @@ int amdgpu_driver_load_kms(struct amdgpu_device *adev, unsigned long flags) */ r = amdgpu_device_init(adev, flags); if (r) { - dev_err(&dev->pdev->dev, "Fatal error during GPU init\n"); + dev_err(dev->dev, "Fatal error during GPU init\n"); goto out; } @@ -199,7 +199,7 @@ int amdgpu_driver_load_kms(struct amdgpu_device *adev, unsigned long flags) acpi_status = amdgpu_acpi_init(adev); if (acpi_status) - dev_dbg(&dev->pdev->dev, "Error during ACPI methods call\n"); + dev_dbg(dev->dev, "Error during ACPI methods call\n"); if (adev->runpm) { /* only need to skip on ATPX */ @@ -735,10 +735,10 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) if (!dev_info) return -ENOMEM; - dev_info->device_id = dev->pdev->device; + dev_info->device_id = adev->pdev->device; dev_info->chip_rev = adev->rev_id; dev_info->external_rev = adev->external_rev_id; - dev_info->pci_rev = dev->pdev->revision; + dev_info->pci_rev = adev->pdev->revision; dev_info->family = adev->family; dev_info->num_shader_engines = adev->gfx.config.max_shader_engines; dev_info->num_shader_arrays_per_engine = adev->gfx.config.max_sh_per_se; -- cgit v1.2.3-70-g09d2 From 9882e278536e6a78a49506aa70318db0455b5d74 Mon Sep 17 00:00:00 2001 From: "Emily.Deng" Date: Wed, 6 Jan 2021 19:34:58 +0800 Subject: drm/amdgpu: Decrease compute timeout to 10 s for sriov multiple VF MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For multiple VF, after engine hang,as host driver will first encounter FLR, so has no meanning to set compute to 60s. v2: Refine the patch and comment Signed-off-by: Emily.Deng Acked-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index dd67b589b4ab..76d4a8ae2fb4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3117,7 +3117,10 @@ static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) */ adev->gfx_timeout = msecs_to_jiffies(10000); adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; - if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) + if (amdgpu_sriov_vf(adev)) + adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? + msecs_to_jiffies(60000) : msecs_to_jiffies(10000); + else if (amdgpu_passthrough(adev)) adev->compute_timeout = msecs_to_jiffies(60000); else adev->compute_timeout = MAX_SCHEDULE_TIMEOUT; -- cgit v1.2.3-70-g09d2 From b6903089a5ab74e8bcae963d5ca60b0005b75c05 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 25 Nov 2020 11:21:32 -0500 Subject: drm/amdgpu: Enable GPU reset for vangogh Enable GPU reset when we encounter a hang. Acked-by: Evan Quan Signed-off-by: Alex Deucher Reviewed-by: Huang Rui Signed-off-by: Huang Rui --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 76d4a8ae2fb4..4d434803fb49 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4210,6 +4210,7 @@ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) case CHIP_NAVI12: case CHIP_SIENNA_CICHLID: case CHIP_NAVY_FLOUNDER: + case CHIP_VANGOGH: break; default: goto disabled; -- cgit v1.2.3-70-g09d2 From 0d7ab835463eb41265c18184a4867a1e9f0864da Mon Sep 17 00:00:00 2001 From: Nirmoy Das Date: Tue, 12 Jan 2021 18:46:33 +0100 Subject: drm/amdgpu: Remove unused variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove unused space_needed variable. Fixes: 453f617a30aa ("drm/amdgpu: Resize BAR0 to the maximum available size, even if it doesn't cover VRAM") Signed-off-by: Nirmoy Das Reviewed-by: Alex Deucher Reviewed-by: Christian König Signed-off-by: Christian König Link: https://patchwork.freedesktop.org/patch/413807/ --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 1ec308de5826..168f3adabf57 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1106,7 +1106,6 @@ void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) */ int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) { - u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size); int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); struct pci_bus *root; struct resource *res; -- cgit v1.2.3-70-g09d2 From 2b3a1f515fe1fc01c2359facfaae9fc44dda3a41 Mon Sep 17 00:00:00 2001 From: Feifei Xu Date: Tue, 19 Jan 2021 17:46:25 +0800 Subject: drm/amdgpu:Add pcie gen5 support in pcie capability. Add PCIE_SPEED_32_0GT and PCIE GEN5 support for amdgpu. Signed-off-by: Feifei Xu Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 ++++++++++++++-- drivers/gpu/drm/amd/include/amd_pcie.h | 2 ++ 2 files changed, 16 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index bfaa99354bbc..e3e5c48fbc8c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4793,7 +4793,13 @@ static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); } else { - if (speed_cap == PCIE_SPEED_16_0GT) + if (speed_cap == PCIE_SPEED_32_0GT) + adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | + CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | + CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | + CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | + CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); + else if (speed_cap == PCIE_SPEED_16_0GT) adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | @@ -4813,7 +4819,13 @@ static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); } else { - if (platform_speed_cap == PCIE_SPEED_16_0GT) + if (platform_speed_cap == PCIE_SPEED_32_0GT) + adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | + CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | + CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | + CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | + CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); + else if (platform_speed_cap == PCIE_SPEED_16_0GT) adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | diff --git a/drivers/gpu/drm/amd/include/amd_pcie.h b/drivers/gpu/drm/amd/include/amd_pcie.h index 9cb9ceb4d74d..a1ece3eecdf5 100644 --- a/drivers/gpu/drm/amd/include/amd_pcie.h +++ b/drivers/gpu/drm/amd/include/amd_pcie.h @@ -28,6 +28,7 @@ #define CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 0x00020000 #define CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 0x00040000 #define CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 0x00080000 +#define CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5 0x00100000 #define CAIL_PCIE_LINK_SPEED_SUPPORT_MASK 0xFFFF0000 #define CAIL_PCIE_LINK_SPEED_SUPPORT_SHIFT 16 @@ -36,6 +37,7 @@ #define CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 0x00000002 #define CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 0x00000004 #define CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 0x00000008 +#define CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5 0x00000010 #define CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_MASK 0x0000FFFF #define CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_SHIFT 0 -- cgit v1.2.3-70-g09d2 From dcb820d185f50d34d0b8ca21230d8d57e0aca7cc Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Tue, 19 Jan 2021 13:35:21 +0800 Subject: drm/amdgpu: remove gpu info firmware of green sardine The ip discovery is supported on green sardine, it doesn't need gpu info firmware anymore. Signed-off-by: Huang Rui Reviewed-by: Prike Liang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index e3e5c48fbc8c..e4e1f04d9164 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -81,7 +81,6 @@ MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin"); -MODULE_FIRMWARE("amdgpu/green_sardine_gpu_info.bin"); #define AMDGPU_RESUME_MS 2000 -- cgit v1.2.3-70-g09d2 From 91fb309d8294be5ab03746638e10bb3e5680f348 Mon Sep 17 00:00:00 2001 From: Horace Chen Date: Wed, 20 Jan 2021 22:03:28 +0800 Subject: drm/amdgpu: race issue when jobs on 2 ring timeout Fix a racing issue when jobs on 2 rings timeout simultaneously. If 2 rings timed out at the same time, the amdgpu_device_gpu_recover will be reentered. Then the adev->gmc.xgmi.head will be grabbed by 2 local linked list, which may cause wild pointer issue in iterating. lock the device earily to prevent the node be added to 2 different lists. also increase karma for the skipped job since the job is also timed out and should be guilty. Signed-off-by: Horace Chen Reviewed-by: Andrey Grodzovsky Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 69 +++++++++++++++++++++++++----- 1 file changed, 59 insertions(+), 10 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index e4e1f04d9164..9ae0f2d68281 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4461,6 +4461,46 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) up_write(&adev->reset_sem); } +/* + * to lockup a list of amdgpu devices in a hive safely, if not a hive + * with multiple nodes, it will be similar as amdgpu_device_lock_adev. + * + * unlock won't require roll back. + */ +static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive) +{ + struct amdgpu_device *tmp_adev = NULL; + + if (adev->gmc.xgmi.num_physical_nodes > 1) { + if (!hive) { + dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes"); + return -ENODEV; + } + list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { + if (!amdgpu_device_lock_adev(tmp_adev, hive)) + goto roll_back; + } + } else if (!amdgpu_device_lock_adev(adev, hive)) + return -EAGAIN; + + return 0; +roll_back: + if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) { + /* + * if the lockup iteration break in the middle of a hive, + * it may means there may has a race issue, + * or a hive device locked up independently. + * we may be in trouble and may not, so will try to roll back + * the lock and give out a warnning. + */ + dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock"); + list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) { + amdgpu_device_unlock_adev(tmp_adev); + } + } + return -EAGAIN; +} + static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) { struct pci_dev *p = NULL; @@ -4574,11 +4614,29 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", job ? job->base.id : -1, hive->hive_id); amdgpu_put_xgmi_hive(hive); + if (job) + drm_sched_increase_karma(&job->base); return 0; } mutex_lock(&hive->hive_lock); } + /* + * lock the device before we try to operate the linked list + * if didn't get the device lock, don't touch the linked list since + * others may iterating it. + */ + r = amdgpu_device_lock_hive_adev(adev, hive); + if (r) { + dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress", + job ? job->base.id : -1); + + /* even we skipped this reset, still need to set the job to guilty */ + if (job) + drm_sched_increase_karma(&job->base); + goto skip_recovery; + } + /* * Build list of devices to reset. * In case we are in XGMI hive mode, resort the device list @@ -4586,8 +4644,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, */ INIT_LIST_HEAD(&device_list); if (adev->gmc.xgmi.num_physical_nodes > 1) { - if (!hive) - return -ENODEV; if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list)) list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list); device_list_handle = &hive->device_list; @@ -4598,13 +4654,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, /* block all schedulers and reset given job's ring */ list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { - if (!amdgpu_device_lock_adev(tmp_adev, hive)) { - dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress", - job ? job->base.id : -1); - r = 0; - goto skip_recovery; - } - /* * Try to put the audio codec into suspend state * before gpu reset started. @@ -4742,7 +4791,7 @@ skip_recovery: amdgpu_put_xgmi_hive(hive); } - if (r) + if (r && r != -EAGAIN) dev_info(adev->dev, "GPU reset end with ret = %d\n", r); return r; } -- cgit v1.2.3-70-g09d2 From 33cf440d594bfbf81fc20604957bc64f02d0b560 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 26 Jan 2021 21:57:00 -0500 Subject: drm/amdgpu: disable gpu reset on Vangogh for now Until the issues in the SMU firmware are fixed. Signed-off-by: Alex Deucher Acked-by: Huang Rui Signed-off-by: Huang Rui --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 1 - drivers/gpu/drm/amd/amdgpu/nv.c | 4 ++++ 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 9ae0f2d68281..51bea409e513 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4211,7 +4211,6 @@ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) case CHIP_NAVI12: case CHIP_SIENNA_CICHLID: case CHIP_NAVY_FLOUNDER: - case CHIP_VANGOGH: break; default: goto disabled; diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c index b9722282d1b8..66279f0c6808 100644 --- a/drivers/gpu/drm/amd/amdgpu/nv.c +++ b/drivers/gpu/drm/amd/amdgpu/nv.c @@ -411,6 +411,10 @@ static int nv_asic_reset(struct amdgpu_device *adev) int ret = 0; struct smu_context *smu = &adev->smu; + /* skip reset on vangogh for now */ + if (adev->asic_type == CHIP_VANGOGH) + return 0; + switch (nv_asic_reset_method(adev)) { case AMD_RESET_METHOD_BACO: dev_info(adev->dev, "BACO reset\n"); -- cgit v1.2.3-70-g09d2 From af484df800e356725c39f52e7cbe8b47f1753453 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 4 Feb 2021 10:29:19 -0500 Subject: drm/amdgpu: add generic pci reset as an option This allows us to use generic PCI reset mechanisms (FLR, SBR) as a reset mechanism to verify that the generic PCI reset mechanisms are working properly. Acked-by: Evan Quan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 4 +++- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 ++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 4 ++-- 3 files changed, 17 insertions(+), 3 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index dbf8c8a0878c..9717cb43eb26 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -579,7 +579,8 @@ enum amd_reset_method { AMD_RESET_METHOD_MODE0, AMD_RESET_METHOD_MODE1, AMD_RESET_METHOD_MODE2, - AMD_RESET_METHOD_BACO + AMD_RESET_METHOD_BACO, + AMD_RESET_METHOD_PCI, }; /* @@ -1228,6 +1229,7 @@ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev); int amdgpu_device_gpu_recover(struct amdgpu_device *adev, struct amdgpu_job* job); void amdgpu_device_pci_config_reset(struct amdgpu_device *adev); +int amdgpu_device_pci_reset(struct amdgpu_device *adev); bool amdgpu_device_need_post(struct amdgpu_device *adev); void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev, u64 num_bytes, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 51bea409e513..e6a72a7fc311 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -929,6 +929,18 @@ void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); } +/** + * amdgpu_device_pci_reset - reset the GPU using generic PCI means + * + * @adev: amdgpu_device pointer + * + * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). + */ +int amdgpu_device_pci_reset(struct amdgpu_device *adev) +{ + return pci_reset_function(adev->pdev); +} + /* * GPU doorbell aperture helpers function. */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 9173da34dd22..80c46f648d5c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -793,9 +793,9 @@ module_param_named(tmz, amdgpu_tmz, int, 0444); /** * DOC: reset_method (int) - * GPU reset method (-1 = auto (default), 0 = legacy, 1 = mode0, 2 = mode1, 3 = mode2, 4 = baco) + * GPU reset method (-1 = auto (default), 0 = legacy, 1 = mode0, 2 = mode1, 3 = mode2, 4 = baco, 5 = pci) */ -MODULE_PARM_DESC(reset_method, "GPU reset method (-1 = auto (default), 0 = legacy, 1 = mode0, 2 = mode1, 3 = mode2, 4 = baco/bamaco)"); +MODULE_PARM_DESC(reset_method, "GPU reset method (-1 = auto (default), 0 = legacy, 1 = mode0, 2 = mode1, 3 = mode2, 4 = baco/bamaco, 5 = pci)"); module_param_named(reset_method, amdgpu_reset_method, int, 0444); /** -- cgit v1.2.3-70-g09d2 From a8d3d80a8ca3df47a846937809fc1e1d8e8fbce2 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 4 Feb 2021 09:11:14 -0500 Subject: drm/amdgpu: drop extra drm_kms_helper_poll_enable/disable calls These are already called in amdgpu_device_suspend/resume which are already called in the same functions. Acked-by: Evan Quan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 -- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 2 -- 2 files changed, 4 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index e6a72a7fc311..0ee6514ee55c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1445,10 +1445,8 @@ static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, amdgpu_device_resume(dev, true); dev->switch_power_state = DRM_SWITCH_POWER_ON; - drm_kms_helper_poll_enable(dev); } else { pr_info("switched off\n"); - drm_kms_helper_poll_disable(dev); dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; amdgpu_device_suspend(dev, true); amdgpu_device_cache_pci_state(pdev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 80c46f648d5c..0b07cc15ebdd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -1348,7 +1348,6 @@ static int amdgpu_pmops_runtime_suspend(struct device *dev) adev->in_runpm = true; if (amdgpu_device_supports_atpx(drm_dev)) drm_dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; - drm_kms_helper_poll_disable(drm_dev); ret = amdgpu_device_suspend(drm_dev, false); if (ret) @@ -1405,7 +1404,6 @@ static int amdgpu_pmops_runtime_resume(struct device *dev) amdgpu_device_baco_exit(drm_dev); } ret = amdgpu_device_resume(drm_dev, false); - drm_kms_helper_poll_enable(drm_dev); if (amdgpu_device_supports_atpx(drm_dev)) drm_dev->switch_power_state = DRM_SWITCH_POWER_ON; adev->in_runpm = false; -- cgit v1.2.3-70-g09d2 From ad887af9b6d0d5d7866a3953563fb0fee7556ea8 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 4 Feb 2021 09:29:08 -0500 Subject: drm/amdgpu: use runpm flag rather than fbcon for kfd runtime suspend (v2) the flag used by kfd is not actually related to fbcon, it just happens to align. Use the runpm flag instead so that we can decouple it from the fbcon flag. v2: fix resume as well Reviewed-by: Felix Kuehling Reviewed-by: Rajneesh Bhardwaj Acked-by: Evan Quan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 0ee6514ee55c..b7ebd424bbc7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3734,7 +3734,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) r = amdgpu_device_ip_suspend_phase1(adev); - amdgpu_amdkfd_suspend(adev, !fbcon); + amdgpu_amdkfd_suspend(adev, adev->in_runpm); /* evict vram memory */ amdgpu_bo_evict_vram(adev); @@ -3818,7 +3818,7 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon) } } } - r = amdgpu_amdkfd_resume(adev, !fbcon); + r = amdgpu_amdkfd_resume(adev, adev->in_runpm); if (r) return r; -- cgit v1.2.3-70-g09d2 From 27859ee3df9761a944535e45a96098027450808c Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Fri, 5 Feb 2021 14:22:23 +0800 Subject: drm/amdgpu: enable gpu recovery for dimgrey_cavefish As dimgrey_cavefish driver is stable enough, set gpu recovery as default in HW hang for dimgrey_cavefish. Signed-off-by: Tao Zhou Reviewed-by: Jiansong Chen Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index b7ebd424bbc7..7052dc35d278 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4221,6 +4221,7 @@ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) case CHIP_NAVI12: case CHIP_SIENNA_CICHLID: case CHIP_NAVY_FLOUNDER: + case CHIP_DIMGREY_CAVEFISH: break; default: goto disabled; -- cgit v1.2.3-70-g09d2