From c9a6b82f45e261d247b980a7949aaa6a9bfffe01 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Wed, 29 Jul 2020 12:59:45 -0400 Subject: drm/amdgpu: Implement DPC recovery Add PCI Downstream Port Containment (DPC) with basic recovery functionality v2: remove pci_save_state to avoid breaking suspend/resume v3: Fix style comments v4: Improve description. Signed-off-by: Andrey Grodzovsky Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 4009d2e30727..29c0b4c87231 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -49,6 +49,8 @@ #include #include #include +#include +#include #include #include @@ -1260,6 +1262,12 @@ static inline int amdgpu_dm_display_resume(struct amdgpu_device *adev) { return void amdgpu_register_gpu_instance(struct amdgpu_device *adev); void amdgpu_unregister_gpu_instance(struct amdgpu_device *adev); +pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, + pci_channel_state_t state); +pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev); +pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev); +void amdgpu_pci_resume(struct pci_dev *pdev); + #include "amdgpu_object.h" /* used by df_v3_6.c and amdgpu_pmu.c */ -- cgit v1.2.3-70-g09d2 From bf36b52e781d7412c3fce826f74ba6a73b9be4d0 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Wed, 29 Jul 2020 13:10:29 -0400 Subject: drm/amdgpu: Avoid accessing HW when suspending SW state At this point the ASIC is already post reset by the HW/PSP so the HW not in proper state to be configured for suspension, some blocks might be even gated and so best is to avoid touching it. v2: Rename in_dpc to more meaningful name Signed-off-by: Andrey Grodzovsky Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 38 ++++++++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 6 +++++ drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 6 +++++ drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 18 ++++++++------ drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 3 +++ 6 files changed, 65 insertions(+), 7 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 29c0b4c87231..207eba012029 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -989,6 +989,7 @@ struct amdgpu_device { atomic_t throttling_logging_enabled; struct ratelimit_state throttling_logging_rs; uint32_t ras_features; + bool in_pci_err_recovery; }; static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 99c0e6e53e84..4d9a8734ea20 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -319,6 +319,9 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, { uint32_t ret; + if (adev->in_pci_err_recovery) + return 0; + if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) && down_read_trylock(&adev->reset_sem)) { ret = amdgpu_kiq_rreg(adev, reg); @@ -356,6 +359,9 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, * Returns the 8 bit value from the offset specified. */ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) { + if (adev->in_pci_err_recovery) + return 0; + if (offset < adev->rmmio_size) return (readb(adev->rmmio + offset)); BUG(); @@ -377,6 +383,9 @@ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) { * Writes the value specified to the offset specified. */ void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) { + if (adev->in_pci_err_recovery) + return; + if (offset < adev->rmmio_size) writeb(value, adev->rmmio + offset); else @@ -387,6 +396,9 @@ static inline void amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint32_t acc_flags) { + if (adev->in_pci_err_recovery) + return; + trace_amdgpu_mm_wreg(adev->pdev->device, reg, v); if ((reg * 4) < adev->rmmio_size) @@ -414,6 +426,9 @@ static inline void amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint32_t acc_flags) { + if (adev->in_pci_err_recovery) + return; + if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) && down_read_trylock(&adev->reset_sem)) { amdgpu_kiq_wreg(adev, reg, v); @@ -432,6 +447,9 @@ void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint32_t acc_flags) { + if (adev->in_pci_err_recovery) + return; + if (amdgpu_sriov_fullaccess(adev) && adev->gfx.rlc.funcs && adev->gfx.rlc.funcs->is_rlcg_access_range) { @@ -453,6 +471,9 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t */ u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg) { + if (adev->in_pci_err_recovery) + return 0; + if ((reg * 4) < adev->rio_mem_size) return ioread32(adev->rio_mem + (reg * 4)); else { @@ -472,6 +493,9 @@ u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg) */ void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v) { + if (adev->in_pci_err_recovery) + return; + if ((reg * 4) < adev->rio_mem_size) iowrite32(v, adev->rio_mem + (reg * 4)); else { @@ -491,6 +515,9 @@ void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v) */ u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) { + if (adev->in_pci_err_recovery) + return 0; + if (index < adev->doorbell.num_doorbells) { return readl(adev->doorbell.ptr + index); } else { @@ -511,6 +538,9 @@ u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) */ void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) { + if (adev->in_pci_err_recovery) + return; + if (index < adev->doorbell.num_doorbells) { writel(v, adev->doorbell.ptr + index); } else { @@ -529,6 +559,9 @@ void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) */ u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) { + if (adev->in_pci_err_recovery) + return 0; + if (index < adev->doorbell.num_doorbells) { return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); } else { @@ -549,6 +582,9 @@ u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) */ void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) { + if (adev->in_pci_err_recovery) + return; + if (index < adev->doorbell.num_doorbells) { atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); } else { @@ -4778,7 +4814,9 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) pci_restore_state(pdev); + adev->in_pci_err_recovery = true; r = amdgpu_device_ip_suspend(adev); + adev->in_pci_err_recovery = false; if (r) goto out; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index d6981425ec51..8c9bacfdbc30 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -693,6 +693,9 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg) struct amdgpu_kiq *kiq = &adev->gfx.kiq; struct amdgpu_ring *ring = &kiq->ring; + if (adev->in_pci_err_recovery) + return 0; + BUG_ON(!ring->funcs->emit_rreg); spin_lock_irqsave(&kiq->ring_lock, flags); @@ -757,6 +760,9 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) BUG_ON(!ring->funcs->emit_wreg); + if (adev->in_pci_err_recovery) + return; + spin_lock_irqsave(&kiq->ring_lock, flags); amdgpu_ring_alloc(ring, 32); amdgpu_ring_emit_wreg(ring, reg, v); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index d6c38e24f130..a7771aaa1c51 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -219,6 +219,9 @@ int psp_wait_for(struct psp_context *psp, uint32_t reg_index, int i; struct amdgpu_device *adev = psp->adev; + if (psp->adev->in_pci_err_recovery) + return 0; + for (i = 0; i < adev->usec_timeout; i++) { val = RREG32(reg_index); if (check_changed) { @@ -245,6 +248,9 @@ psp_cmd_submit_buf(struct psp_context *psp, bool ras_intr = false; bool skip_unsupport = false; + if (psp->adev->in_pci_err_recovery) + return 0; + mutex_lock(&psp->mutex); memset(psp->cmd_buf_mem, 0, PSP_CMD_BUFFER_SIZE); diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index d502e30f67d9..9801bb6daa01 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -6980,15 +6980,19 @@ static int gfx_v10_0_hw_fini(void *handle) amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0); amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0); + + if (!adev->in_pci_err_recovery) { #ifndef BRING_UP_DEBUG - if (amdgpu_async_gfx_ring) { - r = gfx_v10_0_kiq_disable_kgq(adev); - if (r) - DRM_ERROR("KGQ disable failed\n"); - } + if (amdgpu_async_gfx_ring) { + r = gfx_v10_0_kiq_disable_kgq(adev); + if (r) + DRM_ERROR("KGQ disable failed\n"); + } #endif - if (amdgpu_gfx_disable_kcq(adev)) - DRM_ERROR("KCQ disable failed\n"); + if (amdgpu_gfx_disable_kcq(adev)) + DRM_ERROR("KCQ disable failed\n"); + } + if (amdgpu_sriov_vf(adev)) { gfx_v10_0_cp_gfx_enable(adev, false); /* Program KIQ position of RLC_CP_SCHEDULERS during destroy */ diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c index a58ea08cd115..97aa72a94631 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c @@ -112,6 +112,9 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu, struct amdgpu_device *adev = smu->adev; int ret = 0, index = 0; + if (smu->adev->in_pci_err_recovery) + return 0; + index = smu_cmn_to_asic_specific_index(smu, CMN2ASIC_MAPPING_MSG, msg); -- cgit v1.2.3-70-g09d2 From c1dd4aa624076cb6d4724fad2d9e9e71e46bbc9f Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Mon, 24 Aug 2020 12:30:47 -0400 Subject: drm/amdgpu: Fix consecutive DPC recovery failures. Cache the PCI state on boot and before each case where we might loose it. v2: Add pci_restore_state while caching the PCI state to avoid breaking PCI core logic for stuff like suspend/resume. v3: Extract pci_restore_state from amdgpu_device_cache_pci_state to avoid superflous restores during GPU resets and suspend/resumes. v4: Style fixes. Signed-off-by: Andrey Grodzovsky Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 5 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 62 ++++++++++++++++++++++++++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 4 +- drivers/gpu/drm/amd/amdgpu/nv.c | 4 +- drivers/gpu/drm/amd/amdgpu/soc15.c | 4 +- 5 files changed, 70 insertions(+), 9 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 207eba012029..6125ba905faf 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -989,7 +989,9 @@ struct amdgpu_device { atomic_t throttling_logging_enabled; struct ratelimit_state throttling_logging_rs; uint32_t ras_features; + bool in_pci_err_recovery; + struct pci_saved_state *pci_state; }; static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev) @@ -1269,6 +1271,9 @@ pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev); pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev); void amdgpu_pci_resume(struct pci_dev *pdev); +bool amdgpu_device_cache_pci_state(struct pci_dev *pdev); +bool amdgpu_device_load_pci_state(struct pci_dev *pdev); + #include "amdgpu_object.h" /* used by df_v3_6.c and amdgpu_pmu.c */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 36a8f128fb6b..9be9355bb0e0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1292,7 +1292,7 @@ static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; pci_set_power_state(dev->pdev, PCI_D0); - pci_restore_state(dev->pdev); + amdgpu_device_load_pci_state(dev->pdev); r = pci_enable_device(dev->pdev); if (r) DRM_WARN("pci_enable_device failed (%d)\n", r); @@ -1305,7 +1305,7 @@ static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, drm_kms_helper_poll_disable(dev); dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; amdgpu_device_suspend(dev, true); - pci_save_state(dev->pdev); + amdgpu_device_cache_pci_state(dev->pdev); /* Shut down the device */ pci_disable_device(dev->pdev); pci_set_power_state(dev->pdev, PCI_D3cold); @@ -3408,6 +3408,10 @@ fence_driver_init: if (r) dev_err(adev->dev, "amdgpu_pmu_init failed\n"); + /* Have stored pci confspace at hand for restore in sudden PCI error */ + if (amdgpu_device_cache_pci_state(adev->pdev)) + pci_restore_state(pdev); + return 0; failed: @@ -3432,6 +3436,8 @@ void amdgpu_device_fini(struct amdgpu_device *adev) flush_delayed_work(&adev->delayed_init_work); adev->shutdown = true; + kfree(adev->pci_state); + /* make sure IB test finished before entering exclusive mode * to avoid preemption on IB test * */ @@ -4852,7 +4858,7 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) /* wait for asic to come out of reset */ msleep(500); - pci_restore_state(pdev); + amdgpu_device_load_pci_state(pdev); /* confirm ASIC came out of reset */ for (i = 0; i < adev->usec_timeout; i++) { @@ -4932,6 +4938,9 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) out: if (!r) { + if (amdgpu_device_cache_pci_state(adev->pdev)) + pci_restore_state(adev->pdev); + DRM_INFO("PCIe error recovery succeeded\n"); } else { DRM_ERROR("PCIe error recovery failed, err:%d", r); @@ -4971,3 +4980,50 @@ void amdgpu_pci_resume(struct pci_dev *pdev) amdgpu_device_unlock_adev(adev); } + +bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) +{ + struct drm_device *dev = pci_get_drvdata(pdev); + struct amdgpu_device *adev = drm_to_adev(dev); + int r; + + r = pci_save_state(pdev); + if (!r) { + kfree(adev->pci_state); + + adev->pci_state = pci_store_saved_state(pdev); + + if (!adev->pci_state) { + DRM_ERROR("Failed to store PCI saved state"); + return false; + } + } else { + DRM_WARN("Failed to save PCI state, err:%d\n", r); + return false; + } + + return true; +} + +bool amdgpu_device_load_pci_state(struct pci_dev *pdev) +{ + struct drm_device *dev = pci_get_drvdata(pdev); + struct amdgpu_device *adev = drm_to_adev(dev); + int r; + + if (!adev->pci_state) + return false; + + r = pci_load_saved_state(pdev, adev->pci_state); + + if (!r) { + pci_restore_state(pdev); + } else { + DRM_WARN("Failed to load PCI state, err:%d\n", r); + return false; + } + + return true; +} + + diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index e9daab175d17..5fafb3dff1a0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -1315,7 +1315,7 @@ static int amdgpu_pmops_runtime_suspend(struct device *dev) if (amdgpu_is_atpx_hybrid()) { pci_ignore_hotplug(pdev); } else { - pci_save_state(pdev); + amdgpu_device_cache_pci_state(pdev); pci_disable_device(pdev); pci_ignore_hotplug(pdev); pci_set_power_state(pdev, PCI_D3cold); @@ -1348,7 +1348,7 @@ static int amdgpu_pmops_runtime_resume(struct device *dev) pci_set_master(pdev); } else { pci_set_power_state(pdev, PCI_D0); - pci_restore_state(pdev); + amdgpu_device_load_pci_state(pdev); ret = pci_enable_device(pdev); if (ret) return ret; diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c index 4d1402356262..0ec66030bd11 100644 --- a/drivers/gpu/drm/amd/amdgpu/nv.c +++ b/drivers/gpu/drm/amd/amdgpu/nv.c @@ -311,7 +311,7 @@ static int nv_asic_mode1_reset(struct amdgpu_device *adev) /* disable BM */ pci_clear_master(adev->pdev); - pci_save_state(adev->pdev); + amdgpu_device_cache_pci_state(adev->pdev); if (amdgpu_dpm_is_mode1_reset_supported(adev)) { dev_info(adev->dev, "GPU smu mode1 reset\n"); @@ -323,7 +323,7 @@ static int nv_asic_mode1_reset(struct amdgpu_device *adev) if (ret) dev_err(adev->dev, "GPU mode1 reset failed\n"); - pci_restore_state(adev->pdev); + amdgpu_device_load_pci_state(adev->pdev); /* wait for asic to come out of reset */ for (i = 0; i < adev->usec_timeout; i++) { diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c index 2f93c475d6d8..ddd55e3176c4 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15.c +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c @@ -484,13 +484,13 @@ static int soc15_asic_mode1_reset(struct amdgpu_device *adev) /* disable BM */ pci_clear_master(adev->pdev); - pci_save_state(adev->pdev); + amdgpu_device_cache_pci_state(adev->pdev); ret = psp_gpu_reset(adev); if (ret) dev_err(adev->dev, "GPU mode1 reset failed\n"); - pci_restore_state(adev->pdev); + amdgpu_device_load_pci_state(adev->pdev); /* wait for asic to come out of reset */ for (i = 0; i < adev->usec_timeout; i++) { -- cgit v1.2.3-70-g09d2