diff options
author | Dave Airlie <airlied@redhat.com> | 2020-09-23 09:25:17 +1000 |
---|---|---|
committer | Dave Airlie <airlied@redhat.com> | 2020-09-23 09:25:18 +1000 |
commit | fc88fef916e8971eefeacc62241b7408b7e7939d (patch) | |
tree | c2647e7388a752ffc1f9d12b5aaf9a83a69b5ba0 /drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | |
parent | db29dc7d3346328b3fc61057d224a242f9928289 (diff) | |
parent | 911d5bd5e7b8531b39301c2c27e5b90d7bd71b88 (diff) |
Merge tag 'amd-drm-next-5.10-2020-09-18' of git://people.freedesktop.org/~agd5f/linux into drm-next
amd-drm-next-5.10-2020-09-18:
amdgpu:
- Support for PCIe DPC recovery
- Sienna Cichlid updates
- Navy Flounder updates
- RAS fixes
- Refactor DC interrupt handling
- Display fixes
- Fix issues with OLED panels
- Mclk fixes for navi1x
- Watermark fixes for renoir and raven2
- Misc code cleanups
- Misc bug fixes
amdkfd:
- Fix a memory leak
- Fix a crach in GPU reset
- Add process eviction counters
radeon:
- expose sclk via sysfs hwmon interface
- Revert bad PLL fix
scheduler:
- Kernel doc fixes
Signed-off-by: Dave Airlie <airlied@redhat.com>
From: Alex Deucher <alexdeucher@gmail.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200918204322.3931-1-alexander.deucher@amd.com
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 306 |
1 files changed, 294 insertions, 12 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index f7307af76452..2ff43a3d52fc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -319,6 +319,9 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, { uint32_t ret; + if (adev->in_pci_err_recovery) + return 0; + if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) && down_read_trylock(&adev->reset_sem)) { ret = amdgpu_kiq_rreg(adev, reg); @@ -355,7 +358,11 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, * * Returns the 8 bit value from the offset specified. */ -uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) { +uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) +{ + if (adev->in_pci_err_recovery) + return 0; + if (offset < adev->rmmio_size) return (readb(adev->rmmio + offset)); BUG(); @@ -376,7 +383,11 @@ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) { * * Writes the value specified to the offset specified. */ -void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) { +void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) +{ + if (adev->in_pci_err_recovery) + return; + if (offset < adev->rmmio_size) writeb(value, adev->rmmio + offset); else @@ -387,6 +398,9 @@ static inline void amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint32_t acc_flags) { + if (adev->in_pci_err_recovery) + return; + trace_amdgpu_mm_wreg(adev->pdev->device, reg, v); if ((reg * 4) < adev->rmmio_size) @@ -414,6 +428,9 @@ static inline void amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint32_t acc_flags) { + if (adev->in_pci_err_recovery) + return; + if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) && down_read_trylock(&adev->reset_sem)) { amdgpu_kiq_wreg(adev, reg, v); @@ -432,6 +449,9 @@ void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint32_t acc_flags) { + if (adev->in_pci_err_recovery) + return; + if (amdgpu_sriov_fullaccess(adev) && adev->gfx.rlc.funcs && adev->gfx.rlc.funcs->is_rlcg_access_range) { @@ -453,6 +473,9 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t */ u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg) { + if (adev->in_pci_err_recovery) + return 0; + if ((reg * 4) < adev->rio_mem_size) return ioread32(adev->rio_mem + (reg * 4)); else { @@ -472,6 +495,9 @@ u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg) */ void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v) { + if (adev->in_pci_err_recovery) + return; + if ((reg * 4) < adev->rio_mem_size) iowrite32(v, adev->rio_mem + (reg * 4)); else { @@ -491,6 +517,9 @@ void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v) */ u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) { + if (adev->in_pci_err_recovery) + return 0; + if (index < adev->doorbell.num_doorbells) { return readl(adev->doorbell.ptr + index); } else { @@ -511,6 +540,9 @@ u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) */ void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) { + if (adev->in_pci_err_recovery) + return; + if (index < adev->doorbell.num_doorbells) { writel(v, adev->doorbell.ptr + index); } else { @@ -529,6 +561,9 @@ void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) */ u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) { + if (adev->in_pci_err_recovery) + return 0; + if (index < adev->doorbell.num_doorbells) { return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); } else { @@ -549,6 +584,9 @@ u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) */ void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) { + if (adev->in_pci_err_recovery) + return; + if (index < adev->doorbell.num_doorbells) { atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); } else { @@ -1256,7 +1294,7 @@ static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; pci_set_power_state(dev->pdev, PCI_D0); - pci_restore_state(dev->pdev); + amdgpu_device_load_pci_state(dev->pdev); r = pci_enable_device(dev->pdev); if (r) DRM_WARN("pci_enable_device failed (%d)\n", r); @@ -1269,7 +1307,7 @@ static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, drm_kms_helper_poll_disable(dev); dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; amdgpu_device_suspend(dev, true); - pci_save_state(dev->pdev); + amdgpu_device_cache_pci_state(dev->pdev); /* Shut down the device */ pci_disable_device(dev->pdev); pci_set_power_state(dev->pdev, PCI_D3cold); @@ -2999,6 +3037,7 @@ static const struct attribute *amdgpu_dev_attributes[] = { NULL }; + /** * amdgpu_device_init - initialize the driver * @@ -3170,13 +3209,13 @@ int amdgpu_device_init(struct amdgpu_device *adev, r = amdgpu_device_get_job_timeout_settings(adev); if (r) { dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); - return r; + goto failed_unmap; } /* early init functions */ r = amdgpu_device_ip_early_init(adev); if (r) - return r; + goto failed_unmap; /* doorbell bar mapping and doorbell index init*/ amdgpu_device_doorbell_init(adev); @@ -3217,6 +3256,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, } } + pci_enable_pcie_error_reporting(adev->ddev.pdev); + /* Post card if necessary */ if (amdgpu_device_need_post(adev)) { if (!adev->bios) { @@ -3359,16 +3400,18 @@ fence_driver_init: flush_delayed_work(&adev->delayed_init_work); r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); - if (r) { + if (r) dev_err(adev->dev, "Could not create amdgpu device attr\n"); - return r; - } if (IS_ENABLED(CONFIG_PERF_EVENTS)) r = amdgpu_pmu_init(adev); if (r) dev_err(adev->dev, "amdgpu_pmu_init failed\n"); + /* Have stored pci confspace at hand for restore in sudden PCI error */ + if (amdgpu_device_cache_pci_state(adev->pdev)) + pci_restore_state(pdev); + return 0; failed: @@ -3376,6 +3419,10 @@ failed: if (boco) vga_switcheroo_fini_domain_pm_ops(adev->dev); +failed_unmap: + iounmap(adev->rmmio); + adev->rmmio = NULL; + return r; } @@ -3393,6 +3440,8 @@ void amdgpu_device_fini(struct amdgpu_device *adev) flush_delayed_work(&adev->delayed_init_work); adev->shutdown = true; + kfree(adev->pci_state); + /* make sure IB test finished before entering exclusive mode * to avoid preemption on IB test * */ @@ -4072,7 +4121,8 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, struct list_head *device_list_handle, - bool *need_full_reset_arg) + bool *need_full_reset_arg, + bool skip_hw_reset) { struct amdgpu_device *tmp_adev = NULL; bool need_full_reset = *need_full_reset_arg, vram_lost = false; @@ -4082,7 +4132,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, * ASIC reset has to be done on all HGMI hive nodes ASAP * to allow proper links negotiation in FW (within 1 sec) */ - if (need_full_reset) { + if (!skip_hw_reset && need_full_reset) { list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { /* For XGMI run all resets in parallel to speed up the process */ if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { @@ -4477,7 +4527,7 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */ if (r) adev->asic_reset_res = r; } else { - r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset); + r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false); if (r && r == -EAGAIN) goto retry; } @@ -4705,3 +4755,235 @@ int amdgpu_device_baco_exit(struct drm_device *dev) return 0; } + +static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev) +{ + int i; + + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { + struct amdgpu_ring *ring = adev->rings[i]; + + if (!ring || !ring->sched.thread) + continue; + + cancel_delayed_work_sync(&ring->sched.work_tdr); + } +} + +/** + * amdgpu_pci_error_detected - Called when a PCI error is detected. + * @pdev: PCI device struct + * @state: PCI channel state + * + * Description: Called when a PCI error is detected. + * + * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. + */ +pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) +{ + struct drm_device *dev = pci_get_drvdata(pdev); + struct amdgpu_device *adev = drm_to_adev(dev); + int i; + + DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); + + if (adev->gmc.xgmi.num_physical_nodes > 1) { + DRM_WARN("No support for XGMI hive yet..."); + return PCI_ERS_RESULT_DISCONNECT; + } + + switch (state) { + case pci_channel_io_normal: + return PCI_ERS_RESULT_CAN_RECOVER; + /* Fatal error, prepare for slot reset */ + case pci_channel_io_frozen: + /* + * Cancel and wait for all TDRs in progress if failing to + * set adev->in_gpu_reset in amdgpu_device_lock_adev + * + * Locking adev->reset_sem will prevent any external access + * to GPU during PCI error recovery + */ + while (!amdgpu_device_lock_adev(adev, NULL)) + amdgpu_cancel_all_tdr(adev); + + /* + * Block any work scheduling as we do for regular GPU reset + * for the duration of the recovery + */ + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { + struct amdgpu_ring *ring = adev->rings[i]; + + if (!ring || !ring->sched.thread) + continue; + + drm_sched_stop(&ring->sched, NULL); + } + return PCI_ERS_RESULT_NEED_RESET; + case pci_channel_io_perm_failure: + /* Permanent error, prepare for device removal */ + return PCI_ERS_RESULT_DISCONNECT; + } + + return PCI_ERS_RESULT_NEED_RESET; +} + +/** + * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers + * @pdev: pointer to PCI device + */ +pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) +{ + + DRM_INFO("PCI error: mmio enabled callback!!\n"); + + /* TODO - dump whatever for debugging purposes */ + + /* This called only if amdgpu_pci_error_detected returns + * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still + * works, no need to reset slot. + */ + + return PCI_ERS_RESULT_RECOVERED; +} + +/** + * amdgpu_pci_slot_reset - Called when PCI slot has been reset. + * @pdev: PCI device struct + * + * Description: This routine is called by the pci error recovery + * code after the PCI slot has been reset, just before we + * should resume normal operations. + */ +pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) +{ + struct drm_device *dev = pci_get_drvdata(pdev); + struct amdgpu_device *adev = drm_to_adev(dev); + int r, i; + bool need_full_reset = true; + u32 memsize; + struct list_head device_list; + + DRM_INFO("PCI error: slot reset callback!!\n"); + + INIT_LIST_HEAD(&device_list); + list_add_tail(&adev->gmc.xgmi.head, &device_list); + + /* wait for asic to come out of reset */ + msleep(500); + + /* Restore PCI confspace */ + amdgpu_device_load_pci_state(pdev); + + /* confirm ASIC came out of reset */ + for (i = 0; i < adev->usec_timeout; i++) { + memsize = amdgpu_asic_get_config_memsize(adev); + + if (memsize != 0xffffffff) + break; + udelay(1); + } + if (memsize == 0xffffffff) { + r = -ETIME; + goto out; + } + + adev->in_pci_err_recovery = true; + r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset); + adev->in_pci_err_recovery = false; + if (r) + goto out; + + r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true); + +out: + if (!r) { + if (amdgpu_device_cache_pci_state(adev->pdev)) + pci_restore_state(adev->pdev); + + DRM_INFO("PCIe error recovery succeeded\n"); + } else { + DRM_ERROR("PCIe error recovery failed, err:%d", r); + amdgpu_device_unlock_adev(adev); + } + + return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; +} + +/** + * amdgpu_pci_resume() - resume normal ops after PCI reset + * @pdev: pointer to PCI device + * + * Called when the error recovery driver tells us that its + * OK to resume normal operation. Use completion to allow + * halted scsi ops to resume. + */ +void amdgpu_pci_resume(struct pci_dev *pdev) +{ + struct drm_device *dev = pci_get_drvdata(pdev); + struct amdgpu_device *adev = drm_to_adev(dev); + int i; + + + DRM_INFO("PCI error: resume callback!!\n"); + + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { + struct amdgpu_ring *ring = adev->rings[i]; + + if (!ring || !ring->sched.thread) + continue; + + + drm_sched_resubmit_jobs(&ring->sched); + drm_sched_start(&ring->sched, true); + } + + amdgpu_device_unlock_adev(adev); +} + +bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) +{ + struct drm_device *dev = pci_get_drvdata(pdev); + struct amdgpu_device *adev = drm_to_adev(dev); + int r; + + r = pci_save_state(pdev); + if (!r) { + kfree(adev->pci_state); + + adev->pci_state = pci_store_saved_state(pdev); + + if (!adev->pci_state) { + DRM_ERROR("Failed to store PCI saved state"); + return false; + } + } else { + DRM_WARN("Failed to save PCI state, err:%d\n", r); + return false; + } + + return true; +} + +bool amdgpu_device_load_pci_state(struct pci_dev *pdev) +{ + struct drm_device *dev = pci_get_drvdata(pdev); + struct amdgpu_device *adev = drm_to_adev(dev); + int r; + + if (!adev->pci_state) + return false; + + r = pci_load_saved_state(pdev, adev->pci_state); + + if (!r) { + pci_restore_state(pdev); + } else { + DRM_WARN("Failed to load PCI state, err:%d\n", r); + return false; + } + + return true; +} + + |