diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 450 |
1 files changed, 275 insertions, 175 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 0f82c5d21237..7d3b54615147 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -65,6 +65,7 @@ #include "amdgpu_ras.h" #include "amdgpu_pmu.h" #include "amdgpu_fru_eeprom.h" +#include "amdgpu_reset.h" #include <linux/suspend.h> #include <drm/task_barrier.h> @@ -137,7 +138,7 @@ static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, struct amdgpu_device *adev = drm_to_adev(ddev); uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); - return snprintf(buf, PAGE_SIZE, "%llu\n", cnt); + return sysfs_emit(buf, "%llu\n", cnt); } static DEVICE_ATTR(pcie_replay_count, S_IRUGO, @@ -161,7 +162,7 @@ static ssize_t amdgpu_device_get_product_name(struct device *dev, struct drm_device *ddev = dev_get_drvdata(dev); struct amdgpu_device *adev = drm_to_adev(ddev); - return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name); + return sysfs_emit(buf, "%s\n", adev->product_name); } static DEVICE_ATTR(product_name, S_IRUGO, @@ -183,7 +184,7 @@ static ssize_t amdgpu_device_get_product_number(struct device *dev, struct drm_device *ddev = dev_get_drvdata(dev); struct amdgpu_device *adev = drm_to_adev(ddev); - return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number); + return sysfs_emit(buf, "%s\n", adev->product_number); } static DEVICE_ATTR(product_number, S_IRUGO, @@ -205,25 +206,25 @@ static ssize_t amdgpu_device_get_serial_number(struct device *dev, struct drm_device *ddev = dev_get_drvdata(dev); struct amdgpu_device *adev = drm_to_adev(ddev); - return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial); + return sysfs_emit(buf, "%s\n", adev->serial); } static DEVICE_ATTR(serial_number, S_IRUGO, amdgpu_device_get_serial_number, NULL); /** - * amdgpu_device_supports_atpx - Is the device a dGPU with HG/PX power control + * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control * * @dev: drm_device pointer * - * Returns true if the device is a dGPU with HG/PX power control, + * Returns true if the device is a dGPU with ATPX power control, * otherwise return false. */ -bool amdgpu_device_supports_atpx(struct drm_device *dev) +bool amdgpu_device_supports_px(struct drm_device *dev) { struct amdgpu_device *adev = drm_to_adev(dev); - if (adev->flags & AMD_IS_PX) + if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) return true; return false; } @@ -233,14 +234,15 @@ bool amdgpu_device_supports_atpx(struct drm_device *dev) * * @dev: drm_device pointer * - * Returns true if the device is a dGPU with HG/PX power control, + * Returns true if the device is a dGPU with ACPI power control, * otherwise return false. */ bool amdgpu_device_supports_boco(struct drm_device *dev) { struct amdgpu_device *adev = drm_to_adev(dev); - if (adev->has_pr3) + if (adev->has_pr3 || + ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) return true; return false; } @@ -326,6 +328,35 @@ void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, /* * register access helper functions. */ + +/* Check if hw access should be skipped because of hotplug or device error */ +bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) +{ + if (adev->in_pci_err_recovery) + return true; + +#ifdef CONFIG_LOCKDEP + /* + * This is a bit complicated to understand, so worth a comment. What we assert + * here is that the GPU reset is not running on another thread in parallel. + * + * For this we trylock the read side of the reset semaphore, if that succeeds + * we know that the reset is not running in paralell. + * + * If the trylock fails we assert that we are either already holding the read + * side of the lock or are the reset thread itself and hold the write side of + * the lock. + */ + if (in_task()) { + if (down_read_trylock(&adev->reset_sem)) + up_read(&adev->reset_sem); + else + lockdep_assert_held(&adev->reset_sem); + } +#endif + return false; +} + /** * amdgpu_device_rreg - read a memory mapped IO or indirect register * @@ -340,7 +371,7 @@ uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, { uint32_t ret; - if (adev->in_pci_err_recovery) + if (amdgpu_device_skip_hw_access(adev)) return 0; if ((reg * 4) < adev->rmmio_size) { @@ -377,7 +408,7 @@ uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, */ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) { - if (adev->in_pci_err_recovery) + if (amdgpu_device_skip_hw_access(adev)) return 0; if (offset < adev->rmmio_size) @@ -402,7 +433,7 @@ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) */ void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) { - if (adev->in_pci_err_recovery) + if (amdgpu_device_skip_hw_access(adev)) return; if (offset < adev->rmmio_size) @@ -425,7 +456,7 @@ void amdgpu_device_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint32_t acc_flags) { - if (adev->in_pci_err_recovery) + if (amdgpu_device_skip_hw_access(adev)) return; if ((reg * 4) < adev->rmmio_size) { @@ -452,14 +483,14 @@ void amdgpu_device_wreg(struct amdgpu_device *adev, void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t v) { - if (adev->in_pci_err_recovery) + if (amdgpu_device_skip_hw_access(adev)) return; if (amdgpu_sriov_fullaccess(adev) && adev->gfx.rlc.funcs && adev->gfx.rlc.funcs->is_rlcg_access_range) { if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) - return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v); + return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v, 0); } else { writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); } @@ -476,7 +507,7 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, */ u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) { - if (adev->in_pci_err_recovery) + if (amdgpu_device_skip_hw_access(adev)) return 0; if (index < adev->doorbell.num_doorbells) { @@ -499,7 +530,7 @@ u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) */ void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) { - if (adev->in_pci_err_recovery) + if (amdgpu_device_skip_hw_access(adev)) return; if (index < adev->doorbell.num_doorbells) { @@ -520,7 +551,7 @@ void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) */ u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) { - if (adev->in_pci_err_recovery) + if (amdgpu_device_skip_hw_access(adev)) return 0; if (index < adev->doorbell.num_doorbells) { @@ -543,7 +574,7 @@ u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) */ void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) { - if (adev->in_pci_err_recovery) + if (amdgpu_device_skip_hw_access(adev)) return; if (index < adev->doorbell.num_doorbells) { @@ -1391,7 +1422,7 @@ static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, struct drm_device *dev = pci_get_drvdata(pdev); int r; - if (amdgpu_device_supports_atpx(dev) && state == VGA_SWITCHEROO_OFF) + if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) return; if (state == VGA_SWITCHEROO_ON) { @@ -2049,6 +2080,11 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); return r; } + + /*get pf2vf msg info at it's earliest time*/ + if (amdgpu_sriov_vf(adev)) + amdgpu_virt_init_data_exchange(adev); + } } @@ -2331,8 +2367,8 @@ static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) * Returns 0 on success, negative error code on failure. */ -static int amdgpu_device_set_cg_state(struct amdgpu_device *adev, - enum amd_clockgating_state state) +int amdgpu_device_set_cg_state(struct amdgpu_device *adev, + enum amd_clockgating_state state) { int i, j, r; @@ -2343,6 +2379,10 @@ static int amdgpu_device_set_cg_state(struct amdgpu_device *adev, i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; if (!adev->ip_blocks[i].status.late_initialized) continue; + /* skip CG for GFX on S0ix */ + if (adev->in_s0ix && + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) + continue; /* skip CG for VCE/UVD, it's handled specially */ if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && @@ -2363,7 +2403,8 @@ static int amdgpu_device_set_cg_state(struct amdgpu_device *adev, return 0; } -static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state) +int amdgpu_device_set_pg_state(struct amdgpu_device *adev, + enum amd_powergating_state state) { int i, j, r; @@ -2374,6 +2415,10 @@ static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_power i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; if (!adev->ip_blocks[i].status.late_initialized) continue; + /* skip PG for GFX on S0ix */ + if (adev->in_s0ix && + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) + continue; /* skip CG for VCE/UVD, it's handled specially */ if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && @@ -2655,11 +2700,8 @@ static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) { int i, r; - if (adev->in_poweroff_reboot_com || - !amdgpu_acpi_is_s0ix_supported(adev) || amdgpu_in_reset(adev)) { - amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); - amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); - } + amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); + amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); for (i = adev->num_ip_blocks - 1; i >= 0; i--) { if (!adev->ip_blocks[i].status.valid) @@ -2699,6 +2741,9 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) { int i, r; + if (adev->in_s0ix) + amdgpu_gfx_state_change_set(adev, sGpuChangeState_D3Entry); + for (i = adev->num_ip_blocks - 1; i >= 0; i--) { if (!adev->ip_blocks[i].status.valid) continue; @@ -2721,6 +2766,17 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) adev->ip_blocks[i].status.hw = false; continue; } + + /* skip suspend of gfx and psp for S0ix + * gfx is in gfxoff state, so on resume it will exit gfxoff just + * like at runtime. PSP is also part of the always on hardware + * so no need to suspend it. + */ + if (adev->in_s0ix && + (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)) + continue; + /* XXX handle errors */ r = adev->ip_blocks[i].version->funcs->suspend(adev); /* XXX handle errors */ @@ -3086,8 +3142,9 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) if (adev->asic_reset_res) goto fail; - if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count) - adev->mmhub.funcs->reset_ras_error_count(adev); + if (adev->mmhub.ras_funcs && + adev->mmhub.ras_funcs->reset_ras_error_count) + adev->mmhub.ras_funcs->reset_ras_error_count(adev); } else { task_barrier_full(&hive->tb); @@ -3197,7 +3254,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, struct drm_device *ddev = adev_to_drm(adev); struct pci_dev *pdev = adev->pdev; int r, i; - bool atpx = false; + bool px = false; u32 max_MBps; adev->shutdown = false; @@ -3353,29 +3410,14 @@ int amdgpu_device_init(struct amdgpu_device *adev, /* doorbell bar mapping and doorbell index init*/ amdgpu_device_doorbell_init(adev); - /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ - /* this will fail for cards that aren't VGA class devices, just - * ignore it */ - if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) - vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode); - - if (amdgpu_device_supports_atpx(ddev)) - atpx = true; - if (amdgpu_has_atpx() && - (amdgpu_is_atpx_hybrid() || - amdgpu_has_atpx_dgpu_power_cntl()) && - !pci_is_thunderbolt_attached(adev->pdev)) - vga_switcheroo_register_client(adev->pdev, - &amdgpu_switcheroo_ops, atpx); - if (atpx) - vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); - if (amdgpu_emu_mode == 1) { /* post the asic on emulation mode */ emu_soc_asic_init(adev); goto fence_driver_init; } + amdgpu_reset_init(adev); + /* detect if we are with an SRIOV vbios */ amdgpu_device_detect_sriov_bios(adev); @@ -3564,6 +3606,19 @@ fence_driver_init: if (amdgpu_device_cache_pci_state(adev->pdev)) pci_restore_state(pdev); + /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ + /* this will fail for cards that aren't VGA class devices, just + * ignore it */ + if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) + vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode); + + if (amdgpu_device_supports_px(ddev)) { + px = true; + vga_switcheroo_register_client(adev->pdev, + &amdgpu_switcheroo_ops, px); + vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); + } + if (adev->gmc.xgmi.pending_reset) queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, msecs_to_jiffies(AMDGPU_RESUME_MS)); @@ -3575,8 +3630,6 @@ release_ras_con: failed: amdgpu_vf_error_trans_all(adev); - if (atpx) - vga_switcheroo_fini_domain_pm_ops(adev->dev); failed_unmap: iounmap(adev->rmmio); @@ -3626,6 +3679,9 @@ void amdgpu_device_fini(struct amdgpu_device *adev) release_firmware(adev->firmware.gpu_info_fw); adev->firmware.gpu_info_fw = NULL; adev->accel_working = false; + + amdgpu_reset_fini(adev); + /* free i2c buses */ if (!amdgpu_device_has_dc_support(adev)) amdgpu_i2c_fini(adev); @@ -3635,13 +3691,10 @@ void amdgpu_device_fini(struct amdgpu_device *adev) kfree(adev->bios); adev->bios = NULL; - if (amdgpu_has_atpx() && - (amdgpu_is_atpx_hybrid() || - amdgpu_has_atpx_dgpu_power_cntl()) && - !pci_is_thunderbolt_attached(adev->pdev)) + if (amdgpu_device_supports_px(adev_to_drm(adev))) { vga_switcheroo_unregister_client(adev->pdev); - if (amdgpu_device_supports_atpx(adev_to_drm(adev))) vga_switcheroo_fini_domain_pm_ops(adev->dev); + } if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) vga_client_register(adev->pdev, NULL, NULL, NULL); iounmap(adev->rmmio); @@ -3674,14 +3727,9 @@ void amdgpu_device_fini(struct amdgpu_device *adev) */ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) { - struct amdgpu_device *adev; - struct drm_crtc *crtc; - struct drm_connector *connector; - struct drm_connector_list_iter iter; + struct amdgpu_device *adev = drm_to_adev(dev); int r; - adev = drm_to_adev(dev); - if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) return 0; @@ -3693,61 +3741,19 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) cancel_delayed_work_sync(&adev->delayed_init_work); - if (!amdgpu_device_has_dc_support(adev)) { - /* turn off display hw */ - drm_modeset_lock_all(dev); - drm_connector_list_iter_begin(dev, &iter); - drm_for_each_connector_iter(connector, &iter) - drm_helper_connector_dpms(connector, - DRM_MODE_DPMS_OFF); - drm_connector_list_iter_end(&iter); - drm_modeset_unlock_all(dev); - /* unpin the front buffers and cursors */ - list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { - struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); - struct drm_framebuffer *fb = crtc->primary->fb; - struct amdgpu_bo *robj; - - if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { - struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); - r = amdgpu_bo_reserve(aobj, true); - if (r == 0) { - amdgpu_bo_unpin(aobj); - amdgpu_bo_unreserve(aobj); - } - } - - if (fb == NULL || fb->obj[0] == NULL) { - continue; - } - robj = gem_to_amdgpu_bo(fb->obj[0]); - /* don't unpin kernel fb objects */ - if (!amdgpu_fbdev_robj_is_fb(adev, robj)) { - r = amdgpu_bo_reserve(robj, true); - if (r == 0) { - amdgpu_bo_unpin(robj); - amdgpu_bo_unreserve(robj); - } - } - } - } - amdgpu_ras_suspend(adev); r = amdgpu_device_ip_suspend_phase1(adev); - amdgpu_amdkfd_suspend(adev, adev->in_runpm); + if (!adev->in_s0ix) + amdgpu_amdkfd_suspend(adev, adev->in_runpm); /* evict vram memory */ amdgpu_bo_evict_vram(adev); amdgpu_fence_driver_suspend(adev); - if (adev->in_poweroff_reboot_com || - !amdgpu_acpi_is_s0ix_supported(adev) || amdgpu_in_reset(adev)) - r = amdgpu_device_ip_suspend_phase2(adev); - else - amdgpu_gfx_state_change_set(adev, sGpuChangeState_D3Entry); + r = amdgpu_device_ip_suspend_phase2(adev); /* evict remaining vram memory * This second call to evict vram is to evict the gart page table * using the CPU. @@ -3769,16 +3775,13 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) */ int amdgpu_device_resume(struct drm_device *dev, bool fbcon) { - struct drm_connector *connector; - struct drm_connector_list_iter iter; struct amdgpu_device *adev = drm_to_adev(dev); - struct drm_crtc *crtc; int r = 0; if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) return 0; - if (amdgpu_acpi_is_s0ix_supported(adev)) + if (adev->in_s0ix) amdgpu_gfx_state_change_set(adev, sGpuChangeState_D0Entry); /* post card */ @@ -3803,50 +3806,17 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon) queue_delayed_work(system_wq, &adev->delayed_init_work, msecs_to_jiffies(AMDGPU_RESUME_MS)); - if (!amdgpu_device_has_dc_support(adev)) { - /* pin cursors */ - list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { - struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); - - if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { - struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); - r = amdgpu_bo_reserve(aobj, true); - if (r == 0) { - r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM); - if (r != 0) - dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r); - amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj); - amdgpu_bo_unreserve(aobj); - } - } - } + if (!adev->in_s0ix) { + r = amdgpu_amdkfd_resume(adev, adev->in_runpm); + if (r) + return r; } - r = amdgpu_amdkfd_resume(adev, adev->in_runpm); - if (r) - return r; /* Make sure IB tests flushed */ flush_delayed_work(&adev->delayed_init_work); - /* blat the mode back in */ - if (fbcon) { - if (!amdgpu_device_has_dc_support(adev)) { - /* pre DCE11 */ - drm_helper_resume_force_mode(dev); - - /* turn on display hw */ - drm_modeset_lock_all(dev); - - drm_connector_list_iter_begin(dev, &iter); - drm_for_each_connector_iter(connector, &iter) - drm_helper_connector_dpms(connector, - DRM_MODE_DPMS_ON); - drm_connector_list_iter_end(&iter); - - drm_modeset_unlock_all(dev); - } + if (fbcon) amdgpu_fbdev_set_suspend(adev, 0); - } drm_kms_helper_poll_enable(dev); @@ -4144,11 +4114,11 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, amdgpu_amdkfd_post_reset(adev); error: - amdgpu_virt_release_full_gpu(adev, true); if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { amdgpu_inc_vram_lost(adev); r = amdgpu_device_recover_vram(adev); } + amdgpu_virt_release_full_gpu(adev, true); return r; } @@ -4225,6 +4195,8 @@ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) case CHIP_SIENNA_CICHLID: case CHIP_NAVY_FLOUNDER: case CHIP_DIMGREY_CAVEFISH: + case CHIP_VANGOGH: + case CHIP_ALDEBARAN: break; default: goto disabled; @@ -4279,11 +4251,15 @@ int amdgpu_device_mode1_reset(struct amdgpu_device *adev) } int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, - struct amdgpu_job *job, - bool *need_full_reset_arg) + struct amdgpu_reset_context *reset_context) { int i, r = 0; - bool need_full_reset = *need_full_reset_arg; + struct amdgpu_job *job = NULL; + bool need_full_reset = + test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); + + if (reset_context->reset_req_dev == adev) + job = reset_context->job; /* no need to dump if device is not in good state during probe period */ if (!adev->gmc.xgmi.pending_reset) @@ -4308,6 +4284,13 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, if(job) drm_sched_increase_karma(&job->base); + r = amdgpu_reset_prepare_hwcontext(adev, reset_context); + /* If reset handler not implemented, continue; otherwise return */ + if (r == -ENOSYS) + r = 0; + else + return r; + /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ if (!amdgpu_sriov_vf(adev)) { @@ -4326,22 +4309,38 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, if (need_full_reset) r = amdgpu_device_ip_suspend(adev); - - *need_full_reset_arg = need_full_reset; + if (need_full_reset) + set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); + else + clear_bit(AMDGPU_NEED_FULL_RESET, + &reset_context->flags); } return r; } -int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, - struct list_head *device_list_handle, - bool *need_full_reset_arg, - bool skip_hw_reset) +int amdgpu_do_asic_reset(struct list_head *device_list_handle, + struct amdgpu_reset_context *reset_context) { struct amdgpu_device *tmp_adev = NULL; - bool need_full_reset = *need_full_reset_arg, vram_lost = false; + bool need_full_reset, skip_hw_reset, vram_lost = false; int r = 0; + /* Try reset handler method first */ + tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, + reset_list); + r = amdgpu_reset_perform_reset(tmp_adev, reset_context); + /* If reset handler not implemented, continue; otherwise return */ + if (r == -ENOSYS) + r = 0; + else + return r; + + /* Reset handler not implemented, use the default method */ + need_full_reset = + test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); + skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); + /* * ASIC reset has to be done on all XGMI hive nodes ASAP * to allow proper links negotiation in FW (within 1 sec) @@ -4378,9 +4377,9 @@ int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, if (!r && amdgpu_ras_intr_triggered()) { list_for_each_entry(tmp_adev, device_list_handle, reset_list) { - if (tmp_adev->mmhub.funcs && - tmp_adev->mmhub.funcs->reset_ras_error_count) - tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev); + if (tmp_adev->mmhub.ras_funcs && + tmp_adev->mmhub.ras_funcs->reset_ras_error_count) + tmp_adev->mmhub.ras_funcs->reset_ras_error_count(tmp_adev); } amdgpu_ras_intr_cleared(); @@ -4425,7 +4424,8 @@ int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, */ amdgpu_register_gpu_instance(tmp_adev); - if (!hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1) + if (!reset_context->hive && + tmp_adev->gmc.xgmi.num_physical_nodes > 1) amdgpu_xgmi_add_device(tmp_adev); r = amdgpu_device_ip_late_init(tmp_adev); @@ -4453,8 +4453,10 @@ int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, } /* Update PSP FW topology after reset */ - if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1) - r = amdgpu_xgmi_update_topology(hive, tmp_adev); + if (reset_context->hive && + tmp_adev->gmc.xgmi.num_physical_nodes > 1) + r = amdgpu_xgmi_update_topology( + reset_context->hive, tmp_adev); } } @@ -4478,7 +4480,10 @@ out: } end: - *need_full_reset_arg = need_full_reset; + if (need_full_reset) + set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); + else + clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); return r; } @@ -4615,6 +4620,74 @@ static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) return 0; } +void amdgpu_device_recheck_guilty_jobs( + struct amdgpu_device *adev, struct list_head *device_list_handle, + struct amdgpu_reset_context *reset_context) +{ + int i, r = 0; + + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { + struct amdgpu_ring *ring = adev->rings[i]; + int ret = 0; + struct drm_sched_job *s_job; + + if (!ring || !ring->sched.thread) + continue; + + s_job = list_first_entry_or_null(&ring->sched.pending_list, + struct drm_sched_job, list); + if (s_job == NULL) + continue; + + /* clear job's guilty and depend the folowing step to decide the real one */ + drm_sched_reset_karma(s_job); + drm_sched_resubmit_jobs_ext(&ring->sched, 1); + + ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout); + if (ret == 0) { /* timeout */ + DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", + ring->sched.name, s_job->id); + + /* set guilty */ + drm_sched_increase_karma(s_job); +retry: + /* do hw reset */ + if (amdgpu_sriov_vf(adev)) { + amdgpu_virt_fini_data_exchange(adev); + r = amdgpu_device_reset_sriov(adev, false); + if (r) + adev->asic_reset_res = r; + } else { + clear_bit(AMDGPU_SKIP_HW_RESET, + &reset_context->flags); + r = amdgpu_do_asic_reset(device_list_handle, + reset_context); + if (r && r == -EAGAIN) + goto retry; + } + + /* + * add reset counter so that the following + * resubmitted job could flush vmid + */ + atomic_inc(&adev->gpu_reset_counter); + continue; + } + + /* got the hw fence, signal finished fence */ + atomic_dec(ring->sched.score); + dma_fence_get(&s_job->s_fence->finished); + dma_fence_signal(&s_job->s_fence->finished); + dma_fence_put(&s_job->s_fence->finished); + + /* remove node from list and free the job */ + spin_lock(&ring->sched.job_list_lock); + list_del_init(&s_job->list); + spin_unlock(&ring->sched.job_list_lock); + ring->sched.ops->free_job(s_job); + } +} + /** * amdgpu_device_gpu_recover - reset the asic and recover scheduler * @@ -4630,13 +4703,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, struct amdgpu_job *job) { struct list_head device_list, *device_list_handle = NULL; - bool need_full_reset = false; bool job_signaled = false; struct amdgpu_hive_info *hive = NULL; struct amdgpu_device *tmp_adev = NULL; int i, r = 0; bool need_emergency_restart = false; bool audio_suspended = false; + int tmp_vram_lost_counter; + struct amdgpu_reset_context reset_context; + + memset(&reset_context, 0, sizeof(reset_context)); /* * Special case: RAS triggered and full reset isn't supported @@ -4677,6 +4753,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, mutex_lock(&hive->hive_lock); } + reset_context.method = AMD_RESET_METHOD_NONE; + reset_context.reset_req_dev = adev; + reset_context.job = job; + reset_context.hive = hive; + clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); + /* * lock the device before we try to operate the linked list * if didn't get the device lock, don't touch the linked list since @@ -4777,9 +4859,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, retry: /* Rest of adevs pre asic reset from XGMI hive. */ list_for_each_entry(tmp_adev, device_list_handle, reset_list) { - r = amdgpu_device_pre_asic_reset(tmp_adev, - (tmp_adev == adev) ? job : NULL, - &need_full_reset); + r = amdgpu_device_pre_asic_reset(tmp_adev, &reset_context); /*TODO Should we stop ?*/ if (r) { dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", @@ -4788,6 +4868,7 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */ } } + tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter)); /* Actual ASIC resets if needed.*/ /* TODO Implement XGMI hive reset logic for SRIOV */ if (amdgpu_sriov_vf(adev)) { @@ -4795,7 +4876,7 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */ if (r) adev->asic_reset_res = r; } else { - r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false); + r = amdgpu_do_asic_reset(device_list_handle, &reset_context); if (r && r == -EAGAIN) goto retry; } @@ -4805,6 +4886,18 @@ skip_hw_reset: /* Post ASIC reset for all devs .*/ list_for_each_entry(tmp_adev, device_list_handle, reset_list) { + /* + * Sometimes a later bad compute job can block a good gfx job as gfx + * and compute ring share internal GC HW mutually. We add an additional + * guilty jobs recheck step to find the real guilty job, it synchronously + * submits and pends for the first job being signaled. If it gets timeout, + * we identify it as a real guilty job. + */ + if (amdgpu_gpu_recovery == 2 && + !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter))) + amdgpu_device_recheck_guilty_jobs( + tmp_adev, device_list_handle, &reset_context); + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { struct amdgpu_ring *ring = tmp_adev->rings[i]; @@ -5148,12 +5241,14 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) struct drm_device *dev = pci_get_drvdata(pdev); struct amdgpu_device *adev = drm_to_adev(dev); int r, i; - bool need_full_reset = true; + struct amdgpu_reset_context reset_context; u32 memsize; struct list_head device_list; DRM_INFO("PCI error: slot reset callback!!\n"); + memset(&reset_context, 0, sizeof(reset_context)); + INIT_LIST_HEAD(&device_list); list_add_tail(&adev->reset_list, &device_list); @@ -5176,13 +5271,18 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) goto out; } + reset_context.method = AMD_RESET_METHOD_NONE; + reset_context.reset_req_dev = adev; + set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); + set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); + adev->in_pci_err_recovery = true; - r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset); + r = amdgpu_device_pre_asic_reset(adev, &reset_context); adev->in_pci_err_recovery = false; if (r) goto out; - r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true); + r = amdgpu_do_asic_reset(&device_list, &reset_context); out: if (!r) { |