From dac6b80818ac2353631c5a33d140d8d5508e2957 Mon Sep 17 00:00:00 2001 From: Victor Zhao Date: Thu, 28 Jul 2022 10:39:23 +0800 Subject: drm/amdgpu: let mode2 reset fallback to default when failure - introduce AMDGPU_SKIP_MODE2_RESET flag - let mode2 reset fallback to default reset method if failed v2: move this part out from the asic specific part Signed-off-by: Victor Zhao Acked-by: Andrey Grodzovsky Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index e8a0b19b7398..c1ec4b653ca4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5148,6 +5148,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, reset_context->job = job; reset_context->hive = hive; + /* * Build list of devices to reset. * In case we are in XGMI hive mode, resort the device list @@ -5267,8 +5268,11 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */ amdgpu_ras_resume(adev); } else { r = amdgpu_do_asic_reset(device_list_handle, reset_context); - if (r && r == -EAGAIN) + if (r && r == -EAGAIN) { + set_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context->flags); + adev->asic_reset_res = 0; goto retry; + } } skip_hw_reset: @@ -5699,6 +5703,7 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) reset_context.reset_req_dev = adev; set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); + set_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags); adev->no_hw_access = true; r = amdgpu_device_pre_asic_reset(adev, &reset_context); -- cgit v1.2.3-70-g09d2 From 72fadb13674f807f10a168fb7d020dde58ce6b0b Mon Sep 17 00:00:00 2001 From: Victor Zhao Date: Fri, 24 Jun 2022 11:59:21 +0800 Subject: drm/amdgpu: revert context to stop engine before mode2 reset For some hang caused by slow tests, engine cannot be stopped which may cause resume failure after reset. In this case, force halt engine by reverting context addresses Signed-off-by: Victor Zhao Acked-by: Andrey Grodzovsky Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h | 1 + drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c | 36 +++++++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c | 2 ++ 4 files changed, 40 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index c1ec4b653ca4..c84fdef0ac45 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5039,6 +5039,7 @@ static void amdgpu_device_recheck_guilty_jobs( /* set guilty */ drm_sched_increase_karma(s_job); + amdgpu_reset_prepare_hwcontext(adev, reset_context); retry: /* do hw reset */ if (amdgpu_sriov_vf(adev)) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h index f8036f2b100e..c7b44aeb671b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h @@ -37,6 +37,7 @@ struct amdgpu_gfxhub_funcs { void (*utcl2_harvest)(struct amdgpu_device *adev); void (*mode2_save_regs)(struct amdgpu_device *adev); void (*mode2_restore_regs)(struct amdgpu_device *adev); + void (*halt)(struct amdgpu_device *adev); }; struct amdgpu_gfxhub { diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c index 51cf8acd2d79..8cf53e039c11 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c +++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c @@ -646,6 +646,41 @@ static void gfxhub_v2_1_restore_regs(struct amdgpu_device *adev) WREG32_SOC15(GC, 0, mmGCMC_VM_MX_L1_TLB_CNTL, adev->gmc.MC_VM_MX_L1_TLB_CNTL); } +static void gfxhub_v2_1_halt(struct amdgpu_device *adev) +{ + struct amdgpu_vmhub *hub = &adev->vmhub[AMDGPU_GFXHUB_0]; + int i; + uint32_t tmp; + int time = 1000; + + gfxhub_v2_1_set_fault_enable_default(adev, false); + + for (i = 0; i <= 14; i++) { + WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, + i * hub->ctx_addr_distance, ~0); + WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, + i * hub->ctx_addr_distance, ~0); + WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_END_ADDR_LO32, + i * hub->ctx_addr_distance, + 0); + WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_END_ADDR_HI32, + i * hub->ctx_addr_distance, + 0); + } + tmp = RREG32_SOC15(GC, 0, mmGRBM_STATUS2); + while ((tmp & (GRBM_STATUS2__EA_BUSY_MASK | + GRBM_STATUS2__EA_LINK_BUSY_MASK)) != 0 && + time) { + udelay(100); + time--; + tmp = RREG32_SOC15(GC, 0, mmGRBM_STATUS2); + } + + if (!time) { + DRM_WARN("failed to wait for GRBM(EA) idle\n"); + } +} + const struct amdgpu_gfxhub_funcs gfxhub_v2_1_funcs = { .get_fb_location = gfxhub_v2_1_get_fb_location, .get_mc_fb_offset = gfxhub_v2_1_get_mc_fb_offset, @@ -658,4 +693,5 @@ const struct amdgpu_gfxhub_funcs gfxhub_v2_1_funcs = { .utcl2_harvest = gfxhub_v2_1_utcl2_harvest, .mode2_save_regs = gfxhub_v2_1_save_regs, .mode2_restore_regs = gfxhub_v2_1_restore_regs, + .halt = gfxhub_v2_1_halt, }; diff --git a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c index 2dbbda17848b..7aa570c1ce4a 100644 --- a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c +++ b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c @@ -97,6 +97,8 @@ sienna_cichlid_mode2_prepare_hwcontext(struct amdgpu_reset_control *reset_ctl, if (!amdgpu_sriov_vf(adev)) { if (adev->gfxhub.funcs->mode2_save_regs) adev->gfxhub.funcs->mode2_save_regs(adev); + if (adev->gfxhub.funcs->halt) + adev->gfxhub.funcs->halt(adev); r = sienna_cichlid_mode2_suspend_ip(adev); } -- cgit v1.2.3-70-g09d2 From 0ad7347a64ac4baec1786810709eebedc5f823d5 Mon Sep 17 00:00:00 2001 From: André Almeida Date: Wed, 10 Aug 2022 20:28:55 -0300 Subject: drm/amd: Add detailed GFXOFF stats to debugfs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add debugfs interface to log GFXOFF statistics: - Read amdgpu_gfxoff_count to get the total GFXOFF entry count at the time of query since system power-up - Write 1 to amdgpu_gfxoff_residency to start logging, and 0 to stop. Read it to get average GFXOFF residency % multiplied by 100 during the last logging interval. Both features are designed to be keep the values persistent between suspends. Signed-off-by: André Almeida Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 168 ++++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 + drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 39 ++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 6 + drivers/gpu/drm/amd/pm/amdgpu_dpm.c | 45 +++++++ drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h | 3 + drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 33 +++++ drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h | 22 ++++ drivers/gpu/drm/amd/pm/swsmu/smu_internal.h | 3 + 9 files changed, 321 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c index 4e54569fa223..6066aebf491c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c @@ -1042,6 +1042,157 @@ err: return r; } +/** + * amdgpu_debugfs_gfxoff_residency_read - Read GFXOFF residency + * + * @f: open file handle + * @buf: User buffer to store read data in + * @size: Number of bytes to read + * @pos: Offset to seek to + * + * Read the last residency value logged. It doesn't auto update, one needs to + * stop logging before getting the current value. + */ +static ssize_t amdgpu_debugfs_gfxoff_residency_read(struct file *f, char __user *buf, + size_t size, loff_t *pos) +{ + struct amdgpu_device *adev = file_inode(f)->i_private; + ssize_t result = 0; + int r; + + if (size & 0x3 || *pos & 0x3) + return -EINVAL; + + r = pm_runtime_get_sync(adev_to_drm(adev)->dev); + if (r < 0) { + pm_runtime_put_autosuspend(adev_to_drm(adev)->dev); + return r; + } + + while (size) { + uint32_t value; + + r = amdgpu_get_gfx_off_residency(adev, &value); + if (r) + goto out; + + r = put_user(value, (uint32_t *)buf); + if (r) + goto out; + + result += 4; + buf += 4; + *pos += 4; + size -= 4; + } + + r = result; +out: + pm_runtime_mark_last_busy(adev_to_drm(adev)->dev); + pm_runtime_put_autosuspend(adev_to_drm(adev)->dev); + + return r; +} + +/** + * amdgpu_debugfs_gfxoff_residency_write - Log GFXOFF Residency + * + * @f: open file handle + * @buf: User buffer to write data from + * @size: Number of bytes to write + * @pos: Offset to seek to + * + * Write a 32-bit non-zero to start logging; write a 32-bit zero to stop + */ +static ssize_t amdgpu_debugfs_gfxoff_residency_write(struct file *f, const char __user *buf, + size_t size, loff_t *pos) +{ + struct amdgpu_device *adev = file_inode(f)->i_private; + ssize_t result = 0; + int r; + + if (size & 0x3 || *pos & 0x3) + return -EINVAL; + + r = pm_runtime_get_sync(adev_to_drm(adev)->dev); + if (r < 0) { + pm_runtime_put_autosuspend(adev_to_drm(adev)->dev); + return r; + } + + while (size) { + u32 value; + + r = get_user(value, (uint32_t *)buf); + if (r) + goto out; + + amdgpu_set_gfx_off_residency(adev, value ? true : false); + + result += 4; + buf += 4; + *pos += 4; + size -= 4; + } + + r = result; +out: + pm_runtime_mark_last_busy(adev_to_drm(adev)->dev); + pm_runtime_put_autosuspend(adev_to_drm(adev)->dev); + + return r; +} + + +/** + * amdgpu_debugfs_gfxoff_count_read - Read GFXOFF entry count + * + * @f: open file handle + * @buf: User buffer to store read data in + * @size: Number of bytes to read + * @pos: Offset to seek to + */ +static ssize_t amdgpu_debugfs_gfxoff_count_read(struct file *f, char __user *buf, + size_t size, loff_t *pos) +{ + struct amdgpu_device *adev = file_inode(f)->i_private; + ssize_t result = 0; + int r; + + if (size & 0x3 || *pos & 0x3) + return -EINVAL; + + r = pm_runtime_get_sync(adev_to_drm(adev)->dev); + if (r < 0) { + pm_runtime_put_autosuspend(adev_to_drm(adev)->dev); + return r; + } + + while (size) { + u64 value = 0; + + r = amdgpu_get_gfx_off_entrycount(adev, &value); + if (r) + goto out; + + r = put_user(value, (u64 *)buf); + if (r) + goto out; + + result += 4; + buf += 4; + *pos += 4; + size -= 4; + } + + r = result; +out: + pm_runtime_mark_last_busy(adev_to_drm(adev)->dev); + pm_runtime_put_autosuspend(adev_to_drm(adev)->dev); + + return r; +} + /** * amdgpu_debugfs_gfxoff_write - Enable/disable GFXOFF * @@ -1249,6 +1400,19 @@ static const struct file_operations amdgpu_debugfs_gfxoff_status_fops = { .llseek = default_llseek }; +static const struct file_operations amdgpu_debugfs_gfxoff_count_fops = { + .owner = THIS_MODULE, + .read = amdgpu_debugfs_gfxoff_count_read, + .llseek = default_llseek +}; + +static const struct file_operations amdgpu_debugfs_gfxoff_residency_fops = { + .owner = THIS_MODULE, + .read = amdgpu_debugfs_gfxoff_residency_read, + .write = amdgpu_debugfs_gfxoff_residency_write, + .llseek = default_llseek +}; + static const struct file_operations *debugfs_regs[] = { &amdgpu_debugfs_regs_fops, &amdgpu_debugfs_regs2_fops, @@ -1261,6 +1425,8 @@ static const struct file_operations *debugfs_regs[] = { &amdgpu_debugfs_gpr_fops, &amdgpu_debugfs_gfxoff_fops, &amdgpu_debugfs_gfxoff_status_fops, + &amdgpu_debugfs_gfxoff_count_fops, + &amdgpu_debugfs_gfxoff_residency_fops, }; static const char *debugfs_regs_names[] = { @@ -1275,6 +1441,8 @@ static const char *debugfs_regs_names[] = { "amdgpu_gpr", "amdgpu_gfxoff", "amdgpu_gfxoff_status", + "amdgpu_gfxoff_count", + "amdgpu_gfxoff_residency", }; /** diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index c84fdef0ac45..9cb6340c0247 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3577,6 +3577,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); adev->gfx.gfx_off_req_count = 1; + adev->gfx.gfx_off_residency = 0; + adev->gfx.gfx_off_entrycount = 0; adev->pm.ac_power = power_supply_is_system_supplied() > 0; atomic_set(&adev->throttling_logging_enabled, 1); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 454a78ba60d4..ceb91469958a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -610,6 +610,45 @@ unlock: mutex_unlock(&adev->gfx.gfx_off_mutex); } +int amdgpu_set_gfx_off_residency(struct amdgpu_device *adev, bool value) +{ + int r = 0; + + mutex_lock(&adev->gfx.gfx_off_mutex); + + r = amdgpu_dpm_set_residency_gfxoff(adev, value); + + mutex_unlock(&adev->gfx.gfx_off_mutex); + + return r; +} + +int amdgpu_get_gfx_off_residency(struct amdgpu_device *adev, u32 *value) +{ + int r = 0; + + mutex_lock(&adev->gfx.gfx_off_mutex); + + r = amdgpu_dpm_get_residency_gfxoff(adev, value); + + mutex_unlock(&adev->gfx.gfx_off_mutex); + + return r; +} + +int amdgpu_get_gfx_off_entrycount(struct amdgpu_device *adev, u64 *value) +{ + int r = 0; + + mutex_lock(&adev->gfx.gfx_off_mutex); + + r = amdgpu_dpm_get_entrycount_gfxoff(adev, value); + + mutex_unlock(&adev->gfx.gfx_off_mutex); + + return r; +} + int amdgpu_get_gfx_off_status(struct amdgpu_device *adev, uint32_t *value) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h index 23a696d38390..1b8b4a5270c9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h @@ -336,6 +336,8 @@ struct amdgpu_gfx { struct mutex gfx_off_mutex; uint32_t gfx_off_req_count; /* default 1, enable gfx off: dec 1, disable gfx off: add 1 */ struct delayed_work gfx_off_delay_work; + uint32_t gfx_off_residency; + uint64_t gfx_off_entrycount; /* pipe reservation */ struct mutex pipe_reserve_mutex; @@ -407,6 +409,10 @@ bool amdgpu_gfx_is_me_queue_enabled(struct amdgpu_device *adev, int me, void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, bool enable); int amdgpu_get_gfx_off_status(struct amdgpu_device *adev, uint32_t *value); int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block); +void amdgpu_gfx_ras_fini(struct amdgpu_device *adev); +int amdgpu_get_gfx_off_entrycount(struct amdgpu_device *adev, u64 *value); +int amdgpu_get_gfx_off_residency(struct amdgpu_device *adev, u32 *residency); +int amdgpu_set_gfx_off_residency(struct amdgpu_device *adev, bool value); int amdgpu_gfx_process_ras_data_cb(struct amdgpu_device *adev, void *err_data, struct amdgpu_iv_entry *entry); diff --git a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c index 956b6ce81c84..1b300c569faf 100644 --- a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c +++ b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c @@ -668,6 +668,51 @@ int amdgpu_dpm_wait_for_event(struct amdgpu_device *adev, return ret; } +int amdgpu_dpm_set_residency_gfxoff(struct amdgpu_device *adev, bool value) +{ + struct smu_context *smu = adev->powerplay.pp_handle; + int ret = 0; + + if (!is_support_sw_smu(adev)) + return -EOPNOTSUPP; + + mutex_lock(&adev->pm.mutex); + ret = smu_set_residency_gfxoff(smu, value); + mutex_unlock(&adev->pm.mutex); + + return ret; +} + +int amdgpu_dpm_get_residency_gfxoff(struct amdgpu_device *adev, u32 *value) +{ + struct smu_context *smu = adev->powerplay.pp_handle; + int ret = 0; + + if (!is_support_sw_smu(adev)) + return -EOPNOTSUPP; + + mutex_lock(&adev->pm.mutex); + ret = smu_get_residency_gfxoff(smu, value); + mutex_unlock(&adev->pm.mutex); + + return ret; +} + +int amdgpu_dpm_get_entrycount_gfxoff(struct amdgpu_device *adev, u64 *value) +{ + struct smu_context *smu = adev->powerplay.pp_handle; + int ret = 0; + + if (!is_support_sw_smu(adev)) + return -EOPNOTSUPP; + + mutex_lock(&adev->pm.mutex); + ret = smu_get_entrycount_gfxoff(smu, value); + mutex_unlock(&adev->pm.mutex); + + return ret; +} + int amdgpu_dpm_get_status_gfxoff(struct amdgpu_device *adev, uint32_t *value) { struct smu_context *smu = adev->powerplay.pp_handle; diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h index 65624d091ed2..cb5b9df78b4d 100644 --- a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h +++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h @@ -435,6 +435,9 @@ int amdgpu_dpm_set_soft_freq_range(struct amdgpu_device *adev, int amdgpu_dpm_write_watermarks_table(struct amdgpu_device *adev); int amdgpu_dpm_wait_for_event(struct amdgpu_device *adev, enum smu_event_type event, uint64_t event_arg); +int amdgpu_dpm_get_residency_gfxoff(struct amdgpu_device *adev, u32 *value); +int amdgpu_dpm_set_residency_gfxoff(struct amdgpu_device *adev, bool value); +int amdgpu_dpm_get_entrycount_gfxoff(struct amdgpu_device *adev, u64 *value); int amdgpu_dpm_get_status_gfxoff(struct amdgpu_device *adev, uint32_t *value); uint64_t amdgpu_dpm_get_thermal_throttling_counter(struct amdgpu_device *adev); void amdgpu_dpm_gfx_state_change(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c index 7510d470b864..55b7910b4385 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c @@ -90,6 +90,30 @@ static int smu_sys_set_pp_feature_mask(void *handle, return smu_set_pp_feature_mask(smu, new_mask); } +int smu_set_residency_gfxoff(struct smu_context *smu, bool value) +{ + if (!smu->ppt_funcs->set_gfx_off_residency) + return -EINVAL; + + return smu_set_gfx_off_residency(smu, value); +} + +int smu_get_residency_gfxoff(struct smu_context *smu, u32 *value) +{ + if (!smu->ppt_funcs->get_gfx_off_residency) + return -EINVAL; + + return smu_get_gfx_off_residency(smu, value); +} + +int smu_get_entrycount_gfxoff(struct smu_context *smu, u64 *value) +{ + if (!smu->ppt_funcs->get_gfx_off_entrycount) + return -EINVAL; + + return smu_get_gfx_off_entrycount(smu, value); +} + int smu_get_status_gfxoff(struct smu_context *smu, uint32_t *value) { if (!smu->ppt_funcs->get_gfx_off_status) @@ -1576,6 +1600,7 @@ static int smu_suspend(void *handle) struct amdgpu_device *adev = (struct amdgpu_device *)handle; struct smu_context *smu = adev->powerplay.pp_handle; int ret; + uint64_t count; if (amdgpu_sriov_vf(adev)&& !amdgpu_sriov_is_pp_one_vf(adev)) return 0; @@ -1593,6 +1618,14 @@ static int smu_suspend(void *handle) smu_set_gfx_cgpg(smu, false); + /* + * pwfw resets entrycount when device is suspended, so we save the + * last value to be used when we resume to keep it consistent + */ + ret = smu_get_entrycount_gfxoff(smu, &count); + if (!ret) + adev->gfx.gfx_off_entrycount = count; + return 0; } diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h index b81c657c7386..e2fa3b066b96 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h @@ -1111,6 +1111,22 @@ struct pptable_funcs { */ uint32_t (*get_gfx_off_status)(struct smu_context *smu); + /** + * @gfx_off_entrycount: total GFXOFF entry count at the time of + * query since system power-up + */ + u32 (*get_gfx_off_entrycount)(struct smu_context *smu, uint64_t *entrycount); + + /** + * @set_gfx_off_residency: set 1 to start logging, 0 to stop logging + */ + u32 (*set_gfx_off_residency)(struct smu_context *smu, bool start); + + /** + * @get_gfx_off_residency: Average GFXOFF residency % during the logging interval + */ + u32 (*get_gfx_off_residency)(struct smu_context *smu, uint32_t *residency); + /** * @register_irq_handler: Register interupt request handlers. */ @@ -1454,6 +1470,12 @@ int smu_set_ac_dc(struct smu_context *smu); int smu_allow_xgmi_power_down(struct smu_context *smu, bool en); +int smu_get_entrycount_gfxoff(struct smu_context *smu, u64 *value); + +int smu_get_residency_gfxoff(struct smu_context *smu, u32 *value); + +int smu_set_residency_gfxoff(struct smu_context *smu, bool value); + int smu_get_status_gfxoff(struct smu_context *smu, uint32_t *value); int smu_handle_passthrough_sbr(struct smu_context *smu, bool enable); diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_internal.h b/drivers/gpu/drm/amd/pm/swsmu/smu_internal.h index 7469bbfce1fb..ceb13c838067 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu_internal.h +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_internal.h @@ -47,6 +47,9 @@ #define smu_notify_memory_pool_location(smu) smu_ppt_funcs(notify_memory_pool_location, 0, smu) #define smu_gfx_off_control(smu, enable) smu_ppt_funcs(gfx_off_control, 0, smu, enable) #define smu_get_gfx_off_status(smu) smu_ppt_funcs(get_gfx_off_status, 0, smu) +#define smu_get_gfx_off_entrycount(smu, value) smu_ppt_funcs(get_gfx_off_entrycount, 0, smu, value) +#define smu_get_gfx_off_residency(smu, value) smu_ppt_funcs(get_gfx_off_residency, 0, smu, value) +#define smu_set_gfx_off_residency(smu, value) smu_ppt_funcs(set_gfx_off_residency, 0, smu, value) #define smu_set_last_dcef_min_deep_sleep_clk(smu) smu_ppt_funcs(set_last_dcef_min_deep_sleep_clk, 0, smu) #define smu_system_features_control(smu, en) smu_ppt_funcs(system_features_control, 0, smu, en) #define smu_init_max_sustainable_clocks(smu) smu_ppt_funcs(init_max_sustainable_clocks, 0, smu) -- cgit v1.2.3-70-g09d2 From 9dfa4860efb8cf20c12b9b65ec66cafa6e93f3a6 Mon Sep 17 00:00:00 2001 From: YiPeng Chai Date: Fri, 12 Aug 2022 14:34:35 +0800 Subject: drm/amdgpu: fix hive reference leak when adding xgmi device Only amdgpu_get_xgmi_hive but no amdgpu_put_xgmi_hive which will leak the hive reference. Signed-off-by: YiPeng Chai Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 9cb6340c0247..4cd87dbb108c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2456,12 +2456,14 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) if (!hive->reset_domain || !amdgpu_reset_get_reset_domain(hive->reset_domain)) { r = -ENOENT; + amdgpu_put_xgmi_hive(hive); goto init_failed; } /* Drop the early temporary reset domain we created for device */ amdgpu_reset_put_reset_domain(adev->reset_domain); adev->reset_domain = hive->reset_domain; + amdgpu_put_xgmi_hive(hive); } } -- cgit v1.2.3-70-g09d2 From 947f63f17e1d91cee19f1bc071e825b28ce4da98 Mon Sep 17 00:00:00 2001 From: shaoyunl Date: Thu, 18 Aug 2022 14:13:52 -0400 Subject: drm/amdgpu: Remove the additional kfd pre reset call for sriov The additional call is caused by merge conflict Reviewed-by: Felix Kuehling Signed-off-by: shaoyunl Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 4cd87dbb108c..d7eb23b8d692 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4417,8 +4417,6 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, retry: amdgpu_amdkfd_pre_reset(adev); - amdgpu_amdkfd_pre_reset(adev); - if (from_hypervisor) r = amdgpu_virt_request_full_gpu(adev, true); else -- cgit v1.2.3-70-g09d2 From d3ef9d57f24eba16cbce8f304c9190528e842227 Mon Sep 17 00:00:00 2001 From: Chengming Gui Date: Fri, 5 Aug 2022 11:00:56 +0800 Subject: drm/amd/amdgpu: avoid soft reset check when gpu recovery disabled Avoid soft reset, even ip hang check (ring/ib test) when gpu recovery disabled. v2: add missing "}" Signed-off-by: Chengming Gui Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index d7eb23b8d692..ce7d117efdb5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4511,14 +4511,15 @@ bool amdgpu_device_has_job_running(struct amdgpu_device *adev) */ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) { - if (!amdgpu_device_ip_check_soft_reset(adev)) { - dev_info(adev->dev, "Timeout, but no hardware hang detected.\n"); - return false; - } if (amdgpu_gpu_recovery == 0) goto disabled; + if (!amdgpu_device_ip_check_soft_reset(adev)) { + dev_info(adev->dev,"Timeout, but no hardware hang detected.\n"); + return false; + } + if (amdgpu_sriov_vf(adev)) return true; @@ -4643,7 +4644,7 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, if (!need_full_reset) need_full_reset = amdgpu_device_ip_need_full_reset(adev); - if (!need_full_reset) { + if (!need_full_reset && amdgpu_gpu_recovery) { amdgpu_device_ip_pre_soft_reset(adev); r = amdgpu_device_ip_soft_reset(adev); amdgpu_device_ip_post_soft_reset(adev); -- cgit v1.2.3-70-g09d2 From ab23c5b9c74d682d81b0903247817b2bb20528bb Mon Sep 17 00:00:00 2001 From: Alex Sierra Date: Thu, 25 Aug 2022 15:42:08 -0500 Subject: drm/amdgpu: ensure no PCIe peer access for CPU XGMI iolinks [Why] Devices with CPU XGMI iolink do not support PCIe peer access. Signed-off-by: Alex Sierra Acked-by: Alex Deucher Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index ce7d117efdb5..afaa1056e039 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5532,7 +5532,8 @@ bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); resource_size_t aper_limit = adev->gmc.aper_base + adev->gmc.aper_size - 1; - bool p2p_access = !(pci_p2pdma_distance_many(adev->pdev, + bool p2p_access = !adev->gmc.xgmi.connected_to_cpu && + !(pci_p2pdma_distance_many(adev->pdev, &peer_adev->dev, 1, true) < 0); return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && -- cgit v1.2.3-70-g09d2 From fac53471d0ea9693d314aa2df08d62b2e7e3a0f8 Mon Sep 17 00:00:00 2001 From: YiPeng Chai Date: Thu, 18 Aug 2022 16:46:59 +0800 Subject: drm/amdgpu: TA unload messages are not actually sent to psp when amdgpu is uninstalled V1: The psp_cmd_submit_buf function is called by psp_hw_fini to send TA unload messages to psp to terminate ras, asd and tmr. But when amdgpu is uninstalled, drm_dev_unplug is called earlier than psp_hw_fini in amdgpu_pci_remove, the calling order as follows: static void amdgpu_pci_remove(struct pci_dev *pdev) { drm_dev_unplug ...... amdgpu_driver_unload_kms->amdgpu_device_fini_hw->... ->.hw_fini->psp_hw_fini->... ->psp_ta_unload->psp_cmd_submit_buf ...... } The program will return when calling drm_dev_enter in psp_cmd_submit_buf. So the call to drm_dev_enter in psp_cmd_submit_buf should be removed, so that the TA unload messages can be sent to the psp when amdgpu is uninstalled. V2: 1. Restore psp_cmd_submit_buf to its original code. 2. Move drm_dev_unplug call after amdgpu_driver_unload_kms in amdgpu_pci_remove. 3. Since amdgpu_device_fini_hw is called by amdgpu_driver_unload_kms, remove the unplug check to release device mmio resource in amdgpu_device_fini_hw before calling drm_dev_unplug. Signed-off-by: YiPeng Chai Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +-- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index afaa1056e039..62b26f0e37b0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3969,8 +3969,7 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev) amdgpu_gart_dummy_page_fini(adev); - if (drm_dev_is_unplugged(adev_to_drm(adev))) - amdgpu_device_unmap_mmio(adev); + amdgpu_device_unmap_mmio(adev); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index de7144b06e93..728a0933ea6f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -2181,8 +2181,6 @@ amdgpu_pci_remove(struct pci_dev *pdev) struct drm_device *dev = pci_get_drvdata(pdev); struct amdgpu_device *adev = drm_to_adev(dev); - drm_dev_unplug(dev); - if (adev->pm.rpm_mode != AMDGPU_RUNPM_NONE) { pm_runtime_get_sync(dev->dev); pm_runtime_forbid(dev->dev); @@ -2190,6 +2188,8 @@ amdgpu_pci_remove(struct pci_dev *pdev) amdgpu_driver_unload_kms(dev); + drm_dev_unplug(dev); + /* * Flush any in flight DMA operations from device. * Clear the Bus Master Enable bit and then wait on the PCIe Device -- cgit v1.2.3-70-g09d2