From 6ff7fddbd12064dc9de03e0c1ad03e13f6ba7af8 Mon Sep 17 00:00:00 2001 From: Lang Yu Date: Tue, 30 Nov 2021 11:06:40 +0800 Subject: drm/amdgpu: add support for SMU debug option SMU firmware expects the driver maintains error context and doesn't interact with SMU any more when SMU errors occurred. That will aid in debugging SMU firmware issues. Add SMU debug option support for this request, it can be enabled or disabled via amdgpu_smu_debug debugfs file. Use a 32-bit mask to indicate corresponding debug modes. Currently, only one mode(HALT_ON_ERROR) is supported. When enabled, it brings hardware to a kind of halt state so that no one can touch it any more in the envent of SMU errors. The dirver interacts with SMU via sending messages. And threre are three ways to sending messages to SMU in current implementation. Handle them respectively as following: 1, smu_cmn_send_smc_msg_with_param() for normal timeout cases Halt on any error. 2, smu_cmn_send_msg_without_waiting()/smu_cmn_wait_for_response() for longer timeout cases Halt on errors apart from ETIME. Otherwise this way won't work. Let the user handle ETIME error in such a case. 3, smu_cmn_send_msg_without_waiting() for no waiting cases Halt on errors apart from ETIME. Otherwise second way won't work. == Command Guide == 1, enable HALT_ON_ERROR mode # echo 0x1 > /sys/kernel/debug/dri/0/amdgpu_smu_debug 2, disable HALT_ON_ERROR mode # echo 0x0 > /sys/kernel/debug/dri/0/amdgpu_smu_debug v5: - Use bit mask to allow more debug features.(Evan) - Use WRAN() instead of BUG().(Evan) v4: - Set to halt state instead of a simple hang.(Christian) v3: - Use debugfs_create_bool().(Christian) - Put variable into smu_context struct. - Don't resend command when timeout. v2: - Resend command when timeout.(Lijo) - Use debugfs file instead of module parameter. Signed-off-by: Lang Yu Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c index 164d6a9e9fbb..9dfccb20fedd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c @@ -1618,6 +1618,9 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev) if (!debugfs_initialized()) return 0; + debugfs_create_x32("amdgpu_smu_debug", 0600, root, + &adev->smu.smu_debug_mask); + ent = debugfs_create_file("amdgpu_preempt_ib", 0600, root, adev, &fops_ib_preempt); if (IS_ERR(ent)) { -- cgit v1.2.3-70-g09d2 From 7e31a8585b79a4d67e7fefdb6428054d18ddd339 Mon Sep 17 00:00:00 2001 From: Evan Quan Date: Mon, 13 Dec 2021 11:37:56 +0800 Subject: drm/amdgpu: move smu_debug_mask to a more proper place As the smu_context will be invisible from outside(of power). Also, the smu_debug_mask can be shared around all power code instead of some specific framework(swSMU) only. Signed-off-by: Evan Quan Reviewed-by: Lijo Lazar Reviewed-by: Guchun Chen Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 2 +- drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h | 8 ++++++++ drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h | 8 -------- drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 16 +++++++++------- 4 files changed, 18 insertions(+), 16 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c index 9dfccb20fedd..25e2e5bf90eb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c @@ -1619,7 +1619,7 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev) return 0; debugfs_create_x32("amdgpu_smu_debug", 0600, root, - &adev->smu.smu_debug_mask); + &adev->pm.smu_debug_mask); ent = debugfs_create_file("amdgpu_preempt_ib", 0600, root, adev, &fops_ib_preempt); diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h index 16e3f72d31b9..c464a045000d 100644 --- a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h +++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h @@ -423,6 +423,9 @@ enum ip_power_state { POWER_STATE_OFF, }; +/* Used to mask smu debug modes */ +#define SMU_DEBUG_HALT_ON_ERROR 0x1 + struct amdgpu_pm { struct mutex mutex; u32 current_sclk; @@ -460,6 +463,11 @@ struct amdgpu_pm { struct list_head pm_attr_list; atomic_t pwr_state[AMD_IP_BLOCK_TYPE_NUM]; + + /* + * 0 = disabled (default), otherwise enable corresponding debug mode + */ + uint32_t smu_debug_mask; }; #define R600_SSTU_DFLT 0 diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h index 12e67ad9a3b2..2b9b9a7ba97a 100644 --- a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h +++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h @@ -482,9 +482,6 @@ struct stb_context { #define WORKLOAD_POLICY_MAX 7 -/* Used to mask smu debug modes */ -#define SMU_DEBUG_HALT_ON_ERROR 0x1 - struct smu_context { struct amdgpu_device *adev; @@ -573,11 +570,6 @@ struct smu_context struct smu_user_dpm_profile user_dpm_profile; struct stb_context stb_context; - - /* - * 0 = disabled (default), otherwise enable corresponding debug mode - */ - uint32_t smu_debug_mask; }; struct i2c_adapter; diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c index 43637d55fe29..735e1a1e365d 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c @@ -257,10 +257,11 @@ int smu_cmn_send_msg_without_waiting(struct smu_context *smu, uint16_t msg_index, uint32_t param) { + struct amdgpu_device *adev = smu->adev; u32 reg; int res; - if (smu->adev->no_hw_access) + if (adev->no_hw_access) return 0; reg = __smu_cmn_poll_stat(smu); @@ -272,9 +273,9 @@ int smu_cmn_send_msg_without_waiting(struct smu_context *smu, __smu_cmn_send_msg(smu, msg_index, param); res = 0; Out: - if (unlikely(smu->smu_debug_mask & SMU_DEBUG_HALT_ON_ERROR) && + if (unlikely(adev->pm.smu_debug_mask & SMU_DEBUG_HALT_ON_ERROR) && res && (res != -ETIME)) { - amdgpu_device_halt(smu->adev); + amdgpu_device_halt(adev); WARN_ON(1); } @@ -299,7 +300,7 @@ int smu_cmn_wait_for_response(struct smu_context *smu) reg = __smu_cmn_poll_stat(smu); res = __smu_cmn_reg2errno(smu, reg); - if (unlikely(smu->smu_debug_mask & SMU_DEBUG_HALT_ON_ERROR) && + if (unlikely(smu->adev->pm.smu_debug_mask & SMU_DEBUG_HALT_ON_ERROR) && res && (res != -ETIME)) { amdgpu_device_halt(smu->adev); WARN_ON(1); @@ -343,10 +344,11 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu, uint32_t param, uint32_t *read_arg) { + struct amdgpu_device *adev = smu->adev; int res, index; u32 reg; - if (smu->adev->no_hw_access) + if (adev->no_hw_access) return 0; index = smu_cmn_to_asic_specific_index(smu, @@ -372,8 +374,8 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu, if (read_arg) smu_cmn_read_arg(smu, read_arg); Out: - if (unlikely(smu->smu_debug_mask & SMU_DEBUG_HALT_ON_ERROR) && res) { - amdgpu_device_halt(smu->adev); + if (unlikely(adev->pm.smu_debug_mask & SMU_DEBUG_HALT_ON_ERROR) && res) { + amdgpu_device_halt(adev); WARN_ON(1); } -- cgit v1.2.3-70-g09d2