From 76ad30f51aa0d1bd99f12658d4775a86df6e4282 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Thu, 18 Apr 2024 15:46:00 +0800 Subject: drm/amdgpu: add MCA smu cache support v1: because SMU CE valid mca bank will be cleared after reading, this patch adds mca cache at the driver level to ensure that the mca bank is not lost. v2: refine amdgpu_mca_init/fini/reset() function name. v3: add mca_cache.lock support only add CE bank to mca bank cache. Signed-off-by: Yang Wang Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 1adc81a55734..0522533c9182 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -3629,6 +3629,13 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev) amdgpu_ras_set_aca_debug_mode(adev, false); } else { + if (amdgpu_in_reset(adev)) + r = amdgpu_mca_reset(adev); + else + r = amdgpu_mca_init(adev); + if (r) + return r; + amdgpu_ras_set_mca_debug_mode(adev, false); } @@ -3701,6 +3708,8 @@ int amdgpu_ras_fini(struct amdgpu_device *adev) if (amdgpu_aca_is_enabled(adev)) amdgpu_aca_fini(adev); + else + amdgpu_mca_fini(adev); WARN(AMDGPU_RAS_GET_FEATURES(con->features), "Feature mask is not cleared"); -- cgit v1.2.3-70-g09d2 From 3ca73073f46a74dd53c3c1222c2486a593052e72 Mon Sep 17 00:00:00 2001 From: YiPeng Chai Date: Thu, 25 Apr 2024 15:53:03 +0800 Subject: drm/amdgpu: Remove redundant function call Remove redundant function call. Signed-off-by: YiPeng Chai Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 0522533c9182..a037e8fba29f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2804,8 +2804,8 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work) mutex_unlock(&con->umc_ecc_log.lock); } -static int amdgpu_ras_query_ecc_status(struct amdgpu_device *adev, - enum amdgpu_ras_block ras_block, uint32_t timeout_ms) +static void amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev, + uint32_t timeout_ms) { int ret = 0; struct ras_ecc_log_info *ecc_log; @@ -2814,7 +2814,7 @@ static int amdgpu_ras_query_ecc_status(struct amdgpu_device *adev, struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); memset(&info, 0, sizeof(info)); - info.head.block = ras_block; + info.head.block = AMDGPU_RAS_BLOCK__UMC; ecc_log = &ras->umc_ecc_log; ecc_log->de_updated = false; @@ -2822,7 +2822,7 @@ static int amdgpu_ras_query_ecc_status(struct amdgpu_device *adev, ret = amdgpu_ras_query_error_status(adev, &info); if (ret) { dev_err(adev->dev, "Failed to query ras error! ret:%d\n", ret); - return ret; + return; } if (timeout && !ecc_log->de_updated) { @@ -2833,21 +2833,11 @@ static int amdgpu_ras_query_ecc_status(struct amdgpu_device *adev, if (timeout_ms && !timeout) { dev_warn(adev->dev, "Can't find deferred error\n"); - return -ETIMEDOUT; + return; } - return 0; -} - -static void amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev, - uint32_t timeout) -{ - struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - int ret; - - ret = amdgpu_ras_query_ecc_status(adev, AMDGPU_RAS_BLOCK__UMC, timeout); if (!ret) - schedule_delayed_work(&con->page_retirement_dwork, 0); + schedule_delayed_work(&ras->page_retirement_dwork, 0); } static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev, -- cgit v1.2.3-70-g09d2 From 1dbd59f3f4d3fd75287aa16ff0976f25213e4c03 Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Wed, 17 Apr 2024 03:00:18 +0800 Subject: drm/amdgpu: Add psp v13_0_14 ip block Add psp v13_0_14 ip block support. Signed-off-by: Hawking Zhang Reviewed-by: Le Ma Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 13 ++++++++++--- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 2 ++ drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 12 +++++++++--- drivers/gpu/drm/amd/amdgpu/soc15.c | 3 ++- 5 files changed, 24 insertions(+), 7 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c index 911ac69b1ae4..f4fefda7e7d8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c @@ -1851,6 +1851,7 @@ static int amdgpu_discovery_set_psp_ip_blocks(struct amdgpu_device *adev) case IP_VERSION(13, 0, 8): case IP_VERSION(13, 0, 10): case IP_VERSION(13, 0, 11): + case IP_VERSION(13, 0, 14): case IP_VERSION(14, 0, 0): case IP_VERSION(14, 0, 1): amdgpu_device_ip_block_add(adev, &psp_v13_0_ip_block); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index a551c5b67fdd..37820dd03cab 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -145,6 +145,7 @@ static int psp_init_sriov_microcode(struct psp_context *psp) adev->virt.autoload_ucode_id = 0; break; case IP_VERSION(13, 0, 6): + case IP_VERSION(13, 0, 14): ret = psp_init_cap_microcode(psp, ucode_prefix); ret &= psp_init_ta_microcode(psp, ucode_prefix); break; @@ -207,6 +208,7 @@ static int psp_early_init(void *handle) psp->boot_time_tmr = false; fallthrough; case IP_VERSION(13, 0, 6): + case IP_VERSION(13, 0, 14): psp_v13_0_set_psp_funcs(psp); psp->autoload_supported = false; break; @@ -355,7 +357,8 @@ static bool psp_get_runtime_db_entry(struct amdgpu_device *adev, bool ret = false; int i; - if (amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6)) + if (amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6) || + amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 14)) return false; db_header_pos = adev->gmc.mc_vram_size - PSP_RUNTIME_DB_OFFSET; @@ -847,6 +850,7 @@ static bool psp_skip_tmr(struct psp_context *psp) case IP_VERSION(13, 0, 2): case IP_VERSION(13, 0, 6): case IP_VERSION(13, 0, 10): + case IP_VERSION(13, 0, 14): return true; default: return false; @@ -1450,7 +1454,9 @@ int psp_xgmi_get_topology_info(struct psp_context *psp, (psp->xgmi_context.supports_extended_data && get_extended_data) || amdgpu_ip_version(psp->adev, MP0_HWIP, 0) == - IP_VERSION(13, 0, 6); + IP_VERSION(13, 0, 6) || + amdgpu_ip_version(psp->adev, MP0_HWIP, 0) == + IP_VERSION(13, 0, 14); bool ta_port_num_support = amdgpu_sriov_vf(psp->adev) ? 0 : psp->xgmi_context.xgmi_ta_caps & EXTEND_PEER_LINK_INFO_CMD_FLAG; @@ -2636,7 +2642,8 @@ static int psp_load_p2s_table(struct psp_context *psp) (adev->pm.rpm_mode == AMDGPU_RUNPM_BAMACO))) return 0; - if (amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6)) { + if (amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6) || + amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 14)) { uint32_t supp_vers = adev->flags & AMD_IS_APU ? 0x0036013D : 0x0036003C; if (psp->sos.fw_version < supp_vers) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index a037e8fba29f..7b30f448eab6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -3053,6 +3053,7 @@ static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev) switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) { case IP_VERSION(13, 0, 2): case IP_VERSION(13, 0, 6): + case IP_VERSION(13, 0, 14): return true; default: return false; @@ -3064,6 +3065,7 @@ static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev) case IP_VERSION(13, 0, 0): case IP_VERSION(13, 0, 6): case IP_VERSION(13, 0, 10): + case IP_VERSION(13, 0, 14): return true; default: return false; diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c index 0da50ea46eaf..b52e15e2dcc7 100644 --- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c +++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c @@ -51,6 +51,8 @@ MODULE_FIRMWARE("amdgpu/psp_13_0_11_toc.bin"); MODULE_FIRMWARE("amdgpu/psp_13_0_11_ta.bin"); MODULE_FIRMWARE("amdgpu/psp_13_0_6_sos.bin"); MODULE_FIRMWARE("amdgpu/psp_13_0_6_ta.bin"); +MODULE_FIRMWARE("amdgpu/psp_13_0_14_sos.bin"); +MODULE_FIRMWARE("amdgpu/psp_13_0_14_ta.bin"); MODULE_FIRMWARE("amdgpu/psp_14_0_0_toc.bin"); MODULE_FIRMWARE("amdgpu/psp_14_0_0_ta.bin"); MODULE_FIRMWARE("amdgpu/psp_14_0_1_toc.bin"); @@ -115,6 +117,7 @@ static int psp_v13_0_init_microcode(struct psp_context *psp) case IP_VERSION(13, 0, 6): case IP_VERSION(13, 0, 7): case IP_VERSION(13, 0, 10): + case IP_VERSION(13, 0, 14): err = psp_init_sos_microcode(psp, ucode_prefix); if (err) return err; @@ -168,7 +171,8 @@ static int psp_v13_0_wait_for_bootloader(struct psp_context *psp) int retry_loop, retry_cnt, ret; retry_cnt = - (amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6)) ? + ((amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6) || + amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 14))) ? PSP_VMBX_POLLING_LIMIT : 10; /* Wait for bootloader to signify that it is ready having bit 31 of @@ -193,7 +197,8 @@ static int psp_v13_0_wait_for_bootloader_steady_state(struct psp_context *psp) struct amdgpu_device *adev = psp->adev; int ret; - if (amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6)) { + if (amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6) || + amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 14)) { ret = psp_v13_0_wait_for_vmbx_ready(psp); if (ret) amdgpu_ras_query_boot_status(adev, 4); @@ -787,7 +792,8 @@ static bool psp_v13_0_get_ras_capability(struct psp_context *psp) if (!con) return false; - if ((amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6)) && + if ((amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6) || + amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 14)) && (!(adev->flags & AMD_IS_APU))) { reg_data = RREG32_SOC15(MP0, 0, regMP0_SMN_C2PMSG_127); adev->ras_hw_enabled = (reg_data & GENMASK_ULL(23, 0)); diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c index 170f02e96717..55ee5ac82879 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15.c +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c @@ -1458,7 +1458,8 @@ static void soc15_common_get_clockgating_state(void *handle, u64 *flags) adev->hdp.funcs->get_clock_gating_state(adev, flags); if ((amdgpu_ip_version(adev, MP0_HWIP, 0) != IP_VERSION(13, 0, 2)) && - (amdgpu_ip_version(adev, MP0_HWIP, 0) != IP_VERSION(13, 0, 6))) { + (amdgpu_ip_version(adev, MP0_HWIP, 0) != IP_VERSION(13, 0, 6)) && + (amdgpu_ip_version(adev, MP0_HWIP, 0) != IP_VERSION(13, 0, 14))) { /* AMD_CG_SUPPORT_DRM_MGCG */ data = RREG32(SOC15_REG_OFFSET(MP0, 0, mmMP0_MISC_CGTT_CTRL0)); if (!(data & 0x01000000)) -- cgit v1.2.3-70-g09d2 From 329cec8f18fc8bed7570b68b18936724af2f5593 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Tue, 7 May 2024 10:28:04 +0800 Subject: drm/amdgpu: fix RAS unload driver issue in SRIOV Fix null pointer issue when unload driver in SRIOV mode. Adjust the function position to ensure that the amdgpu_mca/aca_xxx_init() related functions can be initialized properly. Signed-off-by: Yang Wang Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 7b30f448eab6..2c5ad9530299 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -3605,10 +3605,6 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev) struct amdgpu_ras_block_object *obj; int r; - /* Guest side doesn't need init ras feature */ - if (amdgpu_sriov_vf(adev)) - return 0; - amdgpu_ras_event_mgr_init(adev); if (amdgpu_aca_is_enabled(adev)) { @@ -3619,7 +3615,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev) if (r) return r; - amdgpu_ras_set_aca_debug_mode(adev, false); + if (!amdgpu_sriov_vf(adev)) + amdgpu_ras_set_aca_debug_mode(adev, false); } else { if (amdgpu_in_reset(adev)) r = amdgpu_mca_reset(adev); @@ -3628,9 +3625,14 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev) if (r) return r; - amdgpu_ras_set_mca_debug_mode(adev, false); + if (!amdgpu_sriov_vf(adev)) + amdgpu_ras_set_mca_debug_mode(adev, false); } + /* Guest side doesn't need init ras feature */ + if (amdgpu_sriov_vf(adev)) + return 0; + list_for_each_entry_safe(node, tmp, &adev->ras_list, node) { obj = node->ras_obj; if (!obj) { -- cgit v1.2.3-70-g09d2 From 2b3b9d2150c02c340f2ad50bae1a20f6913587ce Mon Sep 17 00:00:00 2001 From: YiPeng Chai Date: Tue, 7 May 2024 16:19:09 +0800 Subject: drm/amdgpu: change log level Change log level. Signed-off-by: YiPeng Chai Reviewed-by: Yang Wang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 2 +- drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 2c5ad9530299..1dd13ed3b7b5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2886,7 +2886,7 @@ static int amdgpu_ras_page_retirement_thread(void *param) ras_block = poison_msg.block; - dev_info(adev->dev, "Start processing ras block %s(%d)\n", + dev_dbg(adev->dev, "Start processing ras block %s(%d)\n", ras_block_str(ras_block), ras_block); if (ras_block == AMDGPU_RAS_BLOCK__UMC) { diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c index 0f1a276bc628..6d6350f220b0 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c @@ -71,7 +71,7 @@ static void umc_v12_0_reset_error_count(struct amdgpu_device *adev) bool umc_v12_0_is_deferred_error(struct amdgpu_device *adev, uint64_t mc_umc_status) { - dev_info(adev->dev, + dev_dbg(adev->dev, "MCA_UMC_STATUS(0x%llx): Val:%llu, Poison:%llu, Deferred:%llu, PCC:%llu, UC:%llu, TCC:%llu\n", mc_umc_status, REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val), @@ -504,7 +504,7 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev, err_addr = REG_GET_FIELD(addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr); - dev_info(adev->dev, + dev_dbg(adev->dev, "UMC:IPID:0x%llx, socket:%llu, aid:%llu, inst:%llu, ch:%llu, err_addr:0x%llx\n", ipid, MCA_IPID_2_SOCKET_ID(ipid), -- cgit v1.2.3-70-g09d2 From 01b32973367bbcd12ef38f7a9fbed3a0f1603b6e Mon Sep 17 00:00:00 2001 From: Ma Jun Date: Mon, 13 May 2024 13:42:20 +0800 Subject: drm/amdgpu: Remove dead code in amdgpu_ras_add_mca_err_addr Remove dead code in amdgpu_ras_add_mca_err_addr Signed-off-by: Ma Jun Reviewed-by: YiPeng Chai Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 1dd13ed3b7b5..925ec65ac5ed 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -4287,21 +4287,8 @@ static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_d void amdgpu_ras_add_mca_err_addr(struct ras_err_info *err_info, struct ras_err_addr *err_addr) { - struct ras_err_addr *mca_err_addr; - /* This function will be retired. */ return; - mca_err_addr = kzalloc(sizeof(*mca_err_addr), GFP_KERNEL); - if (!mca_err_addr) - return; - - INIT_LIST_HEAD(&mca_err_addr->node); - - mca_err_addr->err_status = err_addr->err_status; - mca_err_addr->err_ipid = err_addr->err_ipid; - mca_err_addr->err_addr = err_addr->err_addr; - - list_add_tail(&mca_err_addr->node, &err_info->err_addr_list); } void amdgpu_ras_del_mca_err_addr(struct ras_err_info *err_info, struct ras_err_addr *mca_err_addr) -- cgit v1.2.3-70-g09d2 From 4c11d30c95576937c6c35e6f29884761f2dddb43 Mon Sep 17 00:00:00 2001 From: Ma Jun Date: Sat, 11 May 2024 15:48:02 +0800 Subject: drm/amdgpu: Fix the null pointer dereference to ras_manager Check ras_manager before using it Signed-off-by: Ma Jun Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 925ec65ac5ed..2bcf5c3b5d70 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2172,12 +2172,15 @@ static void amdgpu_ras_interrupt_process_handler(struct work_struct *work) int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev, struct ras_dispatch_if *info) { - struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); - struct ras_ih_data *data = &obj->ih_data; + struct ras_manager *obj; + struct ras_ih_data *data; + obj = amdgpu_ras_find_obj(adev, &info->head); if (!obj) return -EINVAL; + data = &obj->ih_data; + if (data->inuse == 0) return 0; -- cgit v1.2.3-70-g09d2 From b712d7c20133b67f13aa134e7534369f19e1214f Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Tue, 14 May 2024 07:48:19 +0800 Subject: drm/amdgpu: fix compiler 'side-effect' check issue for RAS_EVENT_LOG() create a new helper function to avoid compiler 'side-effect' check about RAS_EVENT_LOG() macro. Signed-off-by: Yang Wang Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 18 ++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 13 ++++++------- 2 files changed, 24 insertions(+), 7 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 2bcf5c3b5d70..51f037cc807b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -4494,3 +4494,21 @@ int amdgpu_ras_reserve_page(struct amdgpu_device *adev, uint64_t pfn) return ret; } + +void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + if (amdgpu_ras_event_id_is_valid(adev, event_id)) + dev_printk(KERN_INFO, adev->dev, "{%llu}%pV", event_id, &vaf); + else + dev_printk(KERN_INFO, adev->dev, "%pV", &vaf); + + va_end(args); +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index c8980d5f6540..6a8c7b1609df 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -67,13 +67,8 @@ struct amdgpu_iv_entry; /* The high three bits indicates socketid */ #define AMDGPU_RAS_GET_FEATURES(val) ((val) & ~AMDGPU_RAS_FEATURES_SOCKETID_MASK) -#define RAS_EVENT_LOG(_adev, _id, _fmt, ...) \ -do { \ - if (amdgpu_ras_event_id_is_valid((_adev), (_id))) \ - dev_info((_adev)->dev, "{%llu}" _fmt, (_id), ##__VA_ARGS__); \ - else \ - dev_info((_adev)->dev, _fmt, ##__VA_ARGS__); \ -} while (0) +#define RAS_EVENT_LOG(adev, id, fmt, ...) \ + amdgpu_ras_event_log_print((adev), (id), (fmt), ##__VA_ARGS__); enum amdgpu_ras_block { AMDGPU_RAS_BLOCK__UMC = 0, @@ -956,4 +951,8 @@ int amdgpu_ras_put_poison_req(struct amdgpu_device *adev, enum amdgpu_ras_block block, uint16_t pasid, pasid_notify pasid_fn, void *data, uint32_t reset); +__printf(3, 4) +void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id, + const char *fmt, ...); + #endif -- cgit v1.2.3-70-g09d2 From 062a7ce676e092faf03daa6c579224bb3d904ae1 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Fri, 17 May 2024 09:18:38 +0800 Subject: drm/amdgpu: fix ACA no query result after gpu reset fix ACA no query result after gpu reset. Signed-off-by: Yang Wang Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 7 ------- drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h | 1 - drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 9 ++++----- 3 files changed, 4 insertions(+), 13 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c index 7260a6ce015a..01d50ad603d3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c @@ -712,13 +712,6 @@ void amdgpu_aca_fini(struct amdgpu_device *adev) atomic_set(&aca->ue_update_flag, 0); } -int amdgpu_aca_reset(struct amdgpu_device *adev) -{ - amdgpu_aca_fini(adev); - - return amdgpu_aca_init(adev); -} - void amdgpu_aca_set_smu_funcs(struct amdgpu_device *adev, const struct aca_smu_funcs *smu_funcs) { struct amdgpu_aca *aca = &adev->aca; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h index ba724c2a997d..4327ce1ceacf 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h @@ -192,7 +192,6 @@ struct aca_info { int amdgpu_aca_init(struct amdgpu_device *adev); void amdgpu_aca_fini(struct amdgpu_device *adev); -int amdgpu_aca_reset(struct amdgpu_device *adev); void amdgpu_aca_set_smu_funcs(struct amdgpu_device *adev, const struct aca_smu_funcs *smu_funcs); bool amdgpu_aca_is_enabled(struct amdgpu_device *adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 51f037cc807b..06247818635a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -3611,12 +3611,11 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev) amdgpu_ras_event_mgr_init(adev); if (amdgpu_aca_is_enabled(adev)) { - if (amdgpu_in_reset(adev)) - r = amdgpu_aca_reset(adev); - else + if (!amdgpu_in_reset(adev)) { r = amdgpu_aca_init(adev); - if (r) - return r; + if (r) + return r; + } if (!amdgpu_sriov_vf(adev)) amdgpu_ras_set_aca_debug_mode(adev, false); -- cgit v1.2.3-70-g09d2 From 9262f411dc2e765d8a1d52d33c84d2ebb0580cec Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Wed, 24 Apr 2024 10:47:35 +0800 Subject: drm/amdgpu: skip to create ras xxx_err_count node when ACA is enabled skip to create 'xxx_err_count' node when ACA is enabled. Signed-off-by: Yang Wang Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 06247818635a..8073716bc5ac 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1759,6 +1759,9 @@ int amdgpu_ras_sysfs_create(struct amdgpu_device *adev, { struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); + if (amdgpu_aca_is_enabled(adev)) + return 0; + if (!obj || obj->attr_inuse) return -EINVAL; @@ -1793,6 +1796,9 @@ int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev, { struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); + if (amdgpu_aca_is_enabled(adev)) + return 0; + if (!obj || !obj->attr_inuse) return -EINVAL; -- cgit v1.2.3-70-g09d2 From 3c603b1fa8b44de94c97dd0baf9b40a74586b55d Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Mon, 27 May 2024 15:40:38 +0800 Subject: drm/amdgpu: fix typo in amdgpu_ras_aca_sysfs_read() function fix typo "info.ue_count" in amdgpu_ras_aca_sysfs_read() function. Fixes: 865d3397630b ("drm/amdgpu: add aca deferred error type support") Signed-off-by: Yang Wang Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 8073716bc5ac..db4a811cc0f5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1299,7 +1299,7 @@ ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, struct device_attribute *a return -EINVAL; return sysfs_emit(buf, "%s: %lu\n%s: %lu\n%s: %lu\n", "ue", info.ue_count, - "ce", info.ce_count, "de", info.ue_count); + "ce", info.ce_count, "de", info.de_count); } static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev, -- cgit v1.2.3-70-g09d2 From 473af28d3e63b9b679c7878df33616c7ca6ea947 Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Tue, 28 May 2024 13:52:46 +0800 Subject: drm/amdgpu: Estimate RAS reservation when report capacity v2 Add estimate of how much vram we need to reserve for RAS when caculating the total available vram. v2: apply the change to MP0 v13_0_2 and v13_0_14 Signed-off-by: Hawking Zhang Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 9 +++++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 20 ++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 4 ++++ 3 files changed, 31 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 3af00b57cd8a..11672bfe4fad 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -172,6 +172,8 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev, { uint64_t reserved_for_pt = ESTIMATE_PT_SIZE(amdgpu_amdkfd_total_mem_size); + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + uint64_t reserved_for_ras = (con ? con->reserved_pages_in_bytes : 0); size_t system_mem_needed, ttm_mem_needed, vram_needed; int ret = 0; uint64_t vram_size = 0; @@ -220,7 +222,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev, (kfd_mem_limit.ttm_mem_used + ttm_mem_needed > kfd_mem_limit.max_ttm_mem_limit) || (adev && xcp_id >= 0 && adev->kfd.vram_used[xcp_id] + vram_needed > - vram_size - reserved_for_pt - atomic64_read(&adev->vram_pin_size))) { + vram_size - reserved_for_pt - reserved_for_ras - atomic64_read(&adev->vram_pin_size))) { ret = -ENOMEM; goto release; } @@ -1673,6 +1675,8 @@ size_t amdgpu_amdkfd_get_available_memory(struct amdgpu_device *adev, { uint64_t reserved_for_pt = ESTIMATE_PT_SIZE(amdgpu_amdkfd_total_mem_size); + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + uint64_t reserved_for_ras = (con ? con->reserved_pages_in_bytes : 0); ssize_t available; uint64_t vram_available, system_mem_available, ttm_mem_available; @@ -1680,7 +1684,8 @@ size_t amdgpu_amdkfd_get_available_memory(struct amdgpu_device *adev, vram_available = KFD_XCP_MEMORY_SIZE(adev, xcp_id) - adev->kfd.vram_used_aligned[xcp_id] - atomic64_read(&adev->vram_pin_size) - - reserved_for_pt; + - reserved_for_pt + - reserved_for_ras; if (adev->flags & AMD_IS_APU) { system_mem_available = no_system_mem_limit ? diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index db4a811cc0f5..45b696524541 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -3298,6 +3298,24 @@ static void amdgpu_ras_event_mgr_init(struct amdgpu_device *adev) amdgpu_put_xgmi_hive(hive); } +static void amdgpu_ras_init_reserved_vram_size(struct amdgpu_device *adev) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + + if (!con || (adev->flags & AMD_IS_APU)) + return; + + switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) { + case IP_VERSION(13, 0, 2): + case IP_VERSION(13, 0, 6): + case IP_VERSION(13, 0, 14): + con->reserved_pages_in_bytes = AMDGPU_RAS_RESERVED_VRAM_SIZE; + break; + default: + break; + } +} + int amdgpu_ras_init(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ -3403,6 +3421,8 @@ int amdgpu_ras_init(struct amdgpu_device *adev) /* Get RAS schema for particular SOC */ con->schema = amdgpu_get_ras_schema(adev); + amdgpu_ras_init_reserved_vram_size(adev); + if (amdgpu_ras_fs_init(adev)) { r = -EINVAL; goto release_con; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index d06c01b978cd..36a4de89c27d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -64,6 +64,9 @@ struct amdgpu_iv_entry; #define AMDGPU_RAS_FEATURES_SOCKETID_SHIFT 29 #define AMDGPU_RAS_FEATURES_SOCKETID_MASK 0xe0000000 +/* Reserve 8 physical dram row for possible retirement. + * In worst cases, it will lose 8 * 2MB memory in vram domain */ +#define AMDGPU_RAS_RESERVED_VRAM_SIZE (16ULL << 20) /* The high three bits indicates socketid */ #define AMDGPU_RAS_GET_FEATURES(val) ((val) & ~AMDGPU_RAS_FEATURES_SOCKETID_MASK) @@ -541,6 +544,7 @@ struct amdgpu_ras { struct ras_event_manager __event_mgr; struct ras_event_manager *event_mgr; + uint64_t reserved_pages_in_bytes; }; struct ras_fs_data { -- cgit v1.2.3-70-g09d2 From a474161e84fc0b15534a80f8dfcbaf5e48fd8249 Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Thu, 30 May 2024 16:02:12 +0800 Subject: drm/amdgpu: Update programming for boot error reporting AMDGPU_RAS_GPU_ERR_BOOT_STATUS field is no longer valid. The polling sequence is also simplifed according to the latest firmware change. Signed-off-by: Hawking Zhang Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 99 +++++++++++++++------------------ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 4 +- 2 files changed, 46 insertions(+), 57 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 45b696524541..8dbfdb767f94 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -4400,64 +4400,74 @@ int amdgpu_ras_error_statistic_de_count(struct ras_err_data *err_data, #define mmMP0_SMN_C2PMSG_92 0x1609C #define mmMP0_SMN_C2PMSG_126 0x160BE static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev, - u32 instance, u32 boot_error) + u32 instance) { u32 socket_id, aid_id, hbm_id; - u32 reg_data; + u32 fw_status; + u32 boot_error; u64 reg_addr; - socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error); - aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error); - hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error); - /* The pattern for smn addressing in other SOC could be different from * the one for aqua_vanjaram. We should revisit the code if the pattern * is changed. In such case, replace the aqua_vanjaram implementation * with more common helper */ reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) + aqua_vanjaram_encode_ext_smn_addressing(instance); + fw_status = amdgpu_device_indirect_rreg_ext(adev, reg_addr); + + reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) + + aqua_vanjaram_encode_ext_smn_addressing(instance); + boot_error = amdgpu_device_indirect_rreg_ext(adev, reg_addr); - reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr); - dev_err(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw status is 0x%x\n", - socket_id, aid_id, reg_data); + socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error); + aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error); + hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error); if (AMDGPU_RAS_GPU_ERR_MEM_TRAINING(boot_error)) - dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory training failed\n", - socket_id, aid_id, hbm_id); + dev_info(adev->dev, + "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, memory training failed\n", + socket_id, aid_id, hbm_id, fw_status); if (AMDGPU_RAS_GPU_ERR_FW_LOAD(boot_error)) - dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed at boot time\n", - socket_id, aid_id); + dev_info(adev->dev, + "socket: %d, aid: %d, fw_status: 0x%x, firmware load failed at boot time\n", + socket_id, aid_id, fw_status); if (AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(boot_error)) - dev_info(adev->dev, "socket: %d, aid: %d, wafl link training failed\n", - socket_id, aid_id); + dev_info(adev->dev, + "socket: %d, aid: %d, fw_status: 0x%x, wafl link training failed\n", + socket_id, aid_id, fw_status); if (AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(boot_error)) - dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training failed\n", - socket_id, aid_id); + dev_info(adev->dev, + "socket: %d, aid: %d, fw_status: 0x%x, xgmi link training failed\n", + socket_id, aid_id, fw_status); if (AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(boot_error)) - dev_info(adev->dev, "socket: %d, aid: %d, usr cp link training failed\n", - socket_id, aid_id); + dev_info(adev->dev, + "socket: %d, aid: %d, fw_status: 0x%x, usr cp link training failed\n", + socket_id, aid_id, fw_status); if (AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(boot_error)) - dev_info(adev->dev, "socket: %d, aid: %d, usr dp link training failed\n", - socket_id, aid_id); + dev_info(adev->dev, + "socket: %d, aid: %d, fw_status: 0x%x, usr dp link training failed\n", + socket_id, aid_id, fw_status); if (AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(boot_error)) - dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm memory test failed\n", - socket_id, aid_id, hbm_id); + dev_info(adev->dev, + "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, hbm memory test failed\n", + socket_id, aid_id, hbm_id, fw_status); if (AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(boot_error)) - dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm bist test failed\n", - socket_id, aid_id, hbm_id); + dev_info(adev->dev, + "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, hbm bist test failed\n", + socket_id, aid_id, hbm_id, fw_status); } -static int amdgpu_ras_wait_for_boot_complete(struct amdgpu_device *adev, - u32 instance, u32 *boot_error) +static bool amdgpu_ras_boot_error_detected(struct amdgpu_device *adev, + u32 instance) { - u32 reg_addr; + u64 reg_addr; u32 reg_data; int retry_loop; @@ -4466,41 +4476,22 @@ static int amdgpu_ras_wait_for_boot_complete(struct amdgpu_device *adev, for (retry_loop = 0; retry_loop < AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT; retry_loop++) { reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr); - if ((reg_data & AMDGPU_RAS_BOOT_STATUS_MASK) == AMDGPU_RAS_BOOT_STEADY_STATUS) { - *boot_error = AMDGPU_RAS_BOOT_SUCEESS; - return 0; - } - msleep(1); - } - - /* The pattern for smn addressing in other SOC could be different from - * the one for aqua_vanjaram. We should revisit the code if the pattern - * is changed. In such case, replace the aqua_vanjaram implementation - * with more common helper */ - reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) + - aqua_vanjaram_encode_ext_smn_addressing(instance); - - for (retry_loop = 0; retry_loop < AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT; retry_loop++) { - reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr); - if (AMDGPU_RAS_GPU_ERR_BOOT_STATUS(reg_data)) { - *boot_error = reg_data; - return 0; - } - msleep(1); + if ((reg_data & AMDGPU_RAS_BOOT_STATUS_MASK) == AMDGPU_RAS_BOOT_STEADY_STATUS) + return false; + else + msleep(1); } - *boot_error = reg_data; - return -ETIME; + return true; } void amdgpu_ras_query_boot_status(struct amdgpu_device *adev, u32 num_instances) { - u32 boot_error = 0; u32 i; for (i = 0; i < num_instances; i++) { - if (amdgpu_ras_wait_for_boot_complete(adev, i, &boot_error)) - amdgpu_ras_boot_time_error_reporting(adev, i, boot_error); + if (amdgpu_ras_boot_error_detected(adev, i)) + amdgpu_ras_boot_time_error_reporting(adev, i); } } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 36a4de89c27d..56b9bf63b67f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -47,12 +47,10 @@ struct amdgpu_iv_entry; #define AMDGPU_RAS_GPU_ERR_SOCKET_ID(x) AMDGPU_GET_REG_FIELD(x, 10, 8) #define AMDGPU_RAS_GPU_ERR_AID_ID(x) AMDGPU_GET_REG_FIELD(x, 12, 11) #define AMDGPU_RAS_GPU_ERR_HBM_ID(x) AMDGPU_GET_REG_FIELD(x, 14, 13) -#define AMDGPU_RAS_GPU_ERR_BOOT_STATUS(x) AMDGPU_GET_REG_FIELD(x, 31, 31) -#define AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT 1000 +#define AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT 100 #define AMDGPU_RAS_BOOT_STEADY_STATUS 0xBA #define AMDGPU_RAS_BOOT_STATUS_MASK 0xFF -#define AMDGPU_RAS_BOOT_SUCEESS 0x80000000 #define AMDGPU_RAS_FLAG_INIT_BY_VBIOS (0x1 << 0) /* position of instance value in sub_block_index of -- cgit v1.2.3-70-g09d2 From b95fa494d6b74c30eeb4a50481aa1041c631754e Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Thu, 23 May 2024 11:23:20 +0800 Subject: drm/amdgpu: add RAS is_rma flag Set the flag to true if bad page number reaches threshold. Signed-off-by: Tao Zhou Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 9 ++++----- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 10 ++++++---- drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 3 +-- 4 files changed, 12 insertions(+), 11 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 8dbfdb767f94..b3d11703df04 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2926,7 +2926,6 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data **data; u32 max_eeprom_records_count = 0; - bool exc_err_limit = false; int ret; if (!con || amdgpu_sriov_vf(adev)) @@ -2963,12 +2962,12 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) */ if (adev->gmc.xgmi.pending_reset) return 0; - ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit); + ret = amdgpu_ras_eeprom_init(&con->eeprom_control); /* - * This calling fails when exc_err_limit is true or + * This calling fails when is_rma is true or * ret != 0. */ - if (exc_err_limit || ret) + if (con->is_rma || ret) goto free; if (con->eeprom_control.ras_num_recs) { @@ -3016,7 +3015,7 @@ out: * Except error threshold exceeding case, other failure cases in this * function would not fail amdgpu driver init. */ - if (!exc_err_limit) + if (!con->is_rma) ret = 0; else ret = -EINVAL; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 56b9bf63b67f..e70c45712ddb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -522,6 +522,7 @@ struct amdgpu_ras { bool update_channel_flag; /* Record status of smu mca debug mode */ bool is_aca_debug_mode; + bool is_rma; /* Record special requirements of gpu reset caller */ uint32_t gpu_reset_flags; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index 9b789dcc2bd1..eae0a555df3c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -750,6 +750,9 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control) control->tbl_rai.health_percent = 0; } + if (amdgpu_bad_page_threshold != -1) + ras->is_rma = true; + /* ignore the -ENOTSUPP return value */ amdgpu_dpm_send_rma_reason(adev); } @@ -1321,8 +1324,7 @@ Out: return res == RAS_TABLE_V2_1_INFO_SIZE ? 0 : res; } -int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, - bool *exceed_err_limit) +int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control) { struct amdgpu_device *adev = to_amdgpu_device(control); unsigned char buf[RAS_TABLE_HEADER_SIZE] = { 0 }; @@ -1330,7 +1332,7 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); int res; - *exceed_err_limit = false; + ras->is_rma = false; if (!__is_ras_eeprom_supported(adev)) return 0; @@ -1422,7 +1424,7 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, dev_warn(adev->dev, "GPU will be initialized due to bad_page_threshold = -1."); res = 0; } else { - *exceed_err_limit = true; + ras->is_rma = true; dev_err(adev->dev, "RAS records:%d exceed threshold:%d, " "GPU will not be initialized. Replace this GPU or increase the threshold", diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h index 6dfd667f3013..b9ebda577797 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h @@ -129,8 +129,7 @@ struct eeprom_table_record { unsigned char mcumc_id; } __packed; -int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, - bool *exceed_err_limit); +int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control); int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control); -- cgit v1.2.3-70-g09d2