summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu
diff options
context:
space:
mode:
authorYiPeng Chai <YiPeng.Chai@amd.com>2024-04-22 17:37:36 +0800
committerAlex Deucher <alexander.deucher@amd.com>2024-04-26 17:22:41 -0400
commitf493dd64ee6680dc5bb46d7c800346eadb18049a (patch)
tree121afe9b853713ae9c33759a35d05f90835b2bbd /drivers/gpu/drm/amd/amdgpu
parent98b5bc878d4b522c035309c8f6d3247d54050369 (diff)
drm/amdgpu: prepare for logging ecc errors
Prepare for logging ecc errors. Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com> Reviewed-by: Tao Zhou <tao.zhou1@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c32
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h23
2 files changed, 55 insertions, 0 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 6641f27cb35c..fb2f88005cf3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2737,6 +2737,35 @@ static int amdgpu_ras_get_poison_req(struct amdgpu_device *adev,
}
#endif
+static void amdgpu_ras_ecc_log_init(struct ras_ecc_log_info *ecc_log)
+{
+ mutex_init(&ecc_log->lock);
+
+ /* Set any value as siphash key */
+ memset(&ecc_log->ecc_key, 0xad, sizeof(ecc_log->ecc_key));
+
+ INIT_RADIX_TREE(&ecc_log->de_page_tree, GFP_KERNEL);
+ ecc_log->de_updated = false;
+}
+
+static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)
+{
+ struct radix_tree_iter iter;
+ void __rcu **slot;
+ struct ras_ecc_err *ecc_err;
+
+ mutex_lock(&ecc_log->lock);
+ radix_tree_for_each_slot(slot, &ecc_log->de_page_tree, &iter, 0) {
+ ecc_err = radix_tree_deref_slot(slot);
+ kfree(ecc_err->err_pages.pfn);
+ kfree(ecc_err);
+ radix_tree_iter_delete(&ecc_log->de_page_tree, &iter, slot);
+ }
+ mutex_unlock(&ecc_log->lock);
+
+ mutex_destroy(&ecc_log->lock);
+ ecc_log->de_updated = false;
+}
static int amdgpu_ras_page_retirement_thread(void *param)
{
struct amdgpu_device *adev = (struct amdgpu_device *)param;
@@ -2838,6 +2867,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
dev_warn(adev->dev, "Failed to create umc_page_retirement thread!!!\n");
}
+ amdgpu_ras_ecc_log_init(&con->umc_ecc_log);
#ifdef CONFIG_X86_MCE_AMD
if ((adev->asic_type == CHIP_ALDEBARAN) &&
(adev->gmc.xgmi.connected_to_cpu))
@@ -2882,6 +2912,8 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
cancel_work_sync(&con->recovery_work);
+ amdgpu_ras_ecc_log_fini(&con->umc_ecc_log);
+
mutex_lock(&con->recovery_lock);
con->eh_data = NULL;
kfree(data->bps);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 2b15996f1ede..634654cf2634 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -27,6 +27,8 @@
#include <linux/debugfs.h>
#include <linux/list.h>
#include <linux/kfifo.h>
+#include <linux/radix-tree.h>
+#include <linux/siphash.h>
#include "ta_ras_if.h"
#include "amdgpu_ras_eeprom.h"
#include "amdgpu_smuio.h"
@@ -454,6 +456,26 @@ struct ras_poison_msg {
void *data;
};
+struct ras_err_pages {
+ uint32_t count;
+ uint64_t *pfn;
+};
+
+struct ras_ecc_err {
+ u64 hash_index;
+ uint64_t status;
+ uint64_t ipid;
+ uint64_t addr;
+ struct ras_err_pages err_pages;
+};
+
+struct ras_ecc_log_info {
+ struct mutex lock;
+ siphash_key_t ecc_key;
+ struct radix_tree_root de_page_tree;
+ bool de_updated;
+};
+
struct amdgpu_ras {
/* ras infrastructure */
/* for ras itself. */
@@ -514,6 +536,7 @@ struct amdgpu_ras {
atomic_t page_retirement_req_cnt;
struct mutex page_rsv_lock;
DECLARE_KFIFO(poison_fifo, struct ras_poison_msg, 128);
+ struct ras_ecc_log_info umc_ecc_log;
/* Fatal error detected flag */
atomic_t fed;