drm/amdgpu: Set EEPROM ras info

Set EEPROM ras info: rma status, health percent and bad page threshold. Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
author: Stanley.Yang <Stanley.Yang@amd.com> 2023-06-01 20:56:42 +0800
committer: Alex Deucher <alexander.deucher@amd.com> 2023-06-09 12:44:40 -0400
commit: 0bc3137b2157115f328859477b463c912d605c3a (patch)
tree: 096b78ea7ab5b6a74823ef12ce54cf4ac633b792 /drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
parent: 7c2551fa1dfdb06a9dd3a6c629086fe2c348e00a (diff)
1 files changed, 24 insertions, 0 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 9eceb3bc1058..c2e8f6491ac6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -406,6 +406,7 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
 {
 	struct amdgpu_device *adev = to_amdgpu_device(control);
 	struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
+	struct amdgpu_ras_eeprom_table_ras_info *rai = &control->tbl_rai;
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 	u8 csum;
 	int res;
@@ -423,6 +424,14 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
 		hdr->first_rec_offset = RAS_RECORD_START_V2_1;
 		hdr->tbl_size = RAS_TABLE_HEADER_SIZE +
 				RAS_TABLE_V2_1_INFO_SIZE;
+		rai->rma_status = GPU_HEALTH_USABLE;
+		/**
+		 * GPU health represented as a percentage.
+		 * 0 means worst health, 100 means fully health.
+		 */
+		rai->health_percent = 100;
+		/* ecc_page_threshold = 0 means disable bad page retirement */
+		rai->ecc_page_threshold = con->bad_page_cnt_threshold;
 	} else {
 		hdr->first_rec_offset = RAS_RECORD_START;
 		hdr->tbl_size = RAS_TABLE_HEADER_SIZE;
@@ -712,6 +721,10 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
 			"Saved bad pages %d reaches threshold value %d\n",
 			control->ras_num_recs, ras->bad_page_cnt_threshold);
 		control->tbl_hdr.header = RAS_TABLE_HDR_BAD;
+		if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) {
+			control->tbl_rai.rma_status = GPU_RETIRED__ECC_REACH_THRESHOLD;
+			control->tbl_rai.health_percent = 0;
+		}
 	}
 
 	if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)
@@ -749,6 +762,17 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
 		goto Out;
 	}
 
+	/**
+	 * bad page records have been stored in eeprom,
+	 * now calculate gpu health percent
+	 */
+	if (amdgpu_bad_page_threshold != 0 &&
+	    control->tbl_hdr.version == RAS_TABLE_VER_V2_1 &&
+	    control->ras_num_recs < ras->bad_page_cnt_threshold)
+		control->tbl_rai.health_percent = ((ras->bad_page_cnt_threshold -
+						   control->ras_num_recs) * 100) /
+						   ras->bad_page_cnt_threshold;
+
 	/* Recalc the checksum.
 	 */
 	csum = 0;
author	Stanley.Yang <Stanley.Yang@amd.com>	2023-06-01 20:56:42 +0800
committer	Alex Deucher <alexander.deucher@amd.com>	2023-06-09 12:44:40 -0400
commit	0bc3137b2157115f328859477b463c912d605c3a (patch)
tree	096b78ea7ab5b6a74823ef12ce54cf4ac633b792 /drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
parent	7c2551fa1dfdb06a9dd3a6c629086fe2c348e00a (diff)