diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 130 | 
1 files changed, 103 insertions, 27 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index cef94e2169fe..ab379b44679c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -31,6 +31,7 @@  #include "amdgpu.h"  #include "amdgpu_ras.h"  #include "amdgpu_atomfirmware.h" +#include "amdgpu_xgmi.h"  #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"  const char *ras_error_string[] = { @@ -280,6 +281,11 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *  	struct ras_debug_if data;  	int ret = 0; +	if (amdgpu_ras_intr_triggered()) { +		DRM_WARN("RAS WARN: error injection currently inaccessible\n"); +		return size; +	} +  	ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data);  	if (ret)  		return -EINVAL; @@ -393,6 +399,10 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev,  		.head = obj->head,  	}; +	if (amdgpu_ras_intr_triggered()) +		return snprintf(buf, PAGE_SIZE, +				"Query currently inaccessible\n"); +  	if (amdgpu_ras_error_query(obj->adev, &info))  		return -EINVAL; @@ -720,6 +730,9 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev,  		if (adev->nbio.funcs->query_ras_error_count)  			adev->nbio.funcs->query_ras_error_count(adev, &err_data);  		break; +	case AMDGPU_RAS_BLOCK__XGMI_WAFL: +		amdgpu_xgmi_query_ras_error_count(adev, &err_data); +		break;  	default:  		break;  	} @@ -742,20 +755,6 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev,  	return 0;  } -uint64_t get_xgmi_relative_phy_addr(struct amdgpu_device *adev, uint64_t addr) -{ -	uint32_t df_inst_id; - -	if ((!adev->df.funcs)                 || -	    (!adev->df.funcs->get_df_inst_id) || -	    (!adev->df.funcs->get_dram_base_addr)) -		return addr; - -	df_inst_id = adev->df.funcs->get_df_inst_id(adev); - -	return addr + adev->df.funcs->get_dram_base_addr(adev, df_inst_id); -} -  /* wrapper of psp_ras_trigger_error */  int amdgpu_ras_error_inject(struct amdgpu_device *adev,  		struct ras_inject_if *info) @@ -775,8 +774,9 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,  	/* Calculate XGMI relative offset */  	if (adev->gmc.xgmi.num_physical_nodes > 1) { -		block_info.address = get_xgmi_relative_phy_addr(adev, -								block_info.address); +		block_info.address = +			amdgpu_xgmi_get_relative_phy_addr(adev, +							  block_info.address);  	}  	switch (info->head.block) { @@ -1122,6 +1122,32 @@ void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,  				       &amdgpu_ras_debugfs_ops);  } +void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev) +{ +	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); +	struct ras_manager *obj; +	struct ras_fs_if fs_info; + +	/* +	 * it won't be called in resume path, no need to check +	 * suspend and gpu reset status +	 */ +	if (!con) +		return; + +	amdgpu_ras_debugfs_create_ctrl_node(adev); + +	list_for_each_entry(obj, &con->head, node) { +		if (amdgpu_ras_is_supported(adev, obj->head.block) && +			(obj->attr_inuse == 1)) { +			sprintf(fs_info.debugfs_name, "%s_err_inject", +					ras_block_str(obj->head.block)); +			fs_info.head = obj->head; +			amdgpu_ras_debugfs_create(adev, &fs_info); +		} +	} +} +  void amdgpu_ras_debugfs_remove(struct amdgpu_device *adev,  		struct ras_common_if *head)  { @@ -1154,7 +1180,6 @@ static void amdgpu_ras_debugfs_remove_all(struct amdgpu_device *adev)  static int amdgpu_ras_fs_init(struct amdgpu_device *adev)  {  	amdgpu_ras_sysfs_create_feature_node(adev); -	amdgpu_ras_debugfs_create_ctrl_node(adev);  	return 0;  } @@ -1319,6 +1344,33 @@ static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev)  }  /* ih end */ +/* traversal all IPs except NBIO to query error counter */ +static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev) +{ +	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); +	struct ras_manager *obj; + +	if (!con) +		return; + +	list_for_each_entry(obj, &con->head, node) { +		struct ras_query_if info = { +			.head = obj->head, +		}; + +		/* +		 * PCIE_BIF IP has one different isr by ras controller +		 * interrupt, the specific ras counter query will be +		 * done in that isr. So skip such block from common +		 * sync flood interrupt isr calling. +		 */ +		if (info.head.block == AMDGPU_RAS_BLOCK__PCIE_BIF) +			continue; + +		amdgpu_ras_error_query(adev, &info); +	} +} +  /* recovery begin */  /* return 0 on success. @@ -1372,6 +1424,22 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)  {  	struct amdgpu_ras *ras =  		container_of(work, struct amdgpu_ras, recovery_work); +	struct amdgpu_device *remote_adev = NULL; +	struct amdgpu_device *adev = ras->adev; +	struct list_head device_list, *device_list_handle =  NULL; +	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, false); + +	/* Build list of devices to query RAS related errors */ +	if  (hive && adev->gmc.xgmi.num_physical_nodes > 1) { +		device_list_handle = &hive->device_list; +	} else { +		list_add_tail(&adev->gmc.xgmi.head, &device_list); +		device_list_handle = &device_list; +	} + +	list_for_each_entry(remote_adev, device_list_handle, gmc.xgmi.head) { +		amdgpu_ras_log_on_err_counter(remote_adev); +	}  	if (amdgpu_device_should_recover_gpu(ras->adev))  		amdgpu_device_gpu_recover(ras->adev, 0); @@ -1713,18 +1781,30 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev,  	*hw_supported = 0;  	*supported = 0; -	if (amdgpu_sriov_vf(adev) || +	if (amdgpu_sriov_vf(adev) || !adev->is_atom_fw ||  	    (adev->asic_type != CHIP_VEGA20 &&  	     adev->asic_type != CHIP_ARCTURUS))  		return; -	if (adev->is_atom_fw && -			(amdgpu_atomfirmware_mem_ecc_supported(adev) || -			 amdgpu_atomfirmware_sram_ecc_supported(adev))) -		*hw_supported = AMDGPU_RAS_BLOCK_MASK; +	if (amdgpu_atomfirmware_mem_ecc_supported(adev)) { +		DRM_INFO("HBM ECC is active.\n"); +		*hw_supported |= (1 << AMDGPU_RAS_BLOCK__UMC | +				1 << AMDGPU_RAS_BLOCK__DF); +	} else +		DRM_INFO("HBM ECC is not presented.\n"); + +	if (amdgpu_atomfirmware_sram_ecc_supported(adev)) { +		DRM_INFO("SRAM ECC is active.\n"); +		*hw_supported |= ~(1 << AMDGPU_RAS_BLOCK__UMC | +				1 << AMDGPU_RAS_BLOCK__DF); +	} else +		DRM_INFO("SRAM ECC is not presented.\n"); + +	/* hw_supported needs to be aligned with RAS block mask. */ +	*hw_supported &= AMDGPU_RAS_BLOCK_MASK;  	*supported = amdgpu_ras_enable == 0 ? -				0 : *hw_supported & amdgpu_ras_mask; +			0 : *hw_supported & amdgpu_ras_mask;  }  int amdgpu_ras_init(struct amdgpu_device *adev) @@ -1825,8 +1905,6 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,  			goto interrupt;  	} -	amdgpu_ras_debugfs_create(adev, fs_info); -  	r = amdgpu_ras_sysfs_create(adev, fs_info);  	if (r)  		goto sysfs; @@ -1835,7 +1913,6 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,  cleanup:  	amdgpu_ras_sysfs_remove(adev, ras_block);  sysfs: -	amdgpu_ras_debugfs_remove(adev, ras_block);  	if (ih_info->cb)  		amdgpu_ras_interrupt_remove_handler(adev, ih_info);  interrupt: @@ -1852,7 +1929,6 @@ void amdgpu_ras_late_fini(struct amdgpu_device *adev,  		return;  	amdgpu_ras_sysfs_remove(adev, ras_block); -	amdgpu_ras_debugfs_remove(adev, ras_block);  	if (ih_info->cb)                  amdgpu_ras_interrupt_remove_handler(adev, ih_info);  	amdgpu_ras_feature_enable(adev, ras_block, 0);  | 
