diff options
author | Dave Airlie <airlied@redhat.com> | 2019-10-26 05:56:57 +1000 |
---|---|---|
committer | Dave Airlie <airlied@redhat.com> | 2019-10-26 05:56:57 +1000 |
commit | 3275a71e76fac5bc276f0d60e027b18c2e8d7a5b (patch) | |
tree | f275ab1c98be91f5e0fda869819e09c05d0918ab /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | |
parent | 2e79e22e092acd55da0b2db066e4826d7d152c41 (diff) | |
parent | 1cd4d9eead73c004d08a58536dc726bd172eaaec (diff) |
Merge tag 'drm-next-5.5-2019-10-09' of git://people.freedesktop.org/~agd5f/linux into drm-next
drm-next-5.5-2019-10-09:
amdgpu:
- Additional RAS enablement for vega20
- RAS page retirement and bad page storage in EEPROM
- No GPU reset with unrecoverable RAS errors
- Reserve vram for page tables rather than trying to evict
- Fix issues with GPU reset and xgmi hives
- DC i2c over aux fixes
- Direct submission for clears, PTE/PDE updates
- Improvements to help support recoverable GPU page faults
- Silence harmless SAD block messages
- Clean up code for creating a bo at a fixed location
- Initial DC HDCP support
- Lots of documentation fixes
- GPU reset for renoir
- Add IH clockgating support for soc15 asics
- Powerplay improvements
- DC MST cleanups
- Add support for MSI-X
- Misc cleanups and bug fixes
amdkfd:
- Query KFD device info by asic type rather than pci ids
- Add navi14 support
- Add renoir support
- Add navi12 support
- gfx10 trap handler improvements
- pasid cleanups
- Check against device cgroup
ttm:
- Return -EBUSY with pipelining with no_gpu_wait
radeon:
- Silence harmless SAD block messages
device_cgroup:
- Export devcgroup_check_permission
Signed-off-by: Dave Airlie <airlied@redhat.com>
From: Alex Deucher <alexdeucher@gmail.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20191010041713.3412-1-alexander.deucher@amd.com
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 43 |
1 files changed, 32 insertions, 11 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 6c76bb2a6843..f80fd3428c98 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -317,8 +317,6 @@ struct amdgpu_ras { struct list_head head; /* debugfs */ struct dentry *dir; - /* debugfs ctrl */ - struct dentry *ent; /* sysfs */ struct device_attribute features_attr; struct bin_attribute badpages_attr; @@ -334,7 +332,7 @@ struct amdgpu_ras { struct mutex recovery_lock; uint32_t flags; - + bool reboot; struct amdgpu_ras_eeprom_control eeprom_control; }; @@ -347,15 +345,14 @@ struct ras_err_data { unsigned long ue_count; unsigned long ce_count; unsigned long err_addr_cnt; - uint64_t *err_addr; + struct eeprom_table_record *err_addr; }; struct ras_err_handler_data { - /* point to bad pages array */ - struct { - unsigned long bp; - struct amdgpu_bo *bo; - } *bps; + /* point to bad page records array */ + struct eeprom_table_record *bps; + /* point to reserved bo array */ + struct amdgpu_bo **bps_bo; /* the count of entries */ int count; /* the space can place new entries */ @@ -365,7 +362,7 @@ struct ras_err_handler_data { }; typedef int (*ras_ih_cb)(struct amdgpu_device *adev, - struct ras_err_data *err_data, + void *err_data, struct amdgpu_iv_entry *entry); struct ras_ih_data { @@ -481,6 +478,7 @@ static inline int amdgpu_ras_is_supported(struct amdgpu_device *adev, return ras && (ras->supported & (1 << block)); } +int amdgpu_ras_recovery_init(struct amdgpu_device *adev); int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev, unsigned int block); @@ -492,7 +490,7 @@ unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev, /* error handling functions */ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, - unsigned long *bps, int pages); + struct eeprom_table_record *bps, int pages); int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev); @@ -501,6 +499,12 @@ static inline int amdgpu_ras_reset_gpu(struct amdgpu_device *adev, { struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + /* save bad page to eeprom before gpu reset, + * i2c may be unstable in gpu reset + */ + if (in_task()) + amdgpu_ras_reserve_bad_pages(adev); + if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0) schedule_work(&ras->recovery_work); return 0; @@ -566,6 +570,13 @@ amdgpu_ras_error_to_ta(enum amdgpu_ras_error_type error) { int amdgpu_ras_init(struct amdgpu_device *adev); int amdgpu_ras_fini(struct amdgpu_device *adev); int amdgpu_ras_pre_fini(struct amdgpu_device *adev); +int amdgpu_ras_late_init(struct amdgpu_device *adev, + struct ras_common_if *ras_block, + struct ras_fs_if *fs_info, + struct ras_ih_if *ih_info); +void amdgpu_ras_late_fini(struct amdgpu_device *adev, + struct ras_common_if *ras_block, + struct ras_ih_if *ih_info); int amdgpu_ras_feature_enable(struct amdgpu_device *adev, struct ras_common_if *head, bool enable); @@ -599,4 +610,14 @@ int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev, int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev, struct ras_dispatch_if *info); + +extern atomic_t amdgpu_ras_in_intr; + +static inline bool amdgpu_ras_intr_triggered(void) +{ + return !!atomic_read(&amdgpu_ras_in_intr); +} + +void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev); + #endif |