From 5c1e6fa49e8d8dbdd8bb457492b2bc52718df244 Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Thu, 16 Dec 2021 13:35:27 -0500 Subject: drm/amdgpu: introduce new amdgpu_fence object to indicate the job embedded fence MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The job embedded fence donesn't initialize the flags at dma_fence_init(). Then we will go a wrong way in amdgpu_fence_get_timeline_name callback and trigger a null pointer panic once we enabled the trace event here. So introduce new amdgpu_fence object to indicate the job embedded fence. [ 156.131790] BUG: kernel NULL pointer dereference, address: 00000000000002a0 [ 156.131804] #PF: supervisor read access in kernel mode [ 156.131811] #PF: error_code(0x0000) - not-present page [ 156.131817] PGD 0 P4D 0 [ 156.131824] Oops: 0000 [#1] PREEMPT SMP PTI [ 156.131832] CPU: 6 PID: 1404 Comm: sdma0 Tainted: G OE 5.16.0-rc1-custom #1 [ 156.131842] Hardware name: Gigabyte Technology Co., Ltd. Z170XP-SLI/Z170XP-SLI-CF, BIOS F20 11/04/2016 [ 156.131848] RIP: 0010:strlen+0x0/0x20 [ 156.131859] Code: 89 c0 c3 0f 1f 80 00 00 00 00 48 01 fe eb 0f 0f b6 07 38 d0 74 10 48 83 c7 01 84 c0 74 05 48 39 f7 75 ec 31 c0 c3 48 89 f8 c3 <80> 3f 00 74 10 48 89 f8 48 83 c0 01 80 38 00 75 f7 48 29 f8 c3 31 [ 156.131872] RSP: 0018:ffff9bd0018dbcf8 EFLAGS: 00010206 [ 156.131880] RAX: 00000000000002a0 RBX: ffff8d0305ef01b0 RCX: 000000000000000b [ 156.131888] RDX: ffff8d03772ab924 RSI: ffff8d0305ef01b0 RDI: 00000000000002a0 [ 156.131895] RBP: ffff9bd0018dbd60 R08: ffff8d03002094d0 R09: 0000000000000000 [ 156.131901] R10: 000000000000005e R11: 0000000000000065 R12: ffff8d03002094d0 [ 156.131907] R13: 000000000000001f R14: 0000000000070018 R15: 0000000000000007 [ 156.131914] FS: 0000000000000000(0000) GS:ffff8d062ed80000(0000) knlGS:0000000000000000 [ 156.131923] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 156.131929] CR2: 00000000000002a0 CR3: 000000001120a005 CR4: 00000000003706e0 [ 156.131937] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 156.131942] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 156.131949] Call Trace: [ 156.131953] [ 156.131957] ? trace_event_raw_event_dma_fence+0xcc/0x200 [ 156.131973] ? ring_buffer_unlock_commit+0x23/0x130 [ 156.131982] dma_fence_init+0x92/0xb0 [ 156.131993] amdgpu_fence_emit+0x10d/0x2b0 [amdgpu] [ 156.132302] amdgpu_ib_schedule+0x2f9/0x580 [amdgpu] [ 156.132586] amdgpu_job_run+0xed/0x220 [amdgpu] v2: fix mismatch warning between the prototype and function name (Ray, kernel test robot) Signed-off-by: Huang Rui Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index a24b4c374188..bd4232691697 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4505,7 +4505,7 @@ int amdgpu_device_mode1_reset(struct amdgpu_device *adev) int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, struct amdgpu_reset_context *reset_context) { - int i, j, r = 0; + int i, r = 0; struct amdgpu_job *job = NULL; bool need_full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); @@ -4527,15 +4527,8 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, /*clear job fence from fence drv to avoid force_completion *leave NULL and vm flush fence in fence drv */ - for (j = 0; j <= ring->fence_drv.num_fences_mask; j++) { - struct dma_fence *old, **ptr; + amdgpu_fence_driver_clear_job_fences(ring); - ptr = &ring->fence_drv.fences[j]; - old = rcu_dereference_protected(*ptr, 1); - if (old && test_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, &old->flags)) { - RCU_INIT_POINTER(*ptr, NULL); - } - } /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ amdgpu_fence_driver_force_completion(ring); } -- cgit v1.2.3-70-g09d2 From 892deb48269c65376f3eeb5b4c032ff2c2979bd7 Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Thu, 16 Dec 2021 17:01:45 +0000 Subject: drm/amdgpu: Separate vf2pf work item init from virt data exchange We want to be able to call virt data exchange conditionally after gmc sw init to reserve bad pages as early as possible. Since this is a conditional call, we will need to call it again unconditionally later in the init sequence. Refactor the data exchange function so it can be called multiple times without re-initializing the work item. v2: Cleaned up the code. Kept the original call to init_exchange_data() inside early init to initialize the work item, afterwards call exchange_data() when needed. Signed-off-by: Victor Skvortsov Reviewed By: Shaoyun.liu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 6 ++++- drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 36 ++++++++++++++++++++---------- drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 1 + 3 files changed, 30 insertions(+), 13 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index bd4232691697..0cd2404ca63c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2317,6 +2317,10 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) /* need to do gmc hw init early so we can allocate gpu mem */ if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { + /* Try to reserve bad pages early */ + if (amdgpu_sriov_vf(adev)) + amdgpu_virt_exchange_data(adev); + r = amdgpu_device_vram_scratch_init(adev); if (r) { DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); @@ -2348,7 +2352,7 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) } if (amdgpu_sriov_vf(adev)) - amdgpu_virt_init_data_exchange(adev); + amdgpu_virt_exchange_data(adev); r = amdgpu_ib_pool_init(adev); if (r) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c index 3fc49823f527..f8e574cc0e22 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c @@ -622,17 +622,35 @@ void amdgpu_virt_fini_data_exchange(struct amdgpu_device *adev) void amdgpu_virt_init_data_exchange(struct amdgpu_device *adev) { - uint64_t bp_block_offset = 0; - uint32_t bp_block_size = 0; - struct amd_sriov_msg_pf2vf_info *pf2vf_v2 = NULL; - adev->virt.fw_reserve.p_pf2vf = NULL; adev->virt.fw_reserve.p_vf2pf = NULL; adev->virt.vf2pf_update_interval_ms = 0; - if (adev->mman.fw_vram_usage_va != NULL) { + if (adev->bios != NULL) { adev->virt.vf2pf_update_interval_ms = 2000; + adev->virt.fw_reserve.p_pf2vf = + (struct amd_sriov_msg_pf2vf_info_header *) + (adev->bios + (AMD_SRIOV_MSG_PF2VF_OFFSET_KB << 10)); + + amdgpu_virt_read_pf2vf_data(adev); + } + + if (adev->virt.vf2pf_update_interval_ms != 0) { + INIT_DELAYED_WORK(&adev->virt.vf2pf_work, amdgpu_virt_update_vf2pf_work_item); + schedule_delayed_work(&(adev->virt.vf2pf_work), msecs_to_jiffies(adev->virt.vf2pf_update_interval_ms)); + } +} + + +void amdgpu_virt_exchange_data(struct amdgpu_device *adev) +{ + uint64_t bp_block_offset = 0; + uint32_t bp_block_size = 0; + struct amd_sriov_msg_pf2vf_info *pf2vf_v2 = NULL; + + if (adev->mman.fw_vram_usage_va != NULL) { + adev->virt.fw_reserve.p_pf2vf = (struct amd_sriov_msg_pf2vf_info_header *) (adev->mman.fw_vram_usage_va + (AMD_SRIOV_MSG_PF2VF_OFFSET_KB << 10)); @@ -663,16 +681,10 @@ void amdgpu_virt_init_data_exchange(struct amdgpu_device *adev) (adev->bios + (AMD_SRIOV_MSG_PF2VF_OFFSET_KB << 10)); amdgpu_virt_read_pf2vf_data(adev); - - return; - } - - if (adev->virt.vf2pf_update_interval_ms != 0) { - INIT_DELAYED_WORK(&adev->virt.vf2pf_work, amdgpu_virt_update_vf2pf_work_item); - schedule_delayed_work(&(adev->virt.vf2pf_work), adev->virt.vf2pf_update_interval_ms); } } + void amdgpu_detect_virtualization(struct amdgpu_device *adev) { uint32_t reg; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h index 8d4c20bb71c5..9adfb8d63280 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h @@ -308,6 +308,7 @@ int amdgpu_virt_alloc_mm_table(struct amdgpu_device *adev); void amdgpu_virt_free_mm_table(struct amdgpu_device *adev); void amdgpu_virt_release_ras_err_handler_data(struct amdgpu_device *adev); void amdgpu_virt_init_data_exchange(struct amdgpu_device *adev); +void amdgpu_virt_exchange_data(struct amdgpu_device *adev); void amdgpu_virt_fini_data_exchange(struct amdgpu_device *adev); void amdgpu_detect_virtualization(struct amdgpu_device *adev); -- cgit v1.2.3-70-g09d2 From 4a0165f0603f333c6b36a420b4e348b67ddf6fc8 Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Wed, 15 Dec 2021 18:42:44 +0000 Subject: drm/amdgpu: get xgmi info before ip_init Driver needs to call get_xgmi_info() before ip_init to determine whether it needs to handle a pending hive reset. Signed-off-by: Victor Skvortsov Reviewed-by: David Nieto Reviewed by: shaoyun.liu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 7 +++++++ drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 6 ------ drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 6 ------ 3 files changed, 7 insertions(+), 12 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 0cd2404ca63c..ce820ee2e2e6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3577,6 +3577,13 @@ int amdgpu_device_init(struct amdgpu_device *adev, if (r) return r; + /* Need to get xgmi info early to decide the reset behavior*/ + if (adev->gmc.xgmi.supported) { + r = adev->gfxhub.funcs->get_xgmi_info(adev); + if (r) + return r; + } + /* enable PCIE atomic ops */ if (amdgpu_sriov_vf(adev)) adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c index ae46eb35b3d7..3d5d47a799e3 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c @@ -914,12 +914,6 @@ static int gmc_v10_0_sw_init(void *handle) return r; } - if (adev->gmc.xgmi.supported) { - r = adev->gfxhub.funcs->get_xgmi_info(adev); - if (r) - return r; - } - r = gmc_v10_0_mc_init(adev); if (r) return r; diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 2b86c63b032a..57f2729a7bd0 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -1628,12 +1628,6 @@ static int gmc_v9_0_sw_init(void *handle) } adev->need_swiotlb = drm_need_swiotlb(44); - if (adev->gmc.xgmi.supported) { - r = adev->gfxhub.funcs->get_xgmi_info(adev); - if (r) - return r; - } - r = gmc_v9_0_mc_init(adev); if (r) return r; -- cgit v1.2.3-70-g09d2 From 4da8b63944a4f4482303c9ad6efb18aa547d4630 Mon Sep 17 00:00:00 2001 From: sashank saye Date: Fri, 17 Dec 2021 07:50:09 -0500 Subject: drm/amdgpu: Send Message to SMU on aldebaran passthrough for sbr handling For Aldebaran chip passthrough case we need to intimate SMU about special handling for SBR.On older chips we send LightSBR to SMU, enabling the same for Aldebaran. Slight difference, compared to previous chips, is on Aldebaran, SMU would do a heavy reset on SBR. Hence, the word Heavy instead of Light SBR is used for SMU to differentiate. Reviewed by: Shaoyun.liu Signed-off-by: sashank saye Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 9 ++++----- drivers/gpu/drm/amd/pm/inc/aldebaran_ppsmc.h | 4 +++- drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h | 6 +++--- drivers/gpu/drm/amd/pm/inc/smu_types.h | 3 ++- drivers/gpu/drm/amd/pm/inc/smu_v11_0.h | 2 +- drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 6 +++--- drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c | 2 +- drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c | 2 +- drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 10 ++++++++++ 9 files changed, 28 insertions(+), 16 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index ce820ee2e2e6..5250c5bbbd76 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2619,11 +2619,10 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) if (r) DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); - /* For XGMI + passthrough configuration on arcturus, enable light SBR */ - if (adev->asic_type == CHIP_ARCTURUS && - amdgpu_passthrough(adev) && - adev->gmc.xgmi.num_physical_nodes > 1) - smu_set_light_sbr(&adev->smu, true); + /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ + if (amdgpu_passthrough(adev) && ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1)|| + adev->asic_type == CHIP_ALDEBARAN )) + smu_handle_passthrough_sbr(&adev->smu, true); if (adev->gmc.xgmi.num_physical_nodes > 1) { mutex_lock(&mgpu_info.mutex); diff --git a/drivers/gpu/drm/amd/pm/inc/aldebaran_ppsmc.h b/drivers/gpu/drm/amd/pm/inc/aldebaran_ppsmc.h index 35fa0d8e92dd..ab66a4b9e438 100644 --- a/drivers/gpu/drm/amd/pm/inc/aldebaran_ppsmc.h +++ b/drivers/gpu/drm/amd/pm/inc/aldebaran_ppsmc.h @@ -102,7 +102,9 @@ #define PPSMC_MSG_GfxDriverResetRecovery 0x42 #define PPSMC_MSG_BoardPowerCalibration 0x43 -#define PPSMC_Message_Count 0x44 +#define PPSMC_MSG_HeavySBR 0x45 +#define PPSMC_Message_Count 0x46 + //PPSMC Reset Types #define PPSMC_RESET_TYPE_WARM_RESET 0x00 diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h index 2b9b9a7ba97a..ba7565bc8104 100644 --- a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h +++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h @@ -1257,9 +1257,9 @@ struct pptable_funcs { int (*set_fine_grain_gfx_freq_parameters)(struct smu_context *smu); /** - * @set_light_sbr: Set light sbr mode for the SMU. + * @smu_handle_passthrough_sbr: Send message to SMU about special handling for SBR. */ - int (*set_light_sbr)(struct smu_context *smu, bool enable); + int (*smu_handle_passthrough_sbr)(struct smu_context *smu, bool enable); /** * @wait_for_event: Wait for events from SMU. @@ -1415,7 +1415,7 @@ int smu_allow_xgmi_power_down(struct smu_context *smu, bool en); int smu_get_status_gfxoff(struct amdgpu_device *adev, uint32_t *value); -int smu_set_light_sbr(struct smu_context *smu, bool enable); +int smu_handle_passthrough_sbr(struct smu_context *smu, bool enable); int smu_wait_for_event(struct amdgpu_device *adev, enum smu_event_type event, uint64_t event_arg); diff --git a/drivers/gpu/drm/amd/pm/inc/smu_types.h b/drivers/gpu/drm/amd/pm/inc/smu_types.h index 18b862a90fbe..ff8a0bcbd290 100644 --- a/drivers/gpu/drm/amd/pm/inc/smu_types.h +++ b/drivers/gpu/drm/amd/pm/inc/smu_types.h @@ -229,7 +229,8 @@ __SMU_DUMMY_MAP(BoardPowerCalibration), \ __SMU_DUMMY_MAP(RequestGfxclk), \ __SMU_DUMMY_MAP(ForceGfxVid), \ - __SMU_DUMMY_MAP(UnforceGfxVid), + __SMU_DUMMY_MAP(UnforceGfxVid), \ + __SMU_DUMMY_MAP(HeavySBR), #undef __SMU_DUMMY_MAP #define __SMU_DUMMY_MAP(type) SMU_MSG_##type diff --git a/drivers/gpu/drm/amd/pm/inc/smu_v11_0.h b/drivers/gpu/drm/amd/pm/inc/smu_v11_0.h index 2d422e6a9feb..acb3be292096 100644 --- a/drivers/gpu/drm/amd/pm/inc/smu_v11_0.h +++ b/drivers/gpu/drm/amd/pm/inc/smu_v11_0.h @@ -312,7 +312,7 @@ int smu_v11_0_deep_sleep_control(struct smu_context *smu, void smu_v11_0_interrupt_work(struct smu_context *smu); -int smu_v11_0_set_light_sbr(struct smu_context *smu, bool enable); +int smu_v11_0_handle_passthrough_sbr(struct smu_context *smu, bool enable); int smu_v11_0_restore_user_od_settings(struct smu_context *smu); diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c index af98fa140d83..76f95e8ada4c 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c @@ -3058,13 +3058,13 @@ static int smu_gfx_state_change_set(void *handle, return ret; } -int smu_set_light_sbr(struct smu_context *smu, bool enable) +int smu_handle_passthrough_sbr(struct smu_context *smu, bool enable) { int ret = 0; mutex_lock(&smu->mutex); - if (smu->ppt_funcs->set_light_sbr) - ret = smu->ppt_funcs->set_light_sbr(smu, enable); + if (smu->ppt_funcs->smu_handle_passthrough_sbr) + ret = smu->ppt_funcs->smu_handle_passthrough_sbr(smu, enable); mutex_unlock(&smu->mutex); return ret; diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c index 58bc387fb279..505d2fb94fd9 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c @@ -2472,7 +2472,7 @@ static const struct pptable_funcs arcturus_ppt_funcs = { .deep_sleep_control = smu_v11_0_deep_sleep_control, .get_fan_parameters = arcturus_get_fan_parameters, .interrupt_work = smu_v11_0_interrupt_work, - .set_light_sbr = smu_v11_0_set_light_sbr, + .smu_handle_passthrough_sbr = smu_v11_0_handle_passthrough_sbr, .set_mp1_state = smu_cmn_set_mp1_state, }; diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c index 28b7c0562b99..4e9e2cf39859 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c @@ -1724,7 +1724,7 @@ int smu_v11_0_mode1_reset(struct smu_context *smu) return ret; } -int smu_v11_0_set_light_sbr(struct smu_context *smu, bool enable) +int smu_v11_0_handle_passthrough_sbr(struct smu_context *smu, bool enable) { int ret = 0; diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c index 7433a051e795..0e60d63ba94f 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c @@ -141,6 +141,7 @@ static const struct cmn2asic_msg_mapping aldebaran_message_map[SMU_MSG_MAX_COUNT MSG_MAP(SetUclkDpmMode, PPSMC_MSG_SetUclkDpmMode, 0), MSG_MAP(GfxDriverResetRecovery, PPSMC_MSG_GfxDriverResetRecovery, 0), MSG_MAP(BoardPowerCalibration, PPSMC_MSG_BoardPowerCalibration, 0), + MSG_MAP(HeavySBR, PPSMC_MSG_HeavySBR, 0), }; static const struct cmn2asic_mapping aldebaran_clk_map[SMU_CLK_COUNT] = { @@ -1912,6 +1913,14 @@ out: return ret; } +static int aldebaran_smu_handle_passthrough_sbr(struct smu_context *smu, bool enable) +{ + int ret = 0; + ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_HeavySBR, enable ? 1 : 0, NULL); + + return ret; +} + static bool aldebaran_is_mode1_reset_supported(struct smu_context *smu) { #if 0 @@ -2021,6 +2030,7 @@ static const struct pptable_funcs aldebaran_ppt_funcs = { .get_gpu_metrics = aldebaran_get_gpu_metrics, .mode1_reset_is_support = aldebaran_is_mode1_reset_supported, .mode2_reset_is_support = aldebaran_is_mode2_reset_supported, + .smu_handle_passthrough_sbr = aldebaran_smu_handle_passthrough_sbr, .mode1_reset = aldebaran_mode1_reset, .set_mp1_state = aldebaran_set_mp1_state, .mode2_reset = aldebaran_mode2_reset, -- cgit v1.2.3-70-g09d2 From 87172e89dcc7b09b32a4eb5f21e35d310e3cb024 Mon Sep 17 00:00:00 2001 From: Leslie Shi Date: Thu, 16 Dec 2021 14:03:41 +0800 Subject: drm/amdgpu: Call amdgpu_device_unmap_mmio() if device is unplugged to prevent crash in GPU initialization failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [Why] In amdgpu_driver_load_kms, when amdgpu_device_init returns error during driver modprobe, it will start the error handle path immediately and call into amdgpu_device_unmap_mmio as well to release mapped VRAM. However, in the following release callback, driver stills visits the unmapped memory like vcn.inst[i].fw_shared_cpu_addr in vcn_v3_0_sw_fini. So a kernel crash occurs. [How] call amdgpu_device_unmap_mmio() if device is unplugged to prevent invalid memory address in vcn_v3_0_sw_fini() when GPU initialization failure. Signed-off-by: Leslie Shi Reviewed-by: Andrey Grodzovsky Reviewed-by: Guchun Chen Acked-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 5250c5bbbd76..43e9149e129a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3899,7 +3899,9 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev) amdgpu_gart_dummy_page_fini(adev); - amdgpu_device_unmap_mmio(adev); + if (drm_dev_is_unplugged(adev_to_drm(adev))) + amdgpu_device_unmap_mmio(adev); + } void amdgpu_device_fini_sw(struct amdgpu_device *adev) -- cgit v1.2.3-70-g09d2 From b6fd6e0f5eb8c6d10575d08a2c6df8ed83877e07 Mon Sep 17 00:00:00 2001 From: Surbhi Kakarya Date: Fri, 17 Dec 2021 12:01:09 -0500 Subject: drm/amdgpu: Check the memory can be accesssed by ttm_device_clear_dma_mappings. If the event guard is enabled and VF doesn't receive an ack from PF for full access, the guest driver load crashes. This is caused due to the call to ttm_device_clear_dma_mappings with non-initialized mman during driver tear down. This patch adds the necessary condition to check if the mman initialization passed or not and takes the path based on the condition output. Signed-off-by: Surbhi Kakarya Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 43e9149e129a..dadfa03570d1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3895,7 +3895,8 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev) amdgpu_irq_fini_hw(adev); - ttm_device_clear_dma_mappings(&adev->mman.bdev); + if (adev->mman.initialized) + ttm_device_clear_dma_mappings(&adev->mman.bdev); amdgpu_gart_dummy_page_fini(adev); -- cgit v1.2.3-70-g09d2 From 0637d41786a3a9551f33ad8e15bdb40416362028 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 23 Dec 2021 14:13:02 -0500 Subject: drm/amdgpu: no DC support for headless chips Chips with no display hardware should return false for DC support. v2: drop Arcturus and Aldebaran Fixes: f7f12b25823c0d ("drm/amdgpu: default to true in amdgpu_device_asic_has_dc_support") Reviewed-by: Evan Quan Reviewed-by: Guchun Chen Reported-by: Tareque Md.Hanif Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index dadfa03570d1..c2a16b20de9a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3185,6 +3185,12 @@ static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) { switch (asic_type) { +#ifdef CONFIG_DRM_AMDGPU_SI + case CHIP_HAINAN: +#endif + case CHIP_TOPAZ: + /* chips with no display hardware */ + return false; #if defined(CONFIG_DRM_AMD_DC) case CHIP_TAHITI: case CHIP_PITCAIRN: -- cgit v1.2.3-70-g09d2