Merge tag 'drm-next-2021-02-19' of git://anongit.freedesktop.org/drm/drm

Pull drm updates from Dave Airlie: "A pretty normal tree, lots of refactoring across the board, ttm, i915, nouveau, and bunch of features in various drivers. docs: - lots of updated docs core: - require crtc to have unique primary plane - fourcc macro fix - PCI bar quirk for bar resizing - don't sent hotplug on error - move vm code to legacy - nuke hose only used on old oboslete alpha dma-buf: - kernel doc updates - improved lock tracking dp/hdmi: - DP-HDMI2.1 protocol converter support ttm: - bo size handling cleanup - release a pinned bo warning - cleanup lru handler - avoid using pages with drm_prime_sg_to_page_addr_arrays cma-helper: - prime/mmap fixes bridge: - add DP support gma500: - remove gma3600 support i915: - try eDP fast/narrow link again with fallback - Intel eDP backlight control - replace display register read/write macros - refactor intel_display.c - display power improvements - HPD code cleanup - Rocketlake display fixes - Power/backlight/RPM fixes - DG1 display fix - IVB/BYT clear residuals security fix again - make i915 mitigations options via parameter - HSW GT1 GPU hangs fixes - DG1 workaround hang fixes - TGL DMAR hang avoidance - Lots of GT fixes - follow on fixes for residuals clear - gen7 per-engine-reset support - HDCP2.2 + HDCP1.4 GEN12 DP MST support - TGL clear color support - backlight refactoring - VRR/Adaptive sync enabling on DP/EDP for TGL+ - async flips for all ilk+ amdgpu: - rework IH ring handling (Vega/Navi) - rework HDP handling (Vega/Navi) - swSMU updates for renoir/vangogh - Sienna Cichild overdrive support - FP16 on DCE8-11 support - GPU reset on navy flounder/vangogh - SMU profile fixes for APU - SR-IOV fixes - Vangogh SMU fixes - fan speed control fixes amdkfd: - config handling fix - buffer free fix - recursive lock warnings fix nouveau: - Turing MMU fault recovery fixes - mDP connectors reporting fix - audio locking fixes - rework engines/instances code to support new scheme tegra: - VIC newer firmware support - display/gr2d fixes for older tegra - pm reference leak fix mediatek: - SOC MT8183 support - decouple sub driver + share mtk mutex driver radeon: - PCI resource fix for some platforms ingenic: - pm support - 8-bit delta RGB panels vmwgfx: - managed driver helpers vc4: - BCM2711 DSI1 support - converted to atomic helpers - enable 10/12 bpc outputs - gem prime mmap helpers - CEC fix omap: - use degamma table - CTM support - rework DSI support imx: - stack usage fixes - drm managed support - imx-tve clock provider leak fix - rcar-du: - default mode fixes - conversion to managed API hisilicon: - use simple encoder vkms: - writeback connector support d3: - BT2020 support" * tag 'drm-next-2021-02-19' of git://anongit.freedesktop.org/drm/drm: (1459 commits) drm/amdgpu: Set reference clock to 100Mhz on Renoir (v2) drm/radeon: OLAND boards don't have VCE drm/amdkfd: Fix recursive lock warnings drm/amd/display: Add FPU wrappers to dcn21_validate_bandwidth() drm/amd/display: Fix potential integer overflow drm/amdgpu/display: remove hdcp_srm sysfs on device removal drm/amdgpu: fix CGTS_TCC_DISABLE register offset on gfx10.3 drm/i915/gt: Correct surface base address for renderclear drm/i915: Disallow plane x+w>stride on ilk+ with X-tiling drm/nouveau/top/ga100: initial support drm/nouveau/top: add ioctrl/nvjpg drm/nouveau/privring: rename from ibus drm/nouveau/nvkm: remove nvkm_subdev.index drm/nouveau/nvkm: determine subdev id/order from layout drm/nouveau/vic: switch to instanced constructor drm/nouveau/sw: switch to instanced constructor drm/nouveau/sec2: switch to instanced constructor drm/nouveau/sec: switch to instanced constructor drm/nouveau/pm: switch to instanced constructor drm/nouveau/nvenc: switch to instanced constructor ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2021-02-21 14:44:44 -0800
committer: Linus Torvalds <torvalds@linux-foundation.org> 2021-02-21 14:44:44 -0800
commit: d99676af540c2dc829999928fb81c58c80a1dce4 (patch)
tree: a78602eb6fa5d46d867c00ee187179ced6c18766 /drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
parent: 10e2ec8edece2566b40f69bae035a555ece71ab4 (diff)
parent: f730f39eb981af249d57336b47cfe3925632a7fd (diff)
1 files changed, 111 insertions, 33 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index cab1ebaf6d62..7052dc35d278 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -929,6 +929,18 @@ void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
 	pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
 }
 
+/**
+ * amdgpu_device_pci_reset - reset the GPU using generic PCI means
+ *
+ * @adev: amdgpu_device pointer
+ *
+ * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
+ */
+int amdgpu_device_pci_reset(struct amdgpu_device *adev)
+{
+	return pci_reset_function(adev->pdev);
+}
+
 /*
  * GPU doorbell aperture helpers function.
  */
@@ -1105,8 +1117,7 @@ void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
  */
 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
 {
-	u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size);
-	u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1;
+	int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
 	struct pci_bus *root;
 	struct resource *res;
 	unsigned i;
@@ -1137,6 +1148,10 @@ int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
 	if (!res)
 		return 0;
 
+	/* Limit the BAR size to what is available */
+	rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
+			rbar_size);
+
 	/* Disable memory decoding while we change the BAR addresses and size */
 	pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
 	pci_write_config_word(adev->pdev, PCI_COMMAND,
@@ -1422,24 +1437,22 @@ static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
 		/* don't suspend or resume card normally */
 		dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
 
-		pci_set_power_state(dev->pdev, PCI_D0);
-		amdgpu_device_load_pci_state(dev->pdev);
-		r = pci_enable_device(dev->pdev);
+		pci_set_power_state(pdev, PCI_D0);
+		amdgpu_device_load_pci_state(pdev);
+		r = pci_enable_device(pdev);
 		if (r)
 			DRM_WARN("pci_enable_device failed (%d)\n", r);
 		amdgpu_device_resume(dev, true);
 
 		dev->switch_power_state = DRM_SWITCH_POWER_ON;
-		drm_kms_helper_poll_enable(dev);
 	} else {
 		pr_info("switched off\n");
-		drm_kms_helper_poll_disable(dev);
 		dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
 		amdgpu_device_suspend(dev, true);
-		amdgpu_device_cache_pci_state(dev->pdev);
+		amdgpu_device_cache_pci_state(pdev);
 		/* Shut down the device */
-		pci_disable_device(dev->pdev);
-		pci_set_power_state(dev->pdev, PCI_D3cold);
+		pci_disable_device(pdev);
+		pci_set_power_state(pdev, PCI_D3cold);
 		dev->switch_power_state = DRM_SWITCH_POWER_OFF;
 	}
 }
@@ -1702,8 +1715,7 @@ static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
 	adev->enable_virtual_display = false;
 
 	if (amdgpu_virtual_display) {
-		struct drm_device *ddev = adev_to_drm(adev);
-		const char *pci_address_name = pci_name(ddev->pdev);
+		const char *pci_address_name = pci_name(adev->pdev);
 		char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
 
 		pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
@@ -3116,7 +3128,10 @@ static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
 	 */
 	adev->gfx_timeout = msecs_to_jiffies(10000);
 	adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
-	if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
+	if (amdgpu_sriov_vf(adev))
+		adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
+					msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
+	else if (amdgpu_passthrough(adev))
 		adev->compute_timeout =  msecs_to_jiffies(60000);
 	else
 		adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
@@ -3396,7 +3411,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 		}
 	}
 
-	pci_enable_pcie_error_reporting(adev->ddev.pdev);
+	pci_enable_pcie_error_reporting(adev->pdev);
 
 	/* Post card if necessary */
 	if (amdgpu_device_need_post(adev)) {
@@ -3719,7 +3734,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
 
 	r = amdgpu_device_ip_suspend_phase1(adev);
 
-	amdgpu_amdkfd_suspend(adev, !fbcon);
+	amdgpu_amdkfd_suspend(adev, adev->in_runpm);
 
 	/* evict vram memory */
 	amdgpu_bo_evict_vram(adev);
@@ -3803,7 +3818,7 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
 			}
 		}
 	}
-	r = amdgpu_amdkfd_resume(adev, !fbcon);
+	r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
 	if (r)
 		return r;
 
@@ -4154,8 +4169,8 @@ bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
 			continue;
 
 		spin_lock(&ring->sched.job_list_lock);
-		job = list_first_entry_or_null(&ring->sched.ring_mirror_list,
-				struct drm_sched_job, node);
+		job = list_first_entry_or_null(&ring->sched.pending_list,
+					       struct drm_sched_job, list);
 		spin_unlock(&ring->sched.job_list_lock);
 		if (job)
 			return true;
@@ -4205,6 +4220,8 @@ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
 		case CHIP_NAVI14:
 		case CHIP_NAVI12:
 		case CHIP_SIENNA_CICHLID:
+		case CHIP_NAVY_FLOUNDER:
+		case CHIP_DIMGREY_CAVEFISH:
 			break;
 		default:
 			goto disabled;
@@ -4454,6 +4471,46 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
 	up_write(&adev->reset_sem);
 }
 
+/*
+ * to lockup a list of amdgpu devices in a hive safely, if not a hive
+ * with multiple nodes, it will be similar as amdgpu_device_lock_adev.
+ *
+ * unlock won't require roll back.
+ */
+static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive)
+{
+	struct amdgpu_device *tmp_adev = NULL;
+
+	if (adev->gmc.xgmi.num_physical_nodes > 1) {
+		if (!hive) {
+			dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes");
+			return -ENODEV;
+		}
+		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
+			if (!amdgpu_device_lock_adev(tmp_adev, hive))
+				goto roll_back;
+		}
+	} else if (!amdgpu_device_lock_adev(adev, hive))
+		return -EAGAIN;
+
+	return 0;
+roll_back:
+	if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) {
+		/*
+		 * if the lockup iteration break in the middle of a hive,
+		 * it may means there may has a race issue,
+		 * or a hive device locked up independently.
+		 * we may be in trouble and may not, so will try to roll back
+		 * the lock and give out a warnning.
+		 */
+		dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock");
+		list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) {
+			amdgpu_device_unlock_adev(tmp_adev);
+		}
+	}
+	return -EAGAIN;
+}
+
 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
 {
 	struct pci_dev *p = NULL;
@@ -4567,20 +4624,36 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 			DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
 				job ? job->base.id : -1, hive->hive_id);
 			amdgpu_put_xgmi_hive(hive);
+			if (job)
+				drm_sched_increase_karma(&job->base);
 			return 0;
 		}
 		mutex_lock(&hive->hive_lock);
 	}
 
 	/*
+	 * lock the device before we try to operate the linked list
+	 * if didn't get the device lock, don't touch the linked list since
+	 * others may iterating it.
+	 */
+	r = amdgpu_device_lock_hive_adev(adev, hive);
+	if (r) {
+		dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
+					job ? job->base.id : -1);
+
+		/* even we skipped this reset, still need to set the job to guilty */
+		if (job)
+			drm_sched_increase_karma(&job->base);
+		goto skip_recovery;
+	}
+
+	/*
 	 * Build list of devices to reset.
 	 * In case we are in XGMI hive mode, resort the device list
 	 * to put adev in the 1st position.
 	 */
 	INIT_LIST_HEAD(&device_list);
 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
-		if (!hive)
-			return -ENODEV;
 		if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
 			list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
 		device_list_handle = &hive->device_list;
@@ -4591,13 +4664,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 
 	/* block all schedulers and reset given job's ring */
 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
-		if (!amdgpu_device_lock_adev(tmp_adev, hive)) {
-			dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
-				  job ? job->base.id : -1);
-			r = 0;
-			goto skip_recovery;
-		}
-
 		/*
 		 * Try to put the audio codec into suspend state
 		 * before gpu reset started.
@@ -4735,7 +4801,7 @@ skip_recovery:
 		amdgpu_put_xgmi_hive(hive);
 	}
 
-	if (r)
+	if (r && r != -EAGAIN)
 		dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
 	return r;
 }
@@ -4785,7 +4851,13 @@ static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
 						  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
 						  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
 		} else {
-			if (speed_cap == PCIE_SPEED_16_0GT)
+			if (speed_cap == PCIE_SPEED_32_0GT)
+				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
+							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
+							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
+							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
+							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
+			else if (speed_cap == PCIE_SPEED_16_0GT)
 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
@@ -4805,7 +4877,13 @@ static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
 			adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
 						   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
 		} else {
-			if (platform_speed_cap == PCIE_SPEED_16_0GT)
+			if (platform_speed_cap == PCIE_SPEED_32_0GT)
+				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
+							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
+							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
+							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
+							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
+			else if (platform_speed_cap == PCIE_SPEED_16_0GT)
 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
@@ -4949,8 +5027,8 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
 	case pci_channel_io_normal:
 		return PCI_ERS_RESULT_CAN_RECOVER;
 	/* Fatal error, prepare for slot reset */
-	case pci_channel_io_frozen:		
-		/*		
+	case pci_channel_io_frozen:
+		/*
 		 * Cancel and wait for all TDRs in progress if failing to
 		 * set  adev->in_gpu_reset in amdgpu_device_lock_adev
 		 *
@@ -5041,7 +5119,7 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
 		goto out;
 	}
 
-	adev->in_pci_err_recovery = true;	
+	adev->in_pci_err_recovery = true;
 	r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset);
 	adev->in_pci_err_recovery = false;
 	if (r)
author	Linus Torvalds <torvalds@linux-foundation.org>	2021-02-21 14:44:44 -0800
committer	Linus Torvalds <torvalds@linux-foundation.org>	2021-02-21 14:44:44 -0800
commit	d99676af540c2dc829999928fb81c58c80a1dce4 (patch)
tree	a78602eb6fa5d46d867c00ee187179ced6c18766 /drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
parent	10e2ec8edece2566b40f69bae035a555ece71ab4 (diff)
parent	f730f39eb981af249d57336b47cfe3925632a7fd (diff)