From 3affaa5a7ca3ca114e01049bc45e3ed4a3955505 Mon Sep 17 00:00:00 2001
From: Brian Starkey <brian.starkey@arm.com>
Date: Mon, 3 Dec 2018 11:31:57 +0000
Subject: drm/afbc: Add AFBC modifier usage documentation

AFBC is a flexible, proprietary, lossless compression protocol and
format, with a number of defined DRM format modifiers. To facilitate
consistency and compatibility between different AFBC producers and
consumers, document the expectations for usage of the AFBC DRM format
modifiers in a new .rst chapter.

Signed-off-by: Brian Starkey <brian.starkey@arm.com>
Reviewed-by: Liviu Dudau <liviu.dudau@arm.com>
[Updated MAINTAINERS entry to show that "Mali DP Maintainers" is
 actually a mailing list and added an SPDX-License-Identifier to
 the documentation]
Signed-off-by: Liviu Dudau <liviu.dudau@arm.com>
---
 include/uapi/drm/drm_fourcc.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/uapi/drm/drm_fourcc.h b/include/uapi/drm/drm_fourcc.h
index 0b44260a5ee9..df014ede4e57 100644
--- a/include/uapi/drm/drm_fourcc.h
+++ b/include/uapi/drm/drm_fourcc.h
@@ -572,6 +572,9 @@ extern "C" {
  * AFBC has several features which may be supported and/or used, which are
  * represented using bits in the modifier. Not all combinations are valid,
  * and different devices or use-cases may support different combinations.
+ *
+ * Further information on the use of AFBC modifiers can be found in
+ * Documentation/gpu/afbc.rst
  */
 #define DRM_FORMAT_MOD_ARM_AFBC(__afbc_mode)	fourcc_mod_code(ARM, __afbc_mode)
 
-- 
cgit v1.3.1


From 03ca3cf8e9aa7549e6c398462af0f68bdd43e7fe Mon Sep 17 00:00:00 2001
From: Rodrigo Vivi <rodrigo.vivi@intel.com>
Date: Thu, 17 Jan 2019 21:59:43 -0800
Subject: drm/i915/icl: Adding few more device IDs for Ice Lake
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We just got aware that there was more IDs available
at spec, so let's add them already.

Cc: James Ausmus <james.ausmus@intel.com>
Cc: José Roberto de Souza <jose.souza@intel.com>
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Reviewed-by: José Roberto de Souza <jose.souza@intel.com>
Signed-off-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190118055943.10252-1-rodrigo.vivi@intel.com
---
 include/drm/i915_pciids.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/drm/i915_pciids.h b/include/drm/i915_pciids.h
index 192667144693..df72be7e8b88 100644
--- a/include/drm/i915_pciids.h
+++ b/include/drm/i915_pciids.h
@@ -457,9 +457,13 @@
 	INTEL_VGA_DEVICE(0x8A51, info), \
 	INTEL_VGA_DEVICE(0x8A5C, info), \
 	INTEL_VGA_DEVICE(0x8A5D, info), \
+	INTEL_VGA_DEVICE(0x8A59, info),	\
+	INTEL_VGA_DEVICE(0x8A58, info),	\
 	INTEL_VGA_DEVICE(0x8A52, info), \
 	INTEL_VGA_DEVICE(0x8A5A, info), \
 	INTEL_VGA_DEVICE(0x8A5B, info), \
+	INTEL_VGA_DEVICE(0x8A57, info), \
+	INTEL_VGA_DEVICE(0x8A56, info), \
 	INTEL_VGA_DEVICE(0x8A71, info), \
 	INTEL_VGA_DEVICE(0x8A70, info)
 
-- 
cgit v1.3.1


From 3c8861d84a4d2c6cd7221d18e49bf9201c6c6115 Mon Sep 17 00:00:00 2001
From: Matt Roper <matthew.d.roper@intel.com>
Date: Mon, 17 Dec 2018 14:44:14 -0800
Subject: drm: Add color management LUT validation helper (v4)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some hardware may place additional restrictions on the gamma/degamma
curves described by our LUT properties.  E.g., that a gamma curve never
decreases or that the red/green/blue channels of a LUT's entries must be
equal.  Let's add a helper function that drivers can use to test that a
userspace-provided LUT is valid and doesn't violate hardware
requirements.

v2:
 - Combine into a single helper that just takes a bitmask of the tests
   to apply.  (Brian Starkey)
 - Add additional check (always performed) that LUT property blob size
   is always a multiple of the LUT entry size.  (stolen from ARM driver)

v3:
 - Drop the LUT size check again since
   drm_atomic_replace_property_blob_from_id() already covers this for
   us.  (Alexandru Gheorghe)

v4:
 - Use an enum to describe possible test values rather than #define's;
   this is cleaner to provide kerneldoc for.  (Daniel Vetter)
 - s/DRM_COLOR_LUT_INCREASING/DRM_COLOR_LUT_NON_DECREASING/.  (Ville)

Cc: Uma Shankar <uma.shankar@intel.com>
Cc: Swati Sharma <swati2.sharma@intel.com>
Cc: Brian Starkey <Brian.Starkey@arm.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
Signed-off-by: Matt Roper <matthew.d.roper@intel.com>
Reviewed-by: Brian Starkey <brian.starkey@arm.com>
Reviewed-by: Alexandru Gheorghe <alexandru-cosmin.gheorghe@arm.com>
Reviewed-by: Uma Shankar <uma.shankar@intel.com>
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20181217224415.12848-1-matthew.d.roper@intel.com
---
 drivers/gpu/drm/drm_color_mgmt.c | 44 ++++++++++++++++++++++++++++++++++++++++
 include/drm/drm_color_mgmt.h     | 29 ++++++++++++++++++++++++++
 2 files changed, 73 insertions(+)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_color_mgmt.c b/drivers/gpu/drm/drm_color_mgmt.c
index 07dcf47daafe..968ca7c91ad8 100644
--- a/drivers/gpu/drm/drm_color_mgmt.c
+++ b/drivers/gpu/drm/drm_color_mgmt.c
@@ -462,3 +462,47 @@ int drm_plane_create_color_properties(struct drm_plane *plane,
 	return 0;
 }
 EXPORT_SYMBOL(drm_plane_create_color_properties);
+
+/**
+ * drm_color_lut_check - check validity of lookup table
+ * @lut: property blob containing LUT to check
+ * @tests: bitmask of tests to run
+ *
+ * Helper to check whether a userspace-provided lookup table is valid and
+ * satisfies hardware requirements.  Drivers pass a bitmask indicating which of
+ * the tests in &drm_color_lut_tests should be performed.
+ *
+ * Returns 0 on success, -EINVAL on failure.
+ */
+int drm_color_lut_check(struct drm_property_blob *lut,
+			uint32_t tests)
+{
+	struct drm_color_lut *entry;
+	int i;
+
+	if (!lut || !tests)
+		return 0;
+
+	entry = lut->data;
+	for (i = 0; i < drm_color_lut_size(lut); i++) {
+		if (tests & DRM_COLOR_LUT_EQUAL_CHANNELS) {
+			if (entry[i].red != entry[i].blue ||
+			    entry[i].red != entry[i].green) {
+				DRM_DEBUG_KMS("All LUT entries must have equal r/g/b\n");
+				return -EINVAL;
+			}
+		}
+
+		if (i > 0 && tests & DRM_COLOR_LUT_NON_DECREASING) {
+			if (entry[i].red < entry[i - 1].red ||
+			    entry[i].green < entry[i - 1].green ||
+			    entry[i].blue < entry[i - 1].blue) {
+				DRM_DEBUG_KMS("LUT entries must never decrease.\n");
+				return -EINVAL;
+			}
+		}
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(drm_color_lut_check);
diff --git a/include/drm/drm_color_mgmt.h b/include/drm/drm_color_mgmt.h
index 90ef9996d9a4..6affbda6d9cb 100644
--- a/include/drm/drm_color_mgmt.h
+++ b/include/drm/drm_color_mgmt.h
@@ -69,4 +69,33 @@ int drm_plane_create_color_properties(struct drm_plane *plane,
 				      u32 supported_ranges,
 				      enum drm_color_encoding default_encoding,
 				      enum drm_color_range default_range);
+
+/**
+ * enum drm_color_lut_tests - hw-specific LUT tests to perform
+ *
+ * The drm_color_lut_check() function takes a bitmask of the values here to
+ * determine which tests to apply to a userspace-provided LUT.
+ */
+enum drm_color_lut_tests {
+	/**
+	 * @DRM_COLOR_LUT_EQUAL_CHANNELS:
+	 *
+	 * Checks whether the entries of a LUT all have equal values for the
+	 * red, green, and blue channels.  Intended for hardware that only
+	 * accepts a single value per LUT entry and assumes that value applies
+	 * to all three color components.
+	 */
+	DRM_COLOR_LUT_EQUAL_CHANNELS = BIT(0),
+
+	/**
+	 * @DRM_COLOR_LUT_NON_DECREASING:
+	 *
+	 * Checks whether the entries of a LUT are always flat or increasing
+	 * (never decreasing).
+	 */
+	DRM_COLOR_LUT_NON_DECREASING = BIT(1),
+};
+
+int drm_color_lut_check(struct drm_property_blob *lut,
+			uint32_t tests);
 #endif
-- 
cgit v1.3.1


From ae6d343541bb75958e9535d056adaf4ff6a66d6a Mon Sep 17 00:00:00 2001
From: Chunming Zhou <david1.zhou@amd.com>
Date: Thu, 10 Jan 2019 17:56:39 +0800
Subject: drm/ttm: add lru notify to bo driver v2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

allow driver do somethings when lru changed.
v2:
address Michel's comments.

Signed-off-by: Chunming Zhou <david1.zhou@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/ttm/ttm_bo.c    | 11 +++++++----
 include/drm/ttm/ttm_bo_driver.h |  9 +++++++++
 2 files changed, 16 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index 0ec08394e17a..de088c8070fb 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -198,19 +198,22 @@ static void ttm_bo_ref_bug(struct kref *list_kref)
 
 void ttm_bo_del_from_lru(struct ttm_buffer_object *bo)
 {
+	struct ttm_bo_device *bdev = bo->bdev;
+	bool notify = false;
+
 	if (!list_empty(&bo->swap)) {
 		list_del_init(&bo->swap);
 		kref_put(&bo->list_kref, ttm_bo_ref_bug);
+		notify = true;
 	}
 	if (!list_empty(&bo->lru)) {
 		list_del_init(&bo->lru);
 		kref_put(&bo->list_kref, ttm_bo_ref_bug);
+		notify = true;
 	}
 
-	/*
-	 * TODO: Add a driver hook to delete from
-	 * driver-specific LRU's here.
-	 */
+	if (notify && bdev->driver->del_from_lru_notify)
+		bdev->driver->del_from_lru_notify(bo);
 }
 
 void ttm_bo_del_sub_from_lru(struct ttm_buffer_object *bo)
diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h
index 1021106438b2..15829b24277c 100644
--- a/include/drm/ttm/ttm_bo_driver.h
+++ b/include/drm/ttm/ttm_bo_driver.h
@@ -381,6 +381,15 @@ struct ttm_bo_driver {
 	 */
 	int (*access_memory)(struct ttm_buffer_object *bo, unsigned long offset,
 			     void *buf, int len, int write);
+
+	/**
+	 * struct ttm_bo_driver member del_from_lru_notify
+	 *
+	 * @bo: the buffer object deleted from lru
+	 *
+	 * notify driver that a BO was deleted from LRU.
+	 */
+	void (*del_from_lru_notify)(struct ttm_buffer_object *bo);
 };
 
 /**
-- 
cgit v1.3.1


From 222b5f044159877504dbac9bc1910f89a74136e2 Mon Sep 17 00:00:00 2001
From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Date: Tue, 4 Dec 2018 16:56:14 -0500
Subject: drm/sched: Refactor ring mirror list handling.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Decauple sched threads stop and start and ring mirror
list handling from the policy of what to do about the
guilty jobs.
When stoppping the sched thread and detaching sched fences
from non signaled HW fenes wait for all signaled HW fences
to complete before rerunning the jobs.

v2: Fix resubmission of guilty job into HW after refactoring.

v4:
Full restart for all the jobs, not only from guilty ring.
Extract karma increase into standalone function.

v5:
Rework waiting for signaled jobs without relying on the job
struct itself as those might already be freed for non 'guilty'
job's schedulers.
Expose karma increase to drivers.

v6:
Use list_for_each_entry_safe_continue and drm_sched_process_job
in case fence already signaled.
Call drm_sched_increase_karma only once for amdgpu and add documentation.

v7:
Wait only for the latest job's fence.

Suggested-by: Christian Koenig <Christian.Koenig@amd.com>
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  20 ++--
 drivers/gpu/drm/etnaviv/etnaviv_sched.c    |  11 +-
 drivers/gpu/drm/scheduler/sched_main.c     | 170 ++++++++++++++++++-----------
 drivers/gpu/drm/v3d/v3d_sched.c            |  13 ++-
 include/drm/gpu_scheduler.h                |   7 +-
 5 files changed, 131 insertions(+), 90 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e20dce438d37..d7dddb936f84 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3313,17 +3313,15 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
 		if (!ring || !ring->sched.thread)
 			continue;
 
-		kthread_park(ring->sched.thread);
-
-		if (job && job->base.sched != &ring->sched)
-			continue;
-
-		drm_sched_hw_job_reset(&ring->sched, job ? &job->base : NULL);
+		drm_sched_stop(&ring->sched);
 
 		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
 		amdgpu_fence_driver_force_completion(ring);
 	}
 
+	if(job)
+		drm_sched_increase_karma(&job->base);
+
 
 
 	if (!amdgpu_sriov_vf(adev)) {
@@ -3469,14 +3467,10 @@ static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev,
 		if (!ring || !ring->sched.thread)
 			continue;
 
-		/* only need recovery sched of the given job's ring
-		 * or all rings (in the case @job is NULL)
-		 * after above amdgpu_reset accomplished
-		 */
-		if ((!job || job->base.sched == &ring->sched) && !adev->asic_reset_res)
-			drm_sched_job_recovery(&ring->sched);
+		if (!adev->asic_reset_res)
+			drm_sched_resubmit_jobs(&ring->sched);
 
-		kthread_unpark(ring->sched.thread);
+		drm_sched_start(&ring->sched, !adev->asic_reset_res);
 	}
 
 	if (!amdgpu_device_has_dc_support(adev)) {
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
index 49a6763693f1..67ae26602024 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
@@ -109,16 +109,19 @@ static void etnaviv_sched_timedout_job(struct drm_sched_job *sched_job)
 	}
 
 	/* block scheduler */
-	kthread_park(gpu->sched.thread);
-	drm_sched_hw_job_reset(&gpu->sched, sched_job);
+	drm_sched_stop(&gpu->sched);
+
+	if(sched_job)
+		drm_sched_increase_karma(sched_job);
 
 	/* get the GPU back into the init state */
 	etnaviv_core_dump(gpu);
 	etnaviv_gpu_recover_hang(gpu);
 
+	drm_sched_resubmit_jobs(&gpu->sched);
+
 	/* restart scheduler after GPU is usable again */
-	drm_sched_job_recovery(&gpu->sched);
-	kthread_unpark(gpu->sched.thread);
+	drm_sched_start(&gpu->sched, true);
 }
 
 static void etnaviv_sched_free_job(struct drm_sched_job *sched_job)
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index dbb69063b3d5..4bb18511ac75 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -60,8 +60,6 @@
 
 static void drm_sched_process_job(struct dma_fence *f, struct dma_fence_cb *cb);
 
-static void drm_sched_expel_job_unlocked(struct drm_sched_job *s_job);
-
 /**
  * drm_sched_rq_init - initialize a given run queue struct
  *
@@ -335,6 +333,51 @@ static void drm_sched_job_timedout(struct work_struct *work)
 	spin_unlock_irqrestore(&sched->job_list_lock, flags);
 }
 
+ /**
+  * drm_sched_increase_karma - Update sched_entity guilty flag
+  *
+  * @bad: The job guilty of time out
+  *
+  * Increment on every hang caused by the 'bad' job. If this exceeds the hang
+  * limit of the scheduler then the respective sched entity is marked guilty and
+  * jobs from it will not be scheduled further
+  */
+void drm_sched_increase_karma(struct drm_sched_job *bad)
+{
+	int i;
+	struct drm_sched_entity *tmp;
+	struct drm_sched_entity *entity;
+	struct drm_gpu_scheduler *sched = bad->sched;
+
+	/* don't increase @bad's karma if it's from KERNEL RQ,
+	 * because sometimes GPU hang would cause kernel jobs (like VM updating jobs)
+	 * corrupt but keep in mind that kernel jobs always considered good.
+	 */
+	if (bad->s_priority != DRM_SCHED_PRIORITY_KERNEL) {
+		atomic_inc(&bad->karma);
+		for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_KERNEL;
+		     i++) {
+			struct drm_sched_rq *rq = &sched->sched_rq[i];
+
+			spin_lock(&rq->lock);
+			list_for_each_entry_safe(entity, tmp, &rq->entities, list) {
+				if (bad->s_fence->scheduled.context ==
+				    entity->fence_context) {
+					if (atomic_read(&bad->karma) >
+					    bad->sched->hang_limit)
+						if (entity->guilty)
+							atomic_set(entity->guilty, 1);
+					break;
+				}
+			}
+			spin_unlock(&rq->lock);
+			if (&entity->list != &rq->entities)
+				break;
+		}
+	}
+}
+EXPORT_SYMBOL(drm_sched_increase_karma);
+
 /**
  * drm_sched_hw_job_reset - stop the scheduler if it contains the bad job
  *
@@ -342,13 +385,20 @@ static void drm_sched_job_timedout(struct work_struct *work)
  * @bad: bad scheduler job
  *
  */
-void drm_sched_hw_job_reset(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
+void drm_sched_stop(struct drm_gpu_scheduler *sched)
 {
 	struct drm_sched_job *s_job;
-	struct drm_sched_entity *entity, *tmp;
 	unsigned long flags;
-	int i;
+	struct dma_fence *last_fence =  NULL;
 
+	kthread_park(sched->thread);
+
+	/*
+	 * Verify all the signaled jobs in mirror list are removed from the ring
+	 * by waiting for the latest job to enter the list. This should insure that
+	 * also all the previous jobs that were in flight also already singaled
+	 * and removed from the list.
+	 */
 	spin_lock_irqsave(&sched->job_list_lock, flags);
 	list_for_each_entry_reverse(s_job, &sched->ring_mirror_list, node) {
 		if (s_job->s_fence->parent &&
@@ -357,35 +407,20 @@ void drm_sched_hw_job_reset(struct drm_gpu_scheduler *sched, struct drm_sched_jo
 			dma_fence_put(s_job->s_fence->parent);
 			s_job->s_fence->parent = NULL;
 			atomic_dec(&sched->hw_rq_count);
+		} else {
+			 last_fence = dma_fence_get(&s_job->s_fence->finished);
+			 break;
 		}
 	}
 	spin_unlock_irqrestore(&sched->job_list_lock, flags);
 
-	if (bad && bad->s_priority != DRM_SCHED_PRIORITY_KERNEL) {
-		atomic_inc(&bad->karma);
-		/* don't increase @bad's karma if it's from KERNEL RQ,
-		 * becuase sometimes GPU hang would cause kernel jobs (like VM updating jobs)
-		 * corrupt but keep in mind that kernel jobs always considered good.
-		 */
-		for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_KERNEL; i++ ) {
-			struct drm_sched_rq *rq = &sched->sched_rq[i];
-
-			spin_lock(&rq->lock);
-			list_for_each_entry_safe(entity, tmp, &rq->entities, list) {
-				if (bad->s_fence->scheduled.context == entity->fence_context) {
-				    if (atomic_read(&bad->karma) > bad->sched->hang_limit)
-						if (entity->guilty)
-							atomic_set(entity->guilty, 1);
-					break;
-				}
-			}
-			spin_unlock(&rq->lock);
-			if (&entity->list != &rq->entities)
-				break;
-		}
+	if (last_fence) {
+		dma_fence_wait(last_fence, false);
+		dma_fence_put(last_fence);
 	}
 }
-EXPORT_SYMBOL(drm_sched_hw_job_reset);
+
+EXPORT_SYMBOL(drm_sched_stop);
 
 /**
  * drm_sched_job_recovery - recover jobs after a reset
@@ -393,33 +428,21 @@ EXPORT_SYMBOL(drm_sched_hw_job_reset);
  * @sched: scheduler instance
  *
  */
-void drm_sched_job_recovery(struct drm_gpu_scheduler *sched)
+void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
 {
 	struct drm_sched_job *s_job, *tmp;
-	bool found_guilty = false;
 	unsigned long flags;
 	int r;
 
+	if (!full_recovery)
+		goto unpark;
+
 	spin_lock_irqsave(&sched->job_list_lock, flags);
 	list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
 		struct drm_sched_fence *s_fence = s_job->s_fence;
-		struct dma_fence *fence;
-		uint64_t guilty_context;
-
-		if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) {
-			found_guilty = true;
-			guilty_context = s_job->s_fence->scheduled.context;
-		}
-
-		if (found_guilty && s_job->s_fence->scheduled.context == guilty_context)
-			dma_fence_set_error(&s_fence->finished, -ECANCELED);
-
-		spin_unlock_irqrestore(&sched->job_list_lock, flags);
-		fence = sched->ops->run_job(s_job);
-		atomic_inc(&sched->hw_rq_count);
+		struct dma_fence *fence = s_job->s_fence->parent;
 
 		if (fence) {
-			s_fence->parent = dma_fence_get(fence);
 			r = dma_fence_add_callback(fence, &s_fence->cb,
 						   drm_sched_process_job);
 			if (r == -ENOENT)
@@ -427,18 +450,47 @@ void drm_sched_job_recovery(struct drm_gpu_scheduler *sched)
 			else if (r)
 				DRM_ERROR("fence add callback failed (%d)\n",
 					  r);
-			dma_fence_put(fence);
-		} else {
-			if (s_fence->finished.error < 0)
-				drm_sched_expel_job_unlocked(s_job);
+		} else
 			drm_sched_process_job(NULL, &s_fence->cb);
-		}
-		spin_lock_irqsave(&sched->job_list_lock, flags);
 	}
+
 	drm_sched_start_timeout(sched);
 	spin_unlock_irqrestore(&sched->job_list_lock, flags);
+
+unpark:
+	kthread_unpark(sched->thread);
 }
-EXPORT_SYMBOL(drm_sched_job_recovery);
+EXPORT_SYMBOL(drm_sched_start);
+
+/**
+ * drm_sched_resubmit_jobs - helper to relunch job from mirror ring list
+ *
+ * @sched: scheduler instance
+ *
+ */
+void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched)
+{
+	struct drm_sched_job *s_job, *tmp;
+	uint64_t guilty_context;
+	bool found_guilty = false;
+
+	/*TODO DO we need spinlock here ? */
+	list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
+		struct drm_sched_fence *s_fence = s_job->s_fence;
+
+		if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) {
+			found_guilty = true;
+			guilty_context = s_job->s_fence->scheduled.context;
+		}
+
+		if (found_guilty && s_job->s_fence->scheduled.context == guilty_context)
+			dma_fence_set_error(&s_fence->finished, -ECANCELED);
+
+		s_job->s_fence->parent = sched->ops->run_job(s_job);
+		atomic_inc(&sched->hw_rq_count);
+	}
+}
+EXPORT_SYMBOL(drm_sched_resubmit_jobs);
 
 /**
  * drm_sched_job_init - init a scheduler job
@@ -634,26 +686,14 @@ static int drm_sched_main(void *param)
 				DRM_ERROR("fence add callback failed (%d)\n",
 					  r);
 			dma_fence_put(fence);
-		} else {
-			if (s_fence->finished.error < 0)
-				drm_sched_expel_job_unlocked(sched_job);
+		} else
 			drm_sched_process_job(NULL, &s_fence->cb);
-		}
 
 		wake_up(&sched->job_scheduled);
 	}
 	return 0;
 }
 
-static void drm_sched_expel_job_unlocked(struct drm_sched_job *s_job)
-{
-	struct drm_gpu_scheduler *sched = s_job->sched;
-
-	spin_lock(&sched->job_list_lock);
-	list_del_init(&s_job->node);
-	spin_unlock(&sched->job_list_lock);
-}
-
 /**
  * drm_sched_init - Init a gpu scheduler instance
  *
diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
index f7508e907536..4704b2df3688 100644
--- a/drivers/gpu/drm/v3d/v3d_sched.c
+++ b/drivers/gpu/drm/v3d/v3d_sched.c
@@ -234,18 +234,21 @@ v3d_gpu_reset_for_timeout(struct v3d_dev *v3d, struct drm_sched_job *sched_job)
 	for (q = 0; q < V3D_MAX_QUEUES; q++) {
 		struct drm_gpu_scheduler *sched = &v3d->queue[q].sched;
 
-		kthread_park(sched->thread);
-		drm_sched_hw_job_reset(sched, (sched_job->sched == sched ?
-					       sched_job : NULL));
+		drm_sched_stop(sched);
+
+		if(sched_job)
+			drm_sched_increase_karma(sched_job);
 	}
 
 	/* get the GPU back into the init state */
 	v3d_reset(v3d);
 
+	for (q = 0; q < V3D_MAX_QUEUES; q++)
+		drm_sched_resubmit_jobs(sched_job->sched);
+
 	/* Unblock schedulers and restart their jobs. */
 	for (q = 0; q < V3D_MAX_QUEUES; q++) {
-		drm_sched_job_recovery(&v3d->queue[q].sched);
-		kthread_unpark(v3d->queue[q].sched.thread);
+		drm_sched_start(&v3d->queue[q].sched, true);
 	}
 
 	mutex_unlock(&v3d->reset_lock);
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index 47e19796c450..c567bba91ba0 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -298,9 +298,10 @@ int drm_sched_job_init(struct drm_sched_job *job,
 		       void *owner);
 void drm_sched_job_cleanup(struct drm_sched_job *job);
 void drm_sched_wakeup(struct drm_gpu_scheduler *sched);
-void drm_sched_hw_job_reset(struct drm_gpu_scheduler *sched,
-			    struct drm_sched_job *job);
-void drm_sched_job_recovery(struct drm_gpu_scheduler *sched);
+void drm_sched_stop(struct drm_gpu_scheduler *sched);
+void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery);
+void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched);
+void drm_sched_increase_karma(struct drm_sched_job *bad);
 bool drm_sched_dependency_optimized(struct dma_fence* fence,
 				    struct drm_sched_entity *entity);
 void drm_sched_fault(struct drm_gpu_scheduler *sched);
-- 
cgit v1.3.1


From 3741540e04137256df82105bcd720a5e27423c34 Mon Sep 17 00:00:00 2001
From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Date: Wed, 5 Dec 2018 14:21:28 -0500
Subject: drm/sched: Rework HW fence processing.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Expedite job deletion from ring mirror list to the HW fence signal
callback instead from finish_work, together with waiting for all
such fences to signal in drm_sched_stop we garantee that
already signaled job will not be processed twice.
Remove the sched finish fence callback and just submit finish_work
directly from the HW fence callback.

v2: Fix comments.
v3: Attach  hw fence cb to sched_job
v5: Rebase

Suggested-by: Christian Koenig <Christian.Koenig@amd.com>
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/scheduler/sched_main.c | 55 +++++++++++++++++-----------------
 include/drm/gpu_scheduler.h            |  6 ++--
 2 files changed, 29 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index 4bb18511ac75..19fc601c9eeb 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -284,8 +284,6 @@ static void drm_sched_job_finish(struct work_struct *work)
 	cancel_delayed_work_sync(&sched->work_tdr);
 
 	spin_lock_irqsave(&sched->job_list_lock, flags);
-	/* remove job from ring_mirror_list */
-	list_del_init(&s_job->node);
 	/* queue TDR for next job */
 	drm_sched_start_timeout(sched);
 	spin_unlock_irqrestore(&sched->job_list_lock, flags);
@@ -293,22 +291,11 @@ static void drm_sched_job_finish(struct work_struct *work)
 	sched->ops->free_job(s_job);
 }
 
-static void drm_sched_job_finish_cb(struct dma_fence *f,
-				    struct dma_fence_cb *cb)
-{
-	struct drm_sched_job *job = container_of(cb, struct drm_sched_job,
-						 finish_cb);
-	schedule_work(&job->finish_work);
-}
-
 static void drm_sched_job_begin(struct drm_sched_job *s_job)
 {
 	struct drm_gpu_scheduler *sched = s_job->sched;
 	unsigned long flags;
 
-	dma_fence_add_callback(&s_job->s_fence->finished, &s_job->finish_cb,
-			       drm_sched_job_finish_cb);
-
 	spin_lock_irqsave(&sched->job_list_lock, flags);
 	list_add_tail(&s_job->node, &sched->ring_mirror_list);
 	drm_sched_start_timeout(sched);
@@ -403,7 +390,7 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched)
 	list_for_each_entry_reverse(s_job, &sched->ring_mirror_list, node) {
 		if (s_job->s_fence->parent &&
 		    dma_fence_remove_callback(s_job->s_fence->parent,
-					      &s_job->s_fence->cb)) {
+					      &s_job->cb)) {
 			dma_fence_put(s_job->s_fence->parent);
 			s_job->s_fence->parent = NULL;
 			atomic_dec(&sched->hw_rq_count);
@@ -431,31 +418,34 @@ EXPORT_SYMBOL(drm_sched_stop);
 void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
 {
 	struct drm_sched_job *s_job, *tmp;
-	unsigned long flags;
 	int r;
 
 	if (!full_recovery)
 		goto unpark;
 
-	spin_lock_irqsave(&sched->job_list_lock, flags);
+	/*
+	 * Locking the list is not required here as the sched thread is parked
+	 * so no new jobs are being pushed in to HW and in drm_sched_stop we
+	 * flushed all the jobs who were still in mirror list but who already
+	 * signaled and removed them self from the list. Also concurrent
+	 * GPU recovers can't run in parallel.
+	 */
 	list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
-		struct drm_sched_fence *s_fence = s_job->s_fence;
 		struct dma_fence *fence = s_job->s_fence->parent;
 
 		if (fence) {
-			r = dma_fence_add_callback(fence, &s_fence->cb,
+			r = dma_fence_add_callback(fence, &s_job->cb,
 						   drm_sched_process_job);
 			if (r == -ENOENT)
-				drm_sched_process_job(fence, &s_fence->cb);
+				drm_sched_process_job(fence, &s_job->cb);
 			else if (r)
 				DRM_ERROR("fence add callback failed (%d)\n",
 					  r);
 		} else
-			drm_sched_process_job(NULL, &s_fence->cb);
+			drm_sched_process_job(NULL, &s_job->cb);
 	}
 
 	drm_sched_start_timeout(sched);
-	spin_unlock_irqrestore(&sched->job_list_lock, flags);
 
 unpark:
 	kthread_unpark(sched->thread);
@@ -604,18 +594,27 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched)
  */
 static void drm_sched_process_job(struct dma_fence *f, struct dma_fence_cb *cb)
 {
-	struct drm_sched_fence *s_fence =
-		container_of(cb, struct drm_sched_fence, cb);
+	struct drm_sched_job *s_job = container_of(cb, struct drm_sched_job, cb);
+	struct drm_sched_fence *s_fence = s_job->s_fence;
 	struct drm_gpu_scheduler *sched = s_fence->sched;
+	unsigned long flags;
+
+	cancel_delayed_work(&sched->work_tdr);
 
-	dma_fence_get(&s_fence->finished);
 	atomic_dec(&sched->hw_rq_count);
 	atomic_dec(&sched->num_jobs);
+
+	spin_lock_irqsave(&sched->job_list_lock, flags);
+	/* remove job from ring_mirror_list */
+	list_del_init(&s_job->node);
+	spin_unlock_irqrestore(&sched->job_list_lock, flags);
+
 	drm_sched_fence_finished(s_fence);
 
 	trace_drm_sched_process_job(s_fence);
-	dma_fence_put(&s_fence->finished);
 	wake_up_interruptible(&sched->wake_up_worker);
+
+	schedule_work(&s_job->finish_work);
 }
 
 /**
@@ -678,16 +677,16 @@ static int drm_sched_main(void *param)
 
 		if (fence) {
 			s_fence->parent = dma_fence_get(fence);
-			r = dma_fence_add_callback(fence, &s_fence->cb,
+			r = dma_fence_add_callback(fence, &sched_job->cb,
 						   drm_sched_process_job);
 			if (r == -ENOENT)
-				drm_sched_process_job(fence, &s_fence->cb);
+				drm_sched_process_job(fence, &sched_job->cb);
 			else if (r)
 				DRM_ERROR("fence add callback failed (%d)\n",
 					  r);
 			dma_fence_put(fence);
 		} else
-			drm_sched_process_job(NULL, &s_fence->cb);
+			drm_sched_process_job(NULL, &sched_job->cb);
 
 		wake_up(&sched->job_scheduled);
 	}
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index c567bba91ba0..0daca4d8dad9 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -137,10 +137,6 @@ struct drm_sched_fence {
          */
 	struct dma_fence		finished;
 
-        /**
-         * @cb: the callback for the parent fence below.
-         */
-	struct dma_fence_cb		cb;
         /**
          * @parent: the fence returned by &drm_sched_backend_ops.run_job
          * when scheduling the job on hardware. We signal the
@@ -181,6 +177,7 @@ struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f);
  *         be scheduled further.
  * @s_priority: the priority of the job.
  * @entity: the entity to which this job belongs.
+ * @cb: the callback for the parent fence in s_fence.
  *
  * A job is created by the driver using drm_sched_job_init(), and
  * should call drm_sched_entity_push_job() once it wants the scheduler
@@ -197,6 +194,7 @@ struct drm_sched_job {
 	atomic_t			karma;
 	enum drm_sched_priority		s_priority;
 	struct drm_sched_entity  *entity;
+	struct dma_fence_cb		cb;
 };
 
 static inline bool drm_sched_invalidate_job(struct drm_sched_job *s_job,
-- 
cgit v1.3.1


From cbce5f0a9f306f890d257a92fcca6cb341af76ed Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Fri, 25 Jan 2019 12:02:11 +0100
Subject: drm/ttm: Remove ttm_bo_reference and ttm_bo_unref
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Both functions are obsolete and all calls have been replaced by
ttm_bo_get and ttm_bo_put.

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Reviewed-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Huang Rui <ray.huang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/ttm/ttm_bo.c |  9 ---------
 include/drm/ttm/ttm_bo_api.h | 28 ----------------------------
 2 files changed, 37 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index de088c8070fb..3f56647cdb35 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -679,15 +679,6 @@ void ttm_bo_put(struct ttm_buffer_object *bo)
 }
 EXPORT_SYMBOL(ttm_bo_put);
 
-void ttm_bo_unref(struct ttm_buffer_object **p_bo)
-{
-	struct ttm_buffer_object *bo = *p_bo;
-
-	*p_bo = NULL;
-	ttm_bo_put(bo);
-}
-EXPORT_SYMBOL(ttm_bo_unref);
-
 int ttm_bo_lock_delayed_workqueue(struct ttm_bo_device *bdev)
 {
 	return cancel_delayed_work_sync(&bdev->wq);
diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h
index 3fc4854dce49..49d9cdfc58f2 100644
--- a/include/drm/ttm/ttm_bo_api.h
+++ b/include/drm/ttm/ttm_bo_api.h
@@ -295,23 +295,6 @@ static inline void ttm_bo_get(struct ttm_buffer_object *bo)
 	kref_get(&bo->kref);
 }
 
-/**
- * ttm_bo_reference - reference a struct ttm_buffer_object
- *
- * @bo: The buffer object.
- *
- * Returns a refcounted pointer to a buffer object.
- *
- * This function is deprecated. Use @ttm_bo_get instead.
- */
-
-static inline struct ttm_buffer_object *
-ttm_bo_reference(struct ttm_buffer_object *bo)
-{
-	ttm_bo_get(bo);
-	return bo;
-}
-
 /**
  * ttm_bo_get_unless_zero - reference a struct ttm_buffer_object unless
  * its refcount has already reached zero.
@@ -386,17 +369,6 @@ int ttm_bo_validate(struct ttm_buffer_object *bo,
  */
 void ttm_bo_put(struct ttm_buffer_object *bo);
 
-/**
- * ttm_bo_unref
- *
- * @bo: The buffer object.
- *
- * Unreference and clear a pointer to a buffer object.
- *
- * This function is deprecated. Use @ttm_bo_put instead.
- */
-void ttm_bo_unref(struct ttm_buffer_object **bo);
-
 /**
  * ttm_bo_add_to_lru
  *
-- 
cgit v1.3.1


From 5a3db6f08a8eae15bff597dce290ac238daeb717 Mon Sep 17 00:00:00 2001
From: Ville Syrjälä <ville.syrjala@linux.intel.com>
Date: Tue, 29 Jan 2019 19:06:09 +0200
Subject: drm: Constify drm_color_lut_check()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

drm_color_lut_check() doens't modify the passed in blob so
let's make it const.

Also s/uint32_t/u32/ while at it.

v2: Reduce line wraps (Sam)

Cc: Matt Roper <matthew.d.roper@intel.com>
Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190129170609.5718-1-ville.syrjala@linux.intel.com
Reviewed-by: Sam Ravnborg <sam@ravnborg.org>
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/drm_color_mgmt.c | 5 ++---
 include/drm/drm_color_mgmt.h     | 3 +--
 2 files changed, 3 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_color_mgmt.c b/drivers/gpu/drm/drm_color_mgmt.c
index 968ca7c91ad8..d5d34d0c79c7 100644
--- a/drivers/gpu/drm/drm_color_mgmt.c
+++ b/drivers/gpu/drm/drm_color_mgmt.c
@@ -474,10 +474,9 @@ EXPORT_SYMBOL(drm_plane_create_color_properties);
  *
  * Returns 0 on success, -EINVAL on failure.
  */
-int drm_color_lut_check(struct drm_property_blob *lut,
-			uint32_t tests)
+int drm_color_lut_check(const struct drm_property_blob *lut, u32 tests)
 {
-	struct drm_color_lut *entry;
+	const struct drm_color_lut *entry;
 	int i;
 
 	if (!lut || !tests)
diff --git a/include/drm/drm_color_mgmt.h b/include/drm/drm_color_mgmt.h
index 6affbda6d9cb..d1c662d92ab7 100644
--- a/include/drm/drm_color_mgmt.h
+++ b/include/drm/drm_color_mgmt.h
@@ -96,6 +96,5 @@ enum drm_color_lut_tests {
 	DRM_COLOR_LUT_NON_DECREASING = BIT(1),
 };
 
-int drm_color_lut_check(struct drm_property_blob *lut,
-			uint32_t tests);
+int drm_color_lut_check(const struct drm_property_blob *lut, u32 tests);
 #endif
-- 
cgit v1.3.1


From 5e0f5a58b167fc2c8352d90c0faa8c0c9ca75c26 Mon Sep 17 00:00:00 2001
From: Rodrigo Vivi <rodrigo.vivi@intel.com>
Date: Fri, 1 Feb 2019 15:50:49 -0800
Subject: drm/i915/cfl: Adding another PCI Device ID.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

While cross checking PCI IDs from Intel Media SDK
and kernel Dmitry noticed this gap. So we checked the
spec and this new ID had been recently added.

v2: Adding new H_GT1 entry to i915_pci.c (Jose)

Reported-by: Dmitry Rogozhkin<dmitry.v.rogozhkin@intel.com>
Cc: Dmitry Rogozhkin<dmitry.v.rogozhkin@intel.com>
Cc: José Roberto de Souza <jose.souza@intel.com>
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Reviewed-by: José Roberto de Souza <jose.souza@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190201235049.27206-1-rodrigo.vivi@intel.com
---
 drivers/gpu/drm/i915/i915_pci.c | 1 +
 include/drm/i915_pciids.h       | 4 ++++
 2 files changed, 5 insertions(+)

(limited to 'include')

diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c
index 6f7ad10f34b8..55c9058e91a8 100644
--- a/drivers/gpu/drm/i915/i915_pci.c
+++ b/drivers/gpu/drm/i915/i915_pci.c
@@ -711,6 +711,7 @@ static const struct pci_device_id pciidlist[] = {
 	INTEL_AML_KBL_GT2_IDS(&intel_kabylake_gt2_info),
 	INTEL_CFL_S_GT1_IDS(&intel_coffeelake_gt1_info),
 	INTEL_CFL_S_GT2_IDS(&intel_coffeelake_gt2_info),
+	INTEL_CFL_H_GT1_IDS(&intel_coffeelake_gt1_info),
 	INTEL_CFL_H_GT2_IDS(&intel_coffeelake_gt2_info),
 	INTEL_CFL_U_GT2_IDS(&intel_coffeelake_gt2_info),
 	INTEL_CFL_U_GT3_IDS(&intel_coffeelake_gt3_info),
diff --git a/include/drm/i915_pciids.h b/include/drm/i915_pciids.h
index df72be7e8b88..d2fad7b0fcf6 100644
--- a/include/drm/i915_pciids.h
+++ b/include/drm/i915_pciids.h
@@ -394,6 +394,9 @@
 	INTEL_VGA_DEVICE(0x3E9A, info)  /* SRV GT2 */
 
 /* CFL H */
+#define INTEL_CFL_H_GT1_IDS(info) \
+	INTEL_VGA_DEVICE(0x3E9C, info)
+
 #define INTEL_CFL_H_GT2_IDS(info) \
 	INTEL_VGA_DEVICE(0x3E9B, info), /* Halo GT2 */ \
 	INTEL_VGA_DEVICE(0x3E94, info)  /* Halo GT2 */
@@ -426,6 +429,7 @@
 #define INTEL_CFL_IDS(info)	   \
 	INTEL_CFL_S_GT1_IDS(info), \
 	INTEL_CFL_S_GT2_IDS(info), \
+	INTEL_CFL_H_GT1_IDS(info), \
 	INTEL_CFL_H_GT2_IDS(info), \
 	INTEL_CFL_U_GT2_IDS(info), \
 	INTEL_CFL_U_GT3_IDS(info), \
-- 
cgit v1.3.1


From e46c2e99f60043bfdb63f18fe775db1f688906d9 Mon Sep 17 00:00:00 2001
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Date: Tue, 5 Feb 2019 09:50:31 +0000
Subject: drm/i915: Expose RPCS (SSEU) configuration to userspace (Gen11 only)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We want to allow userspace to reconfigure the subslice configuration on a
per context basis.

This is required for the functional requirement of shutting down non-VME
enabled sub-slices on Gen11 parts.

To do so, we expose a context parameter to allow adjustment of the RPCS
register stored within the context image (and currently not accessible via
LRI).

If the context is adjusted before first use or whilst idle, the adjustment
is for "free"; otherwise if the context is active we queue a request to do
so (using the kernel context), following all other activity by that
context, which is also marked as barrier for all following submission
against the same context.

Since the overhead of device re-configuration during context switching can
be significant, especially in multi-context workloads, we limit this new
uAPI to only support the Gen11 VME use case. In this use case either the
device is fully enabled, and exactly one slice and half of the subslices
are enabled.

Example usage:

	struct drm_i915_gem_context_param_sseu sseu = { };
	struct drm_i915_gem_context_param arg = {
		.param = I915_CONTEXT_PARAM_SSEU,
		.ctx_id = gem_context_create(fd),
		.size = sizeof(sseu),
		.value = to_user_pointer(&sseu)
	};

	/* Query device defaults. */
	gem_context_get_param(fd, &arg);

	/* Set VME configuration on a 1x6x8 part. */
	sseu.slice_mask = 0x1;
	sseu.subslice_mask = 0xe0;
	gem_context_set_param(fd, &arg);

v2: Fix offset of CTX_R_PWR_CLK_STATE in intel_lr_context_set_sseu()
    (Lionel)

v3: Add ability to program this per engine (Chris)

v4: Move most get_sseu() into i915_gem_context.c (Lionel)

v5: Validate sseu configuration against the device's capabilities (Lionel)

v6: Change context powergating settings through MI_SDM on kernel context
    (Chris)

v7: Synchronize the requests following a powergating setting change using
    a global dependency (Chris)
    Iterate timelines through dev_priv.gt.active_rings (Tvrtko)
    Disable RPCS configuration setting for non capable users
    (Lionel/Tvrtko)

v8: s/union intel_sseu/struct intel_sseu/ (Lionel)
    s/dev_priv/i915/ (Tvrtko)
    Change uapi class/instance fields to u16 (Tvrtko)
    Bump mask fields to 64bits (Lionel)
    Don't return EPERM when dynamic sseu is disabled (Tvrtko)

v9: Import context image into kernel context's ppgtt only when
    reconfiguring powergated slice/subslices (Chris)
    Use aliasing ppgtt when needed (Michel)

Tvrtko Ursulin:

v10:
 * Update for upstream changes.
 * Request submit needs a RPM reference.
 * Reject on !FULL_PPGTT for simplicity.
 * Pull out get/set param to helpers for readability and less indent.
 * Use i915_request_await_dma_fence in add_global_barrier to skip waits
   on the same timeline and avoid GEM_BUG_ON.
 * No need to explicitly assign a NULL pointer to engine in legacy mode.
 * No need to move gen8_make_rpcs up.
 * Factored out global barrier as prep patch.
 * Allow to only CAP_SYS_ADMIN if !Gen11.

v11:
 * Remove engine vfunc in favour of local helper. (Chris Wilson)
 * Stop retiring requests before updates since it is not needed
   (Chris Wilson)
 * Implement direct CPU update path for idle contexts. (Chris Wilson)
 * Left side dependency needs only be on the same context timeline.
   (Chris Wilson)
 * It is sufficient to order the timeline. (Chris Wilson)
 * Reject !RCS configuration attempts with -ENODEV for now.

v12:
 * Rebase for make_rpcs.

v13:
 * Centralize SSEU normalization to make_rpcs.
 * Type width checking (uAPI <-> implementation).
 * Gen11 restrictions uAPI checks.
 * Gen11 subslice count differences handling.
 Chris Wilson:
 * args->size handling fixes.
 * Update context image from GGTT.
 * Postpone context image update to pinning.
 * Use i915_gem_active_raw instead of last_request_on_engine.

v14:
 * Add activity tracker on intel_context to fix the lifetime issues
   and simplify the code. (Chris Wilson)

v15:
 * Fix context pin leak if no space in ring by simplifying the
   context pinning sequence.

v16:
 * Rebase for context get/set param locking changes.
 * Just -ENODEV on !Gen11. (Joonas)

v17:
 * Fix one Gen11 subslice enablement rule.
 * Handle error from i915_sw_fence_await_sw_fence_gfp. (Chris Wilson)

v18:
 * Update commit message. (Joonas)
 * Restrict uAPI to VME use case. (Joonas)

v19:
 * Rebase.

v20:
 * Rebase for ce->active_tracker.

v21:
 * Rebase for IS_GEN changes.

v22:
 * Reserve uAPI for flags straight away. (Chris Wilson)

v23:
 * Rebase for RUNTIME_INFO.

v24:
 * Added some headline docs for the uapi usage. (Joonas/Chris)

v25:
 * Renamed class/instance to engine_class/engine_instance to avoid clash
   with C++ keyword. (Tony Ye)

v26:
 * Rebased for runtime pm api changes.

v27:
 * Rebased for intel_context_init.
 * Wrap commit msg to 75.

v28:
 (Chris Wilson)
 * Use i915_gem_ggtt.
 * Use i915_request_await_dma_fence to show a better example.

v29:
 * i915_timeline_set_barrier can now fail. (Chris Wilson)

v30:
 * Capture some acks.

v31:
 * Drop the WARN_ON from use controllable paths. (Chris Wilson)
 * Use overflows_type for all checks.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=100899
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=107634
Issue: https://github.com/intel/media-driver/issues/267
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Cc: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Zhipeng Gong <zhipeng.gong@intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Tony Ye <tony.ye@intel.com>
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Acked-by: Timo Aaltonen <timo.aaltonen@canonical.com>
Acked-by: Takashi Iwai <tiwai@suse.de>
Acked-by: Stéphane Marchesin <marcheu@chromium.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20190205095032.22673-4-tvrtko.ursulin@linux.intel.com
---
 drivers/gpu/drm/i915/i915_gem_context.c | 340 +++++++++++++++++++++++++++++++-
 drivers/gpu/drm/i915/i915_gem_context.h |   6 +
 drivers/gpu/drm/i915/intel_lrc.c        |   4 +-
 include/uapi/drm/i915_drm.h             |  64 ++++++
 4 files changed, 411 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index d3887c27c3ba..2d3e1ce9cc76 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -89,6 +89,7 @@
 #include <drm/i915_drm.h>
 #include "i915_drv.h"
 #include "i915_trace.h"
+#include "intel_lrc_reg.h"
 #include "intel_workarounds.h"
 
 #define ALL_L3_SLICES(dev) (1 << NUM_L3_SLICES(dev)) - 1
@@ -321,6 +322,15 @@ static u32 default_desc_template(const struct drm_i915_private *i915,
 	return desc;
 }
 
+static void intel_context_retire(struct i915_gem_active *active,
+				 struct i915_request *rq)
+{
+	struct intel_context *ce =
+		container_of(active, typeof(*ce), active_tracker);
+
+	intel_context_unpin(ce);
+}
+
 void
 intel_context_init(struct intel_context *ce,
 		   struct i915_gem_context *ctx,
@@ -333,6 +343,8 @@ intel_context_init(struct intel_context *ce,
 
 	/* Use the whole device by default */
 	ce->sseu = intel_device_default_sseu(ctx->i915);
+
+	init_request_active(&ce->active_tracker, intel_context_retire);
 }
 
 static struct i915_gem_context *
@@ -850,6 +862,56 @@ out:
 	return 0;
 }
 
+static int get_sseu(struct i915_gem_context *ctx,
+		    struct drm_i915_gem_context_param *args)
+{
+	struct drm_i915_gem_context_param_sseu user_sseu;
+	struct intel_engine_cs *engine;
+	struct intel_context *ce;
+	int ret;
+
+	if (args->size == 0)
+		goto out;
+	else if (args->size < sizeof(user_sseu))
+		return -EINVAL;
+
+	if (copy_from_user(&user_sseu, u64_to_user_ptr(args->value),
+			   sizeof(user_sseu)))
+		return -EFAULT;
+
+	if (user_sseu.flags || user_sseu.rsvd)
+		return -EINVAL;
+
+	engine = intel_engine_lookup_user(ctx->i915,
+					  user_sseu.engine_class,
+					  user_sseu.engine_instance);
+	if (!engine)
+		return -EINVAL;
+
+	/* Only use for mutex here is to serialize get_param and set_param. */
+	ret = mutex_lock_interruptible(&ctx->i915->drm.struct_mutex);
+	if (ret)
+		return ret;
+
+	ce = to_intel_context(ctx, engine);
+
+	user_sseu.slice_mask = ce->sseu.slice_mask;
+	user_sseu.subslice_mask = ce->sseu.subslice_mask;
+	user_sseu.min_eus_per_subslice = ce->sseu.min_eus_per_subslice;
+	user_sseu.max_eus_per_subslice = ce->sseu.max_eus_per_subslice;
+
+	mutex_unlock(&ctx->i915->drm.struct_mutex);
+
+	if (copy_to_user(u64_to_user_ptr(args->value), &user_sseu,
+			 sizeof(user_sseu)))
+		return -EFAULT;
+
+out:
+	args->size = sizeof(user_sseu);
+
+	return 0;
+}
+
 int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
 				    struct drm_file *file)
 {
@@ -862,15 +924,17 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
 	if (!ctx)
 		return -ENOENT;
 
-	args->size = 0;
 	switch (args->param) {
 	case I915_CONTEXT_PARAM_BAN_PERIOD:
 		ret = -EINVAL;
 		break;
 	case I915_CONTEXT_PARAM_NO_ZEROMAP:
+		args->size = 0;
 		args->value = test_bit(UCONTEXT_NO_ZEROMAP, &ctx->user_flags);
 		break;
 	case I915_CONTEXT_PARAM_GTT_SIZE:
+		args->size = 0;
+
 		if (ctx->ppgtt)
 			args->value = ctx->ppgtt->vm.total;
 		else if (to_i915(dev)->mm.aliasing_ppgtt)
@@ -879,14 +943,20 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
 			args->value = to_i915(dev)->ggtt.vm.total;
 		break;
 	case I915_CONTEXT_PARAM_NO_ERROR_CAPTURE:
+		args->size = 0;
 		args->value = i915_gem_context_no_error_capture(ctx);
 		break;
 	case I915_CONTEXT_PARAM_BANNABLE:
+		args->size = 0;
 		args->value = i915_gem_context_is_bannable(ctx);
 		break;
 	case I915_CONTEXT_PARAM_PRIORITY:
+		args->size = 0;
 		args->value = ctx->sched.priority >> I915_USER_PRIORITY_SHIFT;
 		break;
+	case I915_CONTEXT_PARAM_SSEU:
+		ret = get_sseu(ctx, args);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
@@ -896,6 +966,270 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
 	return ret;
 }
 
+static int gen8_emit_rpcs_config(struct i915_request *rq,
+				 struct intel_context *ce,
+				 struct intel_sseu sseu)
+{
+	u64 offset;
+	u32 *cs;
+
+	cs = intel_ring_begin(rq, 4);
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
+
+	offset = i915_ggtt_offset(ce->state) +
+		 LRC_STATE_PN * PAGE_SIZE +
+		 (CTX_R_PWR_CLK_STATE + 1) * 4;
+
+	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+	*cs++ = lower_32_bits(offset);
+	*cs++ = upper_32_bits(offset);
+	*cs++ = gen8_make_rpcs(rq->i915, &sseu);
+
+	intel_ring_advance(rq, cs);
+
+	return 0;
+}
+
+static int
+gen8_modify_rpcs_gpu(struct intel_context *ce,
+		     struct intel_engine_cs *engine,
+		     struct intel_sseu sseu)
+{
+	struct drm_i915_private *i915 = engine->i915;
+	struct i915_request *rq, *prev;
+	intel_wakeref_t wakeref;
+	int ret;
+
+	GEM_BUG_ON(!ce->pin_count);
+
+	lockdep_assert_held(&i915->drm.struct_mutex);
+
+	/* Submitting requests etc needs the hw awake. */
+	wakeref = intel_runtime_pm_get(i915);
+
+	rq = i915_request_alloc(engine, i915->kernel_context);
+	if (IS_ERR(rq)) {
+		ret = PTR_ERR(rq);
+		goto out_put;
+	}
+
+	/* Queue this switch after all other activity by this context. */
+	prev = i915_gem_active_raw(&ce->ring->timeline->last_request,
+				   &i915->drm.struct_mutex);
+	if (prev && !i915_request_completed(prev)) {
+		ret = i915_request_await_dma_fence(rq, &prev->fence);
+		if (ret < 0)
+			goto out_add;
+	}
+
+	/* Order all following requests to be after. */
+	ret = i915_timeline_set_barrier(ce->ring->timeline, rq);
+	if (ret)
+		goto out_add;
+
+	ret = gen8_emit_rpcs_config(rq, ce, sseu);
+	if (ret)
+		goto out_add;
+
+	/*
+	 * Guarantee context image and the timeline remains pinned until the
+	 * modifying request is retired by setting the ce activity tracker.
+	 *
+	 * But we only need to take one pin on the account of it. Or in other
+	 * words transfer the pinned ce object to tracked active request.
+	 */
+	if (!i915_gem_active_isset(&ce->active_tracker))
+		__intel_context_pin(ce);
+	i915_gem_active_set(&ce->active_tracker, rq);
+
+out_add:
+	i915_request_add(rq);
+out_put:
+	intel_runtime_pm_put(i915, wakeref);
+
+	return ret;
+}
+
+static int
+i915_gem_context_reconfigure_sseu(struct i915_gem_context *ctx,
+				  struct intel_engine_cs *engine,
+				  struct intel_sseu sseu)
+{
+	struct intel_context *ce = to_intel_context(ctx, engine);
+	int ret;
+
+	GEM_BUG_ON(INTEL_GEN(ctx->i915) < 8);
+	GEM_BUG_ON(engine->id != RCS);
+
+	ret = mutex_lock_interruptible(&ctx->i915->drm.struct_mutex);
+	if (ret)
+		return ret;
+
+	/* Nothing to do if unmodified. */
+	if (!memcmp(&ce->sseu, &sseu, sizeof(sseu)))
+		goto out;
+
+	/*
+	 * If context is not idle we have to submit an ordered request to modify
+	 * its context image via the kernel context. Pristine and idle contexts
+	 * will be configured on pinning.
+	 */
+	if (ce->pin_count)
+		ret = gen8_modify_rpcs_gpu(ce, engine, sseu);
+
+	if (!ret)
+		ce->sseu = sseu;
+
+out:
+	mutex_unlock(&ctx->i915->drm.struct_mutex);
+
+	return ret;
+}
+
+static int
+user_to_context_sseu(struct drm_i915_private *i915,
+		     const struct drm_i915_gem_context_param_sseu *user,
+		     struct intel_sseu *context)
+{
+	const struct sseu_dev_info *device = &RUNTIME_INFO(i915)->sseu;
+
+	/* No zeros in any field. */
+	if (!user->slice_mask || !user->subslice_mask ||
+	    !user->min_eus_per_subslice || !user->max_eus_per_subslice)
+		return -EINVAL;
+
+	/* Max > min. */
+	if (user->max_eus_per_subslice < user->min_eus_per_subslice)
+		return -EINVAL;
+
+	/*
+	 * Some future proofing on the types since the uAPI is wider than the
+	 * current internal implementation.
+	 */
+	if (overflows_type(user->slice_mask, context->slice_mask) ||
+	    overflows_type(user->subslice_mask, context->subslice_mask) ||
+	    overflows_type(user->min_eus_per_subslice,
+			   context->min_eus_per_subslice) ||
+	    overflows_type(user->max_eus_per_subslice,
+			   context->max_eus_per_subslice))
+		return -EINVAL;
+
+	/* Check validity against hardware. */
+	if (user->slice_mask & ~device->slice_mask)
+		return -EINVAL;
+
+	if (user->subslice_mask & ~device->subslice_mask[0])
+		return -EINVAL;
+
+	if (user->max_eus_per_subslice > device->max_eus_per_subslice)
+		return -EINVAL;
+
+	context->slice_mask = user->slice_mask;
+	context->subslice_mask = user->subslice_mask;
+	context->min_eus_per_subslice = user->min_eus_per_subslice;
+	context->max_eus_per_subslice = user->max_eus_per_subslice;
+
+	/* Part specific restrictions. */
+	if (IS_GEN(i915, 11)) {
+		unsigned int hw_s = hweight8(device->slice_mask);
+		unsigned int hw_ss_per_s = hweight8(device->subslice_mask[0]);
+		unsigned int req_s = hweight8(context->slice_mask);
+		unsigned int req_ss = hweight8(context->subslice_mask);
+
+		/*
+		 * Only full subslice enablement is possible if more than one
+		 * slice is turned on.
+		 */
+		if (req_s > 1 && req_ss != hw_ss_per_s)
+			return -EINVAL;
+
+		/*
+		 * If more than four (SScount bitfield limit) subslices are
+		 * requested then the number has to be even.
+		 */
+		if (req_ss > 4 && (req_ss & 1))
+			return -EINVAL;
+
+		/*
+		 * If only one slice is enabled and subslice count is below the
+		 * device full enablement, it must be at most half of the all
+		 * available subslices.
+		 */
+		if (req_s == 1 && req_ss < hw_ss_per_s &&
+		    req_ss > (hw_ss_per_s / 2))
+			return -EINVAL;
+
+		/* ABI restriction - VME use case only. */
+
+		/* All slices or one slice only. */
+		if (req_s != 1 && req_s != hw_s)
+			return -EINVAL;
+
+		/*
+		 * Half subslices or full enablement only when one slice is
+		 * enabled.
+		 */
+		if (req_s == 1 &&
+		    (req_ss != hw_ss_per_s && req_ss != (hw_ss_per_s / 2)))
+			return -EINVAL;
+
+		/* No EU configuration changes. */
+		if ((user->min_eus_per_subslice !=
+		     device->max_eus_per_subslice) ||
+		    (user->max_eus_per_subslice !=
+		     device->max_eus_per_subslice))
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int set_sseu(struct i915_gem_context *ctx,
+		    struct drm_i915_gem_context_param *args)
+{
+	struct drm_i915_private *i915 = ctx->i915;
+	struct drm_i915_gem_context_param_sseu user_sseu;
+	struct intel_engine_cs *engine;
+	struct intel_sseu sseu;
+	int ret;
+
+	if (args->size < sizeof(user_sseu))
+		return -EINVAL;
+
+	if (!IS_GEN(i915, 11))
+		return -ENODEV;
+
+	if (copy_from_user(&user_sseu, u64_to_user_ptr(args->value),
+			   sizeof(user_sseu)))
+		return -EFAULT;
+
+	if (user_sseu.flags || user_sseu.rsvd)
+		return -EINVAL;
+
+	engine = intel_engine_lookup_user(i915,
+					  user_sseu.engine_class,
+					  user_sseu.engine_instance);
+	if (!engine)
+		return -EINVAL;
+
+	/* Only render engine supports RPCS configuration. */
+	if (engine->class != RENDER_CLASS)
+		return -ENODEV;
+
+	ret = user_to_context_sseu(i915, &user_sseu, &sseu);
+	if (ret)
+		return ret;
+
+	ret = i915_gem_context_reconfigure_sseu(ctx, engine, sseu);
+	if (ret)
+		return ret;
+
+	args->size = sizeof(user_sseu);
+
+	return 0;
+}
+
 int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
 				    struct drm_file *file)
 {
@@ -958,7 +1292,9 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
 					I915_USER_PRIORITY(priority);
 		}
 		break;
-
+	case I915_CONTEXT_PARAM_SSEU:
+		ret = set_sseu(ctx, args);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
index 919f6f0a0f7a..92ad5272e57f 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.h
+++ b/drivers/gpu/drm/i915/i915_gem_context.h
@@ -183,6 +183,12 @@ struct i915_gem_context {
 		u64 lrc_desc;
 		int pin_count;
 
+		/**
+		 * active_tracker: Active tracker for the external rq activity
+		 * on this intel_context object.
+		 */
+		struct i915_gem_active active_tracker;
+
 		const struct intel_context_ops *ops;
 
 		/** sseu: Control eu/slice partitioning */
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index d99c462e2c09..5e98fd79bd9d 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -2498,7 +2498,9 @@ u32 gen8_make_rpcs(struct drm_i915_private *i915, struct intel_sseu *req_sseu)
 	 * subslices are enabled, or a count between one and four on the first
 	 * slice.
 	 */
-	if (IS_GEN(i915, 11) && slices == 1 && subslices >= 4) {
+	if (IS_GEN(i915, 11) &&
+	    slices == 1 &&
+	    subslices > min_t(u8, 4, hweight8(sseu->subslice_mask[0]) / 2)) {
 		GEM_BUG_ON(subslices & 1);
 
 		subslice_pg = false;
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 298b2e197744..397810fa2d33 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1486,9 +1486,73 @@ struct drm_i915_gem_context_param {
 #define   I915_CONTEXT_MAX_USER_PRIORITY	1023 /* inclusive */
 #define   I915_CONTEXT_DEFAULT_PRIORITY		0
 #define   I915_CONTEXT_MIN_USER_PRIORITY	-1023 /* inclusive */
+	/*
+	 * When using the following param, value should be a pointer to
+	 * drm_i915_gem_context_param_sseu.
+	 */
+#define I915_CONTEXT_PARAM_SSEU		0x7
 	__u64 value;
 };
 
+/**
+ * Context SSEU programming
+ *
+ * It may be necessary for either functional or performance reason to configure
+ * a context to run with a reduced number of SSEU (where SSEU stands for Slice/
+ * Sub-slice/EU).
+ *
+ * This is done by configuring SSEU configuration using the below
+ * @struct drm_i915_gem_context_param_sseu for every supported engine which
+ * userspace intends to use.
+ *
+ * Not all GPUs or engines support this functionality in which case an error
+ * code -ENODEV will be returned.
+ *
+ * Also, flexibility of possible SSEU configuration permutations varies between
+ * GPU generations and software imposed limitations. Requesting such a
+ * combination will return an error code of -EINVAL.
+ *
+ * NOTE: When perf/OA is active the context's SSEU configuration is ignored in
+ * favour of a single global setting.
+ */
+struct drm_i915_gem_context_param_sseu {
+	/*
+	 * Engine class & instance to be configured or queried.
+	 */
+	__u16 engine_class;
+	__u16 engine_instance;
+
+	/*
+	 * Unused for now. Must be cleared to zero.
+	 */
+	__u32 flags;
+
+	/*
+	 * Mask of slices to enable for the context. Valid values are a subset
+	 * of the bitmask value returned for I915_PARAM_SLICE_MASK.
+	 */
+	__u64 slice_mask;
+
+	/*
+	 * Mask of subslices to enable for the context. Valid values are a
+	 * subset of the bitmask value return by I915_PARAM_SUBSLICE_MASK.
+	 */
+	__u64 subslice_mask;
+
+	/*
+	 * Minimum/Maximum number of EUs to enable per subslice for the
+	 * context. min_eus_per_subslice must be inferior or equal to
+	 * max_eus_per_subslice.
+	 */
+	__u16 min_eus_per_subslice;
+	__u16 max_eus_per_subslice;
+
+	/*
+	 * Unused for now. Must be cleared to zero.
+	 */
+	__u32 rsvd;
+};
+
 enum drm_i915_oa_format {
 	I915_OA_FORMAT_A13 = 1,	    /* HSW only */
 	I915_OA_FORMAT_A29,	    /* HSW only */
-- 
cgit v1.3.1


From 67dd1a36334ffce82bebeb2d633e152aa436d370 Mon Sep 17 00:00:00 2001
From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Date: Thu, 31 Jan 2019 15:44:22 -0500
Subject: drm/amdgpu: Add AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

New chunk for dependency on start of job's execution instead on
the end. This is used for GPU deadlock prevention when
userspace uses mid-IB fences to wait for mid-IB work on other rings.

v2: Fix typo in AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES
v3: Bump KMS version
v4: put old fence AFTER acquiring the scheduled fence.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Suggested-by: Christian Koenig <Christian.Koenig@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 13 ++++++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  3 ++-
 include/uapi/drm/amdgpu_drm.h           |  1 +
 3 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 1c49b8266d69..52a5e4fdc95b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -214,6 +214,7 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs
 		case AMDGPU_CHUNK_ID_DEPENDENCIES:
 		case AMDGPU_CHUNK_ID_SYNCOBJ_IN:
 		case AMDGPU_CHUNK_ID_SYNCOBJ_OUT:
+		case AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES:
 			break;
 
 		default:
@@ -1090,6 +1091,15 @@ static int amdgpu_cs_process_fence_dep(struct amdgpu_cs_parser *p,
 
 		fence = amdgpu_ctx_get_fence(ctx, entity,
 					     deps[i].handle);
+
+		if (chunk->chunk_id == AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES) {
+			struct drm_sched_fence *s_fence = to_drm_sched_fence(fence);
+			struct dma_fence *old = fence;
+
+			fence = dma_fence_get(&s_fence->scheduled);
+			dma_fence_put(old);
+		}
+
 		if (IS_ERR(fence)) {
 			r = PTR_ERR(fence);
 			amdgpu_ctx_put(ctx);
@@ -1177,7 +1187,8 @@ static int amdgpu_cs_dependencies(struct amdgpu_device *adev,
 
 		chunk = &p->chunks[i];
 
-		if (chunk->chunk_id == AMDGPU_CHUNK_ID_DEPENDENCIES) {
+		if (chunk->chunk_id == AMDGPU_CHUNK_ID_DEPENDENCIES ||
+		    chunk->chunk_id == AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES) {
 			r = amdgpu_cs_process_fence_dep(p, chunk);
 			if (r)
 				return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index c806f984bcc5..1158a6f4eec6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -71,9 +71,10 @@
  * - 3.25.0 - Add support for sensor query info (stable pstate sclk/mclk).
  * - 3.26.0 - GFX9: Process AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE.
  * - 3.27.0 - Add new chunk to to AMDGPU_CS to enable BO_LIST creation.
+ * - 3.28.0 - Add AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES
  */
 #define KMS_DRIVER_MAJOR	3
-#define KMS_DRIVER_MINOR	27
+#define KMS_DRIVER_MINOR	28
 #define KMS_DRIVER_PATCHLEVEL	0
 
 int amdgpu_vram_limit = 0;
diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
index be84e43c1e19..47296f2ec3fa 100644
--- a/include/uapi/drm/amdgpu_drm.h
+++ b/include/uapi/drm/amdgpu_drm.h
@@ -523,6 +523,7 @@ struct drm_amdgpu_gem_va {
 #define AMDGPU_CHUNK_ID_SYNCOBJ_IN      0x04
 #define AMDGPU_CHUNK_ID_SYNCOBJ_OUT     0x05
 #define AMDGPU_CHUNK_ID_BO_HANDLES      0x06
+#define AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES	0x07
 
 struct drm_amdgpu_cs_chunk {
 	__u32		chunk_id;
-- 
cgit v1.3.1


From 41cca166cc57e75e94d888595a428d23a3bf4e36 Mon Sep 17 00:00:00 2001
From: Marek Olšák <marek.olsak@amd.com>
Date: Mon, 21 Jan 2019 17:22:55 -0500
Subject: drm/amdgpu: add a workaround for GDS ordered append hangs with
 compute queues
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

I'm not increasing the DRM version because GDS isn't totally without bugs yet.

v2: update emit_ib_size

Signed-off-by: Marek Olšák <marek.olsak@amd.com>
Acked-by: Christian König <christian.koenig@amd.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h |  2 ++
 drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c   | 19 +++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c   | 21 +++++++++++++++--
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c   | 40 +++++++++++++++++++++++++++++++--
 include/uapi/drm/amdgpu_drm.h           |  5 +++++
 6 files changed, 84 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 1158a6f4eec6..2f0ea380c031 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -72,9 +72,10 @@
  * - 3.26.0 - GFX9: Process AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE.
  * - 3.27.0 - Add new chunk to to AMDGPU_CS to enable BO_LIST creation.
  * - 3.28.0 - Add AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES
+ * - 3.29.0 - Add AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID
  */
 #define KMS_DRIVER_MAJOR	3
-#define KMS_DRIVER_MINOR	28
+#define KMS_DRIVER_MINOR	29
 #define KMS_DRIVER_PATCHLEVEL	0
 
 int amdgpu_vram_limit = 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
index ecbcefe49a98..f89f5734d985 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
@@ -37,6 +37,8 @@ struct amdgpu_gds {
 	struct amdgpu_gds_asic_info	mem;
 	struct amdgpu_gds_asic_info	gws;
 	struct amdgpu_gds_asic_info	oa;
+	uint32_t			gds_compute_max_wave_id;
+
 	/* At present, GDS, GWS and OA resources for gfx (graphics)
 	 * is always pre-allocated and available for graphics operation.
 	 * Such resource is shared between all gfx clients.
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
index 7984292f9282..a59e0fdf5a97 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
@@ -2264,6 +2264,22 @@ static void gfx_v7_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
 	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
 	u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24);
 
+	/* Currently, there is a high possibility to get wave ID mismatch
+	 * between ME and GDS, leading to a hw deadlock, because ME generates
+	 * different wave IDs than the GDS expects. This situation happens
+	 * randomly when at least 5 compute pipes use GDS ordered append.
+	 * The wave IDs generated by ME are also wrong after suspend/resume.
+	 * Those are probably bugs somewhere else in the kernel driver.
+	 *
+	 * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME and
+	 * GDS to 0 for this ring (me/pipe).
+	 */
+	if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) {
+		amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1));
+		amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID - PACKET3_SET_CONFIG_REG_START);
+		amdgpu_ring_write(ring, ring->adev->gds.gds_compute_max_wave_id);
+	}
+
 	amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
 	amdgpu_ring_write(ring,
 #ifdef __BIG_ENDIAN
@@ -5000,7 +5016,7 @@ static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_compute = {
 		7 + /* gfx_v7_0_ring_emit_pipeline_sync */
 		CIK_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /* gfx_v7_0_ring_emit_vm_flush */
 		7 + 7 + 7, /* gfx_v7_0_ring_emit_fence_compute x3 for user fence, vm fence */
-	.emit_ib_size =	4, /* gfx_v7_0_ring_emit_ib_compute */
+	.emit_ib_size =	7, /* gfx_v7_0_ring_emit_ib_compute */
 	.emit_ib = gfx_v7_0_ring_emit_ib_compute,
 	.emit_fence = gfx_v7_0_ring_emit_fence_compute,
 	.emit_pipeline_sync = gfx_v7_0_ring_emit_pipeline_sync,
@@ -5057,6 +5073,7 @@ static void gfx_v7_0_set_gds_init(struct amdgpu_device *adev)
 	adev->gds.mem.total_size = RREG32(mmGDS_VMID0_SIZE);
 	adev->gds.gws.total_size = 64;
 	adev->gds.oa.total_size = 16;
+	adev->gds.gds_compute_max_wave_id = RREG32(mmGDS_COMPUTE_MAX_WAVE_ID);
 
 	if (adev->gds.mem.total_size == 64 * 1024) {
 		adev->gds.mem.gfx_partition_size = 4096;
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index a26747681ed6..b8e50a34bdb3 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -6084,6 +6084,22 @@ static void gfx_v8_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
 	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
 	u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24);
 
+	/* Currently, there is a high possibility to get wave ID mismatch
+	 * between ME and GDS, leading to a hw deadlock, because ME generates
+	 * different wave IDs than the GDS expects. This situation happens
+	 * randomly when at least 5 compute pipes use GDS ordered append.
+	 * The wave IDs generated by ME are also wrong after suspend/resume.
+	 * Those are probably bugs somewhere else in the kernel driver.
+	 *
+	 * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME and
+	 * GDS to 0 for this ring (me/pipe).
+	 */
+	if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) {
+		amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1));
+		amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID - PACKET3_SET_CONFIG_REG_START);
+		amdgpu_ring_write(ring, ring->adev->gds.gds_compute_max_wave_id);
+	}
+
 	amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
 	amdgpu_ring_write(ring,
 #ifdef __BIG_ENDIAN
@@ -6890,7 +6906,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = {
 		7 + /* gfx_v8_0_ring_emit_pipeline_sync */
 		VI_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /* gfx_v8_0_ring_emit_vm_flush */
 		7 + 7 + 7, /* gfx_v8_0_ring_emit_fence_compute x3 for user fence, vm fence */
-	.emit_ib_size =	4, /* gfx_v8_0_ring_emit_ib_compute */
+	.emit_ib_size =	7, /* gfx_v8_0_ring_emit_ib_compute */
 	.emit_ib = gfx_v8_0_ring_emit_ib_compute,
 	.emit_fence = gfx_v8_0_ring_emit_fence_compute,
 	.emit_pipeline_sync = gfx_v8_0_ring_emit_pipeline_sync,
@@ -6920,7 +6936,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_kiq = {
 		7 + /* gfx_v8_0_ring_emit_pipeline_sync */
 		17 + /* gfx_v8_0_ring_emit_vm_flush */
 		7 + 7 + 7, /* gfx_v8_0_ring_emit_fence_kiq x3 for user fence, vm fence */
-	.emit_ib_size =	4, /* gfx_v8_0_ring_emit_ib_compute */
+	.emit_ib_size =	7, /* gfx_v8_0_ring_emit_ib_compute */
 	.emit_fence = gfx_v8_0_ring_emit_fence_kiq,
 	.test_ring = gfx_v8_0_ring_test_ring,
 	.insert_nop = amdgpu_ring_insert_nop,
@@ -6996,6 +7012,7 @@ static void gfx_v8_0_set_gds_init(struct amdgpu_device *adev)
 	adev->gds.mem.total_size = RREG32(mmGDS_VMID0_SIZE);
 	adev->gds.gws.total_size = 64;
 	adev->gds.oa.total_size = 16;
+	adev->gds.gds_compute_max_wave_id = RREG32(mmGDS_COMPUTE_MAX_WAVE_ID);
 
 	if (adev->gds.mem.total_size == 64 * 1024) {
 		adev->gds.mem.gfx_partition_size = 4096;
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 262ee3cf6f1c..5533f6e4f4a4 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -4010,6 +4010,22 @@ static void gfx_v9_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
 	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
 	u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24);
 
+	/* Currently, there is a high possibility to get wave ID mismatch
+	 * between ME and GDS, leading to a hw deadlock, because ME generates
+	 * different wave IDs than the GDS expects. This situation happens
+	 * randomly when at least 5 compute pipes use GDS ordered append.
+	 * The wave IDs generated by ME are also wrong after suspend/resume.
+	 * Those are probably bugs somewhere else in the kernel driver.
+	 *
+	 * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME and
+	 * GDS to 0 for this ring (me/pipe).
+	 */
+	if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) {
+		amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1));
+		amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID);
+		amdgpu_ring_write(ring, ring->adev->gds.gds_compute_max_wave_id);
+	}
+
 	amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
 	BUG_ON(ib->gpu_addr & 0x3); /* Dword align */
 	amdgpu_ring_write(ring,
@@ -4729,7 +4745,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
 		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
 		2 + /* gfx_v9_0_ring_emit_vm_flush */
 		8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
-	.emit_ib_size =	4, /* gfx_v9_0_ring_emit_ib_compute */
+	.emit_ib_size =	7, /* gfx_v9_0_ring_emit_ib_compute */
 	.emit_ib = gfx_v9_0_ring_emit_ib_compute,
 	.emit_fence = gfx_v9_0_ring_emit_fence,
 	.emit_pipeline_sync = gfx_v9_0_ring_emit_pipeline_sync,
@@ -4764,7 +4780,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = {
 		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
 		2 + /* gfx_v9_0_ring_emit_vm_flush */
 		8 + 8 + 8, /* gfx_v9_0_ring_emit_fence_kiq x3 for user fence, vm fence */
-	.emit_ib_size =	4, /* gfx_v9_0_ring_emit_ib_compute */
+	.emit_ib_size =	7, /* gfx_v9_0_ring_emit_ib_compute */
 	.emit_fence = gfx_v9_0_ring_emit_fence_kiq,
 	.test_ring = gfx_v9_0_ring_test_ring,
 	.insert_nop = amdgpu_ring_insert_nop,
@@ -4846,6 +4862,26 @@ static void gfx_v9_0_set_gds_init(struct amdgpu_device *adev)
 		break;
 	}
 
+	switch (adev->asic_type) {
+	case CHIP_VEGA10:
+	case CHIP_VEGA20:
+		adev->gds.gds_compute_max_wave_id = 0x7ff;
+		break;
+	case CHIP_VEGA12:
+		adev->gds.gds_compute_max_wave_id = 0x27f;
+		break;
+	case CHIP_RAVEN:
+		if (adev->rev_id >= 0x8)
+			adev->gds.gds_compute_max_wave_id = 0x77; /* raven2 */
+		else
+			adev->gds.gds_compute_max_wave_id = 0x15f; /* raven1 */
+		break;
+	default:
+		/* this really depends on the chip */
+		adev->gds.gds_compute_max_wave_id = 0x7ff;
+		break;
+	}
+
 	adev->gds.gws.total_size = 64;
 	adev->gds.oa.total_size = 16;
 
diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
index 47296f2ec3fa..2acbc8bc465b 100644
--- a/include/uapi/drm/amdgpu_drm.h
+++ b/include/uapi/drm/amdgpu_drm.h
@@ -566,6 +566,11 @@ union drm_amdgpu_cs {
  * caches (L2/vL1/sL1/I$). */
 #define AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE (1 << 3)
 
+/* Set GDS_COMPUTE_MAX_WAVE_ID = DEFAULT before PACKET3_INDIRECT_BUFFER.
+ * This will reset wave ID counters for the IB.
+ */
+#define AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID (1 << 4)
+
 struct drm_amdgpu_cs_chunk_ib {
 	__u32 _pad;
 	/** AMDGPU_IB_FLAG_* */
-- 
cgit v1.3.1


From 5a5fccbd8c315c08db01e585e1cbe88e30b70691 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Fri, 1 Feb 2019 14:28:24 +0100
Subject: gpu: host1x: Introduce support for wide opcodes

The CDMA push buffer can currently only handle opcodes that take a
single word parameter. However, the host1x implementation on Tegra186
and later supports opcodes that require multiple words as parameters.

Unfortunately the way the push buffer is structured, these wide opcodes
cannot simply be composed of two regular opcodes because that could
result in the wide opcode being split across the end of the push buffer
and the final RESTART opcode required to wrap the push buffer around
would break the wide opcode.

One way to fix this would be to remove the concept of slots to simplify
push buffer operations. However, that's not entirely trivial and should
be done in a separate patch. For now, simply use a different function
to push four-word opcodes into the push buffer. Technically only three
words are pushed, with the fourth word used as padding to preserve the
2-word alignment required by the slots abstraction. The fourth word is
always a NOP opcode.

Additional care must be taken when the end of the push buffer is
reached. If a four-word opcode doesn't fit into the push buffer without
being split by the boundary, NOP opcodes will be introduced and the new
wide opcode placed at the beginning of the push buffer.

Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/host1x/cdma.c     | 92 +++++++++++++++++++++++++++++++++++++++++++
 drivers/gpu/host1x/cdma.h     |  2 +
 include/trace/events/host1x.h | 26 ++++++++++++
 3 files changed, 120 insertions(+)

(limited to 'include')

diff --git a/drivers/gpu/host1x/cdma.c b/drivers/gpu/host1x/cdma.c
index 15bdeb836e3d..64099cc1964b 100644
--- a/drivers/gpu/host1x/cdma.c
+++ b/drivers/gpu/host1x/cdma.c
@@ -217,6 +217,45 @@ unsigned int host1x_cdma_wait_locked(struct host1x_cdma *cdma,
 	return 0;
 }
 
+/*
+ * Sleep (if necessary) until the push buffer has enough free space.
+ *
+ * Must be called with the cdma lock held.
+ */
+int host1x_cdma_wait_pushbuffer_space(struct host1x *host1x,
+				      struct host1x_cdma *cdma,
+				      unsigned int needed)
+{
+	while (true) {
+		struct push_buffer *pb = &cdma->push_buffer;
+		unsigned int space;
+
+		space = host1x_pushbuffer_space(pb);
+		if (space >= needed)
+			break;
+
+		trace_host1x_wait_cdma(dev_name(cdma_to_channel(cdma)->dev),
+				       CDMA_EVENT_PUSH_BUFFER_SPACE);
+
+		host1x_hw_cdma_flush(host1x, cdma);
+
+		/* If somebody has managed to already start waiting, yield */
+		if (cdma->event != CDMA_EVENT_NONE) {
+			mutex_unlock(&cdma->lock);
+			schedule();
+			mutex_lock(&cdma->lock);
+			continue;
+		}
+
+		cdma->event = CDMA_EVENT_PUSH_BUFFER_SPACE;
+
+		mutex_unlock(&cdma->lock);
+		wait_for_completion(&cdma->complete);
+		mutex_lock(&cdma->lock);
+	}
+
+	return 0;
+}
 /*
  * Start timer that tracks the time spent by the job.
  * Must be called with the cdma lock held.
@@ -509,6 +548,59 @@ void host1x_cdma_push(struct host1x_cdma *cdma, u32 op1, u32 op2)
 	host1x_pushbuffer_push(pb, op1, op2);
 }
 
+/*
+ * Push four words into two consecutive push buffer slots. Note that extra
+ * care needs to be taken not to split the two slots across the end of the
+ * push buffer. Otherwise the RESTART opcode at the end of the push buffer
+ * that ensures processing will restart at the beginning will break up the
+ * four words.
+ *
+ * Blocks as necessary if the push buffer is full.
+ */
+void host1x_cdma_push_wide(struct host1x_cdma *cdma, u32 op1, u32 op2,
+			   u32 op3, u32 op4)
+{
+	struct host1x_channel *channel = cdma_to_channel(cdma);
+	struct host1x *host1x = cdma_to_host1x(cdma);
+	struct push_buffer *pb = &cdma->push_buffer;
+	unsigned int needed = 2, extra = 0, i;
+	unsigned int space = cdma->slots_free;
+
+	if (host1x_debug_trace_cmdbuf)
+		trace_host1x_cdma_push_wide(dev_name(channel->dev), op1, op2,
+					    op3, op4);
+
+	/* compute number of extra slots needed for padding */
+	if (pb->pos + 16 > pb->size) {
+		extra = (pb->size - pb->pos) / 8;
+		needed += extra;
+	}
+
+	host1x_cdma_wait_pushbuffer_space(host1x, cdma, needed);
+	space = host1x_pushbuffer_space(pb);
+
+	cdma->slots_free = space - needed;
+	cdma->slots_used += needed;
+
+	/*
+	 * Note that we rely on the fact that this is only used to submit wide
+	 * gather opcodes, which consist of 3 words, and they are padded with
+	 * a NOP to avoid having to deal with fractional slots (a slot always
+	 * represents 2 words). The fourth opcode passed to this function will
+	 * therefore always be a NOP.
+	 *
+	 * This works around a slight ambiguity when it comes to opcodes. For
+	 * all current host1x incarnations the NOP opcode uses the exact same
+	 * encoding (0x20000000), so we could hard-code the value here, but a
+	 * new incarnation may change it and break that assumption.
+	 */
+	for (i = 0; i < extra; i++)
+		host1x_pushbuffer_push(pb, op4, op4);
+
+	host1x_pushbuffer_push(pb, op1, op2);
+	host1x_pushbuffer_push(pb, op3, op4);
+}
+
 /*
  * End a cdma submit
  * Kick off DMA, add job to the sync queue, and a number of slots to be freed
diff --git a/drivers/gpu/host1x/cdma.h b/drivers/gpu/host1x/cdma.h
index 71078359c626..3a5e0408b8d1 100644
--- a/drivers/gpu/host1x/cdma.h
+++ b/drivers/gpu/host1x/cdma.h
@@ -90,6 +90,8 @@ int host1x_cdma_init(struct host1x_cdma *cdma);
 int host1x_cdma_deinit(struct host1x_cdma *cdma);
 int host1x_cdma_begin(struct host1x_cdma *cdma, struct host1x_job *job);
 void host1x_cdma_push(struct host1x_cdma *cdma, u32 op1, u32 op2);
+void host1x_cdma_push_wide(struct host1x_cdma *cdma, u32 op1, u32 op2,
+			   u32 op3, u32 op4);
 void host1x_cdma_end(struct host1x_cdma *cdma, struct host1x_job *job);
 void host1x_cdma_update(struct host1x_cdma *cdma);
 void host1x_cdma_peek(struct host1x_cdma *cdma, u32 dmaget, int slot,
diff --git a/include/trace/events/host1x.h b/include/trace/events/host1x.h
index a37ef73092e5..3d340b6f1ea3 100644
--- a/include/trace/events/host1x.h
+++ b/include/trace/events/host1x.h
@@ -80,6 +80,32 @@ TRACE_EVENT(host1x_cdma_push,
 		__entry->name, __entry->op1, __entry->op2)
 );
 
+TRACE_EVENT(host1x_cdma_push_wide,
+	TP_PROTO(const char *name, u32 op1, u32 op2, u32 op3, u32 op4),
+
+	TP_ARGS(name, op1, op2, op3, op4),
+
+	TP_STRUCT__entry(
+		__field(const char *, name)
+		__field(u32, op1)
+		__field(u32, op2)
+		__field(u32, op3)
+		__field(u32, op4)
+	),
+
+	TP_fast_assign(
+		__entry->name = name;
+		__entry->op1 = op1;
+		__entry->op2 = op2;
+		__entry->op3 = op3;
+		__entry->op4 = op4;
+	),
+
+	TP_printk("name=%s, op1=%08x, op2=%08x, op3=%08x op4=%08x",
+		__entry->name, __entry->op1, __entry->op2, __entry->op3,
+		__entry->op4)
+);
+
 TRACE_EVENT(host1x_cdma_push_gather,
 	TP_PROTO(const char *name, struct host1x_bo *bo,
 			u32 words, u32 offset, void *cmdbuf),
-- 
cgit v1.3.1