From 3affaa5a7ca3ca114e01049bc45e3ed4a3955505 Mon Sep 17 00:00:00 2001 From: Brian Starkey Date: Mon, 3 Dec 2018 11:31:57 +0000 Subject: drm/afbc: Add AFBC modifier usage documentation AFBC is a flexible, proprietary, lossless compression protocol and format, with a number of defined DRM format modifiers. To facilitate consistency and compatibility between different AFBC producers and consumers, document the expectations for usage of the AFBC DRM format modifiers in a new .rst chapter. Signed-off-by: Brian Starkey Reviewed-by: Liviu Dudau [Updated MAINTAINERS entry to show that "Mali DP Maintainers" is actually a mailing list and added an SPDX-License-Identifier to the documentation] Signed-off-by: Liviu Dudau --- include/uapi/drm/drm_fourcc.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/uapi/drm/drm_fourcc.h b/include/uapi/drm/drm_fourcc.h index 0b44260a5ee9..df014ede4e57 100644 --- a/include/uapi/drm/drm_fourcc.h +++ b/include/uapi/drm/drm_fourcc.h @@ -572,6 +572,9 @@ extern "C" { * AFBC has several features which may be supported and/or used, which are * represented using bits in the modifier. Not all combinations are valid, * and different devices or use-cases may support different combinations. + * + * Further information on the use of AFBC modifiers can be found in + * Documentation/gpu/afbc.rst */ #define DRM_FORMAT_MOD_ARM_AFBC(__afbc_mode) fourcc_mod_code(ARM, __afbc_mode) -- cgit v1.3.1 From 03ca3cf8e9aa7549e6c398462af0f68bdd43e7fe Mon Sep 17 00:00:00 2001 From: Rodrigo Vivi Date: Thu, 17 Jan 2019 21:59:43 -0800 Subject: drm/i915/icl: Adding few more device IDs for Ice Lake MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We just got aware that there was more IDs available at spec, so let's add them already. Cc: James Ausmus Cc: José Roberto de Souza Signed-off-by: Rodrigo Vivi Reviewed-by: Mika Kuoppala Reviewed-by: José Roberto de Souza Signed-off-by: Mika Kuoppala Link: https://patchwork.freedesktop.org/patch/msgid/20190118055943.10252-1-rodrigo.vivi@intel.com --- include/drm/i915_pciids.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/drm/i915_pciids.h b/include/drm/i915_pciids.h index 192667144693..df72be7e8b88 100644 --- a/include/drm/i915_pciids.h +++ b/include/drm/i915_pciids.h @@ -457,9 +457,13 @@ INTEL_VGA_DEVICE(0x8A51, info), \ INTEL_VGA_DEVICE(0x8A5C, info), \ INTEL_VGA_DEVICE(0x8A5D, info), \ + INTEL_VGA_DEVICE(0x8A59, info), \ + INTEL_VGA_DEVICE(0x8A58, info), \ INTEL_VGA_DEVICE(0x8A52, info), \ INTEL_VGA_DEVICE(0x8A5A, info), \ INTEL_VGA_DEVICE(0x8A5B, info), \ + INTEL_VGA_DEVICE(0x8A57, info), \ + INTEL_VGA_DEVICE(0x8A56, info), \ INTEL_VGA_DEVICE(0x8A71, info), \ INTEL_VGA_DEVICE(0x8A70, info) -- cgit v1.3.1 From 3c8861d84a4d2c6cd7221d18e49bf9201c6c6115 Mon Sep 17 00:00:00 2001 From: Matt Roper Date: Mon, 17 Dec 2018 14:44:14 -0800 Subject: drm: Add color management LUT validation helper (v4) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some hardware may place additional restrictions on the gamma/degamma curves described by our LUT properties. E.g., that a gamma curve never decreases or that the red/green/blue channels of a LUT's entries must be equal. Let's add a helper function that drivers can use to test that a userspace-provided LUT is valid and doesn't violate hardware requirements. v2: - Combine into a single helper that just takes a bitmask of the tests to apply. (Brian Starkey) - Add additional check (always performed) that LUT property blob size is always a multiple of the LUT entry size. (stolen from ARM driver) v3: - Drop the LUT size check again since drm_atomic_replace_property_blob_from_id() already covers this for us. (Alexandru Gheorghe) v4: - Use an enum to describe possible test values rather than #define's; this is cleaner to provide kerneldoc for. (Daniel Vetter) - s/DRM_COLOR_LUT_INCREASING/DRM_COLOR_LUT_NON_DECREASING/. (Ville) Cc: Uma Shankar Cc: Swati Sharma Cc: Brian Starkey Cc: Daniel Vetter Cc: Ville Syrjälä Signed-off-by: Matt Roper Reviewed-by: Brian Starkey Reviewed-by: Alexandru Gheorghe Reviewed-by: Uma Shankar Acked-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20181217224415.12848-1-matthew.d.roper@intel.com --- drivers/gpu/drm/drm_color_mgmt.c | 44 ++++++++++++++++++++++++++++++++++++++++ include/drm/drm_color_mgmt.h | 29 ++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) (limited to 'include') diff --git a/drivers/gpu/drm/drm_color_mgmt.c b/drivers/gpu/drm/drm_color_mgmt.c index 07dcf47daafe..968ca7c91ad8 100644 --- a/drivers/gpu/drm/drm_color_mgmt.c +++ b/drivers/gpu/drm/drm_color_mgmt.c @@ -462,3 +462,47 @@ int drm_plane_create_color_properties(struct drm_plane *plane, return 0; } EXPORT_SYMBOL(drm_plane_create_color_properties); + +/** + * drm_color_lut_check - check validity of lookup table + * @lut: property blob containing LUT to check + * @tests: bitmask of tests to run + * + * Helper to check whether a userspace-provided lookup table is valid and + * satisfies hardware requirements. Drivers pass a bitmask indicating which of + * the tests in &drm_color_lut_tests should be performed. + * + * Returns 0 on success, -EINVAL on failure. + */ +int drm_color_lut_check(struct drm_property_blob *lut, + uint32_t tests) +{ + struct drm_color_lut *entry; + int i; + + if (!lut || !tests) + return 0; + + entry = lut->data; + for (i = 0; i < drm_color_lut_size(lut); i++) { + if (tests & DRM_COLOR_LUT_EQUAL_CHANNELS) { + if (entry[i].red != entry[i].blue || + entry[i].red != entry[i].green) { + DRM_DEBUG_KMS("All LUT entries must have equal r/g/b\n"); + return -EINVAL; + } + } + + if (i > 0 && tests & DRM_COLOR_LUT_NON_DECREASING) { + if (entry[i].red < entry[i - 1].red || + entry[i].green < entry[i - 1].green || + entry[i].blue < entry[i - 1].blue) { + DRM_DEBUG_KMS("LUT entries must never decrease.\n"); + return -EINVAL; + } + } + } + + return 0; +} +EXPORT_SYMBOL(drm_color_lut_check); diff --git a/include/drm/drm_color_mgmt.h b/include/drm/drm_color_mgmt.h index 90ef9996d9a4..6affbda6d9cb 100644 --- a/include/drm/drm_color_mgmt.h +++ b/include/drm/drm_color_mgmt.h @@ -69,4 +69,33 @@ int drm_plane_create_color_properties(struct drm_plane *plane, u32 supported_ranges, enum drm_color_encoding default_encoding, enum drm_color_range default_range); + +/** + * enum drm_color_lut_tests - hw-specific LUT tests to perform + * + * The drm_color_lut_check() function takes a bitmask of the values here to + * determine which tests to apply to a userspace-provided LUT. + */ +enum drm_color_lut_tests { + /** + * @DRM_COLOR_LUT_EQUAL_CHANNELS: + * + * Checks whether the entries of a LUT all have equal values for the + * red, green, and blue channels. Intended for hardware that only + * accepts a single value per LUT entry and assumes that value applies + * to all three color components. + */ + DRM_COLOR_LUT_EQUAL_CHANNELS = BIT(0), + + /** + * @DRM_COLOR_LUT_NON_DECREASING: + * + * Checks whether the entries of a LUT are always flat or increasing + * (never decreasing). + */ + DRM_COLOR_LUT_NON_DECREASING = BIT(1), +}; + +int drm_color_lut_check(struct drm_property_blob *lut, + uint32_t tests); #endif -- cgit v1.3.1 From ae6d343541bb75958e9535d056adaf4ff6a66d6a Mon Sep 17 00:00:00 2001 From: Chunming Zhou Date: Thu, 10 Jan 2019 17:56:39 +0800 Subject: drm/ttm: add lru notify to bo driver v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit allow driver do somethings when lru changed. v2: address Michel's comments. Signed-off-by: Chunming Zhou Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/ttm/ttm_bo.c | 11 +++++++---- include/drm/ttm/ttm_bo_driver.h | 9 +++++++++ 2 files changed, 16 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index 0ec08394e17a..de088c8070fb 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -198,19 +198,22 @@ static void ttm_bo_ref_bug(struct kref *list_kref) void ttm_bo_del_from_lru(struct ttm_buffer_object *bo) { + struct ttm_bo_device *bdev = bo->bdev; + bool notify = false; + if (!list_empty(&bo->swap)) { list_del_init(&bo->swap); kref_put(&bo->list_kref, ttm_bo_ref_bug); + notify = true; } if (!list_empty(&bo->lru)) { list_del_init(&bo->lru); kref_put(&bo->list_kref, ttm_bo_ref_bug); + notify = true; } - /* - * TODO: Add a driver hook to delete from - * driver-specific LRU's here. - */ + if (notify && bdev->driver->del_from_lru_notify) + bdev->driver->del_from_lru_notify(bo); } void ttm_bo_del_sub_from_lru(struct ttm_buffer_object *bo) diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h index 1021106438b2..15829b24277c 100644 --- a/include/drm/ttm/ttm_bo_driver.h +++ b/include/drm/ttm/ttm_bo_driver.h @@ -381,6 +381,15 @@ struct ttm_bo_driver { */ int (*access_memory)(struct ttm_buffer_object *bo, unsigned long offset, void *buf, int len, int write); + + /** + * struct ttm_bo_driver member del_from_lru_notify + * + * @bo: the buffer object deleted from lru + * + * notify driver that a BO was deleted from LRU. + */ + void (*del_from_lru_notify)(struct ttm_buffer_object *bo); }; /** -- cgit v1.3.1 From 222b5f044159877504dbac9bc1910f89a74136e2 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Tue, 4 Dec 2018 16:56:14 -0500 Subject: drm/sched: Refactor ring mirror list handling. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Decauple sched threads stop and start and ring mirror list handling from the policy of what to do about the guilty jobs. When stoppping the sched thread and detaching sched fences from non signaled HW fenes wait for all signaled HW fences to complete before rerunning the jobs. v2: Fix resubmission of guilty job into HW after refactoring. v4: Full restart for all the jobs, not only from guilty ring. Extract karma increase into standalone function. v5: Rework waiting for signaled jobs without relying on the job struct itself as those might already be freed for non 'guilty' job's schedulers. Expose karma increase to drivers. v6: Use list_for_each_entry_safe_continue and drm_sched_process_job in case fence already signaled. Call drm_sched_increase_karma only once for amdgpu and add documentation. v7: Wait only for the latest job's fence. Suggested-by: Christian Koenig Signed-off-by: Andrey Grodzovsky Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 20 ++-- drivers/gpu/drm/etnaviv/etnaviv_sched.c | 11 +- drivers/gpu/drm/scheduler/sched_main.c | 170 ++++++++++++++++++----------- drivers/gpu/drm/v3d/v3d_sched.c | 13 ++- include/drm/gpu_scheduler.h | 7 +- 5 files changed, 131 insertions(+), 90 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index e20dce438d37..d7dddb936f84 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3313,17 +3313,15 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, if (!ring || !ring->sched.thread) continue; - kthread_park(ring->sched.thread); - - if (job && job->base.sched != &ring->sched) - continue; - - drm_sched_hw_job_reset(&ring->sched, job ? &job->base : NULL); + drm_sched_stop(&ring->sched); /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ amdgpu_fence_driver_force_completion(ring); } + if(job) + drm_sched_increase_karma(&job->base); + if (!amdgpu_sriov_vf(adev)) { @@ -3469,14 +3467,10 @@ static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev, if (!ring || !ring->sched.thread) continue; - /* only need recovery sched of the given job's ring - * or all rings (in the case @job is NULL) - * after above amdgpu_reset accomplished - */ - if ((!job || job->base.sched == &ring->sched) && !adev->asic_reset_res) - drm_sched_job_recovery(&ring->sched); + if (!adev->asic_reset_res) + drm_sched_resubmit_jobs(&ring->sched); - kthread_unpark(ring->sched.thread); + drm_sched_start(&ring->sched, !adev->asic_reset_res); } if (!amdgpu_device_has_dc_support(adev)) { diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c b/drivers/gpu/drm/etnaviv/etnaviv_sched.c index 49a6763693f1..67ae26602024 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c @@ -109,16 +109,19 @@ static void etnaviv_sched_timedout_job(struct drm_sched_job *sched_job) } /* block scheduler */ - kthread_park(gpu->sched.thread); - drm_sched_hw_job_reset(&gpu->sched, sched_job); + drm_sched_stop(&gpu->sched); + + if(sched_job) + drm_sched_increase_karma(sched_job); /* get the GPU back into the init state */ etnaviv_core_dump(gpu); etnaviv_gpu_recover_hang(gpu); + drm_sched_resubmit_jobs(&gpu->sched); + /* restart scheduler after GPU is usable again */ - drm_sched_job_recovery(&gpu->sched); - kthread_unpark(gpu->sched.thread); + drm_sched_start(&gpu->sched, true); } static void etnaviv_sched_free_job(struct drm_sched_job *sched_job) diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index dbb69063b3d5..4bb18511ac75 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -60,8 +60,6 @@ static void drm_sched_process_job(struct dma_fence *f, struct dma_fence_cb *cb); -static void drm_sched_expel_job_unlocked(struct drm_sched_job *s_job); - /** * drm_sched_rq_init - initialize a given run queue struct * @@ -335,6 +333,51 @@ static void drm_sched_job_timedout(struct work_struct *work) spin_unlock_irqrestore(&sched->job_list_lock, flags); } + /** + * drm_sched_increase_karma - Update sched_entity guilty flag + * + * @bad: The job guilty of time out + * + * Increment on every hang caused by the 'bad' job. If this exceeds the hang + * limit of the scheduler then the respective sched entity is marked guilty and + * jobs from it will not be scheduled further + */ +void drm_sched_increase_karma(struct drm_sched_job *bad) +{ + int i; + struct drm_sched_entity *tmp; + struct drm_sched_entity *entity; + struct drm_gpu_scheduler *sched = bad->sched; + + /* don't increase @bad's karma if it's from KERNEL RQ, + * because sometimes GPU hang would cause kernel jobs (like VM updating jobs) + * corrupt but keep in mind that kernel jobs always considered good. + */ + if (bad->s_priority != DRM_SCHED_PRIORITY_KERNEL) { + atomic_inc(&bad->karma); + for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_KERNEL; + i++) { + struct drm_sched_rq *rq = &sched->sched_rq[i]; + + spin_lock(&rq->lock); + list_for_each_entry_safe(entity, tmp, &rq->entities, list) { + if (bad->s_fence->scheduled.context == + entity->fence_context) { + if (atomic_read(&bad->karma) > + bad->sched->hang_limit) + if (entity->guilty) + atomic_set(entity->guilty, 1); + break; + } + } + spin_unlock(&rq->lock); + if (&entity->list != &rq->entities) + break; + } + } +} +EXPORT_SYMBOL(drm_sched_increase_karma); + /** * drm_sched_hw_job_reset - stop the scheduler if it contains the bad job * @@ -342,13 +385,20 @@ static void drm_sched_job_timedout(struct work_struct *work) * @bad: bad scheduler job * */ -void drm_sched_hw_job_reset(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad) +void drm_sched_stop(struct drm_gpu_scheduler *sched) { struct drm_sched_job *s_job; - struct drm_sched_entity *entity, *tmp; unsigned long flags; - int i; + struct dma_fence *last_fence = NULL; + kthread_park(sched->thread); + + /* + * Verify all the signaled jobs in mirror list are removed from the ring + * by waiting for the latest job to enter the list. This should insure that + * also all the previous jobs that were in flight also already singaled + * and removed from the list. + */ spin_lock_irqsave(&sched->job_list_lock, flags); list_for_each_entry_reverse(s_job, &sched->ring_mirror_list, node) { if (s_job->s_fence->parent && @@ -357,35 +407,20 @@ void drm_sched_hw_job_reset(struct drm_gpu_scheduler *sched, struct drm_sched_jo dma_fence_put(s_job->s_fence->parent); s_job->s_fence->parent = NULL; atomic_dec(&sched->hw_rq_count); + } else { + last_fence = dma_fence_get(&s_job->s_fence->finished); + break; } } spin_unlock_irqrestore(&sched->job_list_lock, flags); - if (bad && bad->s_priority != DRM_SCHED_PRIORITY_KERNEL) { - atomic_inc(&bad->karma); - /* don't increase @bad's karma if it's from KERNEL RQ, - * becuase sometimes GPU hang would cause kernel jobs (like VM updating jobs) - * corrupt but keep in mind that kernel jobs always considered good. - */ - for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_KERNEL; i++ ) { - struct drm_sched_rq *rq = &sched->sched_rq[i]; - - spin_lock(&rq->lock); - list_for_each_entry_safe(entity, tmp, &rq->entities, list) { - if (bad->s_fence->scheduled.context == entity->fence_context) { - if (atomic_read(&bad->karma) > bad->sched->hang_limit) - if (entity->guilty) - atomic_set(entity->guilty, 1); - break; - } - } - spin_unlock(&rq->lock); - if (&entity->list != &rq->entities) - break; - } + if (last_fence) { + dma_fence_wait(last_fence, false); + dma_fence_put(last_fence); } } -EXPORT_SYMBOL(drm_sched_hw_job_reset); + +EXPORT_SYMBOL(drm_sched_stop); /** * drm_sched_job_recovery - recover jobs after a reset @@ -393,33 +428,21 @@ EXPORT_SYMBOL(drm_sched_hw_job_reset); * @sched: scheduler instance * */ -void drm_sched_job_recovery(struct drm_gpu_scheduler *sched) +void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery) { struct drm_sched_job *s_job, *tmp; - bool found_guilty = false; unsigned long flags; int r; + if (!full_recovery) + goto unpark; + spin_lock_irqsave(&sched->job_list_lock, flags); list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) { struct drm_sched_fence *s_fence = s_job->s_fence; - struct dma_fence *fence; - uint64_t guilty_context; - - if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) { - found_guilty = true; - guilty_context = s_job->s_fence->scheduled.context; - } - - if (found_guilty && s_job->s_fence->scheduled.context == guilty_context) - dma_fence_set_error(&s_fence->finished, -ECANCELED); - - spin_unlock_irqrestore(&sched->job_list_lock, flags); - fence = sched->ops->run_job(s_job); - atomic_inc(&sched->hw_rq_count); + struct dma_fence *fence = s_job->s_fence->parent; if (fence) { - s_fence->parent = dma_fence_get(fence); r = dma_fence_add_callback(fence, &s_fence->cb, drm_sched_process_job); if (r == -ENOENT) @@ -427,18 +450,47 @@ void drm_sched_job_recovery(struct drm_gpu_scheduler *sched) else if (r) DRM_ERROR("fence add callback failed (%d)\n", r); - dma_fence_put(fence); - } else { - if (s_fence->finished.error < 0) - drm_sched_expel_job_unlocked(s_job); + } else drm_sched_process_job(NULL, &s_fence->cb); - } - spin_lock_irqsave(&sched->job_list_lock, flags); } + drm_sched_start_timeout(sched); spin_unlock_irqrestore(&sched->job_list_lock, flags); + +unpark: + kthread_unpark(sched->thread); } -EXPORT_SYMBOL(drm_sched_job_recovery); +EXPORT_SYMBOL(drm_sched_start); + +/** + * drm_sched_resubmit_jobs - helper to relunch job from mirror ring list + * + * @sched: scheduler instance + * + */ +void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched) +{ + struct drm_sched_job *s_job, *tmp; + uint64_t guilty_context; + bool found_guilty = false; + + /*TODO DO we need spinlock here ? */ + list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) { + struct drm_sched_fence *s_fence = s_job->s_fence; + + if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) { + found_guilty = true; + guilty_context = s_job->s_fence->scheduled.context; + } + + if (found_guilty && s_job->s_fence->scheduled.context == guilty_context) + dma_fence_set_error(&s_fence->finished, -ECANCELED); + + s_job->s_fence->parent = sched->ops->run_job(s_job); + atomic_inc(&sched->hw_rq_count); + } +} +EXPORT_SYMBOL(drm_sched_resubmit_jobs); /** * drm_sched_job_init - init a scheduler job @@ -634,26 +686,14 @@ static int drm_sched_main(void *param) DRM_ERROR("fence add callback failed (%d)\n", r); dma_fence_put(fence); - } else { - if (s_fence->finished.error < 0) - drm_sched_expel_job_unlocked(sched_job); + } else drm_sched_process_job(NULL, &s_fence->cb); - } wake_up(&sched->job_scheduled); } return 0; } -static void drm_sched_expel_job_unlocked(struct drm_sched_job *s_job) -{ - struct drm_gpu_scheduler *sched = s_job->sched; - - spin_lock(&sched->job_list_lock); - list_del_init(&s_job->node); - spin_unlock(&sched->job_list_lock); -} - /** * drm_sched_init - Init a gpu scheduler instance * diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c index f7508e907536..4704b2df3688 100644 --- a/drivers/gpu/drm/v3d/v3d_sched.c +++ b/drivers/gpu/drm/v3d/v3d_sched.c @@ -234,18 +234,21 @@ v3d_gpu_reset_for_timeout(struct v3d_dev *v3d, struct drm_sched_job *sched_job) for (q = 0; q < V3D_MAX_QUEUES; q++) { struct drm_gpu_scheduler *sched = &v3d->queue[q].sched; - kthread_park(sched->thread); - drm_sched_hw_job_reset(sched, (sched_job->sched == sched ? - sched_job : NULL)); + drm_sched_stop(sched); + + if(sched_job) + drm_sched_increase_karma(sched_job); } /* get the GPU back into the init state */ v3d_reset(v3d); + for (q = 0; q < V3D_MAX_QUEUES; q++) + drm_sched_resubmit_jobs(sched_job->sched); + /* Unblock schedulers and restart their jobs. */ for (q = 0; q < V3D_MAX_QUEUES; q++) { - drm_sched_job_recovery(&v3d->queue[q].sched); - kthread_unpark(v3d->queue[q].sched.thread); + drm_sched_start(&v3d->queue[q].sched, true); } mutex_unlock(&v3d->reset_lock); diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index 47e19796c450..c567bba91ba0 100644 --- a/include/drm/gpu_scheduler.h +++ b/include/drm/gpu_scheduler.h @@ -298,9 +298,10 @@ int drm_sched_job_init(struct drm_sched_job *job, void *owner); void drm_sched_job_cleanup(struct drm_sched_job *job); void drm_sched_wakeup(struct drm_gpu_scheduler *sched); -void drm_sched_hw_job_reset(struct drm_gpu_scheduler *sched, - struct drm_sched_job *job); -void drm_sched_job_recovery(struct drm_gpu_scheduler *sched); +void drm_sched_stop(struct drm_gpu_scheduler *sched); +void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery); +void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched); +void drm_sched_increase_karma(struct drm_sched_job *bad); bool drm_sched_dependency_optimized(struct dma_fence* fence, struct drm_sched_entity *entity); void drm_sched_fault(struct drm_gpu_scheduler *sched); -- cgit v1.3.1 From 3741540e04137256df82105bcd720a5e27423c34 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Wed, 5 Dec 2018 14:21:28 -0500 Subject: drm/sched: Rework HW fence processing. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Expedite job deletion from ring mirror list to the HW fence signal callback instead from finish_work, together with waiting for all such fences to signal in drm_sched_stop we garantee that already signaled job will not be processed twice. Remove the sched finish fence callback and just submit finish_work directly from the HW fence callback. v2: Fix comments. v3: Attach hw fence cb to sched_job v5: Rebase Suggested-by: Christian Koenig Signed-off-by: Andrey Grodzovsky Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/scheduler/sched_main.c | 55 +++++++++++++++++----------------- include/drm/gpu_scheduler.h | 6 ++-- 2 files changed, 29 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index 4bb18511ac75..19fc601c9eeb 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -284,8 +284,6 @@ static void drm_sched_job_finish(struct work_struct *work) cancel_delayed_work_sync(&sched->work_tdr); spin_lock_irqsave(&sched->job_list_lock, flags); - /* remove job from ring_mirror_list */ - list_del_init(&s_job->node); /* queue TDR for next job */ drm_sched_start_timeout(sched); spin_unlock_irqrestore(&sched->job_list_lock, flags); @@ -293,22 +291,11 @@ static void drm_sched_job_finish(struct work_struct *work) sched->ops->free_job(s_job); } -static void drm_sched_job_finish_cb(struct dma_fence *f, - struct dma_fence_cb *cb) -{ - struct drm_sched_job *job = container_of(cb, struct drm_sched_job, - finish_cb); - schedule_work(&job->finish_work); -} - static void drm_sched_job_begin(struct drm_sched_job *s_job) { struct drm_gpu_scheduler *sched = s_job->sched; unsigned long flags; - dma_fence_add_callback(&s_job->s_fence->finished, &s_job->finish_cb, - drm_sched_job_finish_cb); - spin_lock_irqsave(&sched->job_list_lock, flags); list_add_tail(&s_job->node, &sched->ring_mirror_list); drm_sched_start_timeout(sched); @@ -403,7 +390,7 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched) list_for_each_entry_reverse(s_job, &sched->ring_mirror_list, node) { if (s_job->s_fence->parent && dma_fence_remove_callback(s_job->s_fence->parent, - &s_job->s_fence->cb)) { + &s_job->cb)) { dma_fence_put(s_job->s_fence->parent); s_job->s_fence->parent = NULL; atomic_dec(&sched->hw_rq_count); @@ -431,31 +418,34 @@ EXPORT_SYMBOL(drm_sched_stop); void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery) { struct drm_sched_job *s_job, *tmp; - unsigned long flags; int r; if (!full_recovery) goto unpark; - spin_lock_irqsave(&sched->job_list_lock, flags); + /* + * Locking the list is not required here as the sched thread is parked + * so no new jobs are being pushed in to HW and in drm_sched_stop we + * flushed all the jobs who were still in mirror list but who already + * signaled and removed them self from the list. Also concurrent + * GPU recovers can't run in parallel. + */ list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) { - struct drm_sched_fence *s_fence = s_job->s_fence; struct dma_fence *fence = s_job->s_fence->parent; if (fence) { - r = dma_fence_add_callback(fence, &s_fence->cb, + r = dma_fence_add_callback(fence, &s_job->cb, drm_sched_process_job); if (r == -ENOENT) - drm_sched_process_job(fence, &s_fence->cb); + drm_sched_process_job(fence, &s_job->cb); else if (r) DRM_ERROR("fence add callback failed (%d)\n", r); } else - drm_sched_process_job(NULL, &s_fence->cb); + drm_sched_process_job(NULL, &s_job->cb); } drm_sched_start_timeout(sched); - spin_unlock_irqrestore(&sched->job_list_lock, flags); unpark: kthread_unpark(sched->thread); @@ -604,18 +594,27 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched) */ static void drm_sched_process_job(struct dma_fence *f, struct dma_fence_cb *cb) { - struct drm_sched_fence *s_fence = - container_of(cb, struct drm_sched_fence, cb); + struct drm_sched_job *s_job = container_of(cb, struct drm_sched_job, cb); + struct drm_sched_fence *s_fence = s_job->s_fence; struct drm_gpu_scheduler *sched = s_fence->sched; + unsigned long flags; + + cancel_delayed_work(&sched->work_tdr); - dma_fence_get(&s_fence->finished); atomic_dec(&sched->hw_rq_count); atomic_dec(&sched->num_jobs); + + spin_lock_irqsave(&sched->job_list_lock, flags); + /* remove job from ring_mirror_list */ + list_del_init(&s_job->node); + spin_unlock_irqrestore(&sched->job_list_lock, flags); + drm_sched_fence_finished(s_fence); trace_drm_sched_process_job(s_fence); - dma_fence_put(&s_fence->finished); wake_up_interruptible(&sched->wake_up_worker); + + schedule_work(&s_job->finish_work); } /** @@ -678,16 +677,16 @@ static int drm_sched_main(void *param) if (fence) { s_fence->parent = dma_fence_get(fence); - r = dma_fence_add_callback(fence, &s_fence->cb, + r = dma_fence_add_callback(fence, &sched_job->cb, drm_sched_process_job); if (r == -ENOENT) - drm_sched_process_job(fence, &s_fence->cb); + drm_sched_process_job(fence, &sched_job->cb); else if (r) DRM_ERROR("fence add callback failed (%d)\n", r); dma_fence_put(fence); } else - drm_sched_process_job(NULL, &s_fence->cb); + drm_sched_process_job(NULL, &sched_job->cb); wake_up(&sched->job_scheduled); } diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index c567bba91ba0..0daca4d8dad9 100644 --- a/include/drm/gpu_scheduler.h +++ b/include/drm/gpu_scheduler.h @@ -137,10 +137,6 @@ struct drm_sched_fence { */ struct dma_fence finished; - /** - * @cb: the callback for the parent fence below. - */ - struct dma_fence_cb cb; /** * @parent: the fence returned by &drm_sched_backend_ops.run_job * when scheduling the job on hardware. We signal the @@ -181,6 +177,7 @@ struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f); * be scheduled further. * @s_priority: the priority of the job. * @entity: the entity to which this job belongs. + * @cb: the callback for the parent fence in s_fence. * * A job is created by the driver using drm_sched_job_init(), and * should call drm_sched_entity_push_job() once it wants the scheduler @@ -197,6 +194,7 @@ struct drm_sched_job { atomic_t karma; enum drm_sched_priority s_priority; struct drm_sched_entity *entity; + struct dma_fence_cb cb; }; static inline bool drm_sched_invalidate_job(struct drm_sched_job *s_job, -- cgit v1.3.1 From cbce5f0a9f306f890d257a92fcca6cb341af76ed Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Fri, 25 Jan 2019 12:02:11 +0100 Subject: drm/ttm: Remove ttm_bo_reference and ttm_bo_unref MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both functions are obsolete and all calls have been replaced by ttm_bo_get and ttm_bo_put. Signed-off-by: Thomas Zimmermann Reviewed-by: Christian König Reviewed-by: Huang Rui Signed-off-by: Alex Deucher --- drivers/gpu/drm/ttm/ttm_bo.c | 9 --------- include/drm/ttm/ttm_bo_api.h | 28 ---------------------------- 2 files changed, 37 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index de088c8070fb..3f56647cdb35 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -679,15 +679,6 @@ void ttm_bo_put(struct ttm_buffer_object *bo) } EXPORT_SYMBOL(ttm_bo_put); -void ttm_bo_unref(struct ttm_buffer_object **p_bo) -{ - struct ttm_buffer_object *bo = *p_bo; - - *p_bo = NULL; - ttm_bo_put(bo); -} -EXPORT_SYMBOL(ttm_bo_unref); - int ttm_bo_lock_delayed_workqueue(struct ttm_bo_device *bdev) { return cancel_delayed_work_sync(&bdev->wq); diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h index 3fc4854dce49..49d9cdfc58f2 100644 --- a/include/drm/ttm/ttm_bo_api.h +++ b/include/drm/ttm/ttm_bo_api.h @@ -295,23 +295,6 @@ static inline void ttm_bo_get(struct ttm_buffer_object *bo) kref_get(&bo->kref); } -/** - * ttm_bo_reference - reference a struct ttm_buffer_object - * - * @bo: The buffer object. - * - * Returns a refcounted pointer to a buffer object. - * - * This function is deprecated. Use @ttm_bo_get instead. - */ - -static inline struct ttm_buffer_object * -ttm_bo_reference(struct ttm_buffer_object *bo) -{ - ttm_bo_get(bo); - return bo; -} - /** * ttm_bo_get_unless_zero - reference a struct ttm_buffer_object unless * its refcount has already reached zero. @@ -386,17 +369,6 @@ int ttm_bo_validate(struct ttm_buffer_object *bo, */ void ttm_bo_put(struct ttm_buffer_object *bo); -/** - * ttm_bo_unref - * - * @bo: The buffer object. - * - * Unreference and clear a pointer to a buffer object. - * - * This function is deprecated. Use @ttm_bo_put instead. - */ -void ttm_bo_unref(struct ttm_buffer_object **bo); - /** * ttm_bo_add_to_lru * -- cgit v1.3.1 From 5a3db6f08a8eae15bff597dce290ac238daeb717 Mon Sep 17 00:00:00 2001 From: Ville Syrjälä Date: Tue, 29 Jan 2019 19:06:09 +0200 Subject: drm: Constify drm_color_lut_check() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit drm_color_lut_check() doens't modify the passed in blob so let's make it const. Also s/uint32_t/u32/ while at it. v2: Reduce line wraps (Sam) Cc: Matt Roper Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20190129170609.5718-1-ville.syrjala@linux.intel.com Reviewed-by: Sam Ravnborg Acked-by: Daniel Vetter --- drivers/gpu/drm/drm_color_mgmt.c | 5 ++--- include/drm/drm_color_mgmt.h | 3 +-- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/drm_color_mgmt.c b/drivers/gpu/drm/drm_color_mgmt.c index 968ca7c91ad8..d5d34d0c79c7 100644 --- a/drivers/gpu/drm/drm_color_mgmt.c +++ b/drivers/gpu/drm/drm_color_mgmt.c @@ -474,10 +474,9 @@ EXPORT_SYMBOL(drm_plane_create_color_properties); * * Returns 0 on success, -EINVAL on failure. */ -int drm_color_lut_check(struct drm_property_blob *lut, - uint32_t tests) +int drm_color_lut_check(const struct drm_property_blob *lut, u32 tests) { - struct drm_color_lut *entry; + const struct drm_color_lut *entry; int i; if (!lut || !tests) diff --git a/include/drm/drm_color_mgmt.h b/include/drm/drm_color_mgmt.h index 6affbda6d9cb..d1c662d92ab7 100644 --- a/include/drm/drm_color_mgmt.h +++ b/include/drm/drm_color_mgmt.h @@ -96,6 +96,5 @@ enum drm_color_lut_tests { DRM_COLOR_LUT_NON_DECREASING = BIT(1), }; -int drm_color_lut_check(struct drm_property_blob *lut, - uint32_t tests); +int drm_color_lut_check(const struct drm_property_blob *lut, u32 tests); #endif -- cgit v1.3.1 From 5e0f5a58b167fc2c8352d90c0faa8c0c9ca75c26 Mon Sep 17 00:00:00 2001 From: Rodrigo Vivi Date: Fri, 1 Feb 2019 15:50:49 -0800 Subject: drm/i915/cfl: Adding another PCI Device ID. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While cross checking PCI IDs from Intel Media SDK and kernel Dmitry noticed this gap. So we checked the spec and this new ID had been recently added. v2: Adding new H_GT1 entry to i915_pci.c (Jose) Reported-by: Dmitry Rogozhkin Cc: Dmitry Rogozhkin Cc: José Roberto de Souza Signed-off-by: Rodrigo Vivi Reviewed-by: José Roberto de Souza Link: https://patchwork.freedesktop.org/patch/msgid/20190201235049.27206-1-rodrigo.vivi@intel.com --- drivers/gpu/drm/i915/i915_pci.c | 1 + include/drm/i915_pciids.h | 4 ++++ 2 files changed, 5 insertions(+) (limited to 'include') diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c index 6f7ad10f34b8..55c9058e91a8 100644 --- a/drivers/gpu/drm/i915/i915_pci.c +++ b/drivers/gpu/drm/i915/i915_pci.c @@ -711,6 +711,7 @@ static const struct pci_device_id pciidlist[] = { INTEL_AML_KBL_GT2_IDS(&intel_kabylake_gt2_info), INTEL_CFL_S_GT1_IDS(&intel_coffeelake_gt1_info), INTEL_CFL_S_GT2_IDS(&intel_coffeelake_gt2_info), + INTEL_CFL_H_GT1_IDS(&intel_coffeelake_gt1_info), INTEL_CFL_H_GT2_IDS(&intel_coffeelake_gt2_info), INTEL_CFL_U_GT2_IDS(&intel_coffeelake_gt2_info), INTEL_CFL_U_GT3_IDS(&intel_coffeelake_gt3_info), diff --git a/include/drm/i915_pciids.h b/include/drm/i915_pciids.h index df72be7e8b88..d2fad7b0fcf6 100644 --- a/include/drm/i915_pciids.h +++ b/include/drm/i915_pciids.h @@ -394,6 +394,9 @@ INTEL_VGA_DEVICE(0x3E9A, info) /* SRV GT2 */ /* CFL H */ +#define INTEL_CFL_H_GT1_IDS(info) \ + INTEL_VGA_DEVICE(0x3E9C, info) + #define INTEL_CFL_H_GT2_IDS(info) \ INTEL_VGA_DEVICE(0x3E9B, info), /* Halo GT2 */ \ INTEL_VGA_DEVICE(0x3E94, info) /* Halo GT2 */ @@ -426,6 +429,7 @@ #define INTEL_CFL_IDS(info) \ INTEL_CFL_S_GT1_IDS(info), \ INTEL_CFL_S_GT2_IDS(info), \ + INTEL_CFL_H_GT1_IDS(info), \ INTEL_CFL_H_GT2_IDS(info), \ INTEL_CFL_U_GT2_IDS(info), \ INTEL_CFL_U_GT3_IDS(info), \ -- cgit v1.3.1 From e46c2e99f60043bfdb63f18fe775db1f688906d9 Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Tue, 5 Feb 2019 09:50:31 +0000 Subject: drm/i915: Expose RPCS (SSEU) configuration to userspace (Gen11 only) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We want to allow userspace to reconfigure the subslice configuration on a per context basis. This is required for the functional requirement of shutting down non-VME enabled sub-slices on Gen11 parts. To do so, we expose a context parameter to allow adjustment of the RPCS register stored within the context image (and currently not accessible via LRI). If the context is adjusted before first use or whilst idle, the adjustment is for "free"; otherwise if the context is active we queue a request to do so (using the kernel context), following all other activity by that context, which is also marked as barrier for all following submission against the same context. Since the overhead of device re-configuration during context switching can be significant, especially in multi-context workloads, we limit this new uAPI to only support the Gen11 VME use case. In this use case either the device is fully enabled, and exactly one slice and half of the subslices are enabled. Example usage: struct drm_i915_gem_context_param_sseu sseu = { }; struct drm_i915_gem_context_param arg = { .param = I915_CONTEXT_PARAM_SSEU, .ctx_id = gem_context_create(fd), .size = sizeof(sseu), .value = to_user_pointer(&sseu) }; /* Query device defaults. */ gem_context_get_param(fd, &arg); /* Set VME configuration on a 1x6x8 part. */ sseu.slice_mask = 0x1; sseu.subslice_mask = 0xe0; gem_context_set_param(fd, &arg); v2: Fix offset of CTX_R_PWR_CLK_STATE in intel_lr_context_set_sseu() (Lionel) v3: Add ability to program this per engine (Chris) v4: Move most get_sseu() into i915_gem_context.c (Lionel) v5: Validate sseu configuration against the device's capabilities (Lionel) v6: Change context powergating settings through MI_SDM on kernel context (Chris) v7: Synchronize the requests following a powergating setting change using a global dependency (Chris) Iterate timelines through dev_priv.gt.active_rings (Tvrtko) Disable RPCS configuration setting for non capable users (Lionel/Tvrtko) v8: s/union intel_sseu/struct intel_sseu/ (Lionel) s/dev_priv/i915/ (Tvrtko) Change uapi class/instance fields to u16 (Tvrtko) Bump mask fields to 64bits (Lionel) Don't return EPERM when dynamic sseu is disabled (Tvrtko) v9: Import context image into kernel context's ppgtt only when reconfiguring powergated slice/subslices (Chris) Use aliasing ppgtt when needed (Michel) Tvrtko Ursulin: v10: * Update for upstream changes. * Request submit needs a RPM reference. * Reject on !FULL_PPGTT for simplicity. * Pull out get/set param to helpers for readability and less indent. * Use i915_request_await_dma_fence in add_global_barrier to skip waits on the same timeline and avoid GEM_BUG_ON. * No need to explicitly assign a NULL pointer to engine in legacy mode. * No need to move gen8_make_rpcs up. * Factored out global barrier as prep patch. * Allow to only CAP_SYS_ADMIN if !Gen11. v11: * Remove engine vfunc in favour of local helper. (Chris Wilson) * Stop retiring requests before updates since it is not needed (Chris Wilson) * Implement direct CPU update path for idle contexts. (Chris Wilson) * Left side dependency needs only be on the same context timeline. (Chris Wilson) * It is sufficient to order the timeline. (Chris Wilson) * Reject !RCS configuration attempts with -ENODEV for now. v12: * Rebase for make_rpcs. v13: * Centralize SSEU normalization to make_rpcs. * Type width checking (uAPI <-> implementation). * Gen11 restrictions uAPI checks. * Gen11 subslice count differences handling. Chris Wilson: * args->size handling fixes. * Update context image from GGTT. * Postpone context image update to pinning. * Use i915_gem_active_raw instead of last_request_on_engine. v14: * Add activity tracker on intel_context to fix the lifetime issues and simplify the code. (Chris Wilson) v15: * Fix context pin leak if no space in ring by simplifying the context pinning sequence. v16: * Rebase for context get/set param locking changes. * Just -ENODEV on !Gen11. (Joonas) v17: * Fix one Gen11 subslice enablement rule. * Handle error from i915_sw_fence_await_sw_fence_gfp. (Chris Wilson) v18: * Update commit message. (Joonas) * Restrict uAPI to VME use case. (Joonas) v19: * Rebase. v20: * Rebase for ce->active_tracker. v21: * Rebase for IS_GEN changes. v22: * Reserve uAPI for flags straight away. (Chris Wilson) v23: * Rebase for RUNTIME_INFO. v24: * Added some headline docs for the uapi usage. (Joonas/Chris) v25: * Renamed class/instance to engine_class/engine_instance to avoid clash with C++ keyword. (Tony Ye) v26: * Rebased for runtime pm api changes. v27: * Rebased for intel_context_init. * Wrap commit msg to 75. v28: (Chris Wilson) * Use i915_gem_ggtt. * Use i915_request_await_dma_fence to show a better example. v29: * i915_timeline_set_barrier can now fail. (Chris Wilson) v30: * Capture some acks. v31: * Drop the WARN_ON from use controllable paths. (Chris Wilson) * Use overflows_type for all checks. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=100899 Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=107634 Issue: https://github.com/intel/media-driver/issues/267 Signed-off-by: Chris Wilson Signed-off-by: Lionel Landwerlin Cc: Dmitry Rogozhkin Cc: Tvrtko Ursulin Cc: Zhipeng Gong Cc: Joonas Lahtinen Cc: Tony Ye Signed-off-by: Tvrtko Ursulin Reviewed-by: Chris Wilson Reviewed-by: Joonas Lahtinen Acked-by: Timo Aaltonen Acked-by: Takashi Iwai Acked-by: Stéphane Marchesin Link: https://patchwork.freedesktop.org/patch/msgid/20190205095032.22673-4-tvrtko.ursulin@linux.intel.com --- drivers/gpu/drm/i915/i915_gem_context.c | 340 +++++++++++++++++++++++++++++++- drivers/gpu/drm/i915/i915_gem_context.h | 6 + drivers/gpu/drm/i915/intel_lrc.c | 4 +- include/uapi/drm/i915_drm.h | 64 ++++++ 4 files changed, 411 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c index d3887c27c3ba..2d3e1ce9cc76 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.c +++ b/drivers/gpu/drm/i915/i915_gem_context.c @@ -89,6 +89,7 @@ #include #include "i915_drv.h" #include "i915_trace.h" +#include "intel_lrc_reg.h" #include "intel_workarounds.h" #define ALL_L3_SLICES(dev) (1 << NUM_L3_SLICES(dev)) - 1 @@ -321,6 +322,15 @@ static u32 default_desc_template(const struct drm_i915_private *i915, return desc; } +static void intel_context_retire(struct i915_gem_active *active, + struct i915_request *rq) +{ + struct intel_context *ce = + container_of(active, typeof(*ce), active_tracker); + + intel_context_unpin(ce); +} + void intel_context_init(struct intel_context *ce, struct i915_gem_context *ctx, @@ -333,6 +343,8 @@ intel_context_init(struct intel_context *ce, /* Use the whole device by default */ ce->sseu = intel_device_default_sseu(ctx->i915); + + init_request_active(&ce->active_tracker, intel_context_retire); } static struct i915_gem_context * @@ -850,6 +862,56 @@ out: return 0; } +static int get_sseu(struct i915_gem_context *ctx, + struct drm_i915_gem_context_param *args) +{ + struct drm_i915_gem_context_param_sseu user_sseu; + struct intel_engine_cs *engine; + struct intel_context *ce; + int ret; + + if (args->size == 0) + goto out; + else if (args->size < sizeof(user_sseu)) + return -EINVAL; + + if (copy_from_user(&user_sseu, u64_to_user_ptr(args->value), + sizeof(user_sseu))) + return -EFAULT; + + if (user_sseu.flags || user_sseu.rsvd) + return -EINVAL; + + engine = intel_engine_lookup_user(ctx->i915, + user_sseu.engine_class, + user_sseu.engine_instance); + if (!engine) + return -EINVAL; + + /* Only use for mutex here is to serialize get_param and set_param. */ + ret = mutex_lock_interruptible(&ctx->i915->drm.struct_mutex); + if (ret) + return ret; + + ce = to_intel_context(ctx, engine); + + user_sseu.slice_mask = ce->sseu.slice_mask; + user_sseu.subslice_mask = ce->sseu.subslice_mask; + user_sseu.min_eus_per_subslice = ce->sseu.min_eus_per_subslice; + user_sseu.max_eus_per_subslice = ce->sseu.max_eus_per_subslice; + + mutex_unlock(&ctx->i915->drm.struct_mutex); + + if (copy_to_user(u64_to_user_ptr(args->value), &user_sseu, + sizeof(user_sseu))) + return -EFAULT; + +out: + args->size = sizeof(user_sseu); + + return 0; +} + int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data, struct drm_file *file) { @@ -862,15 +924,17 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data, if (!ctx) return -ENOENT; - args->size = 0; switch (args->param) { case I915_CONTEXT_PARAM_BAN_PERIOD: ret = -EINVAL; break; case I915_CONTEXT_PARAM_NO_ZEROMAP: + args->size = 0; args->value = test_bit(UCONTEXT_NO_ZEROMAP, &ctx->user_flags); break; case I915_CONTEXT_PARAM_GTT_SIZE: + args->size = 0; + if (ctx->ppgtt) args->value = ctx->ppgtt->vm.total; else if (to_i915(dev)->mm.aliasing_ppgtt) @@ -879,14 +943,20 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data, args->value = to_i915(dev)->ggtt.vm.total; break; case I915_CONTEXT_PARAM_NO_ERROR_CAPTURE: + args->size = 0; args->value = i915_gem_context_no_error_capture(ctx); break; case I915_CONTEXT_PARAM_BANNABLE: + args->size = 0; args->value = i915_gem_context_is_bannable(ctx); break; case I915_CONTEXT_PARAM_PRIORITY: + args->size = 0; args->value = ctx->sched.priority >> I915_USER_PRIORITY_SHIFT; break; + case I915_CONTEXT_PARAM_SSEU: + ret = get_sseu(ctx, args); + break; default: ret = -EINVAL; break; @@ -896,6 +966,270 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data, return ret; } +static int gen8_emit_rpcs_config(struct i915_request *rq, + struct intel_context *ce, + struct intel_sseu sseu) +{ + u64 offset; + u32 *cs; + + cs = intel_ring_begin(rq, 4); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + offset = i915_ggtt_offset(ce->state) + + LRC_STATE_PN * PAGE_SIZE + + (CTX_R_PWR_CLK_STATE + 1) * 4; + + *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; + *cs++ = lower_32_bits(offset); + *cs++ = upper_32_bits(offset); + *cs++ = gen8_make_rpcs(rq->i915, &sseu); + + intel_ring_advance(rq, cs); + + return 0; +} + +static int +gen8_modify_rpcs_gpu(struct intel_context *ce, + struct intel_engine_cs *engine, + struct intel_sseu sseu) +{ + struct drm_i915_private *i915 = engine->i915; + struct i915_request *rq, *prev; + intel_wakeref_t wakeref; + int ret; + + GEM_BUG_ON(!ce->pin_count); + + lockdep_assert_held(&i915->drm.struct_mutex); + + /* Submitting requests etc needs the hw awake. */ + wakeref = intel_runtime_pm_get(i915); + + rq = i915_request_alloc(engine, i915->kernel_context); + if (IS_ERR(rq)) { + ret = PTR_ERR(rq); + goto out_put; + } + + /* Queue this switch after all other activity by this context. */ + prev = i915_gem_active_raw(&ce->ring->timeline->last_request, + &i915->drm.struct_mutex); + if (prev && !i915_request_completed(prev)) { + ret = i915_request_await_dma_fence(rq, &prev->fence); + if (ret < 0) + goto out_add; + } + + /* Order all following requests to be after. */ + ret = i915_timeline_set_barrier(ce->ring->timeline, rq); + if (ret) + goto out_add; + + ret = gen8_emit_rpcs_config(rq, ce, sseu); + if (ret) + goto out_add; + + /* + * Guarantee context image and the timeline remains pinned until the + * modifying request is retired by setting the ce activity tracker. + * + * But we only need to take one pin on the account of it. Or in other + * words transfer the pinned ce object to tracked active request. + */ + if (!i915_gem_active_isset(&ce->active_tracker)) + __intel_context_pin(ce); + i915_gem_active_set(&ce->active_tracker, rq); + +out_add: + i915_request_add(rq); +out_put: + intel_runtime_pm_put(i915, wakeref); + + return ret; +} + +static int +i915_gem_context_reconfigure_sseu(struct i915_gem_context *ctx, + struct intel_engine_cs *engine, + struct intel_sseu sseu) +{ + struct intel_context *ce = to_intel_context(ctx, engine); + int ret; + + GEM_BUG_ON(INTEL_GEN(ctx->i915) < 8); + GEM_BUG_ON(engine->id != RCS); + + ret = mutex_lock_interruptible(&ctx->i915->drm.struct_mutex); + if (ret) + return ret; + + /* Nothing to do if unmodified. */ + if (!memcmp(&ce->sseu, &sseu, sizeof(sseu))) + goto out; + + /* + * If context is not idle we have to submit an ordered request to modify + * its context image via the kernel context. Pristine and idle contexts + * will be configured on pinning. + */ + if (ce->pin_count) + ret = gen8_modify_rpcs_gpu(ce, engine, sseu); + + if (!ret) + ce->sseu = sseu; + +out: + mutex_unlock(&ctx->i915->drm.struct_mutex); + + return ret; +} + +static int +user_to_context_sseu(struct drm_i915_private *i915, + const struct drm_i915_gem_context_param_sseu *user, + struct intel_sseu *context) +{ + const struct sseu_dev_info *device = &RUNTIME_INFO(i915)->sseu; + + /* No zeros in any field. */ + if (!user->slice_mask || !user->subslice_mask || + !user->min_eus_per_subslice || !user->max_eus_per_subslice) + return -EINVAL; + + /* Max > min. */ + if (user->max_eus_per_subslice < user->min_eus_per_subslice) + return -EINVAL; + + /* + * Some future proofing on the types since the uAPI is wider than the + * current internal implementation. + */ + if (overflows_type(user->slice_mask, context->slice_mask) || + overflows_type(user->subslice_mask, context->subslice_mask) || + overflows_type(user->min_eus_per_subslice, + context->min_eus_per_subslice) || + overflows_type(user->max_eus_per_subslice, + context->max_eus_per_subslice)) + return -EINVAL; + + /* Check validity against hardware. */ + if (user->slice_mask & ~device->slice_mask) + return -EINVAL; + + if (user->subslice_mask & ~device->subslice_mask[0]) + return -EINVAL; + + if (user->max_eus_per_subslice > device->max_eus_per_subslice) + return -EINVAL; + + context->slice_mask = user->slice_mask; + context->subslice_mask = user->subslice_mask; + context->min_eus_per_subslice = user->min_eus_per_subslice; + context->max_eus_per_subslice = user->max_eus_per_subslice; + + /* Part specific restrictions. */ + if (IS_GEN(i915, 11)) { + unsigned int hw_s = hweight8(device->slice_mask); + unsigned int hw_ss_per_s = hweight8(device->subslice_mask[0]); + unsigned int req_s = hweight8(context->slice_mask); + unsigned int req_ss = hweight8(context->subslice_mask); + + /* + * Only full subslice enablement is possible if more than one + * slice is turned on. + */ + if (req_s > 1 && req_ss != hw_ss_per_s) + return -EINVAL; + + /* + * If more than four (SScount bitfield limit) subslices are + * requested then the number has to be even. + */ + if (req_ss > 4 && (req_ss & 1)) + return -EINVAL; + + /* + * If only one slice is enabled and subslice count is below the + * device full enablement, it must be at most half of the all + * available subslices. + */ + if (req_s == 1 && req_ss < hw_ss_per_s && + req_ss > (hw_ss_per_s / 2)) + return -EINVAL; + + /* ABI restriction - VME use case only. */ + + /* All slices or one slice only. */ + if (req_s != 1 && req_s != hw_s) + return -EINVAL; + + /* + * Half subslices or full enablement only when one slice is + * enabled. + */ + if (req_s == 1 && + (req_ss != hw_ss_per_s && req_ss != (hw_ss_per_s / 2))) + return -EINVAL; + + /* No EU configuration changes. */ + if ((user->min_eus_per_subslice != + device->max_eus_per_subslice) || + (user->max_eus_per_subslice != + device->max_eus_per_subslice)) + return -EINVAL; + } + + return 0; +} + +static int set_sseu(struct i915_gem_context *ctx, + struct drm_i915_gem_context_param *args) +{ + struct drm_i915_private *i915 = ctx->i915; + struct drm_i915_gem_context_param_sseu user_sseu; + struct intel_engine_cs *engine; + struct intel_sseu sseu; + int ret; + + if (args->size < sizeof(user_sseu)) + return -EINVAL; + + if (!IS_GEN(i915, 11)) + return -ENODEV; + + if (copy_from_user(&user_sseu, u64_to_user_ptr(args->value), + sizeof(user_sseu))) + return -EFAULT; + + if (user_sseu.flags || user_sseu.rsvd) + return -EINVAL; + + engine = intel_engine_lookup_user(i915, + user_sseu.engine_class, + user_sseu.engine_instance); + if (!engine) + return -EINVAL; + + /* Only render engine supports RPCS configuration. */ + if (engine->class != RENDER_CLASS) + return -ENODEV; + + ret = user_to_context_sseu(i915, &user_sseu, &sseu); + if (ret) + return ret; + + ret = i915_gem_context_reconfigure_sseu(ctx, engine, sseu); + if (ret) + return ret; + + args->size = sizeof(user_sseu); + + return 0; +} + int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data, struct drm_file *file) { @@ -958,7 +1292,9 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data, I915_USER_PRIORITY(priority); } break; - + case I915_CONTEXT_PARAM_SSEU: + ret = set_sseu(ctx, args); + break; default: ret = -EINVAL; break; diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h index 919f6f0a0f7a..92ad5272e57f 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.h +++ b/drivers/gpu/drm/i915/i915_gem_context.h @@ -183,6 +183,12 @@ struct i915_gem_context { u64 lrc_desc; int pin_count; + /** + * active_tracker: Active tracker for the external rq activity + * on this intel_context object. + */ + struct i915_gem_active active_tracker; + const struct intel_context_ops *ops; /** sseu: Control eu/slice partitioning */ diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index d99c462e2c09..5e98fd79bd9d 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -2498,7 +2498,9 @@ u32 gen8_make_rpcs(struct drm_i915_private *i915, struct intel_sseu *req_sseu) * subslices are enabled, or a count between one and four on the first * slice. */ - if (IS_GEN(i915, 11) && slices == 1 && subslices >= 4) { + if (IS_GEN(i915, 11) && + slices == 1 && + subslices > min_t(u8, 4, hweight8(sseu->subslice_mask[0]) / 2)) { GEM_BUG_ON(subslices & 1); subslice_pg = false; diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index 298b2e197744..397810fa2d33 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -1486,9 +1486,73 @@ struct drm_i915_gem_context_param { #define I915_CONTEXT_MAX_USER_PRIORITY 1023 /* inclusive */ #define I915_CONTEXT_DEFAULT_PRIORITY 0 #define I915_CONTEXT_MIN_USER_PRIORITY -1023 /* inclusive */ + /* + * When using the following param, value should be a pointer to + * drm_i915_gem_context_param_sseu. + */ +#define I915_CONTEXT_PARAM_SSEU 0x7 __u64 value; }; +/** + * Context SSEU programming + * + * It may be necessary for either functional or performance reason to configure + * a context to run with a reduced number of SSEU (where SSEU stands for Slice/ + * Sub-slice/EU). + * + * This is done by configuring SSEU configuration using the below + * @struct drm_i915_gem_context_param_sseu for every supported engine which + * userspace intends to use. + * + * Not all GPUs or engines support this functionality in which case an error + * code -ENODEV will be returned. + * + * Also, flexibility of possible SSEU configuration permutations varies between + * GPU generations and software imposed limitations. Requesting such a + * combination will return an error code of -EINVAL. + * + * NOTE: When perf/OA is active the context's SSEU configuration is ignored in + * favour of a single global setting. + */ +struct drm_i915_gem_context_param_sseu { + /* + * Engine class & instance to be configured or queried. + */ + __u16 engine_class; + __u16 engine_instance; + + /* + * Unused for now. Must be cleared to zero. + */ + __u32 flags; + + /* + * Mask of slices to enable for the context. Valid values are a subset + * of the bitmask value returned for I915_PARAM_SLICE_MASK. + */ + __u64 slice_mask; + + /* + * Mask of subslices to enable for the context. Valid values are a + * subset of the bitmask value return by I915_PARAM_SUBSLICE_MASK. + */ + __u64 subslice_mask; + + /* + * Minimum/Maximum number of EUs to enable per subslice for the + * context. min_eus_per_subslice must be inferior or equal to + * max_eus_per_subslice. + */ + __u16 min_eus_per_subslice; + __u16 max_eus_per_subslice; + + /* + * Unused for now. Must be cleared to zero. + */ + __u32 rsvd; +}; + enum drm_i915_oa_format { I915_OA_FORMAT_A13 = 1, /* HSW only */ I915_OA_FORMAT_A29, /* HSW only */ -- cgit v1.3.1 From 67dd1a36334ffce82bebeb2d633e152aa436d370 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Thu, 31 Jan 2019 15:44:22 -0500 Subject: drm/amdgpu: Add AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New chunk for dependency on start of job's execution instead on the end. This is used for GPU deadlock prevention when userspace uses mid-IB fences to wait for mid-IB work on other rings. v2: Fix typo in AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES v3: Bump KMS version v4: put old fence AFTER acquiring the scheduled fence. Signed-off-by: Andrey Grodzovsky Suggested-by: Christian Koenig Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 13 ++++++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 3 ++- include/uapi/drm/amdgpu_drm.h | 1 + 3 files changed, 15 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index 1c49b8266d69..52a5e4fdc95b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -214,6 +214,7 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs case AMDGPU_CHUNK_ID_DEPENDENCIES: case AMDGPU_CHUNK_ID_SYNCOBJ_IN: case AMDGPU_CHUNK_ID_SYNCOBJ_OUT: + case AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES: break; default: @@ -1090,6 +1091,15 @@ static int amdgpu_cs_process_fence_dep(struct amdgpu_cs_parser *p, fence = amdgpu_ctx_get_fence(ctx, entity, deps[i].handle); + + if (chunk->chunk_id == AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES) { + struct drm_sched_fence *s_fence = to_drm_sched_fence(fence); + struct dma_fence *old = fence; + + fence = dma_fence_get(&s_fence->scheduled); + dma_fence_put(old); + } + if (IS_ERR(fence)) { r = PTR_ERR(fence); amdgpu_ctx_put(ctx); @@ -1177,7 +1187,8 @@ static int amdgpu_cs_dependencies(struct amdgpu_device *adev, chunk = &p->chunks[i]; - if (chunk->chunk_id == AMDGPU_CHUNK_ID_DEPENDENCIES) { + if (chunk->chunk_id == AMDGPU_CHUNK_ID_DEPENDENCIES || + chunk->chunk_id == AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES) { r = amdgpu_cs_process_fence_dep(p, chunk); if (r) return r; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index c806f984bcc5..1158a6f4eec6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -71,9 +71,10 @@ * - 3.25.0 - Add support for sensor query info (stable pstate sclk/mclk). * - 3.26.0 - GFX9: Process AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE. * - 3.27.0 - Add new chunk to to AMDGPU_CS to enable BO_LIST creation. + * - 3.28.0 - Add AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES */ #define KMS_DRIVER_MAJOR 3 -#define KMS_DRIVER_MINOR 27 +#define KMS_DRIVER_MINOR 28 #define KMS_DRIVER_PATCHLEVEL 0 int amdgpu_vram_limit = 0; diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h index be84e43c1e19..47296f2ec3fa 100644 --- a/include/uapi/drm/amdgpu_drm.h +++ b/include/uapi/drm/amdgpu_drm.h @@ -523,6 +523,7 @@ struct drm_amdgpu_gem_va { #define AMDGPU_CHUNK_ID_SYNCOBJ_IN 0x04 #define AMDGPU_CHUNK_ID_SYNCOBJ_OUT 0x05 #define AMDGPU_CHUNK_ID_BO_HANDLES 0x06 +#define AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES 0x07 struct drm_amdgpu_cs_chunk { __u32 chunk_id; -- cgit v1.3.1 From 41cca166cc57e75e94d888595a428d23a3bf4e36 Mon Sep 17 00:00:00 2001 From: Marek Olšák Date: Mon, 21 Jan 2019 17:22:55 -0500 Subject: drm/amdgpu: add a workaround for GDS ordered append hangs with compute queues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I'm not increasing the DRM version because GDS isn't totally without bugs yet. v2: update emit_ib_size Signed-off-by: Marek Olšák Acked-by: Christian König Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 3 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h | 2 ++ drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c | 19 +++++++++++++++- drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 21 +++++++++++++++-- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 40 +++++++++++++++++++++++++++++++-- include/uapi/drm/amdgpu_drm.h | 5 +++++ 6 files changed, 84 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 1158a6f4eec6..2f0ea380c031 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -72,9 +72,10 @@ * - 3.26.0 - GFX9: Process AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE. * - 3.27.0 - Add new chunk to to AMDGPU_CS to enable BO_LIST creation. * - 3.28.0 - Add AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES + * - 3.29.0 - Add AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID */ #define KMS_DRIVER_MAJOR 3 -#define KMS_DRIVER_MINOR 28 +#define KMS_DRIVER_MINOR 29 #define KMS_DRIVER_PATCHLEVEL 0 int amdgpu_vram_limit = 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h index ecbcefe49a98..f89f5734d985 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h @@ -37,6 +37,8 @@ struct amdgpu_gds { struct amdgpu_gds_asic_info mem; struct amdgpu_gds_asic_info gws; struct amdgpu_gds_asic_info oa; + uint32_t gds_compute_max_wave_id; + /* At present, GDS, GWS and OA resources for gfx (graphics) * is always pre-allocated and available for graphics operation. * Such resource is shared between all gfx clients. diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c index 7984292f9282..a59e0fdf5a97 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c @@ -2264,6 +2264,22 @@ static void gfx_v7_0_ring_emit_ib_compute(struct amdgpu_ring *ring, unsigned vmid = AMDGPU_JOB_GET_VMID(job); u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24); + /* Currently, there is a high possibility to get wave ID mismatch + * between ME and GDS, leading to a hw deadlock, because ME generates + * different wave IDs than the GDS expects. This situation happens + * randomly when at least 5 compute pipes use GDS ordered append. + * The wave IDs generated by ME are also wrong after suspend/resume. + * Those are probably bugs somewhere else in the kernel driver. + * + * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME and + * GDS to 0 for this ring (me/pipe). + */ + if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) { + amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1)); + amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID - PACKET3_SET_CONFIG_REG_START); + amdgpu_ring_write(ring, ring->adev->gds.gds_compute_max_wave_id); + } + amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2)); amdgpu_ring_write(ring, #ifdef __BIG_ENDIAN @@ -5000,7 +5016,7 @@ static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_compute = { 7 + /* gfx_v7_0_ring_emit_pipeline_sync */ CIK_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /* gfx_v7_0_ring_emit_vm_flush */ 7 + 7 + 7, /* gfx_v7_0_ring_emit_fence_compute x3 for user fence, vm fence */ - .emit_ib_size = 4, /* gfx_v7_0_ring_emit_ib_compute */ + .emit_ib_size = 7, /* gfx_v7_0_ring_emit_ib_compute */ .emit_ib = gfx_v7_0_ring_emit_ib_compute, .emit_fence = gfx_v7_0_ring_emit_fence_compute, .emit_pipeline_sync = gfx_v7_0_ring_emit_pipeline_sync, @@ -5057,6 +5073,7 @@ static void gfx_v7_0_set_gds_init(struct amdgpu_device *adev) adev->gds.mem.total_size = RREG32(mmGDS_VMID0_SIZE); adev->gds.gws.total_size = 64; adev->gds.oa.total_size = 16; + adev->gds.gds_compute_max_wave_id = RREG32(mmGDS_COMPUTE_MAX_WAVE_ID); if (adev->gds.mem.total_size == 64 * 1024) { adev->gds.mem.gfx_partition_size = 4096; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c index a26747681ed6..b8e50a34bdb3 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c @@ -6084,6 +6084,22 @@ static void gfx_v8_0_ring_emit_ib_compute(struct amdgpu_ring *ring, unsigned vmid = AMDGPU_JOB_GET_VMID(job); u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24); + /* Currently, there is a high possibility to get wave ID mismatch + * between ME and GDS, leading to a hw deadlock, because ME generates + * different wave IDs than the GDS expects. This situation happens + * randomly when at least 5 compute pipes use GDS ordered append. + * The wave IDs generated by ME are also wrong after suspend/resume. + * Those are probably bugs somewhere else in the kernel driver. + * + * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME and + * GDS to 0 for this ring (me/pipe). + */ + if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) { + amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1)); + amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID - PACKET3_SET_CONFIG_REG_START); + amdgpu_ring_write(ring, ring->adev->gds.gds_compute_max_wave_id); + } + amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2)); amdgpu_ring_write(ring, #ifdef __BIG_ENDIAN @@ -6890,7 +6906,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = { 7 + /* gfx_v8_0_ring_emit_pipeline_sync */ VI_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /* gfx_v8_0_ring_emit_vm_flush */ 7 + 7 + 7, /* gfx_v8_0_ring_emit_fence_compute x3 for user fence, vm fence */ - .emit_ib_size = 4, /* gfx_v8_0_ring_emit_ib_compute */ + .emit_ib_size = 7, /* gfx_v8_0_ring_emit_ib_compute */ .emit_ib = gfx_v8_0_ring_emit_ib_compute, .emit_fence = gfx_v8_0_ring_emit_fence_compute, .emit_pipeline_sync = gfx_v8_0_ring_emit_pipeline_sync, @@ -6920,7 +6936,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_kiq = { 7 + /* gfx_v8_0_ring_emit_pipeline_sync */ 17 + /* gfx_v8_0_ring_emit_vm_flush */ 7 + 7 + 7, /* gfx_v8_0_ring_emit_fence_kiq x3 for user fence, vm fence */ - .emit_ib_size = 4, /* gfx_v8_0_ring_emit_ib_compute */ + .emit_ib_size = 7, /* gfx_v8_0_ring_emit_ib_compute */ .emit_fence = gfx_v8_0_ring_emit_fence_kiq, .test_ring = gfx_v8_0_ring_test_ring, .insert_nop = amdgpu_ring_insert_nop, @@ -6996,6 +7012,7 @@ static void gfx_v8_0_set_gds_init(struct amdgpu_device *adev) adev->gds.mem.total_size = RREG32(mmGDS_VMID0_SIZE); adev->gds.gws.total_size = 64; adev->gds.oa.total_size = 16; + adev->gds.gds_compute_max_wave_id = RREG32(mmGDS_COMPUTE_MAX_WAVE_ID); if (adev->gds.mem.total_size == 64 * 1024) { adev->gds.mem.gfx_partition_size = 4096; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 262ee3cf6f1c..5533f6e4f4a4 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -4010,6 +4010,22 @@ static void gfx_v9_0_ring_emit_ib_compute(struct amdgpu_ring *ring, unsigned vmid = AMDGPU_JOB_GET_VMID(job); u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24); + /* Currently, there is a high possibility to get wave ID mismatch + * between ME and GDS, leading to a hw deadlock, because ME generates + * different wave IDs than the GDS expects. This situation happens + * randomly when at least 5 compute pipes use GDS ordered append. + * The wave IDs generated by ME are also wrong after suspend/resume. + * Those are probably bugs somewhere else in the kernel driver. + * + * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME and + * GDS to 0 for this ring (me/pipe). + */ + if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) { + amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1)); + amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID); + amdgpu_ring_write(ring, ring->adev->gds.gds_compute_max_wave_id); + } + amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2)); BUG_ON(ib->gpu_addr & 0x3); /* Dword align */ amdgpu_ring_write(ring, @@ -4729,7 +4745,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = { SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 + 2 + /* gfx_v9_0_ring_emit_vm_flush */ 8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */ - .emit_ib_size = 4, /* gfx_v9_0_ring_emit_ib_compute */ + .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */ .emit_ib = gfx_v9_0_ring_emit_ib_compute, .emit_fence = gfx_v9_0_ring_emit_fence, .emit_pipeline_sync = gfx_v9_0_ring_emit_pipeline_sync, @@ -4764,7 +4780,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = { SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 + 2 + /* gfx_v9_0_ring_emit_vm_flush */ 8 + 8 + 8, /* gfx_v9_0_ring_emit_fence_kiq x3 for user fence, vm fence */ - .emit_ib_size = 4, /* gfx_v9_0_ring_emit_ib_compute */ + .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */ .emit_fence = gfx_v9_0_ring_emit_fence_kiq, .test_ring = gfx_v9_0_ring_test_ring, .insert_nop = amdgpu_ring_insert_nop, @@ -4846,6 +4862,26 @@ static void gfx_v9_0_set_gds_init(struct amdgpu_device *adev) break; } + switch (adev->asic_type) { + case CHIP_VEGA10: + case CHIP_VEGA20: + adev->gds.gds_compute_max_wave_id = 0x7ff; + break; + case CHIP_VEGA12: + adev->gds.gds_compute_max_wave_id = 0x27f; + break; + case CHIP_RAVEN: + if (adev->rev_id >= 0x8) + adev->gds.gds_compute_max_wave_id = 0x77; /* raven2 */ + else + adev->gds.gds_compute_max_wave_id = 0x15f; /* raven1 */ + break; + default: + /* this really depends on the chip */ + adev->gds.gds_compute_max_wave_id = 0x7ff; + break; + } + adev->gds.gws.total_size = 64; adev->gds.oa.total_size = 16; diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h index 47296f2ec3fa..2acbc8bc465b 100644 --- a/include/uapi/drm/amdgpu_drm.h +++ b/include/uapi/drm/amdgpu_drm.h @@ -566,6 +566,11 @@ union drm_amdgpu_cs { * caches (L2/vL1/sL1/I$). */ #define AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE (1 << 3) +/* Set GDS_COMPUTE_MAX_WAVE_ID = DEFAULT before PACKET3_INDIRECT_BUFFER. + * This will reset wave ID counters for the IB. + */ +#define AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID (1 << 4) + struct drm_amdgpu_cs_chunk_ib { __u32 _pad; /** AMDGPU_IB_FLAG_* */ -- cgit v1.3.1 From 5a5fccbd8c315c08db01e585e1cbe88e30b70691 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Fri, 1 Feb 2019 14:28:24 +0100 Subject: gpu: host1x: Introduce support for wide opcodes The CDMA push buffer can currently only handle opcodes that take a single word parameter. However, the host1x implementation on Tegra186 and later supports opcodes that require multiple words as parameters. Unfortunately the way the push buffer is structured, these wide opcodes cannot simply be composed of two regular opcodes because that could result in the wide opcode being split across the end of the push buffer and the final RESTART opcode required to wrap the push buffer around would break the wide opcode. One way to fix this would be to remove the concept of slots to simplify push buffer operations. However, that's not entirely trivial and should be done in a separate patch. For now, simply use a different function to push four-word opcodes into the push buffer. Technically only three words are pushed, with the fourth word used as padding to preserve the 2-word alignment required by the slots abstraction. The fourth word is always a NOP opcode. Additional care must be taken when the end of the push buffer is reached. If a four-word opcode doesn't fit into the push buffer without being split by the boundary, NOP opcodes will be introduced and the new wide opcode placed at the beginning of the push buffer. Signed-off-by: Thierry Reding --- drivers/gpu/host1x/cdma.c | 92 +++++++++++++++++++++++++++++++++++++++++++ drivers/gpu/host1x/cdma.h | 2 + include/trace/events/host1x.h | 26 ++++++++++++ 3 files changed, 120 insertions(+) (limited to 'include') diff --git a/drivers/gpu/host1x/cdma.c b/drivers/gpu/host1x/cdma.c index 15bdeb836e3d..64099cc1964b 100644 --- a/drivers/gpu/host1x/cdma.c +++ b/drivers/gpu/host1x/cdma.c @@ -217,6 +217,45 @@ unsigned int host1x_cdma_wait_locked(struct host1x_cdma *cdma, return 0; } +/* + * Sleep (if necessary) until the push buffer has enough free space. + * + * Must be called with the cdma lock held. + */ +int host1x_cdma_wait_pushbuffer_space(struct host1x *host1x, + struct host1x_cdma *cdma, + unsigned int needed) +{ + while (true) { + struct push_buffer *pb = &cdma->push_buffer; + unsigned int space; + + space = host1x_pushbuffer_space(pb); + if (space >= needed) + break; + + trace_host1x_wait_cdma(dev_name(cdma_to_channel(cdma)->dev), + CDMA_EVENT_PUSH_BUFFER_SPACE); + + host1x_hw_cdma_flush(host1x, cdma); + + /* If somebody has managed to already start waiting, yield */ + if (cdma->event != CDMA_EVENT_NONE) { + mutex_unlock(&cdma->lock); + schedule(); + mutex_lock(&cdma->lock); + continue; + } + + cdma->event = CDMA_EVENT_PUSH_BUFFER_SPACE; + + mutex_unlock(&cdma->lock); + wait_for_completion(&cdma->complete); + mutex_lock(&cdma->lock); + } + + return 0; +} /* * Start timer that tracks the time spent by the job. * Must be called with the cdma lock held. @@ -509,6 +548,59 @@ void host1x_cdma_push(struct host1x_cdma *cdma, u32 op1, u32 op2) host1x_pushbuffer_push(pb, op1, op2); } +/* + * Push four words into two consecutive push buffer slots. Note that extra + * care needs to be taken not to split the two slots across the end of the + * push buffer. Otherwise the RESTART opcode at the end of the push buffer + * that ensures processing will restart at the beginning will break up the + * four words. + * + * Blocks as necessary if the push buffer is full. + */ +void host1x_cdma_push_wide(struct host1x_cdma *cdma, u32 op1, u32 op2, + u32 op3, u32 op4) +{ + struct host1x_channel *channel = cdma_to_channel(cdma); + struct host1x *host1x = cdma_to_host1x(cdma); + struct push_buffer *pb = &cdma->push_buffer; + unsigned int needed = 2, extra = 0, i; + unsigned int space = cdma->slots_free; + + if (host1x_debug_trace_cmdbuf) + trace_host1x_cdma_push_wide(dev_name(channel->dev), op1, op2, + op3, op4); + + /* compute number of extra slots needed for padding */ + if (pb->pos + 16 > pb->size) { + extra = (pb->size - pb->pos) / 8; + needed += extra; + } + + host1x_cdma_wait_pushbuffer_space(host1x, cdma, needed); + space = host1x_pushbuffer_space(pb); + + cdma->slots_free = space - needed; + cdma->slots_used += needed; + + /* + * Note that we rely on the fact that this is only used to submit wide + * gather opcodes, which consist of 3 words, and they are padded with + * a NOP to avoid having to deal with fractional slots (a slot always + * represents 2 words). The fourth opcode passed to this function will + * therefore always be a NOP. + * + * This works around a slight ambiguity when it comes to opcodes. For + * all current host1x incarnations the NOP opcode uses the exact same + * encoding (0x20000000), so we could hard-code the value here, but a + * new incarnation may change it and break that assumption. + */ + for (i = 0; i < extra; i++) + host1x_pushbuffer_push(pb, op4, op4); + + host1x_pushbuffer_push(pb, op1, op2); + host1x_pushbuffer_push(pb, op3, op4); +} + /* * End a cdma submit * Kick off DMA, add job to the sync queue, and a number of slots to be freed diff --git a/drivers/gpu/host1x/cdma.h b/drivers/gpu/host1x/cdma.h index 71078359c626..3a5e0408b8d1 100644 --- a/drivers/gpu/host1x/cdma.h +++ b/drivers/gpu/host1x/cdma.h @@ -90,6 +90,8 @@ int host1x_cdma_init(struct host1x_cdma *cdma); int host1x_cdma_deinit(struct host1x_cdma *cdma); int host1x_cdma_begin(struct host1x_cdma *cdma, struct host1x_job *job); void host1x_cdma_push(struct host1x_cdma *cdma, u32 op1, u32 op2); +void host1x_cdma_push_wide(struct host1x_cdma *cdma, u32 op1, u32 op2, + u32 op3, u32 op4); void host1x_cdma_end(struct host1x_cdma *cdma, struct host1x_job *job); void host1x_cdma_update(struct host1x_cdma *cdma); void host1x_cdma_peek(struct host1x_cdma *cdma, u32 dmaget, int slot, diff --git a/include/trace/events/host1x.h b/include/trace/events/host1x.h index a37ef73092e5..3d340b6f1ea3 100644 --- a/include/trace/events/host1x.h +++ b/include/trace/events/host1x.h @@ -80,6 +80,32 @@ TRACE_EVENT(host1x_cdma_push, __entry->name, __entry->op1, __entry->op2) ); +TRACE_EVENT(host1x_cdma_push_wide, + TP_PROTO(const char *name, u32 op1, u32 op2, u32 op3, u32 op4), + + TP_ARGS(name, op1, op2, op3, op4), + + TP_STRUCT__entry( + __field(const char *, name) + __field(u32, op1) + __field(u32, op2) + __field(u32, op3) + __field(u32, op4) + ), + + TP_fast_assign( + __entry->name = name; + __entry->op1 = op1; + __entry->op2 = op2; + __entry->op3 = op3; + __entry->op4 = op4; + ), + + TP_printk("name=%s, op1=%08x, op2=%08x, op3=%08x op4=%08x", + __entry->name, __entry->op1, __entry->op2, __entry->op3, + __entry->op4) +); + TRACE_EVENT(host1x_cdma_push_gather, TP_PROTO(const char *name, struct host1x_bo *bo, u32 words, u32 offset, void *cmdbuf), -- cgit v1.3.1