summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-04-05 11:53:46 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2024-04-05 11:53:46 -0700
commit89103a164210f1c88caedf880ac9ab9576a1190d (patch)
tree940acf5e7f5366ccfcf39e6b4728db8a9e12b797
parente8b0ccb2a787fb43f8091a1eaef9c28a79b00002 (diff)
parent4c8595741b5dd3268d6710545461ee9a7bbde891 (diff)
Merge tag 'drm-fixes-2024-04-05' of https://gitlab.freedesktop.org/drm/kernel
Pull drm fixes from Dave Airlie: "Weekly fixes, mostly xe and i915, amdgpu on a week off, otherwise a nouveau fix for a crash with new vulkan cts tests, and a couple of cleanups and misc fixes. display: - fix typos in kerneldoc prime: - unbreak dma-buf export for virt-gpu nouveau: - uvmm: fix remap address calculation - minor cleanups panfrost: - fix power-transition timeouts xe: - Stop using system_unbound_wq for preempt fences - Fix saving unordered rebinding fences by attaching them as kernel feces to the vm's resv - Fix TLB invalidation fences completing out of order - Move rebind TLB invalidation to the ring ops to reduce the latency i915: - A few DisplayPort related fixes - eDP PSR fixes - Remove some VM space restrictions on older platforms - Disable automatic load CCS load balancing" * tag 'drm-fixes-2024-04-05' of https://gitlab.freedesktop.org/drm/kernel: (22 commits) drm/xe: Use ordered wq for preempt fence waiting drm/xe: Move vma rebinding to the drm_exec locking loop drm/xe: Make TLB invalidation fences unordered drm/xe: Rework rebinding drm/xe: Use ring ops TLB invalidation for rebinds drm/i915/mst: Reject FEC+MST on ICL drm/i915/mst: Limit MST+DSC to TGL+ drm/i915/dp: Fix the computation for compressed_bpp for DISPLAY < 13 drm/i915/gt: Enable only one CCS for compute workload drm/i915/gt: Do not generate the command streamer for all the CCS drm/i915/gt: Disable HW load balancing for CCS drm/i915/gt: Limit the reserved VM space to only the platforms that need it drm/i915/psr: Fix intel_psr2_sel_fetch_et_alignment usage drm/i915/psr: Move writing early transport pipe src drm/i915/psr: Calculate PIPE_SRCSZ_ERLY_TPT value drm/i915/dp: Remove support for UHBR13.5 drm/i915/dp: Fix DSC state HW readout for SST connectors drm/display: fix typo drm/prime: Unbreak virtgpu dma-buf export nouveau/uvmm: fix addr/range calcs for remap operations ...
-rw-r--r--drivers/gpu/drm/display/drm_dp_dual_mode_helper.c4
-rw-r--r--drivers/gpu/drm/drm_prime.c7
-rw-r--r--drivers/gpu/drm/i915/Makefile1
-rw-r--r--drivers/gpu/drm/i915/display/intel_display.c9
-rw-r--r--drivers/gpu/drm/i915/display/intel_display_device.h1
-rw-r--r--drivers/gpu/drm/i915/display/intel_display_types.h2
-rw-r--r--drivers/gpu/drm/i915/display/intel_dp.c11
-rw-r--r--drivers/gpu/drm/i915/display/intel_dp_mst.c2
-rw-r--r--drivers/gpu/drm/i915/display/intel_psr.c78
-rw-r--r--drivers/gpu/drm/i915/gt/gen8_ppgtt.c3
-rw-r--r--drivers/gpu/drm/i915/gt/intel_engine_cs.c17
-rw-r--r--drivers/gpu/drm/i915/gt/intel_gt.c6
-rw-r--r--drivers/gpu/drm/i915/gt/intel_gt.h9
-rw-r--r--drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.c39
-rw-r--r--drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.h13
-rw-r--r--drivers/gpu/drm/i915/gt/intel_gt_regs.h6
-rw-r--r--drivers/gpu/drm/i915/gt/intel_workarounds.c30
-rw-r--r--drivers/gpu/drm/nouveau/nouveau_uvmm.c6
-rw-r--r--drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c2
-rw-r--r--drivers/gpu/drm/panfrost/panfrost_gpu.c6
-rw-r--r--drivers/gpu/drm/xe/xe_device.c11
-rw-r--r--drivers/gpu/drm/xe/xe_device_types.h3
-rw-r--r--drivers/gpu/drm/xe/xe_exec.c79
-rw-r--r--drivers/gpu/drm/xe/xe_exec_queue_types.h5
-rw-r--r--drivers/gpu/drm/xe/xe_gt_pagefault.c3
-rw-r--r--drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c1
-rw-r--r--drivers/gpu/drm/xe/xe_gt_types.h7
-rw-r--r--drivers/gpu/drm/xe/xe_preempt_fence.c2
-rw-r--r--drivers/gpu/drm/xe/xe_pt.c25
-rw-r--r--drivers/gpu/drm/xe/xe_ring_ops.c11
-rw-r--r--drivers/gpu/drm/xe/xe_sched_job.c10
-rw-r--r--drivers/gpu/drm/xe/xe_sched_job_types.h2
-rw-r--r--drivers/gpu/drm/xe/xe_vm.c110
-rw-r--r--drivers/gpu/drm/xe/xe_vm.h8
-rw-r--r--drivers/gpu/drm/xe/xe_vm_types.h8
35 files changed, 340 insertions, 197 deletions
diff --git a/drivers/gpu/drm/display/drm_dp_dual_mode_helper.c b/drivers/gpu/drm/display/drm_dp_dual_mode_helper.c
index bd61e20770a5..14a2a8473682 100644
--- a/drivers/gpu/drm/display/drm_dp_dual_mode_helper.c
+++ b/drivers/gpu/drm/display/drm_dp_dual_mode_helper.c
@@ -52,7 +52,7 @@
* @adapter: I2C adapter for the DDC bus
* @offset: register offset
* @buffer: buffer for return data
- * @size: sizo of the buffer
+ * @size: size of the buffer
*
* Reads @size bytes from the DP dual mode adaptor registers
* starting at @offset.
@@ -116,7 +116,7 @@ EXPORT_SYMBOL(drm_dp_dual_mode_read);
* @adapter: I2C adapter for the DDC bus
* @offset: register offset
* @buffer: buffer for write data
- * @size: sizo of the buffer
+ * @size: size of the buffer
*
* Writes @size bytes to the DP dual mode adaptor registers
* starting at @offset.
diff --git a/drivers/gpu/drm/drm_prime.c b/drivers/gpu/drm/drm_prime.c
index 7352bde299d5..03bd3c7bd0dc 100644
--- a/drivers/gpu/drm/drm_prime.c
+++ b/drivers/gpu/drm/drm_prime.c
@@ -582,7 +582,12 @@ int drm_gem_map_attach(struct dma_buf *dma_buf,
{
struct drm_gem_object *obj = dma_buf->priv;
- if (!obj->funcs->get_sg_table)
+ /*
+ * drm_gem_map_dma_buf() requires obj->get_sg_table(), but drivers
+ * that implement their own ->map_dma_buf() do not.
+ */
+ if (dma_buf->ops->map_dma_buf == drm_gem_map_dma_buf &&
+ !obj->funcs->get_sg_table)
return -ENOSYS;
return drm_gem_pin(obj);
diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index 4c2f85632391..fba73c38e235 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -118,6 +118,7 @@ gt-y += \
gt/intel_ggtt_fencing.o \
gt/intel_gt.o \
gt/intel_gt_buffer_pool.o \
+ gt/intel_gt_ccs_mode.o \
gt/intel_gt_clock_utils.o \
gt/intel_gt_debugfs.o \
gt/intel_gt_engines_debugfs.o \
diff --git a/drivers/gpu/drm/i915/display/intel_display.c b/drivers/gpu/drm/i915/display/intel_display.c
index ab2f52d21bad..8af9e6128277 100644
--- a/drivers/gpu/drm/i915/display/intel_display.c
+++ b/drivers/gpu/drm/i915/display/intel_display.c
@@ -2709,15 +2709,6 @@ static void intel_set_pipe_src_size(const struct intel_crtc_state *crtc_state)
*/
intel_de_write(dev_priv, PIPESRC(pipe),
PIPESRC_WIDTH(width - 1) | PIPESRC_HEIGHT(height - 1));
-
- if (!crtc_state->enable_psr2_su_region_et)
- return;
-
- width = drm_rect_width(&crtc_state->psr2_su_area);
- height = drm_rect_height(&crtc_state->psr2_su_area);
-
- intel_de_write(dev_priv, PIPE_SRCSZ_ERLY_TPT(pipe),
- PIPESRC_WIDTH(width - 1) | PIPESRC_HEIGHT(height - 1));
}
static bool intel_pipe_is_interlaced(const struct intel_crtc_state *crtc_state)
diff --git a/drivers/gpu/drm/i915/display/intel_display_device.h b/drivers/gpu/drm/i915/display/intel_display_device.h
index fe4268813786..9b1bce2624b9 100644
--- a/drivers/gpu/drm/i915/display/intel_display_device.h
+++ b/drivers/gpu/drm/i915/display/intel_display_device.h
@@ -47,6 +47,7 @@ struct drm_printer;
#define HAS_DPT(i915) (DISPLAY_VER(i915) >= 13)
#define HAS_DSB(i915) (DISPLAY_INFO(i915)->has_dsb)
#define HAS_DSC(__i915) (DISPLAY_RUNTIME_INFO(__i915)->has_dsc)
+#define HAS_DSC_MST(__i915) (DISPLAY_VER(__i915) >= 12 && HAS_DSC(__i915))
#define HAS_FBC(i915) (DISPLAY_RUNTIME_INFO(i915)->fbc_mask != 0)
#define HAS_FPGA_DBG_UNCLAIMED(i915) (DISPLAY_INFO(i915)->has_fpga_dbg)
#define HAS_FW_BLC(i915) (DISPLAY_VER(i915) >= 3)
diff --git a/drivers/gpu/drm/i915/display/intel_display_types.h b/drivers/gpu/drm/i915/display/intel_display_types.h
index 9104f18753b4..bf3f942e19c3 100644
--- a/drivers/gpu/drm/i915/display/intel_display_types.h
+++ b/drivers/gpu/drm/i915/display/intel_display_types.h
@@ -1423,6 +1423,8 @@ struct intel_crtc_state {
u32 psr2_man_track_ctl;
+ u32 pipe_srcsz_early_tpt;
+
struct drm_rect psr2_su_area;
/* Variable Refresh Rate state */
diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c
index f98ef4b42a44..abd62bebc46d 100644
--- a/drivers/gpu/drm/i915/display/intel_dp.c
+++ b/drivers/gpu/drm/i915/display/intel_dp.c
@@ -499,7 +499,7 @@ intel_dp_set_source_rates(struct intel_dp *intel_dp)
/* The values must be in increasing order */
static const int mtl_rates[] = {
162000, 216000, 243000, 270000, 324000, 432000, 540000, 675000,
- 810000, 1000000, 1350000, 2000000,
+ 810000, 1000000, 2000000,
};
static const int icl_rates[] = {
162000, 216000, 270000, 324000, 432000, 540000, 648000, 810000,
@@ -1422,7 +1422,8 @@ static bool intel_dp_source_supports_fec(struct intel_dp *intel_dp,
if (DISPLAY_VER(dev_priv) >= 12)
return true;
- if (DISPLAY_VER(dev_priv) == 11 && encoder->port != PORT_A)
+ if (DISPLAY_VER(dev_priv) == 11 && encoder->port != PORT_A &&
+ !intel_crtc_has_type(pipe_config, INTEL_OUTPUT_DP_MST))
return true;
return false;
@@ -1917,8 +1918,9 @@ icl_dsc_compute_link_config(struct intel_dp *intel_dp,
dsc_max_bpp = min(dsc_max_bpp, pipe_bpp - 1);
for (i = 0; i < ARRAY_SIZE(valid_dsc_bpp); i++) {
- if (valid_dsc_bpp[i] < dsc_min_bpp ||
- valid_dsc_bpp[i] > dsc_max_bpp)
+ if (valid_dsc_bpp[i] < dsc_min_bpp)
+ continue;
+ if (valid_dsc_bpp[i] > dsc_max_bpp)
break;
ret = dsc_compute_link_config(intel_dp,
@@ -6557,6 +6559,7 @@ intel_dp_init_connector(struct intel_digital_port *dig_port,
intel_connector->get_hw_state = intel_ddi_connector_get_hw_state;
else
intel_connector->get_hw_state = intel_connector_get_hw_state;
+ intel_connector->sync_state = intel_dp_connector_sync_state;
if (!intel_edp_init_connector(intel_dp, intel_connector)) {
intel_dp_aux_fini(intel_dp);
diff --git a/drivers/gpu/drm/i915/display/intel_dp_mst.c b/drivers/gpu/drm/i915/display/intel_dp_mst.c
index 53aec023ce92..b651c990af85 100644
--- a/drivers/gpu/drm/i915/display/intel_dp_mst.c
+++ b/drivers/gpu/drm/i915/display/intel_dp_mst.c
@@ -1355,7 +1355,7 @@ intel_dp_mst_mode_valid_ctx(struct drm_connector *connector,
return 0;
}
- if (DISPLAY_VER(dev_priv) >= 10 &&
+ if (HAS_DSC_MST(dev_priv) &&
drm_dp_sink_supports_dsc(intel_connector->dp.dsc_dpcd)) {
/*
* TBD pass the connector BPC,
diff --git a/drivers/gpu/drm/i915/display/intel_psr.c b/drivers/gpu/drm/i915/display/intel_psr.c
index 6927785fd6ff..b6e539f1342c 100644
--- a/drivers/gpu/drm/i915/display/intel_psr.c
+++ b/drivers/gpu/drm/i915/display/intel_psr.c
@@ -1994,6 +1994,7 @@ static void psr_force_hw_tracking_exit(struct intel_dp *intel_dp)
void intel_psr2_program_trans_man_trk_ctl(const struct intel_crtc_state *crtc_state)
{
+ struct intel_crtc *crtc = to_intel_crtc(crtc_state->uapi.crtc);
struct drm_i915_private *dev_priv = to_i915(crtc_state->uapi.crtc->dev);
enum transcoder cpu_transcoder = crtc_state->cpu_transcoder;
struct intel_encoder *encoder;
@@ -2013,6 +2014,12 @@ void intel_psr2_program_trans_man_trk_ctl(const struct intel_crtc_state *crtc_st
intel_de_write(dev_priv, PSR2_MAN_TRK_CTL(cpu_transcoder),
crtc_state->psr2_man_track_ctl);
+
+ if (!crtc_state->enable_psr2_su_region_et)
+ return;
+
+ intel_de_write(dev_priv, PIPE_SRCSZ_ERLY_TPT(crtc->pipe),
+ crtc_state->pipe_srcsz_early_tpt);
}
static void psr2_man_trk_ctl_calc(struct intel_crtc_state *crtc_state,
@@ -2051,6 +2058,20 @@ exit:
crtc_state->psr2_man_track_ctl = val;
}
+static u32 psr2_pipe_srcsz_early_tpt_calc(struct intel_crtc_state *crtc_state,
+ bool full_update)
+{
+ int width, height;
+
+ if (!crtc_state->enable_psr2_su_region_et || full_update)
+ return 0;
+
+ width = drm_rect_width(&crtc_state->psr2_su_area);
+ height = drm_rect_height(&crtc_state->psr2_su_area);
+
+ return PIPESRC_WIDTH(width - 1) | PIPESRC_HEIGHT(height - 1);
+}
+
static void clip_area_update(struct drm_rect *overlap_damage_area,
struct drm_rect *damage_area,
struct drm_rect *pipe_src)
@@ -2095,21 +2116,36 @@ static void intel_psr2_sel_fetch_pipe_alignment(struct intel_crtc_state *crtc_st
* cursor fully when cursor is in SU area.
*/
static void
-intel_psr2_sel_fetch_et_alignment(struct intel_crtc_state *crtc_state,
- struct intel_plane_state *cursor_state)
+intel_psr2_sel_fetch_et_alignment(struct intel_atomic_state *state,
+ struct intel_crtc *crtc)
{
- struct drm_rect inter;
+ struct intel_crtc_state *crtc_state = intel_atomic_get_new_crtc_state(state, crtc);
+ struct intel_plane_state *new_plane_state;
+ struct intel_plane *plane;
+ int i;
- if (!crtc_state->enable_psr2_su_region_et ||
- !cursor_state->uapi.visible)
+ if (!crtc_state->enable_psr2_su_region_et)
return;
- inter = crtc_state->psr2_su_area;
- if (!drm_rect_intersect(&inter, &cursor_state->uapi.dst))
- return;
+ for_each_new_intel_plane_in_state(state, plane, new_plane_state, i) {
+ struct drm_rect inter;
- clip_area_update(&crtc_state->psr2_su_area, &cursor_state->uapi.dst,
- &crtc_state->pipe_src);
+ if (new_plane_state->uapi.crtc != crtc_state->uapi.crtc)
+ continue;
+
+ if (plane->id != PLANE_CURSOR)
+ continue;
+
+ if (!new_plane_state->uapi.visible)
+ continue;
+
+ inter = crtc_state->psr2_su_area;
+ if (!drm_rect_intersect(&inter, &new_plane_state->uapi.dst))
+ continue;
+
+ clip_area_update(&crtc_state->psr2_su_area, &new_plane_state->uapi.dst,
+ &crtc_state->pipe_src);
+ }
}
/*
@@ -2152,8 +2188,7 @@ int intel_psr2_sel_fetch_update(struct intel_atomic_state *state,
{
struct drm_i915_private *dev_priv = to_i915(state->base.dev);
struct intel_crtc_state *crtc_state = intel_atomic_get_new_crtc_state(state, crtc);
- struct intel_plane_state *new_plane_state, *old_plane_state,
- *cursor_plane_state = NULL;
+ struct intel_plane_state *new_plane_state, *old_plane_state;
struct intel_plane *plane;
bool full_update = false;
int i, ret;
@@ -2238,13 +2273,6 @@ int intel_psr2_sel_fetch_update(struct intel_atomic_state *state,
damaged_area.x2 += new_plane_state->uapi.dst.x1 - src.x1;
clip_area_update(&crtc_state->psr2_su_area, &damaged_area, &crtc_state->pipe_src);
-
- /*
- * Cursor plane new state is stored to adjust su area to cover
- * cursor are fully.
- */
- if (plane->id == PLANE_CURSOR)
- cursor_plane_state = new_plane_state;
}
/*
@@ -2273,9 +2301,13 @@ int intel_psr2_sel_fetch_update(struct intel_atomic_state *state,
if (ret)
return ret;
- /* Adjust su area to cover cursor fully as necessary */
- if (cursor_plane_state)
- intel_psr2_sel_fetch_et_alignment(crtc_state, cursor_plane_state);
+ /*
+ * Adjust su area to cover cursor fully as necessary (early
+ * transport). This needs to be done after
+ * drm_atomic_add_affected_planes to ensure visible cursor is added into
+ * affected planes even when cursor is not updated by itself.
+ */
+ intel_psr2_sel_fetch_et_alignment(state, crtc);
intel_psr2_sel_fetch_pipe_alignment(crtc_state);
@@ -2338,6 +2370,8 @@ int intel_psr2_sel_fetch_update(struct intel_atomic_state *state,
skip_sel_fetch_set_loop:
psr2_man_trk_ctl_calc(crtc_state, full_update);
+ crtc_state->pipe_srcsz_early_tpt =
+ psr2_pipe_srcsz_early_tpt_calc(crtc_state, full_update);
return 0;
}
diff --git a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
index fa46d2308b0e..81bf2216371b 100644
--- a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
+++ b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
@@ -961,6 +961,9 @@ static int gen8_init_rsvd(struct i915_address_space *vm)
struct i915_vma *vma;
int ret;
+ if (!intel_gt_needs_wa_16018031267(vm->gt))
+ return 0;
+
/* The memory will be used only by GPU. */
obj = i915_gem_object_create_lmem(i915, PAGE_SIZE,
I915_BO_ALLOC_VOLATILE |
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index 1ade568ffbfa..7a6dc371c384 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -908,6 +908,23 @@ static intel_engine_mask_t init_engine_mask(struct intel_gt *gt)
info->engine_mask &= ~BIT(GSC0);
}
+ /*
+ * Do not create the command streamer for CCS slices beyond the first.
+ * All the workload submitted to the first engine will be shared among
+ * all the slices.
+ *
+ * Once the user will be allowed to customize the CCS mode, then this
+ * check needs to be removed.
+ */
+ if (IS_DG2(gt->i915)) {
+ u8 first_ccs = __ffs(CCS_MASK(gt));
+
+ /* Mask off all the CCS engine */
+ info->engine_mask &= ~GENMASK(CCS3, CCS0);
+ /* Put back in the first CCS engine */
+ info->engine_mask |= BIT(_CCS(first_ccs));
+ }
+
return info->engine_mask;
}
diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c
index a425db5ed3a2..6a2c2718bcc3 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt.c
@@ -1024,6 +1024,12 @@ enum i915_map_type intel_gt_coherent_map_type(struct intel_gt *gt,
return I915_MAP_WC;
}
+bool intel_gt_needs_wa_16018031267(struct intel_gt *gt)
+{
+ /* Wa_16018031267, Wa_16018063123 */
+ return IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 55), IP_VER(12, 71));
+}
+
bool intel_gt_needs_wa_22016122933(struct intel_gt *gt)
{
return MEDIA_VER_FULL(gt->i915) == IP_VER(13, 0) && gt->type == GT_MEDIA;
diff --git a/drivers/gpu/drm/i915/gt/intel_gt.h b/drivers/gpu/drm/i915/gt/intel_gt.h
index 608f5c872928..003eb93b826f 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt.h
+++ b/drivers/gpu/drm/i915/gt/intel_gt.h
@@ -82,17 +82,18 @@ struct drm_printer;
##__VA_ARGS__); \
} while (0)
-#define NEEDS_FASTCOLOR_BLT_WABB(engine) ( \
- IS_GFX_GT_IP_RANGE(engine->gt, IP_VER(12, 55), IP_VER(12, 71)) && \
- engine->class == COPY_ENGINE_CLASS && engine->instance == 0)
-
static inline bool gt_is_root(struct intel_gt *gt)
{
return !gt->info.id;
}
+bool intel_gt_needs_wa_16018031267(struct intel_gt *gt);
bool intel_gt_needs_wa_22016122933(struct intel_gt *gt);
+#define NEEDS_FASTCOLOR_BLT_WABB(engine) ( \
+ intel_gt_needs_wa_16018031267(engine->gt) && \
+ engine->class == COPY_ENGINE_CLASS && engine->instance == 0)
+
static inline struct intel_gt *uc_to_gt(struct intel_uc *uc)
{
return container_of(uc, struct intel_gt, uc);
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.c b/drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.c
new file mode 100644
index 000000000000..044219c5960a
--- /dev/null
+++ b/drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2024 Intel Corporation
+ */
+
+#include "i915_drv.h"
+#include "intel_gt.h"
+#include "intel_gt_ccs_mode.h"
+#include "intel_gt_regs.h"
+
+void intel_gt_apply_ccs_mode(struct intel_gt *gt)
+{
+ int cslice;
+ u32 mode = 0;
+ int first_ccs = __ffs(CCS_MASK(gt));
+
+ if (!IS_DG2(gt->i915))
+ return;
+
+ /* Build the value for the fixed CCS load balancing */
+ for (cslice = 0; cslice < I915_MAX_CCS; cslice++) {
+ if (CCS_MASK(gt) & BIT(cslice))
+ /*
+ * If available, assign the cslice
+ * to the first available engine...
+ */
+ mode |= XEHP_CCS_MODE_CSLICE(cslice, first_ccs);
+
+ else
+ /*
+ * ... otherwise, mark the cslice as
+ * unavailable if no CCS dispatches here
+ */
+ mode |= XEHP_CCS_MODE_CSLICE(cslice,
+ XEHP_CCS_MODE_CSLICE_MASK);
+ }
+
+ intel_uncore_write(gt->uncore, XEHP_CCS_MODE, mode);
+}
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.h b/drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.h
new file mode 100644
index 000000000000..9e5549caeb26
--- /dev/null
+++ b/drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2024 Intel Corporation
+ */
+
+#ifndef __INTEL_GT_CCS_MODE_H__
+#define __INTEL_GT_CCS_MODE_H__
+
+struct intel_gt;
+
+void intel_gt_apply_ccs_mode(struct intel_gt *gt);
+
+#endif /* __INTEL_GT_CCS_MODE_H__ */
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_regs.h b/drivers/gpu/drm/i915/gt/intel_gt_regs.h
index 50962cfd1353..743fe3566722 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_regs.h
+++ b/drivers/gpu/drm/i915/gt/intel_gt_regs.h
@@ -1477,8 +1477,14 @@
#define ECOBITS_PPGTT_CACHE4B (0 << 8)
#define GEN12_RCU_MODE _MMIO(0x14800)
+#define XEHP_RCU_MODE_FIXED_SLICE_CCS_MODE REG_BIT(1)
#define GEN12_RCU_MODE_CCS_ENABLE REG_BIT(0)
+#define XEHP_CCS_MODE _MMIO(0x14804)
+#define XEHP_CCS_MODE_CSLICE_MASK REG_GENMASK(2, 0) /* CCS0-3 + rsvd */
+#define XEHP_CCS_MODE_CSLICE_WIDTH ilog2(XEHP_CCS_MODE_CSLICE_MASK + 1)
+#define XEHP_CCS_MODE_CSLICE(cslice, ccs) (ccs << (cslice * XEHP_CCS_MODE_CSLICE_WIDTH))
+
#define CHV_FUSE_GT _MMIO(VLV_GUNIT_BASE + 0x2168)
#define CHV_FGT_DISABLE_SS0 (1 << 10)
#define CHV_FGT_DISABLE_SS1 (1 << 11)
diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.c b/drivers/gpu/drm/i915/gt/intel_workarounds.c
index 25413809b9dc..6ec3582c9735 100644
--- a/drivers/gpu/drm/i915/gt/intel_workarounds.c
+++ b/drivers/gpu/drm/i915/gt/intel_workarounds.c
@@ -10,6 +10,7 @@
#include "intel_engine_regs.h"
#include "intel_gpu_commands.h"
#include "intel_gt.h"
+#include "intel_gt_ccs_mode.h"
#include "intel_gt_mcr.h"
#include "intel_gt_print.h"
#include "intel_gt_regs.h"
@@ -51,7 +52,8 @@
* registers belonging to BCS, VCS or VECS should be implemented in
* xcs_engine_wa_init(). Workarounds for registers not belonging to a specific
* engine's MMIO range but that are part of of the common RCS/CCS reset domain
- * should be implemented in general_render_compute_wa_init().
+ * should be implemented in general_render_compute_wa_init(). The settings
+ * about the CCS load balancing should be added in ccs_engine_wa_mode().
*
* - GT workarounds: the list of these WAs is applied whenever these registers
* revert to their default values: on GPU reset, suspend/resume [1]_, etc.
@@ -2854,6 +2856,28 @@ add_render_compute_tuning_settings(struct intel_gt *gt,
wa_write_clr(wal, GEN8_GARBCNTL, GEN12_BUS_HASH_CTL_BIT_EXC);
}
+static void ccs_engine_wa_mode(struct intel_engine_cs *engine, struct i915_wa_list *wal)
+{
+ struct intel_gt *gt = engine->gt;
+
+ if (!IS_DG2(gt->i915))
+ return;
+
+ /*
+ * Wa_14019159160: This workaround, along with others, leads to
+ * significant challenges in utilizing load balancing among the
+ * CCS slices. Consequently, an architectural decision has been
+ * made to completely disable automatic CCS load balancing.
+ */
+ wa_masked_en(wal, GEN12_RCU_MODE, XEHP_RCU_MODE_FIXED_SLICE_CCS_MODE);
+
+ /*
+ * After having disabled automatic load balancing we need to
+ * assign all slices to a single CCS. We will call it CCS mode 1
+ */
+ intel_gt_apply_ccs_mode(gt);
+}
+
/*
* The workarounds in this function apply to shared registers in
* the general render reset domain that aren't tied to a
@@ -3004,8 +3028,10 @@ engine_init_workarounds(struct intel_engine_cs *engine, struct i915_wa_list *wal
* to a single RCS/CCS engine's workaround list since
* they're reset as part of the general render domain reset.
*/
- if (engine->flags & I915_ENGINE_FIRST_RENDER_COMPUTE)
+ if (engine->flags & I915_ENGINE_FIRST_RENDER_COMPUTE) {
general_render_compute_wa_init(engine, wal);
+ ccs_engine_wa_mode(engine, wal);
+ }
if (engine->class == COMPUTE_CLASS)
ccs_engine_wa_init(engine, wal);
diff --git a/drivers/gpu/drm/nouveau/nouveau_uvmm.c b/drivers/gpu/drm/nouveau/nouveau_uvmm.c
index 0a0a11dc9ec0..ee02cd833c5e 100644
--- a/drivers/gpu/drm/nouveau/nouveau_uvmm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_uvmm.c
@@ -812,15 +812,15 @@ op_remap(struct drm_gpuva_op_remap *r,
struct drm_gpuva_op_unmap *u = r->unmap;
struct nouveau_uvma *uvma = uvma_from_va(u->va);
u64 addr = uvma->va.va.addr;
- u64 range = uvma->va.va.range;
+ u64 end = uvma->va.va.addr + uvma->va.va.range;
if (r->prev)
addr = r->prev->va.addr + r->prev->va.range;
if (r->next)
- range = r->next->va.addr - addr;
+ end = r->next->va.addr;
- op_unmap_range(u, addr, range);
+ op_unmap_range(u, addr, end - addr);
}
static int
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c
index 986e8d547c94..060c74a80eb1 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c
@@ -420,7 +420,7 @@ gf100_gr_chan_new(struct nvkm_gr *base, struct nvkm_chan *fifoch,
return ret;
} else {
ret = nvkm_memory_map(gr->attrib_cb, 0, chan->vmm, chan->attrib_cb,
- &args, sizeof(args));;
+ &args, sizeof(args));
if (ret)
return ret;
}
diff --git a/drivers/gpu/drm/panfrost/panfrost_gpu.c b/drivers/gpu/drm/panfrost/panfrost_gpu.c
index 9063ce254642..fd8e44992184 100644
--- a/drivers/gpu/drm/panfrost/panfrost_gpu.c
+++ b/drivers/gpu/drm/panfrost/panfrost_gpu.c
@@ -441,19 +441,19 @@ void panfrost_gpu_power_off(struct panfrost_device *pfdev)
gpu_write(pfdev, SHADER_PWROFF_LO, pfdev->features.shader_present);
ret = readl_relaxed_poll_timeout(pfdev->iomem + SHADER_PWRTRANS_LO,
- val, !val, 1, 1000);
+ val, !val, 1, 2000);
if (ret)
dev_err(pfdev->dev, "shader power transition timeout");
gpu_write(pfdev, TILER_PWROFF_LO, pfdev->features.tiler_present);
ret = readl_relaxed_poll_timeout(pfdev->iomem + TILER_PWRTRANS_LO,
- val, !val, 1, 1000);
+ val, !val, 1, 2000);
if (ret)
dev_err(pfdev->dev, "tiler power transition timeout");
gpu_write(pfdev, L2_PWROFF_LO, pfdev->features.l2_present);
ret = readl_poll_timeout(pfdev->iomem + L2_PWRTRANS_LO,
- val, !val, 0, 1000);
+ val, !val, 0, 2000);
if (ret)
dev_err(pfdev->dev, "l2 power transition timeout");
}
diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
index ca85e81fdb44..d32ff3857e65 100644
--- a/drivers/gpu/drm/xe/xe_device.c
+++ b/drivers/gpu/drm/xe/xe_device.c
@@ -193,6 +193,9 @@ static void xe_device_destroy(struct drm_device *dev, void *dummy)
{
struct xe_device *xe = to_xe_device(dev);
+ if (xe->preempt_fence_wq)
+ destroy_workqueue(xe->preempt_fence_wq);
+
if (xe->ordered_wq)
destroy_workqueue(xe->ordered_wq);
@@ -258,9 +261,15 @@ struct xe_device *xe_device_create(struct pci_dev *pdev,
INIT_LIST_HEAD(&xe->pinned.external_vram);
INIT_LIST_HEAD(&xe->pinned.evicted);
+ xe->preempt_fence_wq = alloc_ordered_workqueue("xe-preempt-fence-wq", 0);
xe->ordered_wq = alloc_ordered_workqueue("xe-ordered-wq", 0);
xe->unordered_wq = alloc_workqueue("xe-unordered-wq", 0, 0);
- if (!xe->ordered_wq || !xe->unordered_wq) {
+ if (!xe->ordered_wq || !xe->unordered_wq ||
+ !xe->preempt_fence_wq) {
+ /*
+ * Cleanup done in xe_device_destroy via
+ * drmm_add_action_or_reset register above
+ */
drm_err(&xe->drm, "Failed to allocate xe workqueues\n");
err = -ENOMEM;
goto err;
diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
index 9785eef2e5a4..8e3a222b41cf 100644
--- a/drivers/gpu/drm/xe/xe_device_types.h
+++ b/drivers/gpu/drm/xe/xe_device_types.h
@@ -363,6 +363,9 @@ struct xe_device {
/** @ufence_wq: user fence wait queue */
wait_queue_head_t ufence_wq;
+ /** @preempt_fence_wq: used to serialize preempt fences */
+ struct workqueue_struct *preempt_fence_wq;
+
/** @ordered_wq: used to serialize compute mode resume */
struct workqueue_struct *ordered_wq;
diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c
index 826c8b389672..cc5e0f75de3c 100644
--- a/drivers/gpu/drm/xe/xe_exec.c
+++ b/drivers/gpu/drm/xe/xe_exec.c
@@ -94,48 +94,16 @@
* Unlock all
*/
+/*
+ * Add validation and rebinding to the drm_exec locking loop, since both can
+ * trigger eviction which may require sleeping dma_resv locks.
+ */
static int xe_exec_fn(struct drm_gpuvm_exec *vm_exec)
{
struct xe_vm *vm = container_of(vm_exec->vm, struct xe_vm, gpuvm);
- struct drm_gem_object *obj;
- unsigned long index;
- int num_fences;
- int ret;
-
- ret = drm_gpuvm_validate(vm_exec->vm, &vm_exec->exec);
- if (ret)
- return ret;
-
- /*
- * 1 fence slot for the final submit, and 1 more for every per-tile for
- * GPU bind and 1 extra for CPU bind. Note that there are potentially
- * many vma per object/dma-resv, however the fence slot will just be
- * re-used, since they are largely the same timeline and the seqno
- * should be in order. In the case of CPU bind there is dummy fence used
- * for all CPU binds, so no need to have a per-tile slot for that.
- */
- num_fences = 1 + 1 + vm->xe->info.tile_count;
- /*
- * We don't know upfront exactly how many fence slots we will need at
- * the start of the exec, since the TTM bo_validate above can consume
- * numerous fence slots. Also due to how the dma_resv_reserve_fences()
- * works it only ensures that at least that many fence slots are
- * available i.e if there are already 10 slots available and we reserve
- * two more, it can just noop without reserving anything. With this it
- * is quite possible that TTM steals some of the fence slots and then
- * when it comes time to do the vma binding and final exec stage we are
- * lacking enough fence slots, leading to some nasty BUG_ON() when
- * adding the fences. Hence just add our own fences here, after the
- * validate stage.
- */
- drm_exec_for_each_locked_object(&vm_exec->exec, index, obj) {
- ret = dma_resv_reserve_fences(obj->resv, num_fences);
- if (ret)
- return ret;
- }
-
- return 0;
+ /* The fence slot added here is intended for the exec sched job. */
+ return xe_vm_validate_rebind(vm, &vm_exec->exec, 1);
}
int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
@@ -152,7 +120,6 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
struct drm_exec *exec = &vm_exec.exec;
u32 i, num_syncs = 0, num_ufence = 0;
struct xe_sched_job *job;
- struct dma_fence *rebind_fence;
struct xe_vm *vm;
bool write_locked, skip_retry = false;
ktime_t end = 0;
@@ -290,39 +257,7 @@ retry:
goto err_exec;
}
- /*
- * Rebind any invalidated userptr or evicted BOs in the VM, non-compute
- * VM mode only.
- */
- rebind_fence = xe_vm_rebind(vm, false);
- if (IS_ERR(rebind_fence)) {
- err = PTR_ERR(rebind_fence);
- goto err_put_job;
- }
-
- /*
- * We store the rebind_fence in the VM so subsequent execs don't get
- * scheduled before the rebinds of userptrs / evicted BOs is complete.
- */
- if (rebind_fence) {
- dma_fence_put(vm->rebind_fence);
- vm->rebind_fence = rebind_fence;
- }
- if (vm->rebind_fence) {
- if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
- &vm->rebind_fence->flags)) {
- dma_fence_put(vm->rebind_fence);
- vm->rebind_fence = NULL;
- } else {
- dma_fence_get(vm->rebind_fence);
- err = drm_sched_job_add_dependency(&job->drm,
- vm->rebind_fence);
- if (err)
- goto err_put_job;
- }
- }
-
- /* Wait behind munmap style rebinds */
+ /* Wait behind rebinds */
if (!xe_vm_in_lr_mode(vm)) {
err = drm_sched_job_add_resv_dependencies(&job->drm,
xe_vm_resv(vm),
diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h
index 62b3d9d1d7cd..462b33195032 100644
--- a/drivers/gpu/drm/xe/xe_exec_queue_types.h
+++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h
@@ -148,6 +148,11 @@ struct xe_exec_queue {
const struct xe_ring_ops *ring_ops;
/** @entity: DRM sched entity for this exec queue (1 to 1 relationship) */
struct drm_sched_entity *entity;
+ /**
+ * @tlb_flush_seqno: The seqno of the last rebind tlb flush performed
+ * Protected by @vm's resv. Unused if @vm == NULL.
+ */
+ u64 tlb_flush_seqno;
/** @lrc: logical ring context for this exec queue */
struct xe_lrc lrc[];
};
diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c b/drivers/gpu/drm/xe/xe_gt_pagefault.c
index 241c294270d9..fa9e9853c53b 100644
--- a/drivers/gpu/drm/xe/xe_gt_pagefault.c
+++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c
@@ -100,10 +100,9 @@ static int xe_pf_begin(struct drm_exec *exec, struct xe_vma *vma,
{
struct xe_bo *bo = xe_vma_bo(vma);
struct xe_vm *vm = xe_vma_vm(vma);
- unsigned int num_shared = 2; /* slots for bind + move */
int err;
- err = xe_vm_prepare_vma(exec, vma, num_shared);
+ err = xe_vm_lock_vma(exec, vma);
if (err)
return err;
diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
index f03e077f81a0..e598a4363d01 100644
--- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
+++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
@@ -61,7 +61,6 @@ int xe_gt_tlb_invalidation_init(struct xe_gt *gt)
INIT_LIST_HEAD(&gt->tlb_invalidation.pending_fences);
spin_lock_init(&gt->tlb_invalidation.pending_lock);
spin_lock_init(&gt->tlb_invalidation.lock);
- gt->tlb_invalidation.fence_context = dma_fence_context_alloc(1);
INIT_DELAYED_WORK(&gt->tlb_invalidation.fence_tdr,
xe_gt_tlb_fence_timeout);
diff --git a/drivers/gpu/drm/xe/xe_gt_types.h b/drivers/gpu/drm/xe/xe_gt_types.h
index 70c615dd1498..07b2f724ec45 100644
--- a/drivers/gpu/drm/xe/xe_gt_types.h
+++ b/drivers/gpu/drm/xe/xe_gt_types.h
@@ -177,13 +177,6 @@ struct xe_gt {
* xe_gt_tlb_fence_timeout after the timeut interval is over.
*/
struct delayed_work fence_tdr;
- /** @tlb_invalidation.fence_context: context for TLB invalidation fences */
- u64 fence_context;
- /**
- * @tlb_invalidation.fence_seqno: seqno to TLB invalidation fences, protected by
- * tlb_invalidation.lock
- */
- u32 fence_seqno;
/** @tlb_invalidation.lock: protects TLB invalidation fences */
spinlock_t lock;
} tlb_invalidation;
diff --git a/drivers/gpu/drm/xe/xe_preempt_fence.c b/drivers/gpu/drm/xe/xe_preempt_fence.c
index 7bce2a332603..7d50c6e89d8e 100644
--- a/drivers/gpu/drm/xe/xe_preempt_fence.c
+++ b/drivers/gpu/drm/xe/xe_preempt_fence.c
@@ -49,7 +49,7 @@ static bool preempt_fence_enable_signaling(struct dma_fence *fence)
struct xe_exec_queue *q = pfence->q;
pfence->error = q->ops->suspend(q);
- queue_work(system_unbound_wq, &pfence->preempt_work);
+ queue_work(q->vm->xe->preempt_fence_wq, &pfence->preempt_work);
return true;
}
diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c
index 7f54bc3e389d..4efc8c1a3d7a 100644
--- a/drivers/gpu/drm/xe/xe_pt.c
+++ b/drivers/gpu/drm/xe/xe_pt.c
@@ -1135,8 +1135,7 @@ static int invalidation_fence_init(struct xe_gt *gt,
spin_lock_irq(&gt->tlb_invalidation.lock);
dma_fence_init(&ifence->base.base, &invalidation_fence_ops,
&gt->tlb_invalidation.lock,
- gt->tlb_invalidation.fence_context,
- ++gt->tlb_invalidation.fence_seqno);
+ dma_fence_context_alloc(1), 1);
spin_unlock_irq(&gt->tlb_invalidation.lock);
INIT_LIST_HEAD(&ifence->base.link);
@@ -1236,6 +1235,13 @@ __xe_pt_bind_vma(struct xe_tile *tile, struct xe_vma *vma, struct xe_exec_queue
err = xe_pt_prepare_bind(tile, vma, entries, &num_entries);
if (err)
goto err;
+
+ err = dma_resv_reserve_fences(xe_vm_resv(vm), 1);
+ if (!err && !xe_vma_has_no_bo(vma) && !xe_vma_bo(vma)->vm)
+ err = dma_resv_reserve_fences(xe_vma_bo(vma)->ttm.base.resv, 1);
+ if (err)
+ goto err;
+
xe_tile_assert(tile, num_entries <= ARRAY_SIZE(entries));
xe_vm_dbg_print_entries(tile_to_xe(tile), entries, num_entries);
@@ -1254,11 +1260,13 @@ __xe_pt_bind_vma(struct xe_tile *tile, struct xe_vma *vma, struct xe_exec_queue
* non-faulting LR, in particular on user-space batch buffer chaining,
* it needs to be done here.
*/
- if ((rebind && !xe_vm_in_lr_mode(vm) && !vm->batch_invalidate_tlb) ||
- (!rebind && xe_vm_has_scratch(vm) && xe_vm_in_preempt_fence_mode(vm))) {
+ if ((!rebind && xe_vm_has_scratch(vm) && xe_vm_in_preempt_fence_mode(vm))) {
ifence = kzalloc(sizeof(*ifence), GFP_KERNEL);
if (!ifence)
return ERR_PTR(-ENOMEM);
+ } else if (rebind && !xe_vm_in_lr_mode(vm)) {
+ /* We bump also if batch_invalidate_tlb is true */
+ vm->tlb_flush_seqno++;
}
rfence = kzalloc(sizeof(*rfence), GFP_KERNEL);
@@ -1297,7 +1305,7 @@ __xe_pt_bind_vma(struct xe_tile *tile, struct xe_vma *vma, struct xe_exec_queue
}
/* add shared fence now for pagetable delayed destroy */
- dma_resv_add_fence(xe_vm_resv(vm), fence, !rebind &&
+ dma_resv_add_fence(xe_vm_resv(vm), fence, rebind ||
last_munmap_rebind ?
DMA_RESV_USAGE_KERNEL :
DMA_RESV_USAGE_BOOKKEEP);
@@ -1576,6 +1584,7 @@ __xe_pt_unbind_vma(struct xe_tile *tile, struct xe_vma *vma, struct xe_exec_queu
struct dma_fence *fence = NULL;
struct invalidation_fence *ifence;
struct xe_range_fence *rfence;
+ int err;
LLIST_HEAD(deferred);
@@ -1593,6 +1602,12 @@ __xe_pt_unbind_vma(struct xe_tile *tile, struct xe_vma *vma, struct xe_exec_queu
xe_pt_calc_rfence_interval(vma, &unbind_pt_update, entries,
num_entries);
+ err = dma_resv_reserve_fences(xe_vm_resv(vm), 1);
+ if (!err && !xe_vma_has_no_bo(vma) && !xe_vma_bo(vma)->vm)
+ err = dma_resv_reserve_fences(xe_vma_bo(vma)->ttm.base.resv, 1);
+ if (err)
+ return ERR_PTR(err);
+
ifence = kzalloc(sizeof(*ifence), GFP_KERNEL);
if (!ifence)
return ERR_PTR(-ENOMEM);
diff --git a/drivers/gpu/drm/xe/xe_ring_ops.c b/drivers/gpu/drm/xe/xe_ring_ops.c
index c4edffcd4a32..5b2b37b59813 100644
--- a/drivers/gpu/drm/xe/xe_ring_ops.c
+++ b/drivers/gpu/drm/xe/xe_ring_ops.c
@@ -219,10 +219,9 @@ static void __emit_job_gen12_simple(struct xe_sched_job *job, struct xe_lrc *lrc
{
u32 dw[MAX_JOB_SIZE_DW], i = 0;
u32 ppgtt_flag = get_ppgtt_flag(job);
- struct xe_vm *vm = job->q->vm;
struct xe_gt *gt = job->q->gt;
- if (vm && vm->batch_invalidate_tlb) {
+ if (job->ring_ops_flush_tlb) {
dw[i++] = preparser_disable(true);
i = emit_flush_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc),
seqno, true, dw, i);
@@ -270,7 +269,6 @@ static void __emit_job_gen12_video(struct xe_sched_job *job, struct xe_lrc *lrc,
struct xe_gt *gt = job->q->gt;
struct xe_device *xe = gt_to_xe(gt);
bool decode = job->q->class == XE_ENGINE_CLASS_VIDEO_DECODE;
- struct xe_vm *vm = job->q->vm;
dw[i++] = preparser_disable(true);
@@ -282,13 +280,13 @@ static void __emit_job_gen12_video(struct xe_sched_job *job, struct xe_lrc *lrc,
i = emit_aux_table_inv(gt, VE0_AUX_INV, dw, i);
}
- if (vm && vm->batch_invalidate_tlb)
+ if (job->ring_ops_flush_tlb)
i = emit_flush_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc),
seqno, true, dw, i);
dw[i++] = preparser_disable(false);
- if (!vm || !vm->batch_invalidate_tlb)
+ if (!job->ring_ops_flush_tlb)
i = emit_store_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc),
seqno, dw, i);
@@ -317,7 +315,6 @@ static void __emit_job_gen12_render_compute(struct xe_sched_job *job,
struct xe_gt *gt = job->q->gt;
struct xe_device *xe = gt_to_xe(gt);
bool lacks_render = !(gt->info.engine_mask & XE_HW_ENGINE_RCS_MASK);
- struct xe_vm *vm = job->q->vm;
u32 mask_flags = 0;
dw[i++] = preparser_disable(true);
@@ -327,7 +324,7 @@ static void __emit_job_gen12_render_compute(struct xe_sched_job *job,
mask_flags = PIPE_CONTROL_3D_ENGINE_FLAGS;
/* See __xe_pt_bind_vma() for a discussion on TLB invalidations. */
- i = emit_pipe_invalidate(mask_flags, vm && vm->batch_invalidate_tlb, dw, i);
+ i = emit_pipe_invalidate(mask_flags, job->ring_ops_flush_tlb, dw, i);
/* hsdes: 1809175790 */
if (has_aux_ccs(xe))
diff --git a/drivers/gpu/drm/xe/xe_sched_job.c b/drivers/gpu/drm/xe/xe_sched_job.c
index 8151ddafb940..b0c7fa4693cf 100644
--- a/drivers/gpu/drm/xe/xe_sched_job.c
+++ b/drivers/gpu/drm/xe/xe_sched_job.c
@@ -250,6 +250,16 @@ bool xe_sched_job_completed(struct xe_sched_job *job)
void xe_sched_job_arm(struct xe_sched_job *job)
{
+ struct xe_exec_queue *q = job->q;
+ struct xe_vm *vm = q->vm;
+
+ if (vm && !xe_sched_job_is_migration(q) && !xe_vm_in_lr_mode(vm) &&
+ (vm->batch_invalidate_tlb || vm->tlb_flush_seqno != q->tlb_flush_seqno)) {
+ xe_vm_assert_held(vm);
+ q->tlb_flush_seqno = vm->tlb_flush_seqno;
+ job->ring_ops_flush_tlb = true;
+ }
+
drm_sched_job_arm(&job->drm);
}
diff --git a/drivers/gpu/drm/xe/xe_sched_job_types.h b/drivers/gpu/drm/xe/xe_sched_job_types.h
index b1d83da50a53..5e12724219fd 100644
--- a/drivers/gpu/drm/xe/xe_sched_job_types.h
+++ b/drivers/gpu/drm/xe/xe_sched_job_types.h
@@ -39,6 +39,8 @@ struct xe_sched_job {
} user_fence;
/** @migrate_flush_flags: Additional flush flags for migration jobs */
u32 migrate_flush_flags;
+ /** @ring_ops_flush_tlb: The ring ops need to flush TLB before payload. */
+ bool ring_ops_flush_tlb;
/** @batch_addr: batch buffer address of job */
u64 batch_addr[];
};
diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
index f88faef4142b..62d1ef8867a8 100644
--- a/drivers/gpu/drm/xe/xe_vm.c
+++ b/drivers/gpu/drm/xe/xe_vm.c
@@ -482,17 +482,53 @@ static int xe_gpuvm_validate(struct drm_gpuvm_bo *vm_bo, struct drm_exec *exec)
return 0;
}
+/**
+ * xe_vm_validate_rebind() - Validate buffer objects and rebind vmas
+ * @vm: The vm for which we are rebinding.
+ * @exec: The struct drm_exec with the locked GEM objects.
+ * @num_fences: The number of fences to reserve for the operation, not
+ * including rebinds and validations.
+ *
+ * Validates all evicted gem objects and rebinds their vmas. Note that
+ * rebindings may cause evictions and hence the validation-rebind
+ * sequence is rerun until there are no more objects to validate.
+ *
+ * Return: 0 on success, negative error code on error. In particular,
+ * may return -EINTR or -ERESTARTSYS if interrupted, and -EDEADLK if
+ * the drm_exec transaction needs to be restarted.
+ */
+int xe_vm_validate_rebind(struct xe_vm *vm, struct drm_exec *exec,
+ unsigned int num_fences)
+{
+ struct drm_gem_object *obj;
+ unsigned long index;
+ int ret;
+
+ do {
+ ret = drm_gpuvm_validate(&vm->gpuvm, exec);
+ if (ret)
+ return ret;
+
+ ret = xe_vm_rebind(vm, false);
+ if (ret)
+ return ret;
+ } while (!list_empty(&vm->gpuvm.evict.list));
+
+ drm_exec_for_each_locked_object(exec, index, obj) {
+ ret = dma_resv_reserve_fences(obj->resv, num_fences);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
static int xe_preempt_work_begin(struct drm_exec *exec, struct xe_vm *vm,
bool *done)
{
int err;
- /*
- * 1 fence for each preempt fence plus a fence for each tile from a
- * possible rebind
- */
- err = drm_gpuvm_prepare_vm(&vm->gpuvm, exec, vm->preempt.num_exec_queues +
- vm->xe->info.tile_count);
+ err = drm_gpuvm_prepare_vm(&vm->gpuvm, exec, 0);
if (err)
return err;
@@ -507,7 +543,7 @@ static int xe_preempt_work_begin(struct drm_exec *exec, struct xe_vm *vm,
return 0;
}
- err = drm_gpuvm_prepare_objects(&vm->gpuvm, exec, vm->preempt.num_exec_queues);
+ err = drm_gpuvm_prepare_objects(&vm->gpuvm, exec, 0);
if (err)
return err;
@@ -515,14 +551,19 @@ static int xe_preempt_work_begin(struct drm_exec *exec, struct xe_vm *vm,
if (err)
return err;
- return drm_gpuvm_validate(&vm->gpuvm, exec);
+ /*
+ * Add validation and rebinding to the locking loop since both can
+ * cause evictions which may require blocing dma_resv locks.
+ * The fence reservation here is intended for the new preempt fences
+ * we attach at the end of the rebind work.
+ */
+ return xe_vm_validate_rebind(vm, exec, vm->preempt.num_exec_queues);
}
static void preempt_rebind_work_func(struct work_struct *w)
{
struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work);
struct drm_exec exec;
- struct dma_fence *rebind_fence;
unsigned int fence_count = 0;
LIST_HEAD(preempt_fences);
ktime_t end = 0;
@@ -568,18 +609,11 @@ retry:
if (err)
goto out_unlock;
- rebind_fence = xe_vm_rebind(vm, true);
- if (IS_ERR(rebind_fence)) {
- err = PTR_ERR(rebind_fence);
+ err = xe_vm_rebind(vm, true);
+ if (err)
goto out_unlock;
- }
- if (rebind_fence) {
- dma_fence_wait(rebind_fence, false);
- dma_fence_put(rebind_fence);
- }
-
- /* Wait on munmap style VM unbinds */
+ /* Wait on rebinds and munmap style VM unbinds */
wait = dma_resv_wait_timeout(xe_vm_resv(vm),
DMA_RESV_USAGE_KERNEL,
false, MAX_SCHEDULE_TIMEOUT);
@@ -773,14 +807,14 @@ xe_vm_bind_vma(struct xe_vma *vma, struct xe_exec_queue *q,
struct xe_sync_entry *syncs, u32 num_syncs,
bool first_op, bool last_op);
-struct dma_fence *xe_vm_rebind(struct xe_vm *vm, bool rebind_worker)
+int xe_vm_rebind(struct xe_vm *vm, bool rebind_worker)
{
- struct dma_fence *fence = NULL;
+ struct dma_fence *fence;
struct xe_vma *vma, *next;
lockdep_assert_held(&vm->lock);
if (xe_vm_in_lr_mode(vm) && !rebind_worker)
- return NULL;
+ return 0;
xe_vm_assert_held(vm);
list_for_each_entry_safe(vma, next, &vm->rebind_list,
@@ -788,17 +822,17 @@ struct dma_fence *xe_vm_rebind(struct xe_vm *vm, bool rebind_worker)
xe_assert(vm->xe, vma->tile_present);
list_del_init(&vma->combined_links.rebind);
- dma_fence_put(fence);
if (rebind_worker)
trace_xe_vma_rebind_worker(vma);
else
trace_xe_vma_rebind_exec(vma);
fence = xe_vm_bind_vma(vma, NULL, NULL, 0, false, false);
if (IS_ERR(fence))
- return fence;
+ return PTR_ERR(fence);
+ dma_fence_put(fence);
}
- return fence;
+ return 0;
}
static void xe_vma_free(struct xe_vma *vma)
@@ -1004,35 +1038,26 @@ static void xe_vma_destroy(struct xe_vma *vma, struct dma_fence *fence)
}
/**
- * xe_vm_prepare_vma() - drm_exec utility to lock a vma
+ * xe_vm_lock_vma() - drm_exec utility to lock a vma
* @exec: The drm_exec object we're currently locking for.
* @vma: The vma for witch we want to lock the vm resv and any attached
* object's resv.
- * @num_shared: The number of dma-fence slots to pre-allocate in the
- * objects' reservation objects.
*
* Return: 0 on success, negative error code on error. In particular
* may return -EDEADLK on WW transaction contention and -EINTR if
* an interruptible wait is terminated by a signal.
*/
-int xe_vm_prepare_vma(struct drm_exec *exec, struct xe_vma *vma,
- unsigned int num_shared)
+int xe_vm_lock_vma(struct drm_exec *exec, struct xe_vma *vma)
{
struct xe_vm *vm = xe_vma_vm(vma);
struct xe_bo *bo = xe_vma_bo(vma);
int err;
XE_WARN_ON(!vm);
- if (num_shared)
- err = drm_exec_prepare_obj(exec, xe_vm_obj(vm), num_shared);
- else
- err = drm_exec_lock_obj(exec, xe_vm_obj(vm));
- if (!err && bo && !bo->vm) {
- if (num_shared)
- err = drm_exec_prepare_obj(exec, &bo->ttm.base, num_shared);
- else
- err = drm_exec_lock_obj(exec, &bo->ttm.base);
- }
+
+ err = drm_exec_lock_obj(exec, xe_vm_obj(vm));
+ if (!err && bo && !bo->vm)
+ err = drm_exec_lock_obj(exec, &bo->ttm.base);
return err;
}
@@ -1044,7 +1069,7 @@ static void xe_vma_destroy_unlocked(struct xe_vma *vma)
drm_exec_init(&exec, 0, 0);
drm_exec_until_all_locked(&exec) {
- err = xe_vm_prepare_vma(&exec, vma, 0);
+ err = xe_vm_lock_vma(&exec, vma);
drm_exec_retry_on_contention(&exec);
if (XE_WARN_ON(err))
break;
@@ -1589,7 +1614,6 @@ static void vm_destroy_work_func(struct work_struct *w)
XE_WARN_ON(vm->pt_root[id]);
trace_xe_vm_free(vm);
- dma_fence_put(vm->rebind_fence);
kfree(vm);
}
@@ -2512,7 +2536,7 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
lockdep_assert_held_write(&vm->lock);
- err = xe_vm_prepare_vma(exec, vma, 1);
+ err = xe_vm_lock_vma(exec, vma);
if (err)
return err;
diff --git a/drivers/gpu/drm/xe/xe_vm.h b/drivers/gpu/drm/xe/xe_vm.h
index 6df1f1c7f85d..306cd0934a19 100644
--- a/drivers/gpu/drm/xe/xe_vm.h
+++ b/drivers/gpu/drm/xe/xe_vm.h
@@ -207,7 +207,7 @@ int __xe_vm_userptr_needs_repin(struct xe_vm *vm);
int xe_vm_userptr_check_repin(struct xe_vm *vm);
-struct dma_fence *xe_vm_rebind(struct xe_vm *vm, bool rebind_worker);
+int xe_vm_rebind(struct xe_vm *vm, bool rebind_worker);
int xe_vm_invalidate_vma(struct xe_vma *vma);
@@ -242,8 +242,10 @@ bool xe_vm_validate_should_retry(struct drm_exec *exec, int err, ktime_t *end);
int xe_analyze_vm(struct drm_printer *p, struct xe_vm *vm, int gt_id);
-int xe_vm_prepare_vma(struct drm_exec *exec, struct xe_vma *vma,
- unsigned int num_shared);
+int xe_vm_lock_vma(struct drm_exec *exec, struct xe_vma *vma);
+
+int xe_vm_validate_rebind(struct xe_vm *vm, struct drm_exec *exec,
+ unsigned int num_fences);
/**
* xe_vm_resv() - Return's the vm's reservation object
diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h
index ae5fb565f6bf..badf3945083d 100644
--- a/drivers/gpu/drm/xe/xe_vm_types.h
+++ b/drivers/gpu/drm/xe/xe_vm_types.h
@@ -177,9 +177,6 @@ struct xe_vm {
*/
struct list_head rebind_list;
- /** @rebind_fence: rebind fence from execbuf */
- struct dma_fence *rebind_fence;
-
/**
* @destroy_work: worker to destroy VM, needed as a dma_fence signaling
* from an irq context can be last put and the destroy needs to be able
@@ -264,6 +261,11 @@ struct xe_vm {
bool capture_once;
} error_capture;
+ /**
+ * @tlb_flush_seqno: Required TLB flush seqno for the next exec.
+ * protected by the vm resv.
+ */
+ u64 tlb_flush_seqno;
/** @batch_invalidate_tlb: Always invalidate TLB before batch start */
bool batch_invalidate_tlb;
/** @xef: XE file handle for tracking this VM's drm client */