diff mbox

[3/8] drm/i915/bdw: Apply workarounds in render ring init function

Message ID 1427374172-4926-4-git-send-email-tjaalton@ubuntu.com
State New
Headers show

Commit Message

Timo Aaltonen March 26, 2015, 12:49 p.m. UTC
From: Arun Siluvery <arun.siluvery@linux.intel.com>

BugLink: http://bugs.launchpad.net/bugs/1374389

For BDW workarounds are currently initialized in init_clock_gating() but
they are lost during reset, suspend/resume etc; this patch moves the WAs
that are part of register state context to render ring init fn otherwise
default context ends up with incorrect values as they don't get initialized
until init_clock_gating fn.

v2: Add workarounds to golden render state
This method has its own issues, first of all this is different for
each gen and it is generated using a tool so adding new workaround
and mainitaining them across gens is not a straightforward process.

v3: Use LRIs to emit these workarounds (Ville)
Instead of modifying the golden render state the same LRIs are
emitted from within the driver.

v4: Use abstract name when exporting gen specific routines (Chris)

For: VIZ-4092
Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>
Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
(backported from commit 86d7f23842f1bce3ab5e8c8d0c676112bbc4c99b)
Signed-off-by: Timo Aaltonen <timo.aaltonen@canonical.com>

Conflicts:
	drivers/gpu/drm/i915/intel_ringbuffer.c
---
 drivers/gpu/drm/i915/i915_gem_context.c |  6 +++
 drivers/gpu/drm/i915/intel_pm.c         | 48 -------------------
 drivers/gpu/drm/i915/intel_ringbuffer.c | 82 ++++++++++++++++++++++++++++++++-
 drivers/gpu/drm/i915/intel_ringbuffer.h |  2 +
 4 files changed, 89 insertions(+), 49 deletions(-)
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index a5ddf3b..cc24e71 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -706,6 +706,12 @@  done:
 	to->last_ring = ring;
 
 	if (uninitialized) {
+		if (ring->init_context) {
+			ret = ring->init_context(ring);
+			if (ret)
+				DRM_ERROR("ring init context: %d\n", ret);
+		}
+
 		ret = i915_gem_render_state_init(ring);
 		if (ret)
 			DRM_ERROR("init render state: %d\n", ret);
diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
index 9c474d5..6fcdd27 100644
--- a/drivers/gpu/drm/i915/intel_pm.c
+++ b/drivers/gpu/drm/i915/intel_pm.c
@@ -5085,37 +5085,12 @@  static void gen8_init_clock_gating(struct drm_device *dev)
 	/* FIXME(BDW): Check all the w/a, some might only apply to
 	 * pre-production hw. */
 
-	/* WaDisablePartialInstShootdown:bdw */
-	I915_WRITE(GEN8_ROW_CHICKEN,
-		   _MASKED_BIT_ENABLE(PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE));
-
-	/* WaDisableThreadStallDopClockGating:bdw */
-	/* FIXME: Unclear whether we really need this on production bdw. */
-	I915_WRITE(GEN8_ROW_CHICKEN,
-		   _MASKED_BIT_ENABLE(STALL_DOP_GATING_DISABLE));
 
-	/*
-	 * This GEN8_CENTROID_PIXEL_OPT_DIS W/A is only needed for
-	 * pre-production hardware
-	 */
-	I915_WRITE(HALF_SLICE_CHICKEN3,
-		   _MASKED_BIT_ENABLE(GEN8_CENTROID_PIXEL_OPT_DIS));
-	I915_WRITE(HALF_SLICE_CHICKEN3,
-		   _MASKED_BIT_ENABLE(GEN8_SAMPLER_POWER_BYPASS_DIS));
 	I915_WRITE(GAMTARBMODE, _MASKED_BIT_ENABLE(ARB_MODE_BWGTLB_DISABLE));
 
 	I915_WRITE(_3D_CHICKEN3,
 		   _MASKED_BIT_ENABLE(_3D_CHICKEN_SDE_LIMIT_FIFO_POLY_DEPTH(2)));
 
-	I915_WRITE(COMMON_SLICE_CHICKEN2,
-		   _MASKED_BIT_ENABLE(GEN8_CSC2_SBE_VUE_CACHE_CONSERVATIVE));
-
-	I915_WRITE(GEN7_HALF_SLICE_CHICKEN1,
-		   _MASKED_BIT_ENABLE(GEN7_SINGLE_SUBSCAN_DISPATCH_ENABLE));
-
-	/* WaDisableDopClockGating:bdw May not be needed for production */
-	I915_WRITE(GEN7_ROW_CHICKEN2,
-		   _MASKED_BIT_ENABLE(DOP_CLOCK_GATING_DISABLE));
 
 	/* WaSwitchSolVfFArbitrationPriority:bdw */
 	I915_WRITE(GAM_ECOCHK, I915_READ(GAM_ECOCHK) | HSW_ECOCHK_ARB_PRIO_SOL);
@@ -5131,31 +5106,12 @@  static void gen8_init_clock_gating(struct drm_device *dev)
 			   BDW_DPRS_MASK_VBLANK_SRD);
 	}
 
-	/* Use Force Non-Coherent whenever executing a 3D context. This is a
-	 * workaround for for a possible hang in the unlikely event a TLB
-	 * invalidation occurs during a PSD flush.
-	 */
-	I915_WRITE(HDC_CHICKEN0,
-		   I915_READ(HDC_CHICKEN0) |
-		   _MASKED_BIT_ENABLE(HDC_FORCE_NON_COHERENT));
-
 	/* WaVSRefCountFullforceMissDisable:bdw */
 	/* WaDSRefCountFullforceMissDisable:bdw */
 	I915_WRITE(GEN7_FF_THREAD_MODE,
 		   I915_READ(GEN7_FF_THREAD_MODE) &
 		   ~(GEN8_FF_DS_REF_CNT_FFME | GEN7_FF_VS_REF_CNT_FFME));
 
-	/*
-	 * BSpec recommends 8x4 when MSAA is used,
-	 * however in practice 16x4 seems fastest.
-	 *
-	 * Note that PS/WM thread counts depend on the WIZ hashing
-	 * disable bit, which we don't touch here, but it's good
-	 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
-	 */
-	I915_WRITE(GEN7_GT_MODE,
-		   GEN6_WIZ_HASHING_MASK | GEN6_WIZ_HASHING_16x4);
-
 	I915_WRITE(GEN6_RC_SLEEP_PSMI_CONTROL,
 		   _MASKED_BIT_ENABLE(GEN8_RC_SEMA_IDLE_MSG_DISABLE));
 
@@ -5163,10 +5119,6 @@  static void gen8_init_clock_gating(struct drm_device *dev)
 	I915_WRITE(GEN8_UCGCTL6, I915_READ(GEN8_UCGCTL6) |
 		   GEN8_SDEUNIT_CLOCK_GATE_DISABLE);
 
-	/* Wa4x4STCOptimizationDisable:bdw */
-	I915_WRITE(CACHE_MODE_1,
-		   _MASKED_BIT_ENABLE(GEN8_4x4_STC_OPTIMIZATION_DISABLE));
-
 	lpt_init_clock_gating(dev);
 }
 
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 6a7a35a..b82d2db 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -605,6 +605,84 @@  err:
 	return ret;
 }
 
+static inline void intel_ring_emit_wa(struct intel_engine_cs *ring,
+				       u32 addr, u32 value)
+{
+	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
+	intel_ring_emit(ring, addr);
+	intel_ring_emit(ring, value);
+}
+
+static int gen8_init_workarounds(struct intel_engine_cs *ring)
+{
+	int ret;
+
+	/*
+	 * workarounds applied in this fn are part of register state context,
+	 * they need to be re-initialized followed by gpu reset, suspend/resume,
+	 * module reload.
+	 */
+
+	/*
+	 * update the number of dwords required based on the
+	 * actual number of workarounds applied
+	 */
+	ret = intel_ring_begin(ring, 24);
+	if (ret)
+		return ret;
+
+	/* WaDisablePartialInstShootdown:bdw */
+	/* WaDisableThreadStallDopClockGating:bdw */
+	/* FIXME: Unclear whether we really need this on production bdw. */
+	intel_ring_emit_wa(ring, GEN8_ROW_CHICKEN,
+			   _MASKED_BIT_ENABLE(PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE
+					     | STALL_DOP_GATING_DISABLE));
+
+	/* WaDisableDopClockGating:bdw May not be needed for production */
+	intel_ring_emit_wa(ring, GEN7_ROW_CHICKEN2,
+			   _MASKED_BIT_ENABLE(DOP_CLOCK_GATING_DISABLE));
+
+	/*
+	 * This GEN8_CENTROID_PIXEL_OPT_DIS W/A is only needed for
+	 * pre-production hardware
+	 */
+	intel_ring_emit_wa(ring, HALF_SLICE_CHICKEN3,
+			   _MASKED_BIT_ENABLE(GEN8_CENTROID_PIXEL_OPT_DIS
+					      | GEN8_SAMPLER_POWER_BYPASS_DIS));
+
+	intel_ring_emit_wa(ring, GEN7_HALF_SLICE_CHICKEN1,
+			   _MASKED_BIT_ENABLE(GEN7_SINGLE_SUBSCAN_DISPATCH_ENABLE));
+
+	intel_ring_emit_wa(ring, COMMON_SLICE_CHICKEN2,
+			   _MASKED_BIT_ENABLE(GEN8_CSC2_SBE_VUE_CACHE_CONSERVATIVE));
+
+	/* Use Force Non-Coherent whenever executing a 3D context. This is a
+	 * workaround for for a possible hang in the unlikely event a TLB
+	 * invalidation occurs during a PSD flush.
+	 */
+	intel_ring_emit_wa(ring, HDC_CHICKEN0,
+			   _MASKED_BIT_ENABLE(HDC_FORCE_NON_COHERENT));
+
+	/* Wa4x4STCOptimizationDisable:bdw */
+	intel_ring_emit_wa(ring, CACHE_MODE_1,
+			   _MASKED_BIT_ENABLE(GEN8_4x4_STC_OPTIMIZATION_DISABLE));
+
+	/*
+	 * BSpec recommends 8x4 when MSAA is used,
+	 * however in practice 16x4 seems fastest.
+	 *
+	 * Note that PS/WM thread counts depend on the WIZ hashing
+	 * disable bit, which we don't touch here, but it's good
+	 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
+	 */
+	intel_ring_emit_wa(ring, GEN7_GT_MODE,
+			   GEN6_WIZ_HASHING_MASK | GEN6_WIZ_HASHING_16x4);
+
+	intel_ring_advance(ring);
+
+	return 0;
+}
+
 static int init_render_ring(struct intel_engine_cs *ring)
 {
 	struct drm_device *dev = ring->dev;
@@ -1964,7 +2042,9 @@  int intel_init_render_ring_buffer(struct drm_device *dev)
 	ring->id = RCS;
 	ring->mmio_base = RENDER_RING_BASE;
 
-	if (INTEL_INFO(dev)->gen >= 6) {
+	if (INTEL_INFO(dev)->gen >= 8) {
+		ring->init_context = gen8_init_workarounds;
+	} else if (INTEL_INFO(dev)->gen >= 6) {
 		ring->add_request = gen6_add_request;
 		ring->flush = gen7_render_ring_flush;
 		if (INTEL_INFO(dev)->gen == 6)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index e72017b..2d45ce7 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -104,6 +104,8 @@  struct  intel_engine_cs {
 
 	int		(*init)(struct intel_engine_cs *ring);
 
+	int		(*init_context)(struct intel_engine_cs *ring);
+
 	void		(*write_tail)(struct intel_engine_cs *ring,
 				      u32 value);
 	int __must_check (*flush)(struct intel_engine_cs *ring,