diff mbox

[RFC,v2,4/5] gk104: channel timeout detection

Message ID 1441021115-28537-5-git-send-email-kholtta@nvidia.com
State Deferred
Headers show

Commit Message

Konsta Hölttä Aug. 31, 2015, 11:38 a.m. UTC
Enable the scheduling timeout error interrupt and set it to a low value
to happen periodically, since it can be missed in HW in certain
conditions. Increment a channel-specific counter in software in the
error handler if the current channel hasn't advanced. Abort the channel
once the timeout limit is hit (with the periodic granularity). The error
notifier is set to NOUVEAU_GEM_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT when this
occurs.

A new KEPLER_SET_CHANNEL_TIMEOUT mthd sets the timeout limit, in
milliseconds. The interrupt granularity is set to 100 ms.

Some status bit mangling in the sched error handler is cleaned up too.

Signed-off-by: Konsta Hölttä <kholtta@nvidia.com>
---
 drm/nouveau/include/nvif/class.h     |  8 ++++
 drm/nouveau/nvkm/engine/fifo/gk104.c | 92 +++++++++++++++++++++++++++++-------
 2 files changed, 84 insertions(+), 16 deletions(-)
diff mbox

Patch

diff --git a/drm/nouveau/include/nvif/class.h b/drm/nouveau/include/nvif/class.h
index 72c3b37..9b568dc 100644
--- a/drm/nouveau/include/nvif/class.h
+++ b/drm/nouveau/include/nvif/class.h
@@ -620,18 +620,26 @@  struct fermi_a_zbc_depth_v0 {
 	__u8  format;
 	__u8  index;
 	__u8  pad03[5];
 	__u32 ds;
 	__u32 l2;
 };
 
 #define KEPLER_SET_CHANNEL_PRIORITY                                        0x00
+#define KEPLER_SET_CHANNEL_TIMEOUT                                         0x01
+
 struct kepler_set_channel_priority_v0 {
 	__u8  version;
 #define KEPLER_SET_CHANNEL_PRIORITY_LOW                                    0x00
 #define KEPLER_SET_CHANNEL_PRIORITY_MEDIUM                                 0x01
 #define KEPLER_SET_CHANNEL_PRIORITY_HIGH                                   0x02
 	__u8 priority;
 	__u8  pad03[6];
 };
 
+struct kepler_set_channel_timeout_v0 {
+	__u8  version;
+	__u8  pad03[3];
+	__u32 timeout_ms;
+};
+
 #endif
diff --git a/drm/nouveau/nvkm/engine/fifo/gk104.c b/drm/nouveau/nvkm/engine/fifo/gk104.c
index fda726d..53a464d 100644
--- a/drm/nouveau/nvkm/engine/fifo/gk104.c
+++ b/drm/nouveau/nvkm/engine/fifo/gk104.c
@@ -49,16 +49,20 @@  static const struct {
 	_(NVDEV_ENGINE_MSVLD   , 0),
 	_(NVDEV_ENGINE_CE0     , 0),
 	_(NVDEV_ENGINE_CE1     , 0),
 	_(NVDEV_ENGINE_MSENC   , 0),
 };
 #undef _
 #define FIFO_ENGINE_NR ARRAY_SIZE(fifo_engine)
 
+#define CTXSW_STATUS_LOAD 5
+#define CTXSW_STATUS_SAVE 6
+#define CTXSW_STATUS_SWITCH 7
+
 struct gk104_fifo_engn {
 	struct nvkm_gpuobj *runlist[2];
 	int cur_runlist;
 	wait_queue_head_t wait;
 };
 
 struct gk104_fifo_priv {
 	struct nvkm_fifo base;
@@ -83,18 +87,25 @@  struct gk104_fifo_base {
 struct gk104_fifo_chan {
 	struct nvkm_fifo_chan base;
 	u32 engine;
 	enum {
 		STOPPED,
 		RUNNING,
 		KILLED
 	} state;
+	struct {
+		u32 sum_ms;
+		u32 limit_ms;
+		u32 gpfifo_get;
+	} timeout;
 };
 
+#define GRFIFO_TIMEOUT_CHECK_PERIOD_MS 100
+
 /*******************************************************************************
  * FIFO channel objects
  ******************************************************************************/
 
 static void
 gk104_fifo_runlist_update(struct gk104_fifo_priv *priv, u32 engine)
 {
 	struct nvkm_bar *bar = nvkm_bar(priv);
@@ -288,16 +299,21 @@  gk104_fifo_chan_ctor(struct nvkm_object *parent, struct nvkm_object *engine,
 	nv_wo32(base, 0x94, 0x30000001);
 	nv_wo32(base, 0x9c, 0x00000100);
 	nv_wo32(base, 0xac, 0x0000001f);
 	nv_wo32(base, 0xe8, chan->base.chid);
 	nv_wo32(base, 0xb8, 0xf8000000);
 	nv_wo32(base, 0xf8, 0x10003080); /* 0x002310 */
 	nv_wo32(base, 0xfc, 0x10000010); /* 0x002350 */
 	bar->flush(bar);
+
+	chan->timeout.sum_ms = 0;
+	chan->timeout.limit_ms = -1;
+	chan->timeout.gpfifo_get = 0;
+
 	return 0;
 }
 
 static int
 gk104_fifo_chan_init(struct nvkm_object *object)
 {
 	struct nvkm_gpuobj *base = nv_gpuobj(object->parent);
 	struct gk104_fifo_priv *priv = (void *)object->engine;
@@ -381,21 +397,39 @@  gk104_fifo_chan_set_priority(struct nvkm_object *object, void *data, u32 size)
 		}
 		return gk104_fifo_set_runlist_timeslice(priv, chan, slice);
 	}
 
 	return ret;
 }
 
 int
+gk104_fifo_chan_set_timeout(struct nvkm_object *object, void *data, u32 size)
+{
+	struct gk104_fifo_chan *chan = (void *)object;
+	union {
+		struct kepler_set_channel_timeout_v0 v0;
+	} *args = data;
+	int ret;
+
+	if (nvif_unpack(args->v0, 0, 0, false)) {
+		chan->timeout.limit_ms = args->v0.timeout_ms;
+	}
+
+	return ret;
+}
+
+int
 gk104_fifo_chan_mthd(struct nvkm_object *object, u32 mthd, void *data, u32 size)
 {
 	switch (mthd) {
 	case KEPLER_SET_CHANNEL_PRIORITY:
 		return gk104_fifo_chan_set_priority(object, data, size);
+	case KEPLER_SET_CHANNEL_TIMEOUT:
+		return gk104_fifo_chan_set_timeout(object, data, size);
 	default:
 		break;
 	}
 	return -EINVAL;
 }
 
 struct nvkm_ofuncs
 gk104_fifo_chan_ofuncs = {
@@ -606,61 +640,83 @@  gk104_fifo_intr_bind(struct gk104_fifo_priv *priv)
 }
 
 static const struct nvkm_enum
 gk104_fifo_sched_reason[] = {
 	{ 0x0a, "CTXSW_TIMEOUT" },
 	{}
 };
 
+static bool
+gk104_fifo_update_timeout(struct gk104_fifo_priv *priv,
+		struct gk104_fifo_chan *chan, u32 dt)
+{
+	u32 gpfifo_get = nv_rd32(priv, 0x88);
+	/* advancing, but slowly; reset counting */
+	if (gpfifo_get != chan->timeout.gpfifo_get)
+		chan->timeout.sum_ms = 0;
+
+	chan->timeout.sum_ms += dt;
+	chan->timeout.gpfifo_get = gpfifo_get;
+
+	return chan->timeout.sum_ms > chan->timeout.limit_ms;
+}
+
 static void
 gk104_fifo_intr_sched_ctxsw(struct gk104_fifo_priv *priv)
 {
 	struct nvkm_engine *engine;
 	struct gk104_fifo_chan *chan;
 	u32 engn;
 
 	for (engn = 0; engn < ARRAY_SIZE(fifo_engine); engn++) {
-		u32 stat = nv_rd32(priv, 0x002640 + (engn * 0x04));
-		u32 busy = (stat & 0x80000000);
-		u32 next = (stat & 0x07ff0000) >> 16;
-		u32 chsw = (stat & 0x00008000);
-		u32 save = (stat & 0x00004000);
-		u32 load = (stat & 0x00002000);
-		u32 prev = (stat & 0x000007ff);
-		u32 chid = load ? next : prev;
-		(void)save;
-
-		if (busy && chsw) {
+		u32 engstat = nv_rd32(priv, 0x002640 + (engn * 0x04));
+		u32 busy = (engstat & 0x80000000);
+		u32 next = (engstat & 0x07ff0000) >> 16;
+		u32 ctxstat = (engstat & 0x0000e000) >> 13;
+		u32 prev = (engstat & 0x000007ff);
+
+		u32 chid = ctxstat == CTXSW_STATUS_LOAD ? next : prev;
+		u32 ctxsw_active = ctxstat == CTXSW_STATUS_LOAD ||
+			ctxstat == CTXSW_STATUS_SAVE ||
+			ctxstat == CTXSW_STATUS_SWITCH;
+
+		if (busy && ctxsw_active) {
 			if (!(chan = (void *)priv->base.channel[chid]))
 				continue;
 			if (!(engine = gk104_fifo_engine(priv, engn)))
 				continue;
 
-			nvkm_fifo_eevent(&priv->base, chid,
-					NOUVEAU_GEM_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT);
-
-			gk104_fifo_recover(priv, engine, chan);
+			if (gk104_fifo_update_timeout(priv, chan,
+						GRFIFO_TIMEOUT_CHECK_PERIOD_MS)) {
+				nvkm_fifo_eevent(&priv->base, chid,
+						NOUVEAU_GEM_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT);
+				gk104_fifo_recover(priv, engine, chan);
+			} else {
+				nv_debug(priv, "fifo waiting for ctxsw %d ms on ch %d\n",
+						chan->timeout.sum_ms, chid);
+			}
 		}
 	}
 }
 
 static void
 gk104_fifo_intr_sched(struct gk104_fifo_priv *priv)
 {
 	u32 intr = nv_rd32(priv, 0x00254c);
 	u32 code = intr & 0x000000ff;
 	const struct nvkm_enum *en;
 	char enunk[6] = "";
 
 	en = nvkm_enum_find(gk104_fifo_sched_reason, code);
 	if (!en)
 		snprintf(enunk, sizeof(enunk), "UNK%02x", code);
 
-	nv_error(priv, "SCHED_ERROR [ %s ]\n", en ? en->name : enunk);
+	/* this is a normal situation, not so loud */
+	nv_debug(priv, "SCHED_ERROR [ %s ]\n", en ? en->name : enunk);
 
 	switch (code) {
 	case 0x0a:
 		gk104_fifo_intr_sched_ctxsw(priv);
 		break;
 	default:
 		break;
 	}
@@ -1133,18 +1189,22 @@  gk104_fifo_init(struct nvkm_object *object)
 	/* PBDMA[n].HCE */
 	for (i = 0; i < priv->spoon_nr; i++) {
 		nv_wr32(priv, 0x040148 + (i * 0x2000), 0xffffffff); /* INTR */
 		nv_wr32(priv, 0x04014c + (i * 0x2000), 0xffffffff); /* INTREN */
 	}
 
 	nv_wr32(priv, 0x002254, 0x10000000 | priv->user.bar.offset >> 12);
 
+	/* enable interrupts */
 	nv_wr32(priv, 0x002100, 0xffffffff);
 	nv_wr32(priv, 0x002140, 0x7fffffff);
+
+	/* engine context switch timeout */
+	nv_wr32(priv, 0x002a0c, 0x80000000 | (1000 * GRFIFO_TIMEOUT_CHECK_PERIOD_MS));
 	return 0;
 }
 
 void
 gk104_fifo_dtor(struct nvkm_object *object)
 {
 	struct gk104_fifo_priv *priv = (void *)object;
 	int i;