diff mbox series

[SRU,X,1/1] nvme/pci: Poll CQ on timeout

Message ID 20181207212811.12826-2-gpiccoli@canonical.com
State New
Headers show
Series [SRU,X,1/1] nvme/pci: Poll CQ on timeout | expand

Commit Message

Guilherme G. Piccoli Dec. 7, 2018, 9:28 p.m. UTC
From: Keith Busch <keith.busch@intel.com>

BugLink: https://launchpad.net/bugs/1807393

If an IO timeout occurs, it's helpful to know if the controller did not
post a completion or the driver missed an interrupt. While we never expect
the latter, this patch will make it possible to tell the difference so
we don't have to guess.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
(backported from 7776db1ccc123d5944a8c170c9c45f7e91d49643 upstream)
[gpiccoli: context adjustment, fixed struct member access that changed]
Signed-off-by: Guilherme G. Piccoli <gpiccoli@canonical.com>
---
 drivers/nvme/host/pci.c | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

Comments

Kleber Sacilotto de Souza Dec. 21, 2018, 3:23 p.m. UTC | #1
On 12/7/18 10:28 PM, Guilherme G. Piccoli wrote:
> From: Keith Busch <keith.busch@intel.com>
>
> BugLink: https://launchpad.net/bugs/1807393
>
> If an IO timeout occurs, it's helpful to know if the controller did not
> post a completion or the driver missed an interrupt. While we never expect
> the latter, this patch will make it possible to tell the difference so
> we don't have to guess.
>
> Signed-off-by: Keith Busch <keith.busch@intel.com>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Tested-by: Johannes Thumshirn <jthumshirn@suse.de>
> Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
> (backported from 7776db1ccc123d5944a8c170c9c45f7e91d49643 upstream)
> [gpiccoli: context adjustment, fixed struct member access that changed]
> Signed-off-by: Guilherme G. Piccoli <gpiccoli@canonical.com>

Thanks for the additional info, Guilherme.


Upstream fix with low regression potential:

Acked-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>


> ---
>  drivers/nvme/host/pci.c | 21 ++++++++++++++++++---
>  1 file changed, 18 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 3752052ae20a..5d2a7ee2f922 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -929,10 +929,8 @@ static irqreturn_t nvme_irq_check(int irq, void *data)
>  	return IRQ_NONE;
>  }
>  
> -static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
> +static int __nvme_poll(struct nvme_queue *nvmeq, unsigned int tag)
>  {
> -	struct nvme_queue *nvmeq = hctx->driver_data;
> -
>  	if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) {
>  		spin_lock_irq(&nvmeq->q_lock);
>  		__nvme_process_cq(nvmeq, &tag);
> @@ -945,6 +943,13 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
>  	return 0;
>  }
>  
> +static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
> +{
> +	struct nvme_queue *nvmeq = hctx->driver_data;
> +
> +	return __nvme_poll(nvmeq, tag);
> +}
> +
>  static void nvme_async_event_work(struct work_struct *work)
>  {
>  	struct nvme_dev *dev = container_of(work, struct nvme_dev, async_work);
> @@ -1045,6 +1050,16 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
>  	struct request *abort_req;
>  	struct nvme_command cmd;
>  
> +	/*
> +	* Did we miss an interrupt?
> +	*/
> +	if (__nvme_poll(nvmeq, req->tag)) {
> +		dev_warn(dev->dev,
> +			"I/O %d QID %d timeout, completion polled\n",
> +			req->tag, nvmeq->qid);
> +		return BLK_EH_HANDLED;
> +	}
> +
>  	/*
>  	 * Shutdown immediately if controller times out while starting. The
>  	 * reset work will see the pci device disabled when it gets the forced
Stefan Bader Jan. 8, 2019, 11:06 a.m. UTC | #2
On 07.12.18 22:28, Guilherme G. Piccoli wrote:
> From: Keith Busch <keith.busch@intel.com>
> 
> BugLink: https://launchpad.net/bugs/1807393
> 
> If an IO timeout occurs, it's helpful to know if the controller did not
> post a completion or the driver missed an interrupt. While we never expect
> the latter, this patch will make it possible to tell the difference so
> we don't have to guess.
> 
> Signed-off-by: Keith Busch <keith.busch@intel.com>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Tested-by: Johannes Thumshirn <jthumshirn@suse.de>
> Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
> (backported from 7776db1ccc123d5944a8c170c9c45f7e91d49643 upstream)
> [gpiccoli: context adjustment, fixed struct member access that changed]
> Signed-off-by: Guilherme G. Piccoli <gpiccoli@canonical.com>
Acked-by: Stefan Bader <stefan.bader@canonical.com>
> ---
>  drivers/nvme/host/pci.c | 21 ++++++++++++++++++---
>  1 file changed, 18 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 3752052ae20a..5d2a7ee2f922 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -929,10 +929,8 @@ static irqreturn_t nvme_irq_check(int irq, void *data)
>  	return IRQ_NONE;
>  }
>  
> -static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
> +static int __nvme_poll(struct nvme_queue *nvmeq, unsigned int tag)
>  {
> -	struct nvme_queue *nvmeq = hctx->driver_data;
> -
>  	if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) {
>  		spin_lock_irq(&nvmeq->q_lock);
>  		__nvme_process_cq(nvmeq, &tag);
> @@ -945,6 +943,13 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
>  	return 0;
>  }
>  
> +static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
> +{
> +	struct nvme_queue *nvmeq = hctx->driver_data;
> +
> +	return __nvme_poll(nvmeq, tag);
> +}
> +
>  static void nvme_async_event_work(struct work_struct *work)
>  {
>  	struct nvme_dev *dev = container_of(work, struct nvme_dev, async_work);
> @@ -1045,6 +1050,16 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
>  	struct request *abort_req;
>  	struct nvme_command cmd;
>  
> +	/*
> +	* Did we miss an interrupt?
> +	*/
> +	if (__nvme_poll(nvmeq, req->tag)) {
> +		dev_warn(dev->dev,
> +			"I/O %d QID %d timeout, completion polled\n",
> +			req->tag, nvmeq->qid);
> +		return BLK_EH_HANDLED;
> +	}
> +
>  	/*
>  	 * Shutdown immediately if controller times out while starting. The
>  	 * reset work will see the pci device disabled when it gets the forced
>
diff mbox series

Patch

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 3752052ae20a..5d2a7ee2f922 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -929,10 +929,8 @@  static irqreturn_t nvme_irq_check(int irq, void *data)
 	return IRQ_NONE;
 }
 
-static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
+static int __nvme_poll(struct nvme_queue *nvmeq, unsigned int tag)
 {
-	struct nvme_queue *nvmeq = hctx->driver_data;
-
 	if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) {
 		spin_lock_irq(&nvmeq->q_lock);
 		__nvme_process_cq(nvmeq, &tag);
@@ -945,6 +943,13 @@  static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
 	return 0;
 }
 
+static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
+{
+	struct nvme_queue *nvmeq = hctx->driver_data;
+
+	return __nvme_poll(nvmeq, tag);
+}
+
 static void nvme_async_event_work(struct work_struct *work)
 {
 	struct nvme_dev *dev = container_of(work, struct nvme_dev, async_work);
@@ -1045,6 +1050,16 @@  static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 	struct request *abort_req;
 	struct nvme_command cmd;
 
+	/*
+	* Did we miss an interrupt?
+	*/
+	if (__nvme_poll(nvmeq, req->tag)) {
+		dev_warn(dev->dev,
+			"I/O %d QID %d timeout, completion polled\n",
+			req->tag, nvmeq->qid);
+		return BLK_EH_HANDLED;
+	}
+
 	/*
 	 * Shutdown immediately if controller times out while starting. The
 	 * reset work will see the pci device disabled when it gets the forced