diff mbox

[2/3,Xenial,SRU] NVMe: Move error handling to failed reset handler

Message ID 1464187653-3754-2-git-send-email-tim.gardner@canonical.com
State New
Headers show

Commit Message

Tim Gardner May 25, 2016, 2:47 p.m. UTC
From: Keith Busch <keith.busch@intel.com>

BugLink: http://bugs.launchpad.net/bugs/1581034

This moves failed queue handling out of the namespace removal path and
into the reset failure path, fixing a hanging condition if the controller
fails or link down during del_gendisk. Previously the driver had to see
the controller as degraded prior to calling del_gendisk to setup the
queues to fail. But, if the controller happened to fail after this,
there was no task to end outstanding requests.

On failure, all namespace states are set to dead. This has capacity
revalidate to 0, and ends all new requests with error status.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
(back ported from commit 69d9a99c258eb1d6478fd9608a2070890797eed7)
Signed-off-by: Tim Gardner <tim.gardner@canonical.com>

Conflicts:
	drivers/nvme/host/pci.c
---
 drivers/nvme/host/core.c | 53 ++++++++++++++++++++++++++++++++----------------
 drivers/nvme/host/nvme.h |  2 ++
 drivers/nvme/host/pci.c  | 33 ++++++++++++++++++++++--------
 3 files changed, 63 insertions(+), 25 deletions(-)

Comments

Chris J Arges May 26, 2016, 1:05 p.m. UTC | #1
On Wed, May 25, 2016 at 08:47:32AM -0600, Tim Gardner wrote:
> From: Keith Busch <keith.busch@intel.com>
> 
> BugLink: http://bugs.launchpad.net/bugs/1581034
> 
> This moves failed queue handling out of the namespace removal path and
> into the reset failure path, fixing a hanging condition if the controller
> fails or link down during del_gendisk. Previously the driver had to see
> the controller as degraded prior to calling del_gendisk to setup the
> queues to fail. But, if the controller happened to fail after this,
> there was no task to end outstanding requests.
> 
> On failure, all namespace states are set to dead. This has capacity
> revalidate to 0, and ends all new requests with error status.
> 
> Signed-off-by: Keith Busch <keith.busch@intel.com>
> Reviewed-by: Christoph Hellwig <hch@lst.de>
> Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
> Signed-off-by: Jens Axboe <axboe@fb.com>
> (back ported from commit 69d9a99c258eb1d6478fd9608a2070890797eed7)
> Signed-off-by: Tim Gardner <tim.gardner@canonical.com>
> 
> Conflicts:
> 	drivers/nvme/host/pci.c
> ---
>  drivers/nvme/host/core.c | 53 ++++++++++++++++++++++++++++++++----------------
>  drivers/nvme/host/nvme.h |  2 ++
>  drivers/nvme/host/pci.c  | 33 ++++++++++++++++++++++--------
>  3 files changed, 63 insertions(+), 25 deletions(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 8a21897a..a87320c 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -556,6 +556,10 @@ static int nvme_revalidate_disk(struct gendisk *disk)
>  	u16 old_ms;
>  	unsigned short bs;
>  
> +	if (test_bit(NVME_NS_DEAD, &ns->flags)) {
> +		set_capacity(disk, 0);
> +		return -ENODEV;
> +	}
>  	if (nvme_identify_ns(ns->ctrl, ns->ns_id, &id)) {
>  		dev_warn(ns->ctrl->dev, "%s: Identify failure nvme%dn%d\n",
>  				__func__, ns->ctrl->instance, ns->ns_id);
> @@ -1180,32 +1184,15 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
>  
>  static void nvme_ns_remove(struct nvme_ns *ns)
>  {
> -	bool kill;
> -
>  	if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
>  		return;
>  
> -	kill = nvme_io_incapable(ns->ctrl) &&
> -			!blk_queue_dying(ns->queue);
> -	if (kill) {
> -		blk_set_queue_dying(ns->queue);
> -
> -		/*
> -		 * The controller was shutdown first if we got here through
> -		 * device removal. The shutdown may requeue outstanding
> -		 * requests. These need to be aborted immediately so
> -		 * del_gendisk doesn't block indefinitely for their completion.
> -		 */
> -		blk_mq_abort_requeue_list(ns->queue);
> -	}
>  	if (ns->disk->flags & GENHD_FL_UP) {
>  		if (blk_get_integrity(ns->disk))
>  			blk_integrity_unregister(ns->disk);
>  		sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
>  					&nvme_ns_attr_group);
>  		del_gendisk(ns->disk);
> -	}
> -	if (kill || !blk_queue_dying(ns->queue)) {
>  		blk_mq_abort_requeue_list(ns->queue);
>  		blk_cleanup_queue(ns->queue);
>  	}
> @@ -1405,6 +1392,38 @@ out:
>  	return ret;
>  }
>  
> +/**
> + * nvme_kill_queues(): Ends all namespace queues
> + * @ctrl: the dead controller that needs to end
> + *
> + * Call this function when the driver determines it is unable to get the
> + * controller in a state capable of servicing IO.
> + */
> +void nvme_kill_queues(struct nvme_ctrl *ctrl)
> +{
> +	struct nvme_ns *ns;
> +
> +	mutex_lock(&ctrl->namespaces_mutex);
> +	list_for_each_entry(ns, &ctrl->namespaces, list) {
> +		if (!kref_get_unless_zero(&ns->kref))
> +			continue;
> +
> +		/*
> +		 * Revalidating a dead namespace sets capacity to 0. This will
> +		 * end buffered writers dirtying pages that can't be synced.
> +		 */
> +		if (!test_and_set_bit(NVME_NS_DEAD, &ns->flags))
> +			revalidate_disk(ns->disk);
> +
> +		blk_set_queue_dying(ns->queue);
> +		blk_mq_abort_requeue_list(ns->queue);
> +		blk_mq_start_stopped_hw_queues(ns->queue, true);
> +
> +		nvme_put_ns(ns);
> +	}
> +	mutex_unlock(&ctrl->namespaces_mutex);
> +}
> +
>  void nvme_stop_queues(struct nvme_ctrl *ctrl)
>  {
>  	struct nvme_ns *ns;
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index 784fba8..4d74625 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -115,6 +115,7 @@ struct nvme_ns {
>  	unsigned long flags;
>  
>  #define NVME_NS_REMOVING 0
> +#define NVME_NS_DEAD     1
>  
>  	u64 mode_select_num_blocks;
>  	u32 mode_select_block_len;
> @@ -244,6 +245,7 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl);
>  
>  void nvme_stop_queues(struct nvme_ctrl *ctrl);
>  void nvme_start_queues(struct nvme_ctrl *ctrl);
> +void nvme_kill_queues(struct nvme_ctrl *ctrl);
>  
>  struct request *nvme_alloc_request(struct request_queue *q,
>  		struct nvme_command *cmd, unsigned int flags);
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 2b2211e..2ecd712 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -690,6 +690,14 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
>  	blk_mq_start_request(req);
>  
>  	spin_lock_irq(&nvmeq->q_lock);
> +	if (unlikely(nvmeq->cq_vector < 0)) {
> +		if (ns && !test_bit(NVME_NS_DEAD, &ns->flags))
> +			ret = BLK_MQ_RQ_QUEUE_BUSY;
> +		else
> +			ret = BLK_MQ_RQ_QUEUE_ERROR;
> +		spin_unlock_irq(&nvmeq->q_lock);
> +		goto out;
> +	}

Are there support patches that introduce this change that would be worth
backporting too?
--chris


>  	__nvme_submit_cmd(nvmeq, &cmnd);
>  	nvme_process_cq(nvmeq);
>  	spin_unlock_irq(&nvmeq->q_lock);
> @@ -1257,6 +1265,12 @@ static struct blk_mq_ops nvme_mq_ops = {
>  static void nvme_dev_remove_admin(struct nvme_dev *dev)
>  {
>  	if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) {
> +		/*
> +		 * If the controller was reset during removal, it's possible
> +		 * user requests may be waiting on a stopped queue. Start the
> +		 * queue to flush these to completion.
> +		 */
> +		blk_mq_start_stopped_hw_queues(dev->ctrl.admin_q, true);
>  		blk_cleanup_queue(dev->ctrl.admin_q);
>  		blk_mq_free_tag_set(&dev->admin_tagset);
>  	}
> @@ -1906,6 +1920,16 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
>  	kfree(dev);
>  }
>  
> +static void nvme_remove_dead_ctrl(struct nvme_dev *dev)
> +{
> +	dev_warn(dev->dev, "Removing after probe failure\n");
> +
> +	kref_get(&dev->ctrl.kref);
> +	nvme_dev_disable(dev, false);
> +	if (!schedule_work(&dev->remove_work))
> +		nvme_put_ctrl(&dev->ctrl);
> +}
> +
>  static void nvme_reset_work(struct work_struct *work)
>  {
>  	struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work);
> @@ -1985,19 +2009,12 @@ static void nvme_remove_dead_ctrl_work(struct work_struct *work)
>  	struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work);
>  	struct pci_dev *pdev = to_pci_dev(dev->dev);
>  
> +	nvme_kill_queues(&dev->ctrl);
>  	if (pci_get_drvdata(pdev))
>  		pci_stop_and_remove_bus_device_locked(pdev);
>  	nvme_put_ctrl(&dev->ctrl);
>  }
>  
> -static void nvme_remove_dead_ctrl(struct nvme_dev *dev)
> -{
> -	dev_warn(dev->dev, "Removing after probe failure\n");
> -	kref_get(&dev->ctrl.kref);
> -	if (!schedule_work(&dev->remove_work))
> -		nvme_put_ctrl(&dev->ctrl);
> -}
> -
>  static int nvme_reset(struct nvme_dev *dev)
>  {
>  	if (!dev->ctrl.admin_q || blk_queue_dying(dev->ctrl.admin_q))
> -- 
> 1.9.1
> 
> 
> -- 
> kernel-team mailing list
> kernel-team@lists.ubuntu.com
> https://lists.ubuntu.com/mailman/listinfo/kernel-team
diff mbox

Patch

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 8a21897a..a87320c 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -556,6 +556,10 @@  static int nvme_revalidate_disk(struct gendisk *disk)
 	u16 old_ms;
 	unsigned short bs;
 
+	if (test_bit(NVME_NS_DEAD, &ns->flags)) {
+		set_capacity(disk, 0);
+		return -ENODEV;
+	}
 	if (nvme_identify_ns(ns->ctrl, ns->ns_id, &id)) {
 		dev_warn(ns->ctrl->dev, "%s: Identify failure nvme%dn%d\n",
 				__func__, ns->ctrl->instance, ns->ns_id);
@@ -1180,32 +1184,15 @@  static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 
 static void nvme_ns_remove(struct nvme_ns *ns)
 {
-	bool kill;
-
 	if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
 		return;
 
-	kill = nvme_io_incapable(ns->ctrl) &&
-			!blk_queue_dying(ns->queue);
-	if (kill) {
-		blk_set_queue_dying(ns->queue);
-
-		/*
-		 * The controller was shutdown first if we got here through
-		 * device removal. The shutdown may requeue outstanding
-		 * requests. These need to be aborted immediately so
-		 * del_gendisk doesn't block indefinitely for their completion.
-		 */
-		blk_mq_abort_requeue_list(ns->queue);
-	}
 	if (ns->disk->flags & GENHD_FL_UP) {
 		if (blk_get_integrity(ns->disk))
 			blk_integrity_unregister(ns->disk);
 		sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
 					&nvme_ns_attr_group);
 		del_gendisk(ns->disk);
-	}
-	if (kill || !blk_queue_dying(ns->queue)) {
 		blk_mq_abort_requeue_list(ns->queue);
 		blk_cleanup_queue(ns->queue);
 	}
@@ -1405,6 +1392,38 @@  out:
 	return ret;
 }
 
+/**
+ * nvme_kill_queues(): Ends all namespace queues
+ * @ctrl: the dead controller that needs to end
+ *
+ * Call this function when the driver determines it is unable to get the
+ * controller in a state capable of servicing IO.
+ */
+void nvme_kill_queues(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+
+	mutex_lock(&ctrl->namespaces_mutex);
+	list_for_each_entry(ns, &ctrl->namespaces, list) {
+		if (!kref_get_unless_zero(&ns->kref))
+			continue;
+
+		/*
+		 * Revalidating a dead namespace sets capacity to 0. This will
+		 * end buffered writers dirtying pages that can't be synced.
+		 */
+		if (!test_and_set_bit(NVME_NS_DEAD, &ns->flags))
+			revalidate_disk(ns->disk);
+
+		blk_set_queue_dying(ns->queue);
+		blk_mq_abort_requeue_list(ns->queue);
+		blk_mq_start_stopped_hw_queues(ns->queue, true);
+
+		nvme_put_ns(ns);
+	}
+	mutex_unlock(&ctrl->namespaces_mutex);
+}
+
 void nvme_stop_queues(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 784fba8..4d74625 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -115,6 +115,7 @@  struct nvme_ns {
 	unsigned long flags;
 
 #define NVME_NS_REMOVING 0
+#define NVME_NS_DEAD     1
 
 	u64 mode_select_num_blocks;
 	u32 mode_select_block_len;
@@ -244,6 +245,7 @@  void nvme_remove_namespaces(struct nvme_ctrl *ctrl);
 
 void nvme_stop_queues(struct nvme_ctrl *ctrl);
 void nvme_start_queues(struct nvme_ctrl *ctrl);
+void nvme_kill_queues(struct nvme_ctrl *ctrl);
 
 struct request *nvme_alloc_request(struct request_queue *q,
 		struct nvme_command *cmd, unsigned int flags);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 2b2211e..2ecd712 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -690,6 +690,14 @@  static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	blk_mq_start_request(req);
 
 	spin_lock_irq(&nvmeq->q_lock);
+	if (unlikely(nvmeq->cq_vector < 0)) {
+		if (ns && !test_bit(NVME_NS_DEAD, &ns->flags))
+			ret = BLK_MQ_RQ_QUEUE_BUSY;
+		else
+			ret = BLK_MQ_RQ_QUEUE_ERROR;
+		spin_unlock_irq(&nvmeq->q_lock);
+		goto out;
+	}
 	__nvme_submit_cmd(nvmeq, &cmnd);
 	nvme_process_cq(nvmeq);
 	spin_unlock_irq(&nvmeq->q_lock);
@@ -1257,6 +1265,12 @@  static struct blk_mq_ops nvme_mq_ops = {
 static void nvme_dev_remove_admin(struct nvme_dev *dev)
 {
 	if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) {
+		/*
+		 * If the controller was reset during removal, it's possible
+		 * user requests may be waiting on a stopped queue. Start the
+		 * queue to flush these to completion.
+		 */
+		blk_mq_start_stopped_hw_queues(dev->ctrl.admin_q, true);
 		blk_cleanup_queue(dev->ctrl.admin_q);
 		blk_mq_free_tag_set(&dev->admin_tagset);
 	}
@@ -1906,6 +1920,16 @@  static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
 	kfree(dev);
 }
 
+static void nvme_remove_dead_ctrl(struct nvme_dev *dev)
+{
+	dev_warn(dev->dev, "Removing after probe failure\n");
+
+	kref_get(&dev->ctrl.kref);
+	nvme_dev_disable(dev, false);
+	if (!schedule_work(&dev->remove_work))
+		nvme_put_ctrl(&dev->ctrl);
+}
+
 static void nvme_reset_work(struct work_struct *work)
 {
 	struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work);
@@ -1985,19 +2009,12 @@  static void nvme_remove_dead_ctrl_work(struct work_struct *work)
 	struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work);
 	struct pci_dev *pdev = to_pci_dev(dev->dev);
 
+	nvme_kill_queues(&dev->ctrl);
 	if (pci_get_drvdata(pdev))
 		pci_stop_and_remove_bus_device_locked(pdev);
 	nvme_put_ctrl(&dev->ctrl);
 }
 
-static void nvme_remove_dead_ctrl(struct nvme_dev *dev)
-{
-	dev_warn(dev->dev, "Removing after probe failure\n");
-	kref_get(&dev->ctrl.kref);
-	if (!schedule_work(&dev->remove_work))
-		nvme_put_ctrl(&dev->ctrl);
-}
-
 static int nvme_reset(struct nvme_dev *dev)
 {
 	if (!dev->ctrl.admin_q || blk_queue_dying(dev->ctrl.admin_q))