diff mbox series

[PATCHv2,2/4] nvme-pci: Distribute io queue types after creation

Message ID 20190103225033.11249-3-keith.busch@intel.com
State Not Applicable
Delegated to: Bjorn Helgaas
Headers show
Series NVMe IRQ sets fixups | expand

Commit Message

Keith Busch Jan. 3, 2019, 10:50 p.m. UTC
The dev->io_queues types were set based on the results of the nvme set
feature "number of queues" and the IRQ allocation. This result does not
mean we're going to successfully allocate and create those IO queues,
though. A failure there will cause blk-mq to have NULL hctx's because the
map's nr_hw_queues accounts for more queues than were actually created.

Adjust the io_queue types after we've created them when we have less than
originally desired.

Fixes: 3b6592f70ad7b ("nvme: utilize two queue maps, one for reads and one for writes")
Signed-off-by: Keith Busch <keith.busch@intel.com>
---
 drivers/nvme/host/pci.c | 46 ++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 40 insertions(+), 6 deletions(-)

Comments

Ming Lei Jan. 4, 2019, 2:31 a.m. UTC | #1
On Thu, Jan 03, 2019 at 03:50:31PM -0700, Keith Busch wrote:
> The dev->io_queues types were set based on the results of the nvme set
> feature "number of queues" and the IRQ allocation. This result does not
> mean we're going to successfully allocate and create those IO queues,
> though. A failure there will cause blk-mq to have NULL hctx's because the
> map's nr_hw_queues accounts for more queues than were actually created.
> 
> Adjust the io_queue types after we've created them when we have less than
> originally desired.
> 
> Fixes: 3b6592f70ad7b ("nvme: utilize two queue maps, one for reads and one for writes")
> Signed-off-by: Keith Busch <keith.busch@intel.com>
> ---
>  drivers/nvme/host/pci.c | 46 ++++++++++++++++++++++++++++++++++++++++------
>  1 file changed, 40 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 98332d0a80f0..1481bb6d9c42 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -1733,6 +1733,30 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
>  	return result;
>  }
>  
> +static void nvme_distribute_queues(struct nvme_dev *dev, unsigned int io_queues)
> +{
> +	unsigned int irq_queues, this_p_queues = dev->io_queues[HCTX_TYPE_POLL],
> +		     this_w_queues = dev->io_queues[HCTX_TYPE_DEFAULT];
> +
> +	if (!io_queues) {
> +		dev->io_queues[HCTX_TYPE_POLL] = 0;
> +		dev->io_queues[HCTX_TYPE_DEFAULT] = 0;
> +		dev->io_queues[HCTX_TYPE_READ] = 0;
> +		return;
> +	}
> +
> +	if (this_p_queues >= io_queues)
> +		this_p_queues = io_queues - 1;
> +	irq_queues = io_queues - this_p_queues;
> +
> +	if (this_w_queues > irq_queues)
> +		this_w_queues = irq_queues;
> +
> +	dev->io_queues[HCTX_TYPE_POLL] = this_p_queues;
> +	dev->io_queues[HCTX_TYPE_DEFAULT] = this_w_queues;
> +	dev->io_queues[HCTX_TYPE_READ] = irq_queues - this_w_queues;
> +}
> +
>  static int nvme_create_io_queues(struct nvme_dev *dev)
>  {
>  	unsigned i, max, rw_queues;
> @@ -1761,6 +1785,13 @@ static int nvme_create_io_queues(struct nvme_dev *dev)
>  			break;
>  	}
>  
> +	/*
> +	 * If we've created less than expected io queues, redistribute the
> +	 * dev->io_queues[] types accordingly.
> +	 */
> +	if (dev->online_queues - 1 != dev->max_qid)
> +		nvme_distribute_queues(dev, dev->online_queues - 1);
> +
>  	/*
>  	 * Ignore failing Create SQ/CQ commands, we can continue with less
>  	 * than the desired amount of queues, and even a controller without
> @@ -2185,11 +2216,6 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
>  	result = max(result - 1, 1);
>  	dev->max_qid = result + dev->io_queues[HCTX_TYPE_POLL];
>  
> -	dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n",
> -					dev->io_queues[HCTX_TYPE_DEFAULT],
> -					dev->io_queues[HCTX_TYPE_READ],
> -					dev->io_queues[HCTX_TYPE_POLL]);
> -
>  	/*
>  	 * Should investigate if there's a performance win from allocating
>  	 * more queues than interrupt vectors; it might allow the submission
> @@ -2203,7 +2229,15 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
>  		return result;
>  	}
>  	set_bit(NVMEQ_ENABLED, &adminq->flags);
> -	return nvme_create_io_queues(dev);
> +	result = nvme_create_io_queues(dev);
> +
> +	if (!result)
> +		dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n",
> +					dev->io_queues[HCTX_TYPE_DEFAULT],
> +					dev->io_queues[HCTX_TYPE_READ],
> +					dev->io_queues[HCTX_TYPE_POLL]);
> +	return result;
> +
>  }
>  
>  static void nvme_del_queue_end(struct request *req, blk_status_t error)
> -- 
> 2.14.4
> 

This way should be better given it covers irq allocation failure and
queue creating/initialization failure.

Reviewed-by: Ming Lei <ming.lei@redhat.com>

Thanks,
Ming
Ming Lei Jan. 4, 2019, 7:21 a.m. UTC | #2
On Fri, Jan 04, 2019 at 10:31:21AM +0800, Ming Lei wrote:
> On Thu, Jan 03, 2019 at 03:50:31PM -0700, Keith Busch wrote:
> > The dev->io_queues types were set based on the results of the nvme set
> > feature "number of queues" and the IRQ allocation. This result does not
> > mean we're going to successfully allocate and create those IO queues,
> > though. A failure there will cause blk-mq to have NULL hctx's because the
> > map's nr_hw_queues accounts for more queues than were actually created.
> > 
> > Adjust the io_queue types after we've created them when we have less than
> > originally desired.
> > 
> > Fixes: 3b6592f70ad7b ("nvme: utilize two queue maps, one for reads and one for writes")
> > Signed-off-by: Keith Busch <keith.busch@intel.com>
> > ---
> >  drivers/nvme/host/pci.c | 46 ++++++++++++++++++++++++++++++++++++++++------
> >  1 file changed, 40 insertions(+), 6 deletions(-)
> > 
> > diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> > index 98332d0a80f0..1481bb6d9c42 100644
> > --- a/drivers/nvme/host/pci.c
> > +++ b/drivers/nvme/host/pci.c
> > @@ -1733,6 +1733,30 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
> >  	return result;
> >  }
> >  
> > +static void nvme_distribute_queues(struct nvme_dev *dev, unsigned int io_queues)
> > +{
> > +	unsigned int irq_queues, this_p_queues = dev->io_queues[HCTX_TYPE_POLL],
> > +		     this_w_queues = dev->io_queues[HCTX_TYPE_DEFAULT];
> > +
> > +	if (!io_queues) {
> > +		dev->io_queues[HCTX_TYPE_POLL] = 0;
> > +		dev->io_queues[HCTX_TYPE_DEFAULT] = 0;
> > +		dev->io_queues[HCTX_TYPE_READ] = 0;
> > +		return;
> > +	}
> > +
> > +	if (this_p_queues >= io_queues)
> > +		this_p_queues = io_queues - 1;
> > +	irq_queues = io_queues - this_p_queues;
> > +
> > +	if (this_w_queues > irq_queues)
> > +		this_w_queues = irq_queues;
> > +
> > +	dev->io_queues[HCTX_TYPE_POLL] = this_p_queues;
> > +	dev->io_queues[HCTX_TYPE_DEFAULT] = this_w_queues;
> > +	dev->io_queues[HCTX_TYPE_READ] = irq_queues - this_w_queues;
> > +}
> > +
> >  static int nvme_create_io_queues(struct nvme_dev *dev)
> >  {
> >  	unsigned i, max, rw_queues;
> > @@ -1761,6 +1785,13 @@ static int nvme_create_io_queues(struct nvme_dev *dev)
> >  			break;
> >  	}
> >  
> > +	/*
> > +	 * If we've created less than expected io queues, redistribute the
> > +	 * dev->io_queues[] types accordingly.
> > +	 */
> > +	if (dev->online_queues - 1 != dev->max_qid)
> > +		nvme_distribute_queues(dev, dev->online_queues - 1);
> > +
> >  	/*
> >  	 * Ignore failing Create SQ/CQ commands, we can continue with less
> >  	 * than the desired amount of queues, and even a controller without
> > @@ -2185,11 +2216,6 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
> >  	result = max(result - 1, 1);
> >  	dev->max_qid = result + dev->io_queues[HCTX_TYPE_POLL];
> >  
> > -	dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n",
> > -					dev->io_queues[HCTX_TYPE_DEFAULT],
> > -					dev->io_queues[HCTX_TYPE_READ],
> > -					dev->io_queues[HCTX_TYPE_POLL]);
> > -
> >  	/*
> >  	 * Should investigate if there's a performance win from allocating
> >  	 * more queues than interrupt vectors; it might allow the submission
> > @@ -2203,7 +2229,15 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
> >  		return result;
> >  	}
> >  	set_bit(NVMEQ_ENABLED, &adminq->flags);
> > -	return nvme_create_io_queues(dev);
> > +	result = nvme_create_io_queues(dev);
> > +
> > +	if (!result)
> > +		dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n",
> > +					dev->io_queues[HCTX_TYPE_DEFAULT],
> > +					dev->io_queues[HCTX_TYPE_READ],
> > +					dev->io_queues[HCTX_TYPE_POLL]);
> > +	return result;
> > +
> >  }
> >  
> >  static void nvme_del_queue_end(struct request *req, blk_status_t error)
> > -- 
> > 2.14.4
> > 
> 
> This way should be better given it covers irq allocation failure and
> queue creating/initialization failure.
> 
> Reviewed-by: Ming Lei <ming.lei@redhat.com>

Thinking about the patch further: after pci_alloc_irq_vectors_affinity()
is returned, queue number for non-polled queues can't be changed at will,
because we have to make sure to spread all CPUs on each queue type, and
the mapping has been fixed by pci_alloc_irq_vectors_affinity() already.

So looks the approach in this patch may be wrong.

Thanks,
Ming
Keith Busch Jan. 4, 2019, 3:53 p.m. UTC | #3
On Fri, Jan 04, 2019 at 03:21:07PM +0800, Ming Lei wrote:
> Thinking about the patch further: after pci_alloc_irq_vectors_affinity()
> is returned, queue number for non-polled queues can't be changed at will,
> because we have to make sure to spread all CPUs on each queue type, and
> the mapping has been fixed by pci_alloc_irq_vectors_affinity() already.
> 
> So looks the approach in this patch may be wrong.

That's a bit of a problem, and not a new one. We always had to allocate
vectors before creating IRQ driven CQ's, but the vector affinity is
created before we know if the queue-pair can be created. Should the
queue creation fail, there may be CPUs that don't have a queue.

Does this mean the pci msi API is wrong? It seems like we'd need to
initially allocate vectors without PCI_IRQ_AFFINITY, then have the
kernel set affinity only after completing the queue-pair setup.
Christoph Hellwig Jan. 4, 2019, 6:17 p.m. UTC | #4
On Fri, Jan 04, 2019 at 08:53:24AM -0700, Keith Busch wrote:
> On Fri, Jan 04, 2019 at 03:21:07PM +0800, Ming Lei wrote:
> > Thinking about the patch further: after pci_alloc_irq_vectors_affinity()
> > is returned, queue number for non-polled queues can't be changed at will,
> > because we have to make sure to spread all CPUs on each queue type, and
> > the mapping has been fixed by pci_alloc_irq_vectors_affinity() already.
> > 
> > So looks the approach in this patch may be wrong.
> 
> That's a bit of a problem, and not a new one. We always had to allocate
> vectors before creating IRQ driven CQ's, but the vector affinity is
> created before we know if the queue-pair can be created. Should the
> queue creation fail, there may be CPUs that don't have a queue.
> 
> Does this mean the pci msi API is wrong? It seems like we'd need to
> initially allocate vectors without PCI_IRQ_AFFINITY, then have the
> kernel set affinity only after completing the queue-pair setup.

We can't just easily do that, as we want to allocate the memory for
the descriptors on the correct node.  But we can just free the
vectors and try again if we have to.
Keith Busch Jan. 4, 2019, 6:35 p.m. UTC | #5
On Fri, Jan 04, 2019 at 07:17:26PM +0100, Christoph Hellwig wrote:
> On Fri, Jan 04, 2019 at 08:53:24AM -0700, Keith Busch wrote:
> > On Fri, Jan 04, 2019 at 03:21:07PM +0800, Ming Lei wrote:
> > > Thinking about the patch further: after pci_alloc_irq_vectors_affinity()
> > > is returned, queue number for non-polled queues can't be changed at will,
> > > because we have to make sure to spread all CPUs on each queue type, and
> > > the mapping has been fixed by pci_alloc_irq_vectors_affinity() already.
> > > 
> > > So looks the approach in this patch may be wrong.
> > 
> > That's a bit of a problem, and not a new one. We always had to allocate
> > vectors before creating IRQ driven CQ's, but the vector affinity is
> > created before we know if the queue-pair can be created. Should the
> > queue creation fail, there may be CPUs that don't have a queue.
> > 
> > Does this mean the pci msi API is wrong? It seems like we'd need to
> > initially allocate vectors without PCI_IRQ_AFFINITY, then have the
> > kernel set affinity only after completing the queue-pair setup.
> 
> We can't just easily do that, as we want to allocate the memory for
> the descriptors on the correct node.  But we can just free the
> vectors and try again if we have to.

I've come to the same realization that switching modes after allocation
can't be easily accomodated. Teardown and retry with a reduced queue
count looks like the easiest solution.
Ming Lei Jan. 6, 2019, 2:56 a.m. UTC | #6
On Fri, Jan 04, 2019 at 08:53:24AM -0700, Keith Busch wrote:
> On Fri, Jan 04, 2019 at 03:21:07PM +0800, Ming Lei wrote:
> > Thinking about the patch further: after pci_alloc_irq_vectors_affinity()
> > is returned, queue number for non-polled queues can't be changed at will,
> > because we have to make sure to spread all CPUs on each queue type, and
> > the mapping has been fixed by pci_alloc_irq_vectors_affinity() already.
> > 
> > So looks the approach in this patch may be wrong.
> 
> That's a bit of a problem, and not a new one. We always had to allocate
> vectors before creating IRQ driven CQ's, but the vector affinity is
> created before we know if the queue-pair can be created. Should the
> queue creation fail, there may be CPUs that don't have a queue.
> 
> Does this mean the pci msi API is wrong? It seems like we'd need to
> initially allocate vectors without PCI_IRQ_AFFINITY, then have the
> kernel set affinity only after completing the queue-pair setup.

I think this kind of API style(two stages) is more clean, and
error-immune.

- pci_alloc_irq_vectors() is only for allocating irq vectors
- pci_set_irq_vectors_affnity() is for spreading affinity at will


Thanks,
Ming
diff mbox series

Patch

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 98332d0a80f0..1481bb6d9c42 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1733,6 +1733,30 @@  static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
 	return result;
 }
 
+static void nvme_distribute_queues(struct nvme_dev *dev, unsigned int io_queues)
+{
+	unsigned int irq_queues, this_p_queues = dev->io_queues[HCTX_TYPE_POLL],
+		     this_w_queues = dev->io_queues[HCTX_TYPE_DEFAULT];
+
+	if (!io_queues) {
+		dev->io_queues[HCTX_TYPE_POLL] = 0;
+		dev->io_queues[HCTX_TYPE_DEFAULT] = 0;
+		dev->io_queues[HCTX_TYPE_READ] = 0;
+		return;
+	}
+
+	if (this_p_queues >= io_queues)
+		this_p_queues = io_queues - 1;
+	irq_queues = io_queues - this_p_queues;
+
+	if (this_w_queues > irq_queues)
+		this_w_queues = irq_queues;
+
+	dev->io_queues[HCTX_TYPE_POLL] = this_p_queues;
+	dev->io_queues[HCTX_TYPE_DEFAULT] = this_w_queues;
+	dev->io_queues[HCTX_TYPE_READ] = irq_queues - this_w_queues;
+}
+
 static int nvme_create_io_queues(struct nvme_dev *dev)
 {
 	unsigned i, max, rw_queues;
@@ -1761,6 +1785,13 @@  static int nvme_create_io_queues(struct nvme_dev *dev)
 			break;
 	}
 
+	/*
+	 * If we've created less than expected io queues, redistribute the
+	 * dev->io_queues[] types accordingly.
+	 */
+	if (dev->online_queues - 1 != dev->max_qid)
+		nvme_distribute_queues(dev, dev->online_queues - 1);
+
 	/*
 	 * Ignore failing Create SQ/CQ commands, we can continue with less
 	 * than the desired amount of queues, and even a controller without
@@ -2185,11 +2216,6 @@  static int nvme_setup_io_queues(struct nvme_dev *dev)
 	result = max(result - 1, 1);
 	dev->max_qid = result + dev->io_queues[HCTX_TYPE_POLL];
 
-	dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n",
-					dev->io_queues[HCTX_TYPE_DEFAULT],
-					dev->io_queues[HCTX_TYPE_READ],
-					dev->io_queues[HCTX_TYPE_POLL]);
-
 	/*
 	 * Should investigate if there's a performance win from allocating
 	 * more queues than interrupt vectors; it might allow the submission
@@ -2203,7 +2229,15 @@  static int nvme_setup_io_queues(struct nvme_dev *dev)
 		return result;
 	}
 	set_bit(NVMEQ_ENABLED, &adminq->flags);
-	return nvme_create_io_queues(dev);
+	result = nvme_create_io_queues(dev);
+
+	if (!result)
+		dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n",
+					dev->io_queues[HCTX_TYPE_DEFAULT],
+					dev->io_queues[HCTX_TYPE_READ],
+					dev->io_queues[HCTX_TYPE_POLL]);
+	return result;
+
 }
 
 static void nvme_del_queue_end(struct request *req, blk_status_t error)