diff mbox series

[PATCHv2,4/4] nvme-pci: Use PCI to handle IRQ reduce and retry

Message ID 20190103225033.11249-5-keith.busch@intel.com
State Not Applicable
Delegated to: Bjorn Helgaas
Headers show
Series NVMe IRQ sets fixups | expand

Commit Message

Keith Busch Jan. 3, 2019, 10:50 p.m. UTC
Restore error handling for vector allocation back to the PCI core.

Signed-off-by: Keith Busch <keith.busch@intel.com>
---
 drivers/nvme/host/pci.c | 77 ++++++++++++++-----------------------------------
 1 file changed, 21 insertions(+), 56 deletions(-)

Comments

Ming Lei Jan. 4, 2019, 2:41 a.m. UTC | #1
On Thu, Jan 03, 2019 at 03:50:33PM -0700, Keith Busch wrote:
> Restore error handling for vector allocation back to the PCI core.
> 
> Signed-off-by: Keith Busch <keith.busch@intel.com>
> ---
>  drivers/nvme/host/pci.c | 77 ++++++++++++++-----------------------------------
>  1 file changed, 21 insertions(+), 56 deletions(-)
> 
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 1481bb6d9c42..f3ef09a8e8f9 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -2059,37 +2059,43 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)
>  	return ret;
>  }
>  
> -static void nvme_calc_io_queues(struct nvme_dev *dev, unsigned int irq_queues)
> +static void nvme_calc_io_queues(struct irq_affinity *affd, unsigned int nvecs)
>  {
> +	struct nvme_dev *dev = affd->priv;
>  	unsigned int this_w_queues = write_queues;
>  
>  	/*
>  	 * Setup read/write queue split
>  	 */
> -	if (irq_queues == 1) {
> +	if (nvecs == 1) {

The above line can be 'nvecs <= 2', cause when nvecs is 2, one is for
admin queue, another can be for DEFAULT.

>  		dev->io_queues[HCTX_TYPE_DEFAULT] = 1;
>  		dev->io_queues[HCTX_TYPE_READ] = 0;
> -		return;
> +		goto set_sets;
>  	}
>  
>  	/*
>  	 * If 'write_queues' is set, ensure it leaves room for at least
>  	 * one read queue
>  	 */
> -	if (this_w_queues >= irq_queues)
> -		this_w_queues = irq_queues - 1;
> +	if (this_w_queues >= nvecs - 1)
> +		this_w_queues = nvecs - 1;

If we want to leave room for one read queue, 'this_w_queues' should be
set as 'nvecs - 2' given nvecs covers admin queue.

>  
>  	/*
>  	 * If 'write_queues' is set to zero, reads and writes will share
>  	 * a queue set.
>  	 */
>  	if (!this_w_queues) {
> -		dev->io_queues[HCTX_TYPE_DEFAULT] = irq_queues;
> +		dev->io_queues[HCTX_TYPE_DEFAULT] = nvecs - 1;
>  		dev->io_queues[HCTX_TYPE_READ] = 0;
>  	} else {
>  		dev->io_queues[HCTX_TYPE_DEFAULT] = this_w_queues;
> -		dev->io_queues[HCTX_TYPE_READ] = irq_queues - this_w_queues;
> +		dev->io_queues[HCTX_TYPE_READ] = nvecs - this_w_queues - 1;
>  	}

In above change, looks you starts to consider admin queue vector, which
is obvious one issue in current code.

So I'd suggest to fix nvme_calc_io_queues() in one standalone patch,
just what I posted, given this patch doesn't do "Restore error
handling for vector allocation back to the PCI core" only.

http://lists.infradead.org/pipermail/linux-nvme/2018-December/021879.html

Thanks,
Ming
Christoph Hellwig Jan. 4, 2019, 6:19 p.m. UTC | #2
I can't say I am a huge fan of the complex callback.  If we just made
the number of read vs write queues a factor instead of invidual
scalar numbers we could just handle this in the irq code without
the callback, and the concept might actually be understandable by
mere humans..
Keith Busch Jan. 4, 2019, 6:33 p.m. UTC | #3
On Fri, Jan 04, 2019 at 07:19:38PM +0100, Christoph Hellwig wrote:
> I can't say I am a huge fan of the complex callback.  If we just made
> the number of read vs write queues a factor instead of invidual
> scalar numbers we could just handle this in the irq code without
> the callback, and the concept might actually be understandable by
> mere humans..

Okay, we could express this as a ratio. I'll explore that path a bit
more.
diff mbox series

Patch

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 1481bb6d9c42..f3ef09a8e8f9 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2059,37 +2059,43 @@  static int nvme_setup_host_mem(struct nvme_dev *dev)
 	return ret;
 }
 
-static void nvme_calc_io_queues(struct nvme_dev *dev, unsigned int irq_queues)
+static void nvme_calc_io_queues(struct irq_affinity *affd, unsigned int nvecs)
 {
+	struct nvme_dev *dev = affd->priv;
 	unsigned int this_w_queues = write_queues;
 
 	/*
 	 * Setup read/write queue split
 	 */
-	if (irq_queues == 1) {
+	if (nvecs == 1) {
 		dev->io_queues[HCTX_TYPE_DEFAULT] = 1;
 		dev->io_queues[HCTX_TYPE_READ] = 0;
-		return;
+		goto set_sets;
 	}
 
 	/*
 	 * If 'write_queues' is set, ensure it leaves room for at least
 	 * one read queue
 	 */
-	if (this_w_queues >= irq_queues)
-		this_w_queues = irq_queues - 1;
+	if (this_w_queues >= nvecs - 1)
+		this_w_queues = nvecs - 1;
 
 	/*
 	 * If 'write_queues' is set to zero, reads and writes will share
 	 * a queue set.
 	 */
 	if (!this_w_queues) {
-		dev->io_queues[HCTX_TYPE_DEFAULT] = irq_queues;
+		dev->io_queues[HCTX_TYPE_DEFAULT] = nvecs - 1;
 		dev->io_queues[HCTX_TYPE_READ] = 0;
 	} else {
 		dev->io_queues[HCTX_TYPE_DEFAULT] = this_w_queues;
-		dev->io_queues[HCTX_TYPE_READ] = irq_queues - this_w_queues;
+		dev->io_queues[HCTX_TYPE_READ] = nvecs - this_w_queues - 1;
 	}
+set_sets:
+	affd->sets[0] = dev->io_queues[HCTX_TYPE_DEFAULT];
+	affd->sets[1] = dev->io_queues[HCTX_TYPE_READ];
+	if (!affd->sets[1])
+		affd->nr_sets = 1;
 }
 
 static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
@@ -2100,9 +2106,10 @@  static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
 		.pre_vectors = 1,
 		.nr_sets = ARRAY_SIZE(irq_sets),
 		.sets = irq_sets,
+		.recalc_sets = nvme_calc_io_queues,
+		.priv = dev,
 	};
-	int result = 0;
-	unsigned int irq_queues, this_p_queues;
+	unsigned int nvecs, this_p_queues;
 
 	/*
 	 * Poll queues don't need interrupts, but we need at least one IO
@@ -2111,56 +2118,14 @@  static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
 	this_p_queues = poll_queues;
 	if (this_p_queues >= nr_io_queues) {
 		this_p_queues = nr_io_queues - 1;
-		irq_queues = 1;
+		nvecs = 2;
 	} else {
-		irq_queues = nr_io_queues - this_p_queues;
+		nvecs = nr_io_queues - this_p_queues + 1;
 	}
 	dev->io_queues[HCTX_TYPE_POLL] = this_p_queues;
-
-	/*
-	 * For irq sets, we have to ask for minvec == maxvec. This passes
-	 * any reduction back to us, so we can adjust our queue counts and
-	 * IRQ vector needs.
-	 */
-	do {
-		nvme_calc_io_queues(dev, irq_queues);
-		irq_sets[0] = dev->io_queues[HCTX_TYPE_DEFAULT];
-		irq_sets[1] = dev->io_queues[HCTX_TYPE_READ];
-		if (!irq_sets[1])
-			affd.nr_sets = 1;
-
-		/*
-		 * If we got a failure and we're down to asking for just
-		 * 1 + 1 queues, just ask for a single vector. We'll share
-		 * that between the single IO queue and the admin queue.
-		 */
-		if (result >= 0 && irq_queues > 1)
-			irq_queues = irq_sets[0] + irq_sets[1] + 1;
-
-		result = pci_alloc_irq_vectors_affinity(pdev, irq_queues,
-				irq_queues,
-				PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
-
-		/*
-		 * Need to reduce our vec counts. If we get ENOSPC, the
-		 * platform should support mulitple vecs, we just need
-		 * to decrease our ask. If we get EINVAL, the platform
-		 * likely does not. Back down to ask for just one vector.
-		 */
-		if (result == -ENOSPC) {
-			irq_queues--;
-			if (!irq_queues)
-				return result;
-			continue;
-		} else if (result == -EINVAL) {
-			irq_queues = 1;
-			continue;
-		} else if (result <= 0)
-			return -EIO;
-		break;
-	} while (1);
-
-	return result;
+	nvme_calc_io_queues(&affd, nvecs);
+	return pci_alloc_irq_vectors_affinity(pdev, affd.pre_vectors, nvecs,
+			PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
 }
 
 static int nvme_setup_io_queues(struct nvme_dev *dev)