diff mbox series

[2/3] PCI: vmd: Align IRQ lists with child device vectors

Message ID 1573040408-3831-3-git-send-email-jonathan.derrick@intel.com
State Changes Requested
Delegated to: Lorenzo Pieralisi
Headers show
Series PCI: vmd: Reducing tail latency by affining to the storage stack | expand

Commit Message

Jon Derrick Nov. 6, 2019, 11:40 a.m. UTC
In order to provide better affinity alignment along the entire storage
stack, VMD IRQ lists can be assigned to in a manner where the underlying
IRQ can be affinitized the same as the child (NVMe) device.

This patch changes the assignment of child device vectors in IRQ lists
from a round-robin strategy to a matching-entry strategy. NVMe
affinities are deterministic in a VMD domain when these devices have the
same vector count as limited by the VMD MSI domain or cpu count. When
one or more child devices are attached on a VMD domain, this patch
aligns the NVMe submission-side affinity with the VMD completion-side
affinity as it completes through the VMD IRQ list.

Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
---
 drivers/pci/controller/vmd.c | 57 ++++++++++++++++----------------------------
 1 file changed, 21 insertions(+), 36 deletions(-)

Comments

Keith Busch Nov. 6, 2019, 6:06 p.m. UTC | #1
On Wed, Nov 06, 2019 at 04:40:07AM -0700, Jon Derrick wrote:
> In order to provide better affinity alignment along the entire storage
> stack, VMD IRQ lists can be assigned to in a manner where the underlying
> IRQ can be affinitized the same as the child (NVMe) device.
> 
> This patch changes the assignment of child device vectors in IRQ lists
> from a round-robin strategy to a matching-entry strategy. NVMe
> affinities are deterministic in a VMD domain when these devices have the
> same vector count as limited by the VMD MSI domain or cpu count. When
> one or more child devices are attached on a VMD domain, this patch
> aligns the NVMe submission-side affinity with the VMD completion-side
> affinity as it completes through the VMD IRQ list.

This really only works if the child devices have the same irq count as
the vmd device. If the vmd device has more interrupts than the child
devices, this will end up overloading the lower vmd interrupts without
even using the higher ones.
Jon Derrick Nov. 6, 2019, 8:14 p.m. UTC | #2
On Thu, 2019-11-07 at 03:06 +0900, Keith Busch wrote:
> On Wed, Nov 06, 2019 at 04:40:07AM -0700, Jon Derrick wrote:
> > In order to provide better affinity alignment along the entire storage
> > stack, VMD IRQ lists can be assigned to in a manner where the underlying
> > IRQ can be affinitized the same as the child (NVMe) device.
> > 
> > This patch changes the assignment of child device vectors in IRQ lists
> > from a round-robin strategy to a matching-entry strategy. NVMe
> > affinities are deterministic in a VMD domain when these devices have the
> > same vector count as limited by the VMD MSI domain or cpu count. When
> > one or more child devices are attached on a VMD domain, this patch
> > aligns the NVMe submission-side affinity with the VMD completion-side
> > affinity as it completes through the VMD IRQ list.
> 
> This really only works if the child devices have the same irq count as
> the vmd device. If the vmd device has more interrupts than the child
> devices, this will end up overloading the lower vmd interrupts without
> even using the higher ones.

Correct. The child NVMe device would need to have the same or more than
the 32 IO vectors VMD offers. We could do something dynamically to
determine when to do matching-affinities vs round-robin, but as this is
a hotpluggable domain it seems fragile to be changing interrupts in
such a way.

I haven't actually seen an NVMe device with fewer than 32 vectors, and
overloading VMD vectors seems to be the least of the concerns of
performance with such a device. This configuration will result in what
is essentially the same issue we're facing today with poorly affined
VMD IRQ lists.

For the future VMD implementation offering 63 IO vectors, yes this will
be a concern and all I can really suggest is to use drives with more
vectors until I can determine a good way to handle this.
diff mbox series

Patch

diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
index ebe7ff6..7aca925 100644
--- a/drivers/pci/controller/vmd.c
+++ b/drivers/pci/controller/vmd.c
@@ -75,13 +75,10 @@  struct vmd_irq {
  * struct vmd_irq_list - list of driver requested IRQs mapping to a VMD vector
  * @irq_list:	the list of irq's the VMD one demuxes to.
  * @srcu:	SRCU struct for local synchronization.
- * @count:	number of child IRQs assigned to this vector; used to track
- *		sharing.
  */
 struct vmd_irq_list {
 	struct list_head	irq_list;
 	struct srcu_struct	srcu;
-	unsigned int		count;
 	unsigned int		index;
 };
 
@@ -184,37 +181,32 @@  static irq_hw_number_t vmd_get_hwirq(struct msi_domain_info *info,
 	return 0;
 }
 
-/*
- * XXX: We can be even smarter selecting the best IRQ once we solve the
- * affinity problem.
- */
 static struct vmd_irq_list *vmd_next_irq(struct vmd_dev *vmd, struct msi_desc *desc)
 {
-	int i, best = 1;
-	unsigned long flags;
-
-	if (vmd->msix_count == 1)
-		return vmd->irqs[0];
-
-	/*
-	 * White list for fast-interrupt handlers. All others will share the
-	 * "slow" interrupt vector.
-	 */
-	switch (msi_desc_to_pci_dev(desc)->class) {
-	case PCI_CLASS_STORAGE_EXPRESS:
-		break;
-	default:
-		return vmd->irqs[0];
+	int entry_nr = desc->msi_attrib.entry_nr;
+
+	if (vmd->msix_count == 1) {
+		entry_nr = 0;
+	} else {
+
+		/*
+		 * White list for fast-interrupt handlers. All others will
+		 * share the "slow" interrupt vector.
+		 */
+		switch (msi_desc_to_pci_dev(desc)->class) {
+		case PCI_CLASS_STORAGE_EXPRESS:
+			break;
+		default:
+			entry_nr = 0;
+		}
 	}
 
-	raw_spin_lock_irqsave(&list_lock, flags);
-	for (i = 1; i < vmd->msix_count; i++)
-		if (vmd->irqs[i]->count < vmd->irqs[best]->count)
-			best = i;
-	vmd->irqs[best]->count++;
-	raw_spin_unlock_irqrestore(&list_lock, flags);
+	if (entry_nr > vmd->msix_count)
+		entry_nr = 0;
 
-	return vmd->irqs[best];
+	dev_dbg(desc->dev, "Entry %d using VMD IRQ list %d/%d\n",
+		desc->msi_attrib.entry_nr, entry_nr, vmd->msix_count - 1);
+	return vmd->irqs[entry_nr];
 }
 
 static int vmd_msi_init(struct irq_domain *domain, struct msi_domain_info *info,
@@ -243,15 +235,8 @@  static void vmd_msi_free(struct irq_domain *domain,
 			struct msi_domain_info *info, unsigned int virq)
 {
 	struct vmd_irq *vmdirq = irq_get_chip_data(virq);
-	unsigned long flags;
 
 	synchronize_srcu(&vmdirq->irq->srcu);
-
-	/* XXX: Potential optimization to rebalance */
-	raw_spin_lock_irqsave(&list_lock, flags);
-	vmdirq->irq->count--;
-	raw_spin_unlock_irqrestore(&list_lock, flags);
-
 	kfree(vmdirq);
 }