diff mbox series

[PATCHv2] PCI: vmd: Use affinity to chain child device interrupts

Message ID 20180206202225.1124-1-keith.busch@intel.com
State Not Applicable
Delegated to: Lorenzo Pieralisi
Headers show
Series [PATCHv2] PCI: vmd: Use affinity to chain child device interrupts | expand

Commit Message

Keith Busch Feb. 6, 2018, 8:22 p.m. UTC
Performance for devices in VMD domains suffer in NUMA environments if
we're not respecting the desired IRQ CPU affinity. This patch fixes
that by creating managed affinity irq vectors for the VMD device, and
then drivers registering their chained interrupts will be assigned the
h/w irq that most closely matches its desired IRQ affinity. A tie is
awarded to the lesser used vector.

Note, this only works for drivers that allocate their vectors with
PCI_IRQ_AFFINITY. All other drivers will be assigned the least used
vector without consideration for affinity.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Acked-by: Jon Derrick <jonathan.derrick@intel.com>
---
v1->v2:

  Added Jon's 'ack'.

  Update changelog subject.

 drivers/pci/host/vmd.c | 80 ++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 65 insertions(+), 15 deletions(-)

Comments

Lorenzo Pieralisi March 13, 2018, 5:10 p.m. UTC | #1
Hi Keith,

On Tue, Feb 06, 2018 at 01:22:25PM -0700, Keith Busch wrote:
> Performance for devices in VMD domains suffer in NUMA environments if
> we're not respecting the desired IRQ CPU affinity. This patch fixes
> that by creating managed affinity irq vectors for the VMD device, and
> then drivers registering their chained interrupts will be assigned the
> h/w irq that most closely matches its desired IRQ affinity. A tie is
> awarded to the lesser used vector.
> 
> Note, this only works for drivers that allocate their vectors with
> PCI_IRQ_AFFINITY. All other drivers will be assigned the least used
> vector without consideration for affinity.
> 
> Signed-off-by: Keith Busch <keith.busch@intel.com>
> Acked-by: Jon Derrick <jonathan.derrick@intel.com>
> ---
> v1->v2:
> 
>   Added Jon's 'ack'.
> 
>   Update changelog subject.
> 
>  drivers/pci/host/vmd.c | 80 ++++++++++++++++++++++++++++++++++++++++----------
>  1 file changed, 65 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/pci/host/vmd.c b/drivers/pci/host/vmd.c
> index 930a8fa08bd6..ac84676e79a4 100644
> --- a/drivers/pci/host/vmd.c
> +++ b/drivers/pci/host/vmd.c

[...]

> @@ -233,9 +266,11 @@ static int vmd_msi_prepare(struct irq_domain *domain, struct device *dev,
>  	struct pci_dev *pdev = to_pci_dev(dev);
>  	struct vmd_dev *vmd = vmd_from_bus(pdev->bus);
>  
> -	if (nvec > vmd->msix_count)
> +	if (nvec > vmd->msix_count) {
> +		if (vmd->msix_count > 1)
> +			return vmd->msix_count - 1;
>  		return vmd->msix_count;

I am about to apply this patch but I do not understand what's this hunk
is there for, to me vmd_msi_prepare() should just return an error in
this code path unless I am getting this wrong.

Thanks,
Lorenzo

> -
> +	}
>  	memset(arg, 0, sizeof(*arg));
>  	return 0;
>  }
> @@ -663,6 +698,14 @@ static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id)
>  	struct vmd_dev *vmd;
>  	int i, err;
>  
> +	/*
> +	 * The first vector is reserved for special use, so start affinity at
> +	 * the second vector.
> +	 */
> +	struct irq_affinity affd = {
> +		.pre_vectors = 1,
> +	};
> +
>  	if (resource_size(&dev->resource[VMD_CFGBAR]) < (1 << 20))
>  		return -ENOMEM;
>  
> @@ -688,8 +731,15 @@ static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id)
>  	if (vmd->msix_count < 0)
>  		return -ENODEV;
>  
> -	vmd->msix_count = pci_alloc_irq_vectors(dev, 1, vmd->msix_count,
> -					PCI_IRQ_MSIX);
> +	/*
> +	 * Reserve remaining vectors that IRQ affinity won't be able to assign.
> +	 */
> +	if ((vmd->msix_count - 1) > cpumask_weight(cpu_present_mask))
> +		affd.post_vectors = vmd->msix_count -
> +					cpumask_weight(cpu_present_mask) - 1;
> +
> +	vmd->msix_count = pci_alloc_irq_vectors_affinity(dev, 1, vmd->msix_count,
> +					PCI_IRQ_MSIX | PCI_IRQ_AFFINITY, &affd);
>  	if (vmd->msix_count < 0)
>  		return vmd->msix_count;
>  
> -- 
> 2.14.3
>
Keith Busch March 13, 2018, 8:08 p.m. UTC | #2
On Tue, Mar 13, 2018 at 05:10:39PM +0000, Lorenzo Pieralisi wrote:
> On Tue, Feb 06, 2018 at 01:22:25PM -0700, Keith Busch wrote:
> > @@ -233,9 +266,11 @@ static int vmd_msi_prepare(struct irq_domain *domain, struct device *dev,
> >  	struct pci_dev *pdev = to_pci_dev(dev);
> >  	struct vmd_dev *vmd = vmd_from_bus(pdev->bus);
> >  
> > -	if (nvec > vmd->msix_count)
> > +	if (nvec > vmd->msix_count) {
> > +		if (vmd->msix_count > 1)
> > +			return vmd->msix_count - 1;
> >  		return vmd->msix_count;
> 
> I am about to apply this patch but I do not understand what's this hunk
> is there for, to me vmd_msi_prepare() should just return an error in
> this code path unless I am getting this wrong.

Hi Lorenzo,

It's not really an error if a driver requests more vectors than
can be allocated. The return here ultimately propogates back to
__pci_enable_msix, which returns 0 for sucess, < 0 for error, and >
0 if the requested count was too high.

The change above is fixing an off-by-one. It is really a bug fix on its
own, but it wasn't really harmful without the affinity awareness this
patch adds, where not having it starts to negatively affect performance.

Thanks,
Keith
Keith Busch March 13, 2018, 8:18 p.m. UTC | #3
On Tue, Mar 13, 2018 at 02:08:40PM -0600, Keith Busch wrote:
> On Tue, Mar 13, 2018 at 05:10:39PM +0000, Lorenzo Pieralisi wrote:
> > On Tue, Feb 06, 2018 at 01:22:25PM -0700, Keith Busch wrote:
> > > @@ -233,9 +266,11 @@ static int vmd_msi_prepare(struct irq_domain *domain, struct device *dev,
> > >  	struct pci_dev *pdev = to_pci_dev(dev);
> > >  	struct vmd_dev *vmd = vmd_from_bus(pdev->bus);
> > >  
> > > -	if (nvec > vmd->msix_count)
> > > +	if (nvec > vmd->msix_count) {
> > > +		if (vmd->msix_count > 1)
> > > +			return vmd->msix_count - 1;
> > >  		return vmd->msix_count;
> > 
> > I am about to apply this patch but I do not understand what's this hunk
> > is there for, to me vmd_msi_prepare() should just return an error in
> > this code path unless I am getting this wrong.
> 
> The change above is fixing an off-by-one.

And since you brought my attention to it, I see the 'if' is off by one.
Let me prepare a v3, and I've one other minor optimization to add as
well.

Thanks,
Keith
Keith Busch March 13, 2018, 11:24 p.m. UTC | #4
On Tue, Mar 13, 2018 at 02:18:07PM -0600, Keith Busch wrote:
> 
> And since you brought my attention to it, I see the 'if' is off by one.
> Let me prepare a v3, and I've one other minor optimization to add as
> well.

Ugh, I may need to request withdrawing this patch entirely. This will
completely break if the VMD interrupt vector's SMP affinity is a subset
of the child driver's, and the CPUs in the intersection of those masks
are set offline after driver initialization.

That's looking like a pretty tricky problem to fix, all for (somewhat)
obscure kernel feature... I can't see how without a new hot cpu notifier.

Let me request a pause on this one and I'll get back ASAP.
Lorenzo Pieralisi March 14, 2018, 11:05 a.m. UTC | #5
On Tue, Mar 13, 2018 at 05:24:40PM -0600, Keith Busch wrote:
> On Tue, Mar 13, 2018 at 02:18:07PM -0600, Keith Busch wrote:
> > 
> > And since you brought my attention to it, I see the 'if' is off by one.
> > Let me prepare a v3, and I've one other minor optimization to add as
> > well.
> 
> Ugh, I may need to request withdrawing this patch entirely. This will
> completely break if the VMD interrupt vector's SMP affinity is a subset
> of the child driver's, and the CPUs in the intersection of those masks
> are set offline after driver initialization.
> 
> That's looking like a pretty tricky problem to fix, all for (somewhat)
> obscure kernel feature... I can't see how without a new hot cpu notifier.
> 
> Let me request a pause on this one and I'll get back ASAP.

OK, I will mark it as "Not Applicable" and will wait for an updated
version then.

Thanks,
Lorenzo
diff mbox series

Patch

diff --git a/drivers/pci/host/vmd.c b/drivers/pci/host/vmd.c
index 930a8fa08bd6..ac84676e79a4 100644
--- a/drivers/pci/host/vmd.c
+++ b/drivers/pci/host/vmd.c
@@ -166,10 +166,6 @@  static irq_hw_number_t vmd_get_hwirq(struct msi_domain_info *info,
 	return 0;
 }
 
-/*
- * XXX: We can be even smarter selecting the best IRQ once we solve the
- * affinity problem.
- */
 static struct vmd_irq_list *vmd_next_irq(struct vmd_dev *vmd, struct msi_desc *desc)
 {
 	int i, best = 1;
@@ -188,24 +184,61 @@  static struct vmd_irq_list *vmd_next_irq(struct vmd_dev *vmd, struct msi_desc *d
 	return &vmd->irqs[best];
 }
 
+static struct vmd_irq_list *vmd_next_affinity_irq(struct vmd_dev *vmd,  const struct cpumask *dest)
+{
+	struct vmd_irq_list *irq = NULL;
+	const struct cpumask *vmd_mask;
+	unsigned long flags, match;
+	int i, best = 0;
+
+	if (!dest || vmd->msix_count < 2)
+		return NULL;
+
+	raw_spin_lock_irqsave(&list_lock, flags);
+	for (i = 1; i < vmd->msix_count; i++) {
+		struct cpumask tmp;
+
+		vmd_mask = pci_irq_get_affinity(vmd->dev, i);
+		cpumask_and(&tmp, vmd_mask, dest);
+		match = cpumask_weight(&tmp);
+		if (match >= best) {
+			if (match == best && irq &&
+			    (vmd->irqs[i].count >= irq->count))
+				continue;
+			irq = &vmd->irqs[i];
+			best = match;
+		}
+	}
+	if (irq)
+		irq->count++;
+	raw_spin_unlock_irqrestore(&list_lock, flags);
+
+	return irq;
+}
+
 static int vmd_msi_init(struct irq_domain *domain, struct msi_domain_info *info,
 			unsigned int virq, irq_hw_number_t hwirq,
 			msi_alloc_info_t *arg)
 {
-	struct msi_desc *desc = arg->desc;
-	struct vmd_dev *vmd = vmd_from_bus(msi_desc_to_pci_dev(desc)->bus);
+	struct msi_desc *msidesc = arg->desc;
+	struct vmd_dev *vmd = vmd_from_bus(msi_desc_to_pci_dev(msidesc)->bus);
 	struct vmd_irq *vmdirq = kzalloc(sizeof(*vmdirq), GFP_KERNEL);
-	unsigned int index, vector;
+	struct irq_desc *desc = irq_to_desc(virq);
+	unsigned int vector;
 
 	if (!vmdirq)
 		return -ENOMEM;
 
 	INIT_LIST_HEAD(&vmdirq->node);
-	vmdirq->irq = vmd_next_irq(vmd, desc);
-	vmdirq->virq = virq;
-	index = index_from_irqs(vmd, vmdirq->irq);
-	vector = pci_irq_vector(vmd->dev, index);
 
+	if (desc && irqd_affinity_is_managed(&desc->irq_data))
+		vmdirq->irq = vmd_next_affinity_irq(vmd,
+					desc->irq_common_data.affinity);
+	if (vmdirq->irq == NULL)
+		vmdirq->irq = vmd_next_irq(vmd, msidesc);
+
+	vmdirq->virq = virq;
+	vector = pci_irq_vector(vmd->dev, index_from_irqs(vmd, vmdirq->irq));
 	irq_domain_set_info(domain, virq, vector, info->chip, vmdirq,
 			    handle_untracked_irq, vmd, NULL);
 	return 0;
@@ -233,9 +266,11 @@  static int vmd_msi_prepare(struct irq_domain *domain, struct device *dev,
 	struct pci_dev *pdev = to_pci_dev(dev);
 	struct vmd_dev *vmd = vmd_from_bus(pdev->bus);
 
-	if (nvec > vmd->msix_count)
+	if (nvec > vmd->msix_count) {
+		if (vmd->msix_count > 1)
+			return vmd->msix_count - 1;
 		return vmd->msix_count;
-
+	}
 	memset(arg, 0, sizeof(*arg));
 	return 0;
 }
@@ -663,6 +698,14 @@  static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id)
 	struct vmd_dev *vmd;
 	int i, err;
 
+	/*
+	 * The first vector is reserved for special use, so start affinity at
+	 * the second vector.
+	 */
+	struct irq_affinity affd = {
+		.pre_vectors = 1,
+	};
+
 	if (resource_size(&dev->resource[VMD_CFGBAR]) < (1 << 20))
 		return -ENOMEM;
 
@@ -688,8 +731,15 @@  static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id)
 	if (vmd->msix_count < 0)
 		return -ENODEV;
 
-	vmd->msix_count = pci_alloc_irq_vectors(dev, 1, vmd->msix_count,
-					PCI_IRQ_MSIX);
+	/*
+	 * Reserve remaining vectors that IRQ affinity won't be able to assign.
+	 */
+	if ((vmd->msix_count - 1) > cpumask_weight(cpu_present_mask))
+		affd.post_vectors = vmd->msix_count -
+					cpumask_weight(cpu_present_mask) - 1;
+
+	vmd->msix_count = pci_alloc_irq_vectors_affinity(dev, 1, vmd->msix_count,
+					PCI_IRQ_MSIX | PCI_IRQ_AFFINITY, &affd);
 	if (vmd->msix_count < 0)
 		return vmd->msix_count;