diff mbox series

[v2,8/8] powerpc/xive: Map one IPI interrupt per node

Message ID 20210303174857.1760393-9-clg@kaod.org
State Superseded
Headers show
Series powerpc/xive: Map one IPI interrupt per node | expand

Checks

Context Check Description
snowpatch_ozlabs/apply_patch success Successfully applied on branch powerpc/merge (626a6c3d2e20da80aaa710104f34ea6037b28b33)
snowpatch_ozlabs/build-ppc64le success Build succeeded
snowpatch_ozlabs/build-ppc64be success Build succeeded
snowpatch_ozlabs/build-ppc64e success Build succeeded
snowpatch_ozlabs/build-pmac32 success Build succeeded
snowpatch_ozlabs/checkpatch success total: 0 errors, 0 warnings, 0 checks, 77 lines checked
snowpatch_ozlabs/needsstable success Patch has no Fixes tags

Commit Message

Cédric Le Goater March 3, 2021, 5:48 p.m. UTC
ipistorm [*] can be used to benchmark the raw interrupt rate of an
interrupt controller by measuring the number of IPIs a system can
sustain. When applied to the XIVE interrupt controller of POWER9 and
POWER10 systems, a significant drop of the interrupt rate can be
observed when crossing the second node boundary.

This is due to the fact that a single IPI interrupt is used for all
CPUs of the system. The structure is shared and the cache line updates
impact greatly the traffic between nodes and the overall IPI
performance.

As a workaround, the impact can be reduced by deactivating the IRQ
lockup detector ("noirqdebug") which does a lot of accounting in the
Linux IRQ descriptor structure and is responsible for most of the
performance penalty.

As a fix, this proposal allocates an IPI interrupt per node, to be
shared by all CPUs of that node. It solves the scaling issue, the IRQ
lockup detector still has an impact but the XIVE interrupt rate scales
linearly. It also improves the "noirqdebug" case as showed in the
tables below.

 * P9 DD2.2 - 2s * 64 threads

                                               "noirqdebug"
                        Mint/s                    Mint/s
 chips  cpus      IPI/sys   IPI/chip       IPI/chip    IPI/sys
 --------------------------------------------------------------
 1      0-15     4.984023   4.875405       4.996536   5.048892
        0-31    10.879164  10.544040      10.757632  11.037859
        0-47    15.345301  14.688764      14.926520  15.310053
        0-63    17.064907  17.066812      17.613416  17.874511
 2      0-79    11.768764  21.650749      22.689120  22.566508
        0-95    10.616812  26.878789      28.434703  28.320324
        0-111   10.151693  31.397803      31.771773  32.388122
        0-127    9.948502  33.139336      34.875716  35.224548

 * P10 DD1 - 4s (not homogeneous) 352 threads

                                               "noirqdebug"
                        Mint/s                    Mint/s
 chips  cpus      IPI/sys   IPI/chip       IPI/chip    IPI/sys
 --------------------------------------------------------------
 1      0-15     2.409402   2.364108       2.383303   2.395091
        0-31     6.028325   6.046075       6.089999   6.073750
        0-47     8.655178   8.644531       8.712830   8.724702
        0-63    11.629652  11.735953      12.088203  12.055979
        0-79    14.392321  14.729959      14.986701  14.973073
        0-95    12.604158  13.004034      17.528748  17.568095
 2      0-111    9.767753  13.719831      19.968606  20.024218
        0-127    6.744566  16.418854      22.898066  22.995110
        0-143    6.005699  19.174421      25.425622  25.417541
        0-159    5.649719  21.938836      27.952662  28.059603
        0-175    5.441410  24.109484      31.133915  31.127996
 3      0-191    5.318341  24.405322      33.999221  33.775354
        0-207    5.191382  26.449769      36.050161  35.867307
        0-223    5.102790  29.356943      39.544135  39.508169
        0-239    5.035295  31.933051      42.135075  42.071975
        0-255    4.969209  34.477367      44.655395  44.757074
 4      0-271    4.907652  35.887016      47.080545  47.318537
        0-287    4.839581  38.076137      50.464307  50.636219
        0-303    4.786031  40.881319      53.478684  53.310759
        0-319    4.743750  43.448424      56.388102  55.973969
        0-335    4.709936  45.623532      59.400930  58.926857
        0-351    4.681413  45.646151      62.035804  61.830057

[*] https://github.com/antonblanchard/ipistorm

Signed-off-by: Cédric Le Goater <clg@kaod.org>
---
 arch/powerpc/sysdev/xive/xive-internal.h |  2 --
 arch/powerpc/sysdev/xive/common.c        | 39 ++++++++++++++++++------
 2 files changed, 30 insertions(+), 11 deletions(-)

Comments

Greg Kurz March 9, 2021, 1:23 p.m. UTC | #1
On Wed, 3 Mar 2021 18:48:57 +0100
Cédric Le Goater <clg@kaod.org> wrote:

> ipistorm [*] can be used to benchmark the raw interrupt rate of an
> interrupt controller by measuring the number of IPIs a system can
> sustain. When applied to the XIVE interrupt controller of POWER9 and
> POWER10 systems, a significant drop of the interrupt rate can be
> observed when crossing the second node boundary.
> 
> This is due to the fact that a single IPI interrupt is used for all
> CPUs of the system. The structure is shared and the cache line updates
> impact greatly the traffic between nodes and the overall IPI
> performance.
> 
> As a workaround, the impact can be reduced by deactivating the IRQ
> lockup detector ("noirqdebug") which does a lot of accounting in the
> Linux IRQ descriptor structure and is responsible for most of the
> performance penalty.
> 
> As a fix, this proposal allocates an IPI interrupt per node, to be
> shared by all CPUs of that node. It solves the scaling issue, the IRQ
> lockup detector still has an impact but the XIVE interrupt rate scales
> linearly. It also improves the "noirqdebug" case as showed in the
> tables below.
> 
>  * P9 DD2.2 - 2s * 64 threads
> 
>                                                "noirqdebug"
>                         Mint/s                    Mint/s
>  chips  cpus      IPI/sys   IPI/chip       IPI/chip    IPI/sys
>  --------------------------------------------------------------
>  1      0-15     4.984023   4.875405       4.996536   5.048892
>         0-31    10.879164  10.544040      10.757632  11.037859
>         0-47    15.345301  14.688764      14.926520  15.310053
>         0-63    17.064907  17.066812      17.613416  17.874511
>  2      0-79    11.768764  21.650749      22.689120  22.566508
>         0-95    10.616812  26.878789      28.434703  28.320324
>         0-111   10.151693  31.397803      31.771773  32.388122
>         0-127    9.948502  33.139336      34.875716  35.224548
> 
>  * P10 DD1 - 4s (not homogeneous) 352 threads
> 
>                                                "noirqdebug"
>                         Mint/s                    Mint/s
>  chips  cpus      IPI/sys   IPI/chip       IPI/chip    IPI/sys
>  --------------------------------------------------------------
>  1      0-15     2.409402   2.364108       2.383303   2.395091
>         0-31     6.028325   6.046075       6.089999   6.073750
>         0-47     8.655178   8.644531       8.712830   8.724702
>         0-63    11.629652  11.735953      12.088203  12.055979
>         0-79    14.392321  14.729959      14.986701  14.973073
>         0-95    12.604158  13.004034      17.528748  17.568095
>  2      0-111    9.767753  13.719831      19.968606  20.024218
>         0-127    6.744566  16.418854      22.898066  22.995110
>         0-143    6.005699  19.174421      25.425622  25.417541
>         0-159    5.649719  21.938836      27.952662  28.059603
>         0-175    5.441410  24.109484      31.133915  31.127996
>  3      0-191    5.318341  24.405322      33.999221  33.775354
>         0-207    5.191382  26.449769      36.050161  35.867307
>         0-223    5.102790  29.356943      39.544135  39.508169
>         0-239    5.035295  31.933051      42.135075  42.071975
>         0-255    4.969209  34.477367      44.655395  44.757074
>  4      0-271    4.907652  35.887016      47.080545  47.318537
>         0-287    4.839581  38.076137      50.464307  50.636219
>         0-303    4.786031  40.881319      53.478684  53.310759
>         0-319    4.743750  43.448424      56.388102  55.973969
>         0-335    4.709936  45.623532      59.400930  58.926857
>         0-351    4.681413  45.646151      62.035804  61.830057
> 
> [*] https://github.com/antonblanchard/ipistorm
> 
> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> ---
>  arch/powerpc/sysdev/xive/xive-internal.h |  2 --
>  arch/powerpc/sysdev/xive/common.c        | 39 ++++++++++++++++++------
>  2 files changed, 30 insertions(+), 11 deletions(-)
> 
> diff --git a/arch/powerpc/sysdev/xive/xive-internal.h b/arch/powerpc/sysdev/xive/xive-internal.h
> index 9cf57c722faa..b3a456fdd3a5 100644
> --- a/arch/powerpc/sysdev/xive/xive-internal.h
> +++ b/arch/powerpc/sysdev/xive/xive-internal.h
> @@ -5,8 +5,6 @@
>  #ifndef __XIVE_INTERNAL_H
>  #define __XIVE_INTERNAL_H
>  
> -#define XIVE_IPI_HW_IRQ		0 /* interrupt source # for IPIs */
> -
>  /*
>   * A "disabled" interrupt should never fire, to catch problems
>   * we set its logical number to this
> diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c
> index 8eefd152b947..c27f7bb0494b 100644
> --- a/arch/powerpc/sysdev/xive/common.c
> +++ b/arch/powerpc/sysdev/xive/common.c
> @@ -65,8 +65,16 @@ static struct irq_domain *xive_irq_domain;
>  #ifdef CONFIG_SMP
>  static struct irq_domain *xive_ipi_irq_domain;
>  
> -/* The IPIs all use the same logical irq number */
> -static u32 xive_ipi_irq;
> +/* The IPIs use the same logical irq number when on the same chip */
> +static struct xive_ipi_desc {
> +	unsigned int irq;
> +	char name[8]; /* enough bytes to fit IPI-XXX */

So this assumes that the node number that node is <= 999 ? This
is certainly the case for now since CONFIG_NODES_SHIFT is 8
on ppc64 but starting with 10, you'd have truncated names.
What about deriving the size of name[] from CONFIG_NODES_SHIFT ?

Apart from that, LGTM. Probably not worth to respin just for
this.

I also could give a try in a KVM guest.

Topology passed to QEMU:

  -smp 8,maxcpus=8,cores=2,threads=2,sockets=2 \
  -numa node,nodeid=0,cpus=0-4 \
  -numa node,nodeid=1,cpus=4-7

Topology observed in guest with lstopo :

  Package L#0
    NUMANode L#0 (P#0 30GB)
    L1d L#0 (32KB) + L1i L#0 (32KB) + Core L#0
      PU L#0 (P#0)
      PU L#1 (P#1)
    L1d L#1 (32KB) + L1i L#1 (32KB) + Core L#1
      PU L#2 (P#2)
      PU L#3 (P#3)
  Package L#1
    NUMANode L#1 (P#1 32GB)
    L1d L#2 (32KB) + L1i L#2 (32KB) + Core L#2
      PU L#4 (P#4)
      PU L#5 (P#5)
    L1d L#3 (32KB) + L1i L#3 (32KB) + Core L#3
      PU L#6 (P#6)
      PU L#7 (P#7)

Interrupts in guest:

$ cat /proc/interrupts 
           CPU0       CPU1       CPU2       CPU3       CPU4       CPU5       CPU6       CPU7       
 16:       1023        871       1042        749          0          0          0          0  XIVE-IPI   0 Edge      IPI-0
 17:          0          0          0          0       2123       1019       1263       1288  XIVE-IPI   1 Edge      IPI-1

IPIs are mapped to the appropriate nodes, and the numbers indicate
that everything is working as expected.

Reviewed-and-tested-by: Greg Kurz <groug@kaod.org>

> +} *xive_ipis;
> +
> +static unsigned int xive_ipi_cpu_to_irq(unsigned int cpu)
> +{
> +	return xive_ipis[cpu_to_node(cpu)].irq;
> +}
>  #endif
>  
>  /* Xive state for each CPU */
> @@ -1106,25 +1114,36 @@ static const struct irq_domain_ops xive_ipi_irq_domain_ops = {
>  
>  static void __init xive_request_ipi(void)
>  {
> -	unsigned int virq;
> +	unsigned int node;
>  
> -	xive_ipi_irq_domain = irq_domain_add_linear(NULL, 1,
> +	xive_ipi_irq_domain = irq_domain_add_linear(NULL, nr_node_ids,
>  						    &xive_ipi_irq_domain_ops, NULL);
>  	if (WARN_ON(xive_ipi_irq_domain == NULL))
>  		return;
>  
> -	/* Initialize it */
> -	virq = irq_create_mapping(xive_ipi_irq_domain, XIVE_IPI_HW_IRQ);
> -	xive_ipi_irq = virq;
> +	xive_ipis = kcalloc(nr_node_ids, sizeof(*xive_ipis), GFP_KERNEL | __GFP_NOFAIL);
> +	for_each_node(node) {
> +		struct xive_ipi_desc *xid = &xive_ipis[node];
> +		irq_hw_number_t node_ipi_hwirq = node;
> +
> +		/*
> +		 * Map one IPI interrupt per node for all cpus of that node.
> +		 * Since the HW interrupt number doesn't have any meaning,
> +		 * simply use the node number.
> +		 */
> +		xid->irq = irq_create_mapping(xive_ipi_irq_domain, node_ipi_hwirq);
> +		snprintf(xid->name, sizeof(xid->name), "IPI-%d", node);
>  
> -	WARN_ON(request_irq(virq, xive_muxed_ipi_action,
> -			    IRQF_PERCPU | IRQF_NO_THREAD, "IPI", NULL));
> +		WARN_ON(request_irq(xid->irq, xive_muxed_ipi_action,
> +				    IRQF_PERCPU | IRQF_NO_THREAD, xid->name, NULL));
> +	}
>  }
>  
>  static int xive_setup_cpu_ipi(unsigned int cpu)
>  {
>  	struct xive_cpu *xc;
>  	int rc;
> +	unsigned int xive_ipi_irq = xive_ipi_cpu_to_irq(cpu);
>  
>  	pr_debug("Setting up IPI for CPU %d\n", cpu);
>  
> @@ -1165,6 +1184,8 @@ static int xive_setup_cpu_ipi(unsigned int cpu)
>  
>  static void xive_cleanup_cpu_ipi(unsigned int cpu, struct xive_cpu *xc)
>  {
> +	unsigned int xive_ipi_irq = xive_ipi_cpu_to_irq(cpu);
> +
>  	/* Disable the IPI and free the IRQ data */
>  
>  	/* Already cleaned up ? */
Cédric Le Goater March 9, 2021, 3:52 p.m. UTC | #2
On 3/9/21 2:23 PM, Greg Kurz wrote:
> On Wed, 3 Mar 2021 18:48:57 +0100
> Cédric Le Goater <clg@kaod.org> wrote:
> 
>> ipistorm [*] can be used to benchmark the raw interrupt rate of an
>> interrupt controller by measuring the number of IPIs a system can
>> sustain. When applied to the XIVE interrupt controller of POWER9 and
>> POWER10 systems, a significant drop of the interrupt rate can be
>> observed when crossing the second node boundary.
>>
>> This is due to the fact that a single IPI interrupt is used for all
>> CPUs of the system. The structure is shared and the cache line updates
>> impact greatly the traffic between nodes and the overall IPI
>> performance.
>>
>> As a workaround, the impact can be reduced by deactivating the IRQ
>> lockup detector ("noirqdebug") which does a lot of accounting in the
>> Linux IRQ descriptor structure and is responsible for most of the
>> performance penalty.
>>
>> As a fix, this proposal allocates an IPI interrupt per node, to be
>> shared by all CPUs of that node. It solves the scaling issue, the IRQ
>> lockup detector still has an impact but the XIVE interrupt rate scales
>> linearly. It also improves the "noirqdebug" case as showed in the
>> tables below.
>>
>>  * P9 DD2.2 - 2s * 64 threads
>>
>>                                                "noirqdebug"
>>                         Mint/s                    Mint/s
>>  chips  cpus      IPI/sys   IPI/chip       IPI/chip    IPI/sys
>>  --------------------------------------------------------------
>>  1      0-15     4.984023   4.875405       4.996536   5.048892
>>         0-31    10.879164  10.544040      10.757632  11.037859
>>         0-47    15.345301  14.688764      14.926520  15.310053
>>         0-63    17.064907  17.066812      17.613416  17.874511
>>  2      0-79    11.768764  21.650749      22.689120  22.566508
>>         0-95    10.616812  26.878789      28.434703  28.320324
>>         0-111   10.151693  31.397803      31.771773  32.388122
>>         0-127    9.948502  33.139336      34.875716  35.224548
>>
>>  * P10 DD1 - 4s (not homogeneous) 352 threads
>>
>>                                                "noirqdebug"
>>                         Mint/s                    Mint/s
>>  chips  cpus      IPI/sys   IPI/chip       IPI/chip    IPI/sys
>>  --------------------------------------------------------------
>>  1      0-15     2.409402   2.364108       2.383303   2.395091
>>         0-31     6.028325   6.046075       6.089999   6.073750
>>         0-47     8.655178   8.644531       8.712830   8.724702
>>         0-63    11.629652  11.735953      12.088203  12.055979
>>         0-79    14.392321  14.729959      14.986701  14.973073
>>         0-95    12.604158  13.004034      17.528748  17.568095
>>  2      0-111    9.767753  13.719831      19.968606  20.024218
>>         0-127    6.744566  16.418854      22.898066  22.995110
>>         0-143    6.005699  19.174421      25.425622  25.417541
>>         0-159    5.649719  21.938836      27.952662  28.059603
>>         0-175    5.441410  24.109484      31.133915  31.127996
>>  3      0-191    5.318341  24.405322      33.999221  33.775354
>>         0-207    5.191382  26.449769      36.050161  35.867307
>>         0-223    5.102790  29.356943      39.544135  39.508169
>>         0-239    5.035295  31.933051      42.135075  42.071975
>>         0-255    4.969209  34.477367      44.655395  44.757074
>>  4      0-271    4.907652  35.887016      47.080545  47.318537
>>         0-287    4.839581  38.076137      50.464307  50.636219
>>         0-303    4.786031  40.881319      53.478684  53.310759
>>         0-319    4.743750  43.448424      56.388102  55.973969
>>         0-335    4.709936  45.623532      59.400930  58.926857
>>         0-351    4.681413  45.646151      62.035804  61.830057
>>
>> [*] https://github.com/antonblanchard/ipistorm
>>
>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>> ---
>>  arch/powerpc/sysdev/xive/xive-internal.h |  2 --
>>  arch/powerpc/sysdev/xive/common.c        | 39 ++++++++++++++++++------
>>  2 files changed, 30 insertions(+), 11 deletions(-)
>>
>> diff --git a/arch/powerpc/sysdev/xive/xive-internal.h b/arch/powerpc/sysdev/xive/xive-internal.h
>> index 9cf57c722faa..b3a456fdd3a5 100644
>> --- a/arch/powerpc/sysdev/xive/xive-internal.h
>> +++ b/arch/powerpc/sysdev/xive/xive-internal.h
>> @@ -5,8 +5,6 @@
>>  #ifndef __XIVE_INTERNAL_H
>>  #define __XIVE_INTERNAL_H
>>  
>> -#define XIVE_IPI_HW_IRQ		0 /* interrupt source # for IPIs */
>> -
>>  /*
>>   * A "disabled" interrupt should never fire, to catch problems
>>   * we set its logical number to this
>> diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c
>> index 8eefd152b947..c27f7bb0494b 100644
>> --- a/arch/powerpc/sysdev/xive/common.c
>> +++ b/arch/powerpc/sysdev/xive/common.c
>> @@ -65,8 +65,16 @@ static struct irq_domain *xive_irq_domain;
>>  #ifdef CONFIG_SMP
>>  static struct irq_domain *xive_ipi_irq_domain;
>>  
>> -/* The IPIs all use the same logical irq number */
>> -static u32 xive_ipi_irq;
>> +/* The IPIs use the same logical irq number when on the same chip */
>> +static struct xive_ipi_desc {
>> +	unsigned int irq;
>> +	char name[8]; /* enough bytes to fit IPI-XXX */
> 
> So this assumes that the node number that node is <= 999 ? This
> is certainly the case for now since CONFIG_NODES_SHIFT is 8
> on ppc64 but starting with 10, you'd have truncated names.

It should be harmless though. I agree this is a useless optimization.

> What about deriving the size of name[] from CONFIG_NODES_SHIFT ?

Yes.
 
> Apart from that, LGTM. Probably not worth to respin just for
> this.
> 
> I also could give a try in a KVM guest.
> 
> Topology passed to QEMU:
> 
>   -smp 8,maxcpus=8,cores=2,threads=2,sockets=2 \
>   -numa node,nodeid=0,cpus=0-4 \
>   -numa node,nodeid=1,cpus=4-7
> 
> Topology observed in guest with lstopo :
> 
>   Package L#0
>     NUMANode L#0 (P#0 30GB)
>     L1d L#0 (32KB) + L1i L#0 (32KB) + Core L#0
>       PU L#0 (P#0)
>       PU L#1 (P#1)
>     L1d L#1 (32KB) + L1i L#1 (32KB) + Core L#1
>       PU L#2 (P#2)
>       PU L#3 (P#3)
>   Package L#1
>     NUMANode L#1 (P#1 32GB)
>     L1d L#2 (32KB) + L1i L#2 (32KB) + Core L#2
>       PU L#4 (P#4)
>       PU L#5 (P#5)
>     L1d L#3 (32KB) + L1i L#3 (32KB) + Core L#3
>       PU L#6 (P#6)
>       PU L#7 (P#7)
> 
> Interrupts in guest:
> 
> $ cat /proc/interrupts 
>            CPU0       CPU1       CPU2       CPU3       CPU4       CPU5       CPU6       CPU7       
>  16:       1023        871       1042        749          0          0          0          0  XIVE-IPI   0 Edge      IPI-0
>  17:          0          0          0          0       2123       1019       1263       1288  XIVE-IPI   1 Edge      IPI-1
> 
> IPIs are mapped to the appropriate nodes, and the numbers indicate
> that everything is working as expected.

You should see the same on 2 socket PowerNV QEMU machine.
 
> Reviewed-and-tested-by: Greg Kurz <groug@kaod.org>

Thanks,

C. 

> 
>> +} *xive_ipis;
>> +
>> +static unsigned int xive_ipi_cpu_to_irq(unsigned int cpu)
>> +{
>> +	return xive_ipis[cpu_to_node(cpu)].irq;
>> +}
>>  #endif
>>  
>>  /* Xive state for each CPU */
>> @@ -1106,25 +1114,36 @@ static const struct irq_domain_ops xive_ipi_irq_domain_ops = {
>>  
>>  static void __init xive_request_ipi(void)
>>  {
>> -	unsigned int virq;
>> +	unsigned int node;
>>  
>> -	xive_ipi_irq_domain = irq_domain_add_linear(NULL, 1,
>> +	xive_ipi_irq_domain = irq_domain_add_linear(NULL, nr_node_ids,
>>  						    &xive_ipi_irq_domain_ops, NULL);
>>  	if (WARN_ON(xive_ipi_irq_domain == NULL))
>>  		return;
>>  
>> -	/* Initialize it */
>> -	virq = irq_create_mapping(xive_ipi_irq_domain, XIVE_IPI_HW_IRQ);
>> -	xive_ipi_irq = virq;
>> +	xive_ipis = kcalloc(nr_node_ids, sizeof(*xive_ipis), GFP_KERNEL | __GFP_NOFAIL);
>> +	for_each_node(node) {
>> +		struct xive_ipi_desc *xid = &xive_ipis[node];
>> +		irq_hw_number_t node_ipi_hwirq = node;
>> +
>> +		/*
>> +		 * Map one IPI interrupt per node for all cpus of that node.
>> +		 * Since the HW interrupt number doesn't have any meaning,
>> +		 * simply use the node number.
>> +		 */
>> +		xid->irq = irq_create_mapping(xive_ipi_irq_domain, node_ipi_hwirq);
>> +		snprintf(xid->name, sizeof(xid->name), "IPI-%d", node);
>>  
>> -	WARN_ON(request_irq(virq, xive_muxed_ipi_action,
>> -			    IRQF_PERCPU | IRQF_NO_THREAD, "IPI", NULL));
>> +		WARN_ON(request_irq(xid->irq, xive_muxed_ipi_action,
>> +				    IRQF_PERCPU | IRQF_NO_THREAD, xid->name, NULL));
>> +	}
>>  }
>>  
>>  static int xive_setup_cpu_ipi(unsigned int cpu)
>>  {
>>  	struct xive_cpu *xc;
>>  	int rc;
>> +	unsigned int xive_ipi_irq = xive_ipi_cpu_to_irq(cpu);
>>  
>>  	pr_debug("Setting up IPI for CPU %d\n", cpu);
>>  
>> @@ -1165,6 +1184,8 @@ static int xive_setup_cpu_ipi(unsigned int cpu)
>>  
>>  static void xive_cleanup_cpu_ipi(unsigned int cpu, struct xive_cpu *xc)
>>  {
>> +	unsigned int xive_ipi_irq = xive_ipi_cpu_to_irq(cpu);
>> +
>>  	/* Disable the IPI and free the IRQ data */
>>  
>>  	/* Already cleaned up ? */
>
Cédric Le Goater March 30, 2021, 4:18 p.m. UTC | #3
On 3/3/21 6:48 PM, Cédric Le Goater wrote:
> ipistorm [*] can be used to benchmark the raw interrupt rate of an
> interrupt controller by measuring the number of IPIs a system can
> sustain. When applied to the XIVE interrupt controller of POWER9 and
> POWER10 systems, a significant drop of the interrupt rate can be
> observed when crossing the second node boundary.
> 
> This is due to the fact that a single IPI interrupt is used for all
> CPUs of the system. The structure is shared and the cache line updates
> impact greatly the traffic between nodes and the overall IPI
> performance.
> 
> As a workaround, the impact can be reduced by deactivating the IRQ
> lockup detector ("noirqdebug") which does a lot of accounting in the
> Linux IRQ descriptor structure and is responsible for most of the
> performance penalty.
> 
> As a fix, this proposal allocates an IPI interrupt per node, to be
> shared by all CPUs of that node. It solves the scaling issue, the IRQ
> lockup detector still has an impact but the XIVE interrupt rate scales
> linearly. It also improves the "noirqdebug" case as showed in the
> tables below.
> 
>  * P9 DD2.2 - 2s * 64 threads
> 
>                                                "noirqdebug"
>                         Mint/s                    Mint/s
>  chips  cpus      IPI/sys   IPI/chip       IPI/chip    IPI/sys
>  --------------------------------------------------------------
>  1      0-15     4.984023   4.875405       4.996536   5.048892
>         0-31    10.879164  10.544040      10.757632  11.037859
>         0-47    15.345301  14.688764      14.926520  15.310053
>         0-63    17.064907  17.066812      17.613416  17.874511
>  2      0-79    11.768764  21.650749      22.689120  22.566508
>         0-95    10.616812  26.878789      28.434703  28.320324
>         0-111   10.151693  31.397803      31.771773  32.388122
>         0-127    9.948502  33.139336      34.875716  35.224548
> 
>  * P10 DD1 - 4s (not homogeneous) 352 threads
> 
>                                                "noirqdebug"
>                         Mint/s                    Mint/s
>  chips  cpus      IPI/sys   IPI/chip       IPI/chip    IPI/sys
>  --------------------------------------------------------------
>  1      0-15     2.409402   2.364108       2.383303   2.395091
>         0-31     6.028325   6.046075       6.089999   6.073750
>         0-47     8.655178   8.644531       8.712830   8.724702
>         0-63    11.629652  11.735953      12.088203  12.055979
>         0-79    14.392321  14.729959      14.986701  14.973073
>         0-95    12.604158  13.004034      17.528748  17.568095
>  2      0-111    9.767753  13.719831      19.968606  20.024218
>         0-127    6.744566  16.418854      22.898066  22.995110
>         0-143    6.005699  19.174421      25.425622  25.417541
>         0-159    5.649719  21.938836      27.952662  28.059603
>         0-175    5.441410  24.109484      31.133915  31.127996
>  3      0-191    5.318341  24.405322      33.999221  33.775354
>         0-207    5.191382  26.449769      36.050161  35.867307
>         0-223    5.102790  29.356943      39.544135  39.508169
>         0-239    5.035295  31.933051      42.135075  42.071975
>         0-255    4.969209  34.477367      44.655395  44.757074
>  4      0-271    4.907652  35.887016      47.080545  47.318537
>         0-287    4.839581  38.076137      50.464307  50.636219
>         0-303    4.786031  40.881319      53.478684  53.310759
>         0-319    4.743750  43.448424      56.388102  55.973969
>         0-335    4.709936  45.623532      59.400930  58.926857
>         0-351    4.681413  45.646151      62.035804  61.830057
> 
> [*] https://github.com/antonblanchard/ipistorm
> 
> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> ---
>  arch/powerpc/sysdev/xive/xive-internal.h |  2 --
>  arch/powerpc/sysdev/xive/common.c        | 39 ++++++++++++++++++------
>  2 files changed, 30 insertions(+), 11 deletions(-)
> 
> diff --git a/arch/powerpc/sysdev/xive/xive-internal.h b/arch/powerpc/sysdev/xive/xive-internal.h
> index 9cf57c722faa..b3a456fdd3a5 100644
> --- a/arch/powerpc/sysdev/xive/xive-internal.h
> +++ b/arch/powerpc/sysdev/xive/xive-internal.h
> @@ -5,8 +5,6 @@
>  #ifndef __XIVE_INTERNAL_H
>  #define __XIVE_INTERNAL_H
>  
> -#define XIVE_IPI_HW_IRQ		0 /* interrupt source # for IPIs */
> -
>  /*
>   * A "disabled" interrupt should never fire, to catch problems
>   * we set its logical number to this
> diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c
> index 8eefd152b947..c27f7bb0494b 100644
> --- a/arch/powerpc/sysdev/xive/common.c
> +++ b/arch/powerpc/sysdev/xive/common.c
> @@ -65,8 +65,16 @@ static struct irq_domain *xive_irq_domain;
>  #ifdef CONFIG_SMP
>  static struct irq_domain *xive_ipi_irq_domain;
>  
> -/* The IPIs all use the same logical irq number */
> -static u32 xive_ipi_irq;
> +/* The IPIs use the same logical irq number when on the same chip */
> +static struct xive_ipi_desc {
> +	unsigned int irq;
> +	char name[8]; /* enough bytes to fit IPI-XXX */
> +} *xive_ipis;
> +
> +static unsigned int xive_ipi_cpu_to_irq(unsigned int cpu)
> +{
> +	return xive_ipis[cpu_to_node(cpu)].irq;

This should be using early_cpu_to_node() for hotplugged CPU, else the CPU IPI 
will be mapped on default node 0. Still works but this is not what we want. 

> +}
>  #endif
>  
>  /* Xive state for each CPU */
> @@ -1106,25 +1114,36 @@ static const struct irq_domain_ops xive_ipi_irq_domain_ops = {
>  
>  static void __init xive_request_ipi(void)
>  {
> -	unsigned int virq;
> +	unsigned int node;
>  
> -	xive_ipi_irq_domain = irq_domain_add_linear(NULL, 1,
> +	xive_ipi_irq_domain = irq_domain_add_linear(NULL, nr_node_ids,
>  						    &xive_ipi_irq_domain_ops, NULL);
>  	if (WARN_ON(xive_ipi_irq_domain == NULL))
>  		return;
>  
> -	/* Initialize it */
> -	virq = irq_create_mapping(xive_ipi_irq_domain, XIVE_IPI_HW_IRQ);
> -	xive_ipi_irq = virq;
> +	xive_ipis = kcalloc(nr_node_ids, sizeof(*xive_ipis), GFP_KERNEL | __GFP_NOFAIL);
> +	for_each_node(node) {
> +		struct xive_ipi_desc *xid = &xive_ipis[node];
> +		irq_hw_number_t node_ipi_hwirq = node;

There, we need to filter cpu-less nodes. There is no need to allocate IPIs
for these.

> +		/*
> +		 * Map one IPI interrupt per node for all cpus of that node.
> +		 * Since the HW interrupt number doesn't have any meaning,
> +		 * simply use the node number.
> +		 */
> +		xid->irq = irq_create_mapping(xive_ipi_irq_domain, node_ipi_hwirq);
> +		snprintf(xid->name, sizeof(xid->name), "IPI-%d", node);

and this mapping needs some modernization. If we use a domain allocator, the 
IPI irq descriptor will be allocated on the node it is serving which is even 
better for cache performance.

I will send a v3 with these changes.

C.

> -	WARN_ON(request_irq(virq, xive_muxed_ipi_action,
> -			    IRQF_PERCPU | IRQF_NO_THREAD, "IPI", NULL));
> +		WARN_ON(request_irq(xid->irq, xive_muxed_ipi_action,
> +				    IRQF_PERCPU | IRQF_NO_THREAD, xid->name, NULL));
> +	}
>  }
>  
>  static int xive_setup_cpu_ipi(unsigned int cpu)
>  {
>  	struct xive_cpu *xc;
>  	int rc;
> +	unsigned int xive_ipi_irq = xive_ipi_cpu_to_irq(cpu);
>  
>  	pr_debug("Setting up IPI for CPU %d\n", cpu);
>  
> @@ -1165,6 +1184,8 @@ static int xive_setup_cpu_ipi(unsigned int cpu)
>  
>  static void xive_cleanup_cpu_ipi(unsigned int cpu, struct xive_cpu *xc)
>  {
> +	unsigned int xive_ipi_irq = xive_ipi_cpu_to_irq(cpu);
> +
>  	/* Disable the IPI and free the IRQ data */
>  
>  	/* Already cleaned up ? */
>
diff mbox series

Patch

diff --git a/arch/powerpc/sysdev/xive/xive-internal.h b/arch/powerpc/sysdev/xive/xive-internal.h
index 9cf57c722faa..b3a456fdd3a5 100644
--- a/arch/powerpc/sysdev/xive/xive-internal.h
+++ b/arch/powerpc/sysdev/xive/xive-internal.h
@@ -5,8 +5,6 @@ 
 #ifndef __XIVE_INTERNAL_H
 #define __XIVE_INTERNAL_H
 
-#define XIVE_IPI_HW_IRQ		0 /* interrupt source # for IPIs */
-
 /*
  * A "disabled" interrupt should never fire, to catch problems
  * we set its logical number to this
diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c
index 8eefd152b947..c27f7bb0494b 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -65,8 +65,16 @@  static struct irq_domain *xive_irq_domain;
 #ifdef CONFIG_SMP
 static struct irq_domain *xive_ipi_irq_domain;
 
-/* The IPIs all use the same logical irq number */
-static u32 xive_ipi_irq;
+/* The IPIs use the same logical irq number when on the same chip */
+static struct xive_ipi_desc {
+	unsigned int irq;
+	char name[8]; /* enough bytes to fit IPI-XXX */
+} *xive_ipis;
+
+static unsigned int xive_ipi_cpu_to_irq(unsigned int cpu)
+{
+	return xive_ipis[cpu_to_node(cpu)].irq;
+}
 #endif
 
 /* Xive state for each CPU */
@@ -1106,25 +1114,36 @@  static const struct irq_domain_ops xive_ipi_irq_domain_ops = {
 
 static void __init xive_request_ipi(void)
 {
-	unsigned int virq;
+	unsigned int node;
 
-	xive_ipi_irq_domain = irq_domain_add_linear(NULL, 1,
+	xive_ipi_irq_domain = irq_domain_add_linear(NULL, nr_node_ids,
 						    &xive_ipi_irq_domain_ops, NULL);
 	if (WARN_ON(xive_ipi_irq_domain == NULL))
 		return;
 
-	/* Initialize it */
-	virq = irq_create_mapping(xive_ipi_irq_domain, XIVE_IPI_HW_IRQ);
-	xive_ipi_irq = virq;
+	xive_ipis = kcalloc(nr_node_ids, sizeof(*xive_ipis), GFP_KERNEL | __GFP_NOFAIL);
+	for_each_node(node) {
+		struct xive_ipi_desc *xid = &xive_ipis[node];
+		irq_hw_number_t node_ipi_hwirq = node;
+
+		/*
+		 * Map one IPI interrupt per node for all cpus of that node.
+		 * Since the HW interrupt number doesn't have any meaning,
+		 * simply use the node number.
+		 */
+		xid->irq = irq_create_mapping(xive_ipi_irq_domain, node_ipi_hwirq);
+		snprintf(xid->name, sizeof(xid->name), "IPI-%d", node);
 
-	WARN_ON(request_irq(virq, xive_muxed_ipi_action,
-			    IRQF_PERCPU | IRQF_NO_THREAD, "IPI", NULL));
+		WARN_ON(request_irq(xid->irq, xive_muxed_ipi_action,
+				    IRQF_PERCPU | IRQF_NO_THREAD, xid->name, NULL));
+	}
 }
 
 static int xive_setup_cpu_ipi(unsigned int cpu)
 {
 	struct xive_cpu *xc;
 	int rc;
+	unsigned int xive_ipi_irq = xive_ipi_cpu_to_irq(cpu);
 
 	pr_debug("Setting up IPI for CPU %d\n", cpu);
 
@@ -1165,6 +1184,8 @@  static int xive_setup_cpu_ipi(unsigned int cpu)
 
 static void xive_cleanup_cpu_ipi(unsigned int cpu, struct xive_cpu *xc)
 {
+	unsigned int xive_ipi_irq = xive_ipi_cpu_to_irq(cpu);
+
 	/* Disable the IPI and free the IRQ data */
 
 	/* Already cleaned up ? */