diff mbox

[qemu] spapr_pci: Add numa node id

Message ID 1469606618-26915-1-git-send-email-aik@ozlabs.ru
State New
Headers show

Commit Message

Alexey Kardashevskiy July 27, 2016, 8:03 a.m. UTC
This adds a numa id property to a PHB to allow linking passed PCI device
to CPU/memory. It is up to the management stack to do CPU/memory pinning
to the node with the actual PCI device.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 hw/ppc/spapr_pci.c          | 13 +++++++++++++
 include/hw/pci-host/spapr.h |  2 ++
 2 files changed, 15 insertions(+)

Comments

Alexey Kardashevskiy Aug. 10, 2016, 5:42 a.m. UTC | #1
On 27/07/16 18:03, Alexey Kardashevskiy wrote:
> This adds a numa id property to a PHB to allow linking passed PCI device
> to CPU/memory. It is up to the management stack to do CPU/memory pinning
> to the node with the actual PCI device.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>

Ping?


> ---
>  hw/ppc/spapr_pci.c          | 13 +++++++++++++
>  include/hw/pci-host/spapr.h |  2 ++
>  2 files changed, 15 insertions(+)
> 
> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> index 949c44f..af5394a 100644
> --- a/hw/ppc/spapr_pci.c
> +++ b/hw/ppc/spapr_pci.c
> @@ -47,6 +47,7 @@
>  #include "sysemu/device_tree.h"
>  #include "sysemu/kvm.h"
>  #include "sysemu/hostmem.h"
> +#include "sysemu/numa.h"
>  
>  #include "hw/vfio/vfio.h"
>  
> @@ -1544,6 +1545,7 @@ static Property spapr_phb_properties[] = {
>      DEFINE_PROP_BOOL("ddw", sPAPRPHBState, ddw_enabled, true),
>      DEFINE_PROP_UINT64("pgsz", sPAPRPHBState, page_size_mask,
>                         (1ULL << 12) | (1ULL << 16)),
> +    DEFINE_PROP_UINT32("node", sPAPRPHBState, numa_node, -1),
>      DEFINE_PROP_END_OF_LIST(),
>  };
>  
> @@ -1805,6 +1807,11 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
>          cpu_to_be32(1),
>          cpu_to_be32(RTAS_IBM_RESET_PE_DMA_WINDOW)
>      };
> +    uint32_t associativity[] = {cpu_to_be32(0x4),
> +                                cpu_to_be32(0x0),
> +                                cpu_to_be32(0x0),
> +                                cpu_to_be32(0x0),
> +                                cpu_to_be32(phb->numa_node)};
>      sPAPRTCETable *tcet;
>      PCIBus *bus = PCI_HOST_BRIDGE(phb)->bus;
>      sPAPRFDT s_fdt;
> @@ -1837,6 +1844,12 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
>                           &ddw_extensions, sizeof(ddw_extensions)));
>      }
>  
> +    /* Advertise NUMA via ibm,associativity */
> +    if (nb_numa_nodes > 1) {
> +        _FDT(fdt_setprop(fdt, bus_off, "ibm,associativity", associativity,
> +                         sizeof(associativity)));
> +    }
> +
>      /* Build the interrupt-map, this must matches what is done
>       * in pci_spapr_map_irq
>       */
> diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h
> index 5adc603..53c4b2d 100644
> --- a/include/hw/pci-host/spapr.h
> +++ b/include/hw/pci-host/spapr.h
> @@ -75,6 +75,8 @@ struct sPAPRPHBState {
>      bool ddw_enabled;
>      uint64_t page_size_mask;
>      uint64_t dma64_win_addr;
> +
> +    uint32_t numa_node;
>  };
>  
>  #define SPAPR_PCI_MAX_INDEX          255
>
Michael Roth Sept. 13, 2016, 11:29 p.m. UTC | #2
Quoting Alexey Kardashevskiy (2016-07-27 03:03:38)
> This adds a numa id property to a PHB to allow linking passed PCI device
> to CPU/memory. It is up to the management stack to do CPU/memory pinning
> to the node with the actual PCI device.

It looks like x86 relies on PCIBus->numa_node() method (via
pci_bus_numa_node()) to encode similar PCIBus affinities
into ACPI tables, and currently exposes it via
-device pci-[-express]-expander-bus,numa_node=X.

Maybe we should implement it using this existing
PCIBus->numa_node() interface?

We'd still have to expose numa_node as a spapr-pci-host-bridge
device option though. Not sure if there's a more common way
to expose it that might be easier for libvirt to discover. As it
stands we'd need to add spapr-pci-host-bridge to a libvirt
whitelist that currently only covers pci-expander-bus.

Cc'ing Shiva who was looking into the libvirt side.

One comment below:

> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  hw/ppc/spapr_pci.c          | 13 +++++++++++++
>  include/hw/pci-host/spapr.h |  2 ++
>  2 files changed, 15 insertions(+)
> 
> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> index 949c44f..af5394a 100644
> --- a/hw/ppc/spapr_pci.c
> +++ b/hw/ppc/spapr_pci.c
> @@ -47,6 +47,7 @@
>  #include "sysemu/device_tree.h"
>  #include "sysemu/kvm.h"
>  #include "sysemu/hostmem.h"
> +#include "sysemu/numa.h"
> 
>  #include "hw/vfio/vfio.h"
> 
> @@ -1544,6 +1545,7 @@ static Property spapr_phb_properties[] = {
>      DEFINE_PROP_BOOL("ddw", sPAPRPHBState, ddw_enabled, true),
>      DEFINE_PROP_UINT64("pgsz", sPAPRPHBState, page_size_mask,
>                         (1ULL << 12) | (1ULL << 16)),
> +    DEFINE_PROP_UINT32("node", sPAPRPHBState, numa_node, -1),
>      DEFINE_PROP_END_OF_LIST(),
>  };
> 
> @@ -1805,6 +1807,11 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
>          cpu_to_be32(1),
>          cpu_to_be32(RTAS_IBM_RESET_PE_DMA_WINDOW)
>      };
> +    uint32_t associativity[] = {cpu_to_be32(0x4),
> +                                cpu_to_be32(0x0),
> +                                cpu_to_be32(0x0),
> +                                cpu_to_be32(0x0),
> +                                cpu_to_be32(phb->numa_node)};
>      sPAPRTCETable *tcet;
>      PCIBus *bus = PCI_HOST_BRIDGE(phb)->bus;
>      sPAPRFDT s_fdt;
> @@ -1837,6 +1844,12 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
>                           &ddw_extensions, sizeof(ddw_extensions)));
>      }
> 
> +    /* Advertise NUMA via ibm,associativity */
> +    if (nb_numa_nodes > 1) {
> +        _FDT(fdt_setprop(fdt, bus_off, "ibm,associativity", associativity,
> +                         sizeof(associativity)));
> +    }

LoPAPR 15.3 seems to suggest that ibm,associativity-reference-points is
required alongside ibm,associativity for each DT node it appears in,
and since we hardcode "Form 1" affinity it should be done similarly as
the entry we place in the top-level DT node.

> +
>      /* Build the interrupt-map, this must matches what is done
>       * in pci_spapr_map_irq
>       */
> diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h
> index 5adc603..53c4b2d 100644
> --- a/include/hw/pci-host/spapr.h
> +++ b/include/hw/pci-host/spapr.h
> @@ -75,6 +75,8 @@ struct sPAPRPHBState {
>      bool ddw_enabled;
>      uint64_t page_size_mask;
>      uint64_t dma64_win_addr;
> +
> +    uint32_t numa_node;
>  };
> 
>  #define SPAPR_PCI_MAX_INDEX          255
> -- 
> 2.5.0.rc3
>
Alexey Kardashevskiy Sept. 14, 2016, 9:39 a.m. UTC | #3
On 14/09/16 09:29, Michael Roth wrote:
> Quoting Alexey Kardashevskiy (2016-07-27 03:03:38)
>> This adds a numa id property to a PHB to allow linking passed PCI device
>> to CPU/memory. It is up to the management stack to do CPU/memory pinning
>> to the node with the actual PCI device.
> 
> It looks like x86 relies on PCIBus->numa_node() method (via
> pci_bus_numa_node()) to encode similar PCIBus affinities
> into ACPI tables, and currently exposes it via
> -device pci-[-express]-expander-bus,numa_node=X.



Well, until we allow DMA windows per PCI bus (not per PHB as it is now),
this won't make much sense for us (unless I am missing something here).


> Maybe we should implement it using this existing
> PCIBus->numa_node() interface?
> 
> We'd still have to expose numa_node as a spapr-pci-host-bridge
> device option though. Not sure if there's a more common way
> to expose it that might be easier for libvirt to discover. As it
> stands we'd need to add spapr-pci-host-bridge to a libvirt
> whitelist that currently only covers pci-expander-bus.
> 
> Cc'ing Shiva who was looking into the libvirt side.
> 
> One comment below:
> 
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>>  hw/ppc/spapr_pci.c          | 13 +++++++++++++
>>  include/hw/pci-host/spapr.h |  2 ++
>>  2 files changed, 15 insertions(+)
>>
>> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
>> index 949c44f..af5394a 100644
>> --- a/hw/ppc/spapr_pci.c
>> +++ b/hw/ppc/spapr_pci.c
>> @@ -47,6 +47,7 @@
>>  #include "sysemu/device_tree.h"
>>  #include "sysemu/kvm.h"
>>  #include "sysemu/hostmem.h"
>> +#include "sysemu/numa.h"
>>
>>  #include "hw/vfio/vfio.h"
>>
>> @@ -1544,6 +1545,7 @@ static Property spapr_phb_properties[] = {
>>      DEFINE_PROP_BOOL("ddw", sPAPRPHBState, ddw_enabled, true),
>>      DEFINE_PROP_UINT64("pgsz", sPAPRPHBState, page_size_mask,
>>                         (1ULL << 12) | (1ULL << 16)),
>> +    DEFINE_PROP_UINT32("node", sPAPRPHBState, numa_node, -1),
>>      DEFINE_PROP_END_OF_LIST(),
>>  };
>>
>> @@ -1805,6 +1807,11 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
>>          cpu_to_be32(1),
>>          cpu_to_be32(RTAS_IBM_RESET_PE_DMA_WINDOW)
>>      };
>> +    uint32_t associativity[] = {cpu_to_be32(0x4),
>> +                                cpu_to_be32(0x0),
>> +                                cpu_to_be32(0x0),
>> +                                cpu_to_be32(0x0),
>> +                                cpu_to_be32(phb->numa_node)};
>>      sPAPRTCETable *tcet;
>>      PCIBus *bus = PCI_HOST_BRIDGE(phb)->bus;
>>      sPAPRFDT s_fdt;
>> @@ -1837,6 +1844,12 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
>>                           &ddw_extensions, sizeof(ddw_extensions)));
>>      }
>>
>> +    /* Advertise NUMA via ibm,associativity */
>> +    if (nb_numa_nodes > 1) {
>> +        _FDT(fdt_setprop(fdt, bus_off, "ibm,associativity", associativity,
>> +                         sizeof(associativity)));
>> +    }
> 
> LoPAPR 15.3 seems to suggest that ibm,associativity-reference-points is
> required alongside ibm,associativity for each DT node it appears in,
> and since we hardcode "Form 1" affinity it should be done similarly as
> the entry we place in the top-level DT node.


Hm, okay, I'll add it. There is a question to Bharata - why do we have 4s
in spapr_create_fdt_skel()'s refpoints? Just a random pick?

vec5[5]==0x80 means we are doing "Form1" and these 4s are far/near distances?


>> +
>>      /* Build the interrupt-map, this must matches what is done
>>       * in pci_spapr_map_irq
>>       */
>> diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h
>> index 5adc603..53c4b2d 100644
>> --- a/include/hw/pci-host/spapr.h
>> +++ b/include/hw/pci-host/spapr.h
>> @@ -75,6 +75,8 @@ struct sPAPRPHBState {
>>      bool ddw_enabled;
>>      uint64_t page_size_mask;
>>      uint64_t dma64_win_addr;
>> +
>> +    uint32_t numa_node;
>>  };
>>
>>  #define SPAPR_PCI_MAX_INDEX          255
>> -- 
>> 2.5.0.rc3
>>
>
Michael Roth Sept. 14, 2016, 12:03 p.m. UTC | #4
Quoting Alexey Kardashevskiy (2016-09-14 04:39:10)
> On 14/09/16 09:29, Michael Roth wrote:
> > Quoting Alexey Kardashevskiy (2016-07-27 03:03:38)
> >> This adds a numa id property to a PHB to allow linking passed PCI device
> >> to CPU/memory. It is up to the management stack to do CPU/memory pinning
> >> to the node with the actual PCI device.
> > 
> > It looks like x86 relies on PCIBus->numa_node() method (via
> > pci_bus_numa_node()) to encode similar PCIBus affinities
> > into ACPI tables, and currently exposes it via
> > -device pci-[-express]-expander-bus,numa_node=X.
> 
> 
> 
> Well, until we allow DMA windows per PCI bus (not per PHB as it is now),
> this won't make much sense for us (unless I am missing something here).

I didn't consider that it's not a bus-level setting; I think
you're right that re-using the interface to both store/retrieve doesn't
make much sense in that case.

My thought that was that since pci_bus_numa_node() could in theory come
to be relied upon by common PCI code, that we should use it as well. But
even if it doesn't make sense for us to use it, wouldn't it make sense to
still set PCIBus->numa_node (in addition to the PHB-wide value) in the
off-chance that common code does come to rely on pci_bus_numa_node()?

> 
> 
> > Maybe we should implement it using this existing
> > PCIBus->numa_node() interface?
> > 
> > We'd still have to expose numa_node as a spapr-pci-host-bridge
> > device option though. Not sure if there's a more common way
> > to expose it that might be easier for libvirt to discover. As it
> > stands we'd need to add spapr-pci-host-bridge to a libvirt
> > whitelist that currently only covers pci-expander-bus.
> > 
> > Cc'ing Shiva who was looking into the libvirt side.
> > 
> > One comment below:
> > 
> >>
> >> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >> ---
> >>  hw/ppc/spapr_pci.c          | 13 +++++++++++++
> >>  include/hw/pci-host/spapr.h |  2 ++
> >>  2 files changed, 15 insertions(+)
> >>
> >> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> >> index 949c44f..af5394a 100644
> >> --- a/hw/ppc/spapr_pci.c
> >> +++ b/hw/ppc/spapr_pci.c
> >> @@ -47,6 +47,7 @@
> >>  #include "sysemu/device_tree.h"
> >>  #include "sysemu/kvm.h"
> >>  #include "sysemu/hostmem.h"
> >> +#include "sysemu/numa.h"
> >>
> >>  #include "hw/vfio/vfio.h"
> >>
> >> @@ -1544,6 +1545,7 @@ static Property spapr_phb_properties[] = {
> >>      DEFINE_PROP_BOOL("ddw", sPAPRPHBState, ddw_enabled, true),
> >>      DEFINE_PROP_UINT64("pgsz", sPAPRPHBState, page_size_mask,
> >>                         (1ULL << 12) | (1ULL << 16)),
> >> +    DEFINE_PROP_UINT32("node", sPAPRPHBState, numa_node, -1),
> >>      DEFINE_PROP_END_OF_LIST(),
> >>  };
> >>
> >> @@ -1805,6 +1807,11 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
> >>          cpu_to_be32(1),
> >>          cpu_to_be32(RTAS_IBM_RESET_PE_DMA_WINDOW)
> >>      };
> >> +    uint32_t associativity[] = {cpu_to_be32(0x4),
> >> +                                cpu_to_be32(0x0),
> >> +                                cpu_to_be32(0x0),
> >> +                                cpu_to_be32(0x0),
> >> +                                cpu_to_be32(phb->numa_node)};
> >>      sPAPRTCETable *tcet;
> >>      PCIBus *bus = PCI_HOST_BRIDGE(phb)->bus;
> >>      sPAPRFDT s_fdt;
> >> @@ -1837,6 +1844,12 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
> >>                           &ddw_extensions, sizeof(ddw_extensions)));
> >>      }
> >>
> >> +    /* Advertise NUMA via ibm,associativity */
> >> +    if (nb_numa_nodes > 1) {
> >> +        _FDT(fdt_setprop(fdt, bus_off, "ibm,associativity", associativity,
> >> +                         sizeof(associativity)));
> >> +    }
> > 
> > LoPAPR 15.3 seems to suggest that ibm,associativity-reference-points is
> > required alongside ibm,associativity for each DT node it appears in,
> > and since we hardcode "Form 1" affinity it should be done similarly as
> > the entry we place in the top-level DT node.
> 
> 
> Hm, okay, I'll add it. There is a question to Bharata - why do we have 4s
> in spapr_create_fdt_skel()'s refpoints? Just a random pick?
> 
> vec5[5]==0x80 means we are doing "Form1" and these 4s are far/near distances?
> 
> 
> >> +
> >>      /* Build the interrupt-map, this must matches what is done
> >>       * in pci_spapr_map_irq
> >>       */
> >> diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h
> >> index 5adc603..53c4b2d 100644
> >> --- a/include/hw/pci-host/spapr.h
> >> +++ b/include/hw/pci-host/spapr.h
> >> @@ -75,6 +75,8 @@ struct sPAPRPHBState {
> >>      bool ddw_enabled;
> >>      uint64_t page_size_mask;
> >>      uint64_t dma64_win_addr;
> >> +
> >> +    uint32_t numa_node;
> >>  };
> >>
> >>  #define SPAPR_PCI_MAX_INDEX          255
> >> -- 
> >> 2.5.0.rc3
> >>
> > 
> 
> 
> -- 
> Alexey
>
Bharata B Rao Sept. 19, 2016, 3:39 a.m. UTC | #5
On Wed, Sep 14, 2016 at 07:39:10PM +1000, Alexey Kardashevskiy wrote:
> On 14/09/16 09:29, Michael Roth wrote:
> > Quoting Alexey Kardashevskiy (2016-07-27 03:03:38)
> >> This adds a numa id property to a PHB to allow linking passed PCI device
> >> to CPU/memory. It is up to the management stack to do CPU/memory pinning
> >> to the node with the actual PCI device.
> > 
> > It looks like x86 relies on PCIBus->numa_node() method (via
> > pci_bus_numa_node()) to encode similar PCIBus affinities
> > into ACPI tables, and currently exposes it via
> > -device pci-[-express]-expander-bus,numa_node=X.
> 
> 
> 
> Well, until we allow DMA windows per PCI bus (not per PHB as it is now),
> this won't make much sense for us (unless I am missing something here).
> 
> 
> > Maybe we should implement it using this existing
> > PCIBus->numa_node() interface?
> > 
> > We'd still have to expose numa_node as a spapr-pci-host-bridge
> > device option though. Not sure if there's a more common way
> > to expose it that might be easier for libvirt to discover. As it
> > stands we'd need to add spapr-pci-host-bridge to a libvirt
> > whitelist that currently only covers pci-expander-bus.
> > 
> > Cc'ing Shiva who was looking into the libvirt side.
> > 
> > One comment below:
> > 
> >>
> >> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >> ---
> >>  hw/ppc/spapr_pci.c          | 13 +++++++++++++
> >>  include/hw/pci-host/spapr.h |  2 ++
> >>  2 files changed, 15 insertions(+)
> >>
> >> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> >> index 949c44f..af5394a 100644
> >> --- a/hw/ppc/spapr_pci.c
> >> +++ b/hw/ppc/spapr_pci.c
> >> @@ -47,6 +47,7 @@
> >>  #include "sysemu/device_tree.h"
> >>  #include "sysemu/kvm.h"
> >>  #include "sysemu/hostmem.h"
> >> +#include "sysemu/numa.h"
> >>
> >>  #include "hw/vfio/vfio.h"
> >>
> >> @@ -1544,6 +1545,7 @@ static Property spapr_phb_properties[] = {
> >>      DEFINE_PROP_BOOL("ddw", sPAPRPHBState, ddw_enabled, true),
> >>      DEFINE_PROP_UINT64("pgsz", sPAPRPHBState, page_size_mask,
> >>                         (1ULL << 12) | (1ULL << 16)),
> >> +    DEFINE_PROP_UINT32("node", sPAPRPHBState, numa_node, -1),
> >>      DEFINE_PROP_END_OF_LIST(),
> >>  };
> >>
> >> @@ -1805,6 +1807,11 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
> >>          cpu_to_be32(1),
> >>          cpu_to_be32(RTAS_IBM_RESET_PE_DMA_WINDOW)
> >>      };
> >> +    uint32_t associativity[] = {cpu_to_be32(0x4),
> >> +                                cpu_to_be32(0x0),
> >> +                                cpu_to_be32(0x0),
> >> +                                cpu_to_be32(0x0),
> >> +                                cpu_to_be32(phb->numa_node)};
> >>      sPAPRTCETable *tcet;
> >>      PCIBus *bus = PCI_HOST_BRIDGE(phb)->bus;
> >>      sPAPRFDT s_fdt;
> >> @@ -1837,6 +1844,12 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
> >>                           &ddw_extensions, sizeof(ddw_extensions)));
> >>      }
> >>
> >> +    /* Advertise NUMA via ibm,associativity */
> >> +    if (nb_numa_nodes > 1) {
> >> +        _FDT(fdt_setprop(fdt, bus_off, "ibm,associativity", associativity,
> >> +                         sizeof(associativity)));
> >> +    }
> > 
> > LoPAPR 15.3 seems to suggest that ibm,associativity-reference-points is
> > required alongside ibm,associativity for each DT node it appears in,
> > and since we hardcode "Form 1" affinity it should be done similarly as
> > the entry we place in the top-level DT node.
> 
> 
> Hm, okay, I'll add it. There is a question to Bharata - why do we have 4s
> in spapr_create_fdt_skel()'s refpoints? Just a random pick?

I remember basing it on what I saw in an LPAR>

> 
> vec5[5]==0x80 means we are doing "Form1" and these 4s are far/near distances?

This comment from https://github.com/open-power/skiboot/blob/master/core/affinity.c should explain things:

/*
 *
 * We currently construct our associativity properties as such:
 *
 * - For "chip" devices (bridges, memory, ...), 4 entries:
 *
 *     - CCM node ID
 *     - HW card ID
 *     - HW module ID
 *     - Chip ID
 *
 *   The information is constructed based on the chip ID which (unlike
 *   pHyp) is our HW chip ID (aka "XSCOM" chip ID). We use it to retrieve
 *   the other properties from the corresponding chip/xscom node in the
 *   device-tree. If those properties are absent, 0 is used.
 *
 * - For "core" devices, we add a 5th entry:
 *
 *     - Core ID
 *
 *   Here too, we do not use the "cooked" HW processor ID from HDAT but
 *   instead use the real HW core ID which is basically the interrupt
 *   server number of thread 0 on that core.
 *
 *
 * The ibm,associativity-reference-points property is currently set to
 * 4,4 indicating that the chip ID is our only reference point. This
 * should be extended to encompass the node IDs eventually.
 */

Regards,
Bharata.
David Gibson Sept. 22, 2016, 4:48 a.m. UTC | #6
On Wed, Sep 14, 2016 at 07:39:10PM +1000, Alexey Kardashevskiy wrote:
> On 14/09/16 09:29, Michael Roth wrote:
> > Quoting Alexey Kardashevskiy (2016-07-27 03:03:38)
> >> This adds a numa id property to a PHB to allow linking passed PCI device
> >> to CPU/memory. It is up to the management stack to do CPU/memory pinning
> >> to the node with the actual PCI device.
> > 
> > It looks like x86 relies on PCIBus->numa_node() method (via
> > pci_bus_numa_node()) to encode similar PCIBus affinities
> > into ACPI tables, and currently exposes it via
> > -device pci-[-express]-expander-bus,numa_node=X.
> 
> Well, until we allow DMA windows per PCI bus (not per PHB as it is now),
> this won't make much sense for us (unless I am missing something
> here).

I can't actually see any sane way we could have NUMA associativity of
PCI at any finer granularity than the host bridge level.  I suspect
the only reason it works that way on x86 is due to some of the weird
stuff PC does to make what are effectively different host bridges
appear as different buses in a single logical domain.

> 
> > Maybe we should implement it using this existing
> > PCIBus->numa_node() interface?
> > 
> > We'd still have to expose numa_node as a spapr-pci-host-bridge
> > device option though. Not sure if there's a more common way
> > to expose it that might be easier for libvirt to discover. As it
> > stands we'd need to add spapr-pci-host-bridge to a libvirt
> > whitelist that currently only covers pci-expander-bus.
> > 
> > Cc'ing Shiva who was looking into the libvirt side.

I think we should change the actual name of the option to "numa_node"
to match the option used on the pxb, though.

> > 
> > One comment below:
> > 
> >>
> >> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >> ---
> >>  hw/ppc/spapr_pci.c          | 13 +++++++++++++
> >>  include/hw/pci-host/spapr.h |  2 ++
> >>  2 files changed, 15 insertions(+)
> >>
> >> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> >> index 949c44f..af5394a 100644
> >> --- a/hw/ppc/spapr_pci.c
> >> +++ b/hw/ppc/spapr_pci.c
> >> @@ -47,6 +47,7 @@
> >>  #include "sysemu/device_tree.h"
> >>  #include "sysemu/kvm.h"
> >>  #include "sysemu/hostmem.h"
> >> +#include "sysemu/numa.h"
> >>
> >>  #include "hw/vfio/vfio.h"
> >>
> >> @@ -1544,6 +1545,7 @@ static Property spapr_phb_properties[] = {
> >>      DEFINE_PROP_BOOL("ddw", sPAPRPHBState, ddw_enabled, true),
> >>      DEFINE_PROP_UINT64("pgsz", sPAPRPHBState, page_size_mask,
> >>                         (1ULL << 12) | (1ULL << 16)),
> >> +    DEFINE_PROP_UINT32("node", sPAPRPHBState, numa_node, -1),
> >>      DEFINE_PROP_END_OF_LIST(),
> >>  };
> >>
> >> @@ -1805,6 +1807,11 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
> >>          cpu_to_be32(1),
> >>          cpu_to_be32(RTAS_IBM_RESET_PE_DMA_WINDOW)
> >>      };
> >> +    uint32_t associativity[] = {cpu_to_be32(0x4),
> >> +                                cpu_to_be32(0x0),
> >> +                                cpu_to_be32(0x0),
> >> +                                cpu_to_be32(0x0),
> >> +                                cpu_to_be32(phb->numa_node)};
> >>      sPAPRTCETable *tcet;
> >>      PCIBus *bus = PCI_HOST_BRIDGE(phb)->bus;
> >>      sPAPRFDT s_fdt;
> >> @@ -1837,6 +1844,12 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
> >>                           &ddw_extensions, sizeof(ddw_extensions)));
> >>      }
> >>
> >> +    /* Advertise NUMA via ibm,associativity */
> >> +    if (nb_numa_nodes > 1) {
> >> +        _FDT(fdt_setprop(fdt, bus_off, "ibm,associativity", associativity,
> >> +                         sizeof(associativity)));
> >> +    }
> > 
> > LoPAPR 15.3 seems to suggest that ibm,associativity-reference-points is
> > required alongside ibm,associativity for each DT node it appears in,
> > and since we hardcode "Form 1" affinity it should be done similarly as
> > the entry we place in the top-level DT node.
> 
> 
> Hm, okay, I'll add it. There is a question to Bharata - why do we have 4s
> in spapr_create_fdt_skel()'s refpoints? Just a random pick?
> 
> vec5[5]==0x80 means we are doing "Form1" and these 4s are far/near distances?
> 
> 
> >> +
> >>      /* Build the interrupt-map, this must matches what is done
> >>       * in pci_spapr_map_irq
> >>       */
> >> diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h
> >> index 5adc603..53c4b2d 100644
> >> --- a/include/hw/pci-host/spapr.h
> >> +++ b/include/hw/pci-host/spapr.h
> >> @@ -75,6 +75,8 @@ struct sPAPRPHBState {
> >>      bool ddw_enabled;
> >>      uint64_t page_size_mask;
> >>      uint64_t dma64_win_addr;
> >> +
> >> +    uint32_t numa_node;
> >>  };
> >>
> >>  #define SPAPR_PCI_MAX_INDEX          255
> >>
> > 
> 
>
David Gibson Sept. 22, 2016, 4:49 a.m. UTC | #7
On Wed, Sep 14, 2016 at 07:03:50AM -0500, Michael Roth wrote:
> Quoting Alexey Kardashevskiy (2016-09-14 04:39:10)
> > On 14/09/16 09:29, Michael Roth wrote:
> > > Quoting Alexey Kardashevskiy (2016-07-27 03:03:38)
> > >> This adds a numa id property to a PHB to allow linking passed PCI device
> > >> to CPU/memory. It is up to the management stack to do CPU/memory pinning
> > >> to the node with the actual PCI device.
> > > 
> > > It looks like x86 relies on PCIBus->numa_node() method (via
> > > pci_bus_numa_node()) to encode similar PCIBus affinities
> > > into ACPI tables, and currently exposes it via
> > > -device pci-[-express]-expander-bus,numa_node=X.
> > 
> > 
> > 
> > Well, until we allow DMA windows per PCI bus (not per PHB as it is now),
> > this won't make much sense for us (unless I am missing something here).
> 
> I didn't consider that it's not a bus-level setting; I think
> you're right that re-using the interface to both store/retrieve doesn't
> make much sense in that case.
> 
> My thought that was that since pci_bus_numa_node() could in theory come
> to be relied upon by common PCI code, that we should use it as well. But
> even if it doesn't make sense for us to use it, wouldn't it make sense to
> still set PCIBus->numa_node (in addition to the PHB-wide value) in the
> off-chance that common code does come to rely on
> pci_bus_numa_node()?

Yes, it would be a good idea to set the PCIBus node value to inherit
the one that's set for the host bridge, just in case any generic code
looks at it in future.

> 
> > 
> > 
> > > Maybe we should implement it using this existing
> > > PCIBus->numa_node() interface?
> > > 
> > > We'd still have to expose numa_node as a spapr-pci-host-bridge
> > > device option though. Not sure if there's a more common way
> > > to expose it that might be easier for libvirt to discover. As it
> > > stands we'd need to add spapr-pci-host-bridge to a libvirt
> > > whitelist that currently only covers pci-expander-bus.
> > > 
> > > Cc'ing Shiva who was looking into the libvirt side.
> > > 
> > > One comment below:
> > > 
> > >>
> > >> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> > >> ---
> > >>  hw/ppc/spapr_pci.c          | 13 +++++++++++++
> > >>  include/hw/pci-host/spapr.h |  2 ++
> > >>  2 files changed, 15 insertions(+)
> > >>
> > >> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> > >> index 949c44f..af5394a 100644
> > >> --- a/hw/ppc/spapr_pci.c
> > >> +++ b/hw/ppc/spapr_pci.c
> > >> @@ -47,6 +47,7 @@
> > >>  #include "sysemu/device_tree.h"
> > >>  #include "sysemu/kvm.h"
> > >>  #include "sysemu/hostmem.h"
> > >> +#include "sysemu/numa.h"
> > >>
> > >>  #include "hw/vfio/vfio.h"
> > >>
> > >> @@ -1544,6 +1545,7 @@ static Property spapr_phb_properties[] = {
> > >>      DEFINE_PROP_BOOL("ddw", sPAPRPHBState, ddw_enabled, true),
> > >>      DEFINE_PROP_UINT64("pgsz", sPAPRPHBState, page_size_mask,
> > >>                         (1ULL << 12) | (1ULL << 16)),
> > >> +    DEFINE_PROP_UINT32("node", sPAPRPHBState, numa_node, -1),
> > >>      DEFINE_PROP_END_OF_LIST(),
> > >>  };
> > >>
> > >> @@ -1805,6 +1807,11 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
> > >>          cpu_to_be32(1),
> > >>          cpu_to_be32(RTAS_IBM_RESET_PE_DMA_WINDOW)
> > >>      };
> > >> +    uint32_t associativity[] = {cpu_to_be32(0x4),
> > >> +                                cpu_to_be32(0x0),
> > >> +                                cpu_to_be32(0x0),
> > >> +                                cpu_to_be32(0x0),
> > >> +                                cpu_to_be32(phb->numa_node)};
> > >>      sPAPRTCETable *tcet;
> > >>      PCIBus *bus = PCI_HOST_BRIDGE(phb)->bus;
> > >>      sPAPRFDT s_fdt;
> > >> @@ -1837,6 +1844,12 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
> > >>                           &ddw_extensions, sizeof(ddw_extensions)));
> > >>      }
> > >>
> > >> +    /* Advertise NUMA via ibm,associativity */
> > >> +    if (nb_numa_nodes > 1) {
> > >> +        _FDT(fdt_setprop(fdt, bus_off, "ibm,associativity", associativity,
> > >> +                         sizeof(associativity)));
> > >> +    }
> > > 
> > > LoPAPR 15.3 seems to suggest that ibm,associativity-reference-points is
> > > required alongside ibm,associativity for each DT node it appears in,
> > > and since we hardcode "Form 1" affinity it should be done similarly as
> > > the entry we place in the top-level DT node.
> > 
> > 
> > Hm, okay, I'll add it. There is a question to Bharata - why do we have 4s
> > in spapr_create_fdt_skel()'s refpoints? Just a random pick?
> > 
> > vec5[5]==0x80 means we are doing "Form1" and these 4s are far/near distances?
> > 
> > 
> > >> +
> > >>      /* Build the interrupt-map, this must matches what is done
> > >>       * in pci_spapr_map_irq
> > >>       */
> > >> diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h
> > >> index 5adc603..53c4b2d 100644
> > >> --- a/include/hw/pci-host/spapr.h
> > >> +++ b/include/hw/pci-host/spapr.h
> > >> @@ -75,6 +75,8 @@ struct sPAPRPHBState {
> > >>      bool ddw_enabled;
> > >>      uint64_t page_size_mask;
> > >>      uint64_t dma64_win_addr;
> > >> +
> > >> +    uint32_t numa_node;
> > >>  };
> > >>
> > >>  #define SPAPR_PCI_MAX_INDEX          255
> > >>
> > > 
> > 
> > 
>
David Gibson Sept. 23, 2016, 1:58 a.m. UTC | #8
On Wed, Jul 27, 2016 at 06:03:38PM +1000, Alexey Kardashevskiy wrote:
> This adds a numa id property to a PHB to allow linking passed PCI device
> to CPU/memory. It is up to the management stack to do CPU/memory pinning
> to the node with the actual PCI device.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>

I've applied this to ppc-for-2.8, renaming the property to "numa_node"
to match the similar option for pxb.

> ---
>  hw/ppc/spapr_pci.c          | 13 +++++++++++++
>  include/hw/pci-host/spapr.h |  2 ++
>  2 files changed, 15 insertions(+)
> 
> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> index 949c44f..af5394a 100644
> --- a/hw/ppc/spapr_pci.c
> +++ b/hw/ppc/spapr_pci.c
> @@ -47,6 +47,7 @@
>  #include "sysemu/device_tree.h"
>  #include "sysemu/kvm.h"
>  #include "sysemu/hostmem.h"
> +#include "sysemu/numa.h"
>  
>  #include "hw/vfio/vfio.h"
>  
> @@ -1544,6 +1545,7 @@ static Property spapr_phb_properties[] = {
>      DEFINE_PROP_BOOL("ddw", sPAPRPHBState, ddw_enabled, true),
>      DEFINE_PROP_UINT64("pgsz", sPAPRPHBState, page_size_mask,
>                         (1ULL << 12) | (1ULL << 16)),
> +    DEFINE_PROP_UINT32("node", sPAPRPHBState, numa_node, -1),
>      DEFINE_PROP_END_OF_LIST(),
>  };
>  
> @@ -1805,6 +1807,11 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
>          cpu_to_be32(1),
>          cpu_to_be32(RTAS_IBM_RESET_PE_DMA_WINDOW)
>      };
> +    uint32_t associativity[] = {cpu_to_be32(0x4),
> +                                cpu_to_be32(0x0),
> +                                cpu_to_be32(0x0),
> +                                cpu_to_be32(0x0),
> +                                cpu_to_be32(phb->numa_node)};
>      sPAPRTCETable *tcet;
>      PCIBus *bus = PCI_HOST_BRIDGE(phb)->bus;
>      sPAPRFDT s_fdt;
> @@ -1837,6 +1844,12 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
>                           &ddw_extensions, sizeof(ddw_extensions)));
>      }
>  
> +    /* Advertise NUMA via ibm,associativity */
> +    if (nb_numa_nodes > 1) {
> +        _FDT(fdt_setprop(fdt, bus_off, "ibm,associativity", associativity,
> +                         sizeof(associativity)));
> +    }
> +
>      /* Build the interrupt-map, this must matches what is done
>       * in pci_spapr_map_irq
>       */
> diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h
> index 5adc603..53c4b2d 100644
> --- a/include/hw/pci-host/spapr.h
> +++ b/include/hw/pci-host/spapr.h
> @@ -75,6 +75,8 @@ struct sPAPRPHBState {
>      bool ddw_enabled;
>      uint64_t page_size_mask;
>      uint64_t dma64_win_addr;
> +
> +    uint32_t numa_node;
>  };
>  
>  #define SPAPR_PCI_MAX_INDEX          255
David Gibson Sept. 23, 2016, 1:59 a.m. UTC | #9
On Thu, Sep 22, 2016 at 02:49:34PM +1000, David Gibson wrote:
> On Wed, Sep 14, 2016 at 07:03:50AM -0500, Michael Roth wrote:
> > Quoting Alexey Kardashevskiy (2016-09-14 04:39:10)
> > > On 14/09/16 09:29, Michael Roth wrote:
> > > > Quoting Alexey Kardashevskiy (2016-07-27 03:03:38)
> > > >> This adds a numa id property to a PHB to allow linking passed PCI device
> > > >> to CPU/memory. It is up to the management stack to do CPU/memory pinning
> > > >> to the node with the actual PCI device.
> > > > 
> > > > It looks like x86 relies on PCIBus->numa_node() method (via
> > > > pci_bus_numa_node()) to encode similar PCIBus affinities
> > > > into ACPI tables, and currently exposes it via
> > > > -device pci-[-express]-expander-bus,numa_node=X.
> > > 
> > > 
> > > 
> > > Well, until we allow DMA windows per PCI bus (not per PHB as it is now),
> > > this won't make much sense for us (unless I am missing something here).
> > 
> > I didn't consider that it's not a bus-level setting; I think
> > you're right that re-using the interface to both store/retrieve doesn't
> > make much sense in that case.
> > 
> > My thought that was that since pci_bus_numa_node() could in theory come
> > to be relied upon by common PCI code, that we should use it as well. But
> > even if it doesn't make sense for us to use it, wouldn't it make sense to
> > still set PCIBus->numa_node (in addition to the PHB-wide value) in the
> > off-chance that common code does come to rely on
> > pci_bus_numa_node()?
> 
> Yes, it would be a good idea to set the PCIBus node value to inherit
> the one that's set for the host bridge, just in case any generic code
> looks at it in future.

But that can be a followup patch, I've applied this to ppc-for-2.8
now.
diff mbox

Patch

diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
index 949c44f..af5394a 100644
--- a/hw/ppc/spapr_pci.c
+++ b/hw/ppc/spapr_pci.c
@@ -47,6 +47,7 @@ 
 #include "sysemu/device_tree.h"
 #include "sysemu/kvm.h"
 #include "sysemu/hostmem.h"
+#include "sysemu/numa.h"
 
 #include "hw/vfio/vfio.h"
 
@@ -1544,6 +1545,7 @@  static Property spapr_phb_properties[] = {
     DEFINE_PROP_BOOL("ddw", sPAPRPHBState, ddw_enabled, true),
     DEFINE_PROP_UINT64("pgsz", sPAPRPHBState, page_size_mask,
                        (1ULL << 12) | (1ULL << 16)),
+    DEFINE_PROP_UINT32("node", sPAPRPHBState, numa_node, -1),
     DEFINE_PROP_END_OF_LIST(),
 };
 
@@ -1805,6 +1807,11 @@  int spapr_populate_pci_dt(sPAPRPHBState *phb,
         cpu_to_be32(1),
         cpu_to_be32(RTAS_IBM_RESET_PE_DMA_WINDOW)
     };
+    uint32_t associativity[] = {cpu_to_be32(0x4),
+                                cpu_to_be32(0x0),
+                                cpu_to_be32(0x0),
+                                cpu_to_be32(0x0),
+                                cpu_to_be32(phb->numa_node)};
     sPAPRTCETable *tcet;
     PCIBus *bus = PCI_HOST_BRIDGE(phb)->bus;
     sPAPRFDT s_fdt;
@@ -1837,6 +1844,12 @@  int spapr_populate_pci_dt(sPAPRPHBState *phb,
                          &ddw_extensions, sizeof(ddw_extensions)));
     }
 
+    /* Advertise NUMA via ibm,associativity */
+    if (nb_numa_nodes > 1) {
+        _FDT(fdt_setprop(fdt, bus_off, "ibm,associativity", associativity,
+                         sizeof(associativity)));
+    }
+
     /* Build the interrupt-map, this must matches what is done
      * in pci_spapr_map_irq
      */
diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h
index 5adc603..53c4b2d 100644
--- a/include/hw/pci-host/spapr.h
+++ b/include/hw/pci-host/spapr.h
@@ -75,6 +75,8 @@  struct sPAPRPHBState {
     bool ddw_enabled;
     uint64_t page_size_mask;
     uint64_t dma64_win_addr;
+
+    uint32_t numa_node;
 };
 
 #define SPAPR_PCI_MAX_INDEX          255