Message ID | 1469606618-26915-1-git-send-email-aik@ozlabs.ru |
---|---|
State | New |
Headers | show |
On 27/07/16 18:03, Alexey Kardashevskiy wrote: > This adds a numa id property to a PHB to allow linking passed PCI device > to CPU/memory. It is up to the management stack to do CPU/memory pinning > to the node with the actual PCI device. > > Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> Ping? > --- > hw/ppc/spapr_pci.c | 13 +++++++++++++ > include/hw/pci-host/spapr.h | 2 ++ > 2 files changed, 15 insertions(+) > > diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c > index 949c44f..af5394a 100644 > --- a/hw/ppc/spapr_pci.c > +++ b/hw/ppc/spapr_pci.c > @@ -47,6 +47,7 @@ > #include "sysemu/device_tree.h" > #include "sysemu/kvm.h" > #include "sysemu/hostmem.h" > +#include "sysemu/numa.h" > > #include "hw/vfio/vfio.h" > > @@ -1544,6 +1545,7 @@ static Property spapr_phb_properties[] = { > DEFINE_PROP_BOOL("ddw", sPAPRPHBState, ddw_enabled, true), > DEFINE_PROP_UINT64("pgsz", sPAPRPHBState, page_size_mask, > (1ULL << 12) | (1ULL << 16)), > + DEFINE_PROP_UINT32("node", sPAPRPHBState, numa_node, -1), > DEFINE_PROP_END_OF_LIST(), > }; > > @@ -1805,6 +1807,11 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb, > cpu_to_be32(1), > cpu_to_be32(RTAS_IBM_RESET_PE_DMA_WINDOW) > }; > + uint32_t associativity[] = {cpu_to_be32(0x4), > + cpu_to_be32(0x0), > + cpu_to_be32(0x0), > + cpu_to_be32(0x0), > + cpu_to_be32(phb->numa_node)}; > sPAPRTCETable *tcet; > PCIBus *bus = PCI_HOST_BRIDGE(phb)->bus; > sPAPRFDT s_fdt; > @@ -1837,6 +1844,12 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb, > &ddw_extensions, sizeof(ddw_extensions))); > } > > + /* Advertise NUMA via ibm,associativity */ > + if (nb_numa_nodes > 1) { > + _FDT(fdt_setprop(fdt, bus_off, "ibm,associativity", associativity, > + sizeof(associativity))); > + } > + > /* Build the interrupt-map, this must matches what is done > * in pci_spapr_map_irq > */ > diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h > index 5adc603..53c4b2d 100644 > --- a/include/hw/pci-host/spapr.h > +++ b/include/hw/pci-host/spapr.h > @@ -75,6 +75,8 @@ struct sPAPRPHBState { > bool ddw_enabled; > uint64_t page_size_mask; > uint64_t dma64_win_addr; > + > + uint32_t numa_node; > }; > > #define SPAPR_PCI_MAX_INDEX 255 >
Quoting Alexey Kardashevskiy (2016-07-27 03:03:38) > This adds a numa id property to a PHB to allow linking passed PCI device > to CPU/memory. It is up to the management stack to do CPU/memory pinning > to the node with the actual PCI device. It looks like x86 relies on PCIBus->numa_node() method (via pci_bus_numa_node()) to encode similar PCIBus affinities into ACPI tables, and currently exposes it via -device pci-[-express]-expander-bus,numa_node=X. Maybe we should implement it using this existing PCIBus->numa_node() interface? We'd still have to expose numa_node as a spapr-pci-host-bridge device option though. Not sure if there's a more common way to expose it that might be easier for libvirt to discover. As it stands we'd need to add spapr-pci-host-bridge to a libvirt whitelist that currently only covers pci-expander-bus. Cc'ing Shiva who was looking into the libvirt side. One comment below: > > Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> > --- > hw/ppc/spapr_pci.c | 13 +++++++++++++ > include/hw/pci-host/spapr.h | 2 ++ > 2 files changed, 15 insertions(+) > > diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c > index 949c44f..af5394a 100644 > --- a/hw/ppc/spapr_pci.c > +++ b/hw/ppc/spapr_pci.c > @@ -47,6 +47,7 @@ > #include "sysemu/device_tree.h" > #include "sysemu/kvm.h" > #include "sysemu/hostmem.h" > +#include "sysemu/numa.h" > > #include "hw/vfio/vfio.h" > > @@ -1544,6 +1545,7 @@ static Property spapr_phb_properties[] = { > DEFINE_PROP_BOOL("ddw", sPAPRPHBState, ddw_enabled, true), > DEFINE_PROP_UINT64("pgsz", sPAPRPHBState, page_size_mask, > (1ULL << 12) | (1ULL << 16)), > + DEFINE_PROP_UINT32("node", sPAPRPHBState, numa_node, -1), > DEFINE_PROP_END_OF_LIST(), > }; > > @@ -1805,6 +1807,11 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb, > cpu_to_be32(1), > cpu_to_be32(RTAS_IBM_RESET_PE_DMA_WINDOW) > }; > + uint32_t associativity[] = {cpu_to_be32(0x4), > + cpu_to_be32(0x0), > + cpu_to_be32(0x0), > + cpu_to_be32(0x0), > + cpu_to_be32(phb->numa_node)}; > sPAPRTCETable *tcet; > PCIBus *bus = PCI_HOST_BRIDGE(phb)->bus; > sPAPRFDT s_fdt; > @@ -1837,6 +1844,12 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb, > &ddw_extensions, sizeof(ddw_extensions))); > } > > + /* Advertise NUMA via ibm,associativity */ > + if (nb_numa_nodes > 1) { > + _FDT(fdt_setprop(fdt, bus_off, "ibm,associativity", associativity, > + sizeof(associativity))); > + } LoPAPR 15.3 seems to suggest that ibm,associativity-reference-points is required alongside ibm,associativity for each DT node it appears in, and since we hardcode "Form 1" affinity it should be done similarly as the entry we place in the top-level DT node. > + > /* Build the interrupt-map, this must matches what is done > * in pci_spapr_map_irq > */ > diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h > index 5adc603..53c4b2d 100644 > --- a/include/hw/pci-host/spapr.h > +++ b/include/hw/pci-host/spapr.h > @@ -75,6 +75,8 @@ struct sPAPRPHBState { > bool ddw_enabled; > uint64_t page_size_mask; > uint64_t dma64_win_addr; > + > + uint32_t numa_node; > }; > > #define SPAPR_PCI_MAX_INDEX 255 > -- > 2.5.0.rc3 >
On 14/09/16 09:29, Michael Roth wrote: > Quoting Alexey Kardashevskiy (2016-07-27 03:03:38) >> This adds a numa id property to a PHB to allow linking passed PCI device >> to CPU/memory. It is up to the management stack to do CPU/memory pinning >> to the node with the actual PCI device. > > It looks like x86 relies on PCIBus->numa_node() method (via > pci_bus_numa_node()) to encode similar PCIBus affinities > into ACPI tables, and currently exposes it via > -device pci-[-express]-expander-bus,numa_node=X. Well, until we allow DMA windows per PCI bus (not per PHB as it is now), this won't make much sense for us (unless I am missing something here). > Maybe we should implement it using this existing > PCIBus->numa_node() interface? > > We'd still have to expose numa_node as a spapr-pci-host-bridge > device option though. Not sure if there's a more common way > to expose it that might be easier for libvirt to discover. As it > stands we'd need to add spapr-pci-host-bridge to a libvirt > whitelist that currently only covers pci-expander-bus. > > Cc'ing Shiva who was looking into the libvirt side. > > One comment below: > >> >> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> >> --- >> hw/ppc/spapr_pci.c | 13 +++++++++++++ >> include/hw/pci-host/spapr.h | 2 ++ >> 2 files changed, 15 insertions(+) >> >> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c >> index 949c44f..af5394a 100644 >> --- a/hw/ppc/spapr_pci.c >> +++ b/hw/ppc/spapr_pci.c >> @@ -47,6 +47,7 @@ >> #include "sysemu/device_tree.h" >> #include "sysemu/kvm.h" >> #include "sysemu/hostmem.h" >> +#include "sysemu/numa.h" >> >> #include "hw/vfio/vfio.h" >> >> @@ -1544,6 +1545,7 @@ static Property spapr_phb_properties[] = { >> DEFINE_PROP_BOOL("ddw", sPAPRPHBState, ddw_enabled, true), >> DEFINE_PROP_UINT64("pgsz", sPAPRPHBState, page_size_mask, >> (1ULL << 12) | (1ULL << 16)), >> + DEFINE_PROP_UINT32("node", sPAPRPHBState, numa_node, -1), >> DEFINE_PROP_END_OF_LIST(), >> }; >> >> @@ -1805,6 +1807,11 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb, >> cpu_to_be32(1), >> cpu_to_be32(RTAS_IBM_RESET_PE_DMA_WINDOW) >> }; >> + uint32_t associativity[] = {cpu_to_be32(0x4), >> + cpu_to_be32(0x0), >> + cpu_to_be32(0x0), >> + cpu_to_be32(0x0), >> + cpu_to_be32(phb->numa_node)}; >> sPAPRTCETable *tcet; >> PCIBus *bus = PCI_HOST_BRIDGE(phb)->bus; >> sPAPRFDT s_fdt; >> @@ -1837,6 +1844,12 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb, >> &ddw_extensions, sizeof(ddw_extensions))); >> } >> >> + /* Advertise NUMA via ibm,associativity */ >> + if (nb_numa_nodes > 1) { >> + _FDT(fdt_setprop(fdt, bus_off, "ibm,associativity", associativity, >> + sizeof(associativity))); >> + } > > LoPAPR 15.3 seems to suggest that ibm,associativity-reference-points is > required alongside ibm,associativity for each DT node it appears in, > and since we hardcode "Form 1" affinity it should be done similarly as > the entry we place in the top-level DT node. Hm, okay, I'll add it. There is a question to Bharata - why do we have 4s in spapr_create_fdt_skel()'s refpoints? Just a random pick? vec5[5]==0x80 means we are doing "Form1" and these 4s are far/near distances? >> + >> /* Build the interrupt-map, this must matches what is done >> * in pci_spapr_map_irq >> */ >> diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h >> index 5adc603..53c4b2d 100644 >> --- a/include/hw/pci-host/spapr.h >> +++ b/include/hw/pci-host/spapr.h >> @@ -75,6 +75,8 @@ struct sPAPRPHBState { >> bool ddw_enabled; >> uint64_t page_size_mask; >> uint64_t dma64_win_addr; >> + >> + uint32_t numa_node; >> }; >> >> #define SPAPR_PCI_MAX_INDEX 255 >> -- >> 2.5.0.rc3 >> >
Quoting Alexey Kardashevskiy (2016-09-14 04:39:10) > On 14/09/16 09:29, Michael Roth wrote: > > Quoting Alexey Kardashevskiy (2016-07-27 03:03:38) > >> This adds a numa id property to a PHB to allow linking passed PCI device > >> to CPU/memory. It is up to the management stack to do CPU/memory pinning > >> to the node with the actual PCI device. > > > > It looks like x86 relies on PCIBus->numa_node() method (via > > pci_bus_numa_node()) to encode similar PCIBus affinities > > into ACPI tables, and currently exposes it via > > -device pci-[-express]-expander-bus,numa_node=X. > > > > Well, until we allow DMA windows per PCI bus (not per PHB as it is now), > this won't make much sense for us (unless I am missing something here). I didn't consider that it's not a bus-level setting; I think you're right that re-using the interface to both store/retrieve doesn't make much sense in that case. My thought that was that since pci_bus_numa_node() could in theory come to be relied upon by common PCI code, that we should use it as well. But even if it doesn't make sense for us to use it, wouldn't it make sense to still set PCIBus->numa_node (in addition to the PHB-wide value) in the off-chance that common code does come to rely on pci_bus_numa_node()? > > > > Maybe we should implement it using this existing > > PCIBus->numa_node() interface? > > > > We'd still have to expose numa_node as a spapr-pci-host-bridge > > device option though. Not sure if there's a more common way > > to expose it that might be easier for libvirt to discover. As it > > stands we'd need to add spapr-pci-host-bridge to a libvirt > > whitelist that currently only covers pci-expander-bus. > > > > Cc'ing Shiva who was looking into the libvirt side. > > > > One comment below: > > > >> > >> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> > >> --- > >> hw/ppc/spapr_pci.c | 13 +++++++++++++ > >> include/hw/pci-host/spapr.h | 2 ++ > >> 2 files changed, 15 insertions(+) > >> > >> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c > >> index 949c44f..af5394a 100644 > >> --- a/hw/ppc/spapr_pci.c > >> +++ b/hw/ppc/spapr_pci.c > >> @@ -47,6 +47,7 @@ > >> #include "sysemu/device_tree.h" > >> #include "sysemu/kvm.h" > >> #include "sysemu/hostmem.h" > >> +#include "sysemu/numa.h" > >> > >> #include "hw/vfio/vfio.h" > >> > >> @@ -1544,6 +1545,7 @@ static Property spapr_phb_properties[] = { > >> DEFINE_PROP_BOOL("ddw", sPAPRPHBState, ddw_enabled, true), > >> DEFINE_PROP_UINT64("pgsz", sPAPRPHBState, page_size_mask, > >> (1ULL << 12) | (1ULL << 16)), > >> + DEFINE_PROP_UINT32("node", sPAPRPHBState, numa_node, -1), > >> DEFINE_PROP_END_OF_LIST(), > >> }; > >> > >> @@ -1805,6 +1807,11 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb, > >> cpu_to_be32(1), > >> cpu_to_be32(RTAS_IBM_RESET_PE_DMA_WINDOW) > >> }; > >> + uint32_t associativity[] = {cpu_to_be32(0x4), > >> + cpu_to_be32(0x0), > >> + cpu_to_be32(0x0), > >> + cpu_to_be32(0x0), > >> + cpu_to_be32(phb->numa_node)}; > >> sPAPRTCETable *tcet; > >> PCIBus *bus = PCI_HOST_BRIDGE(phb)->bus; > >> sPAPRFDT s_fdt; > >> @@ -1837,6 +1844,12 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb, > >> &ddw_extensions, sizeof(ddw_extensions))); > >> } > >> > >> + /* Advertise NUMA via ibm,associativity */ > >> + if (nb_numa_nodes > 1) { > >> + _FDT(fdt_setprop(fdt, bus_off, "ibm,associativity", associativity, > >> + sizeof(associativity))); > >> + } > > > > LoPAPR 15.3 seems to suggest that ibm,associativity-reference-points is > > required alongside ibm,associativity for each DT node it appears in, > > and since we hardcode "Form 1" affinity it should be done similarly as > > the entry we place in the top-level DT node. > > > Hm, okay, I'll add it. There is a question to Bharata - why do we have 4s > in spapr_create_fdt_skel()'s refpoints? Just a random pick? > > vec5[5]==0x80 means we are doing "Form1" and these 4s are far/near distances? > > > >> + > >> /* Build the interrupt-map, this must matches what is done > >> * in pci_spapr_map_irq > >> */ > >> diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h > >> index 5adc603..53c4b2d 100644 > >> --- a/include/hw/pci-host/spapr.h > >> +++ b/include/hw/pci-host/spapr.h > >> @@ -75,6 +75,8 @@ struct sPAPRPHBState { > >> bool ddw_enabled; > >> uint64_t page_size_mask; > >> uint64_t dma64_win_addr; > >> + > >> + uint32_t numa_node; > >> }; > >> > >> #define SPAPR_PCI_MAX_INDEX 255 > >> -- > >> 2.5.0.rc3 > >> > > > > > -- > Alexey >
On Wed, Sep 14, 2016 at 07:39:10PM +1000, Alexey Kardashevskiy wrote: > On 14/09/16 09:29, Michael Roth wrote: > > Quoting Alexey Kardashevskiy (2016-07-27 03:03:38) > >> This adds a numa id property to a PHB to allow linking passed PCI device > >> to CPU/memory. It is up to the management stack to do CPU/memory pinning > >> to the node with the actual PCI device. > > > > It looks like x86 relies on PCIBus->numa_node() method (via > > pci_bus_numa_node()) to encode similar PCIBus affinities > > into ACPI tables, and currently exposes it via > > -device pci-[-express]-expander-bus,numa_node=X. > > > > Well, until we allow DMA windows per PCI bus (not per PHB as it is now), > this won't make much sense for us (unless I am missing something here). > > > > Maybe we should implement it using this existing > > PCIBus->numa_node() interface? > > > > We'd still have to expose numa_node as a spapr-pci-host-bridge > > device option though. Not sure if there's a more common way > > to expose it that might be easier for libvirt to discover. As it > > stands we'd need to add spapr-pci-host-bridge to a libvirt > > whitelist that currently only covers pci-expander-bus. > > > > Cc'ing Shiva who was looking into the libvirt side. > > > > One comment below: > > > >> > >> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> > >> --- > >> hw/ppc/spapr_pci.c | 13 +++++++++++++ > >> include/hw/pci-host/spapr.h | 2 ++ > >> 2 files changed, 15 insertions(+) > >> > >> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c > >> index 949c44f..af5394a 100644 > >> --- a/hw/ppc/spapr_pci.c > >> +++ b/hw/ppc/spapr_pci.c > >> @@ -47,6 +47,7 @@ > >> #include "sysemu/device_tree.h" > >> #include "sysemu/kvm.h" > >> #include "sysemu/hostmem.h" > >> +#include "sysemu/numa.h" > >> > >> #include "hw/vfio/vfio.h" > >> > >> @@ -1544,6 +1545,7 @@ static Property spapr_phb_properties[] = { > >> DEFINE_PROP_BOOL("ddw", sPAPRPHBState, ddw_enabled, true), > >> DEFINE_PROP_UINT64("pgsz", sPAPRPHBState, page_size_mask, > >> (1ULL << 12) | (1ULL << 16)), > >> + DEFINE_PROP_UINT32("node", sPAPRPHBState, numa_node, -1), > >> DEFINE_PROP_END_OF_LIST(), > >> }; > >> > >> @@ -1805,6 +1807,11 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb, > >> cpu_to_be32(1), > >> cpu_to_be32(RTAS_IBM_RESET_PE_DMA_WINDOW) > >> }; > >> + uint32_t associativity[] = {cpu_to_be32(0x4), > >> + cpu_to_be32(0x0), > >> + cpu_to_be32(0x0), > >> + cpu_to_be32(0x0), > >> + cpu_to_be32(phb->numa_node)}; > >> sPAPRTCETable *tcet; > >> PCIBus *bus = PCI_HOST_BRIDGE(phb)->bus; > >> sPAPRFDT s_fdt; > >> @@ -1837,6 +1844,12 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb, > >> &ddw_extensions, sizeof(ddw_extensions))); > >> } > >> > >> + /* Advertise NUMA via ibm,associativity */ > >> + if (nb_numa_nodes > 1) { > >> + _FDT(fdt_setprop(fdt, bus_off, "ibm,associativity", associativity, > >> + sizeof(associativity))); > >> + } > > > > LoPAPR 15.3 seems to suggest that ibm,associativity-reference-points is > > required alongside ibm,associativity for each DT node it appears in, > > and since we hardcode "Form 1" affinity it should be done similarly as > > the entry we place in the top-level DT node. > > > Hm, okay, I'll add it. There is a question to Bharata - why do we have 4s > in spapr_create_fdt_skel()'s refpoints? Just a random pick? I remember basing it on what I saw in an LPAR> > > vec5[5]==0x80 means we are doing "Form1" and these 4s are far/near distances? This comment from https://github.com/open-power/skiboot/blob/master/core/affinity.c should explain things: /* * * We currently construct our associativity properties as such: * * - For "chip" devices (bridges, memory, ...), 4 entries: * * - CCM node ID * - HW card ID * - HW module ID * - Chip ID * * The information is constructed based on the chip ID which (unlike * pHyp) is our HW chip ID (aka "XSCOM" chip ID). We use it to retrieve * the other properties from the corresponding chip/xscom node in the * device-tree. If those properties are absent, 0 is used. * * - For "core" devices, we add a 5th entry: * * - Core ID * * Here too, we do not use the "cooked" HW processor ID from HDAT but * instead use the real HW core ID which is basically the interrupt * server number of thread 0 on that core. * * * The ibm,associativity-reference-points property is currently set to * 4,4 indicating that the chip ID is our only reference point. This * should be extended to encompass the node IDs eventually. */ Regards, Bharata.
On Wed, Sep 14, 2016 at 07:39:10PM +1000, Alexey Kardashevskiy wrote: > On 14/09/16 09:29, Michael Roth wrote: > > Quoting Alexey Kardashevskiy (2016-07-27 03:03:38) > >> This adds a numa id property to a PHB to allow linking passed PCI device > >> to CPU/memory. It is up to the management stack to do CPU/memory pinning > >> to the node with the actual PCI device. > > > > It looks like x86 relies on PCIBus->numa_node() method (via > > pci_bus_numa_node()) to encode similar PCIBus affinities > > into ACPI tables, and currently exposes it via > > -device pci-[-express]-expander-bus,numa_node=X. > > Well, until we allow DMA windows per PCI bus (not per PHB as it is now), > this won't make much sense for us (unless I am missing something > here). I can't actually see any sane way we could have NUMA associativity of PCI at any finer granularity than the host bridge level. I suspect the only reason it works that way on x86 is due to some of the weird stuff PC does to make what are effectively different host bridges appear as different buses in a single logical domain. > > > Maybe we should implement it using this existing > > PCIBus->numa_node() interface? > > > > We'd still have to expose numa_node as a spapr-pci-host-bridge > > device option though. Not sure if there's a more common way > > to expose it that might be easier for libvirt to discover. As it > > stands we'd need to add spapr-pci-host-bridge to a libvirt > > whitelist that currently only covers pci-expander-bus. > > > > Cc'ing Shiva who was looking into the libvirt side. I think we should change the actual name of the option to "numa_node" to match the option used on the pxb, though. > > > > One comment below: > > > >> > >> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> > >> --- > >> hw/ppc/spapr_pci.c | 13 +++++++++++++ > >> include/hw/pci-host/spapr.h | 2 ++ > >> 2 files changed, 15 insertions(+) > >> > >> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c > >> index 949c44f..af5394a 100644 > >> --- a/hw/ppc/spapr_pci.c > >> +++ b/hw/ppc/spapr_pci.c > >> @@ -47,6 +47,7 @@ > >> #include "sysemu/device_tree.h" > >> #include "sysemu/kvm.h" > >> #include "sysemu/hostmem.h" > >> +#include "sysemu/numa.h" > >> > >> #include "hw/vfio/vfio.h" > >> > >> @@ -1544,6 +1545,7 @@ static Property spapr_phb_properties[] = { > >> DEFINE_PROP_BOOL("ddw", sPAPRPHBState, ddw_enabled, true), > >> DEFINE_PROP_UINT64("pgsz", sPAPRPHBState, page_size_mask, > >> (1ULL << 12) | (1ULL << 16)), > >> + DEFINE_PROP_UINT32("node", sPAPRPHBState, numa_node, -1), > >> DEFINE_PROP_END_OF_LIST(), > >> }; > >> > >> @@ -1805,6 +1807,11 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb, > >> cpu_to_be32(1), > >> cpu_to_be32(RTAS_IBM_RESET_PE_DMA_WINDOW) > >> }; > >> + uint32_t associativity[] = {cpu_to_be32(0x4), > >> + cpu_to_be32(0x0), > >> + cpu_to_be32(0x0), > >> + cpu_to_be32(0x0), > >> + cpu_to_be32(phb->numa_node)}; > >> sPAPRTCETable *tcet; > >> PCIBus *bus = PCI_HOST_BRIDGE(phb)->bus; > >> sPAPRFDT s_fdt; > >> @@ -1837,6 +1844,12 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb, > >> &ddw_extensions, sizeof(ddw_extensions))); > >> } > >> > >> + /* Advertise NUMA via ibm,associativity */ > >> + if (nb_numa_nodes > 1) { > >> + _FDT(fdt_setprop(fdt, bus_off, "ibm,associativity", associativity, > >> + sizeof(associativity))); > >> + } > > > > LoPAPR 15.3 seems to suggest that ibm,associativity-reference-points is > > required alongside ibm,associativity for each DT node it appears in, > > and since we hardcode "Form 1" affinity it should be done similarly as > > the entry we place in the top-level DT node. > > > Hm, okay, I'll add it. There is a question to Bharata - why do we have 4s > in spapr_create_fdt_skel()'s refpoints? Just a random pick? > > vec5[5]==0x80 means we are doing "Form1" and these 4s are far/near distances? > > > >> + > >> /* Build the interrupt-map, this must matches what is done > >> * in pci_spapr_map_irq > >> */ > >> diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h > >> index 5adc603..53c4b2d 100644 > >> --- a/include/hw/pci-host/spapr.h > >> +++ b/include/hw/pci-host/spapr.h > >> @@ -75,6 +75,8 @@ struct sPAPRPHBState { > >> bool ddw_enabled; > >> uint64_t page_size_mask; > >> uint64_t dma64_win_addr; > >> + > >> + uint32_t numa_node; > >> }; > >> > >> #define SPAPR_PCI_MAX_INDEX 255 > >> > > > >
On Wed, Sep 14, 2016 at 07:03:50AM -0500, Michael Roth wrote: > Quoting Alexey Kardashevskiy (2016-09-14 04:39:10) > > On 14/09/16 09:29, Michael Roth wrote: > > > Quoting Alexey Kardashevskiy (2016-07-27 03:03:38) > > >> This adds a numa id property to a PHB to allow linking passed PCI device > > >> to CPU/memory. It is up to the management stack to do CPU/memory pinning > > >> to the node with the actual PCI device. > > > > > > It looks like x86 relies on PCIBus->numa_node() method (via > > > pci_bus_numa_node()) to encode similar PCIBus affinities > > > into ACPI tables, and currently exposes it via > > > -device pci-[-express]-expander-bus,numa_node=X. > > > > > > > > Well, until we allow DMA windows per PCI bus (not per PHB as it is now), > > this won't make much sense for us (unless I am missing something here). > > I didn't consider that it's not a bus-level setting; I think > you're right that re-using the interface to both store/retrieve doesn't > make much sense in that case. > > My thought that was that since pci_bus_numa_node() could in theory come > to be relied upon by common PCI code, that we should use it as well. But > even if it doesn't make sense for us to use it, wouldn't it make sense to > still set PCIBus->numa_node (in addition to the PHB-wide value) in the > off-chance that common code does come to rely on > pci_bus_numa_node()? Yes, it would be a good idea to set the PCIBus node value to inherit the one that's set for the host bridge, just in case any generic code looks at it in future. > > > > > > > > Maybe we should implement it using this existing > > > PCIBus->numa_node() interface? > > > > > > We'd still have to expose numa_node as a spapr-pci-host-bridge > > > device option though. Not sure if there's a more common way > > > to expose it that might be easier for libvirt to discover. As it > > > stands we'd need to add spapr-pci-host-bridge to a libvirt > > > whitelist that currently only covers pci-expander-bus. > > > > > > Cc'ing Shiva who was looking into the libvirt side. > > > > > > One comment below: > > > > > >> > > >> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> > > >> --- > > >> hw/ppc/spapr_pci.c | 13 +++++++++++++ > > >> include/hw/pci-host/spapr.h | 2 ++ > > >> 2 files changed, 15 insertions(+) > > >> > > >> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c > > >> index 949c44f..af5394a 100644 > > >> --- a/hw/ppc/spapr_pci.c > > >> +++ b/hw/ppc/spapr_pci.c > > >> @@ -47,6 +47,7 @@ > > >> #include "sysemu/device_tree.h" > > >> #include "sysemu/kvm.h" > > >> #include "sysemu/hostmem.h" > > >> +#include "sysemu/numa.h" > > >> > > >> #include "hw/vfio/vfio.h" > > >> > > >> @@ -1544,6 +1545,7 @@ static Property spapr_phb_properties[] = { > > >> DEFINE_PROP_BOOL("ddw", sPAPRPHBState, ddw_enabled, true), > > >> DEFINE_PROP_UINT64("pgsz", sPAPRPHBState, page_size_mask, > > >> (1ULL << 12) | (1ULL << 16)), > > >> + DEFINE_PROP_UINT32("node", sPAPRPHBState, numa_node, -1), > > >> DEFINE_PROP_END_OF_LIST(), > > >> }; > > >> > > >> @@ -1805,6 +1807,11 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb, > > >> cpu_to_be32(1), > > >> cpu_to_be32(RTAS_IBM_RESET_PE_DMA_WINDOW) > > >> }; > > >> + uint32_t associativity[] = {cpu_to_be32(0x4), > > >> + cpu_to_be32(0x0), > > >> + cpu_to_be32(0x0), > > >> + cpu_to_be32(0x0), > > >> + cpu_to_be32(phb->numa_node)}; > > >> sPAPRTCETable *tcet; > > >> PCIBus *bus = PCI_HOST_BRIDGE(phb)->bus; > > >> sPAPRFDT s_fdt; > > >> @@ -1837,6 +1844,12 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb, > > >> &ddw_extensions, sizeof(ddw_extensions))); > > >> } > > >> > > >> + /* Advertise NUMA via ibm,associativity */ > > >> + if (nb_numa_nodes > 1) { > > >> + _FDT(fdt_setprop(fdt, bus_off, "ibm,associativity", associativity, > > >> + sizeof(associativity))); > > >> + } > > > > > > LoPAPR 15.3 seems to suggest that ibm,associativity-reference-points is > > > required alongside ibm,associativity for each DT node it appears in, > > > and since we hardcode "Form 1" affinity it should be done similarly as > > > the entry we place in the top-level DT node. > > > > > > Hm, okay, I'll add it. There is a question to Bharata - why do we have 4s > > in spapr_create_fdt_skel()'s refpoints? Just a random pick? > > > > vec5[5]==0x80 means we are doing "Form1" and these 4s are far/near distances? > > > > > > >> + > > >> /* Build the interrupt-map, this must matches what is done > > >> * in pci_spapr_map_irq > > >> */ > > >> diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h > > >> index 5adc603..53c4b2d 100644 > > >> --- a/include/hw/pci-host/spapr.h > > >> +++ b/include/hw/pci-host/spapr.h > > >> @@ -75,6 +75,8 @@ struct sPAPRPHBState { > > >> bool ddw_enabled; > > >> uint64_t page_size_mask; > > >> uint64_t dma64_win_addr; > > >> + > > >> + uint32_t numa_node; > > >> }; > > >> > > >> #define SPAPR_PCI_MAX_INDEX 255 > > >> > > > > > > > >
On Wed, Jul 27, 2016 at 06:03:38PM +1000, Alexey Kardashevskiy wrote: > This adds a numa id property to a PHB to allow linking passed PCI device > to CPU/memory. It is up to the management stack to do CPU/memory pinning > to the node with the actual PCI device. > > Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> I've applied this to ppc-for-2.8, renaming the property to "numa_node" to match the similar option for pxb. > --- > hw/ppc/spapr_pci.c | 13 +++++++++++++ > include/hw/pci-host/spapr.h | 2 ++ > 2 files changed, 15 insertions(+) > > diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c > index 949c44f..af5394a 100644 > --- a/hw/ppc/spapr_pci.c > +++ b/hw/ppc/spapr_pci.c > @@ -47,6 +47,7 @@ > #include "sysemu/device_tree.h" > #include "sysemu/kvm.h" > #include "sysemu/hostmem.h" > +#include "sysemu/numa.h" > > #include "hw/vfio/vfio.h" > > @@ -1544,6 +1545,7 @@ static Property spapr_phb_properties[] = { > DEFINE_PROP_BOOL("ddw", sPAPRPHBState, ddw_enabled, true), > DEFINE_PROP_UINT64("pgsz", sPAPRPHBState, page_size_mask, > (1ULL << 12) | (1ULL << 16)), > + DEFINE_PROP_UINT32("node", sPAPRPHBState, numa_node, -1), > DEFINE_PROP_END_OF_LIST(), > }; > > @@ -1805,6 +1807,11 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb, > cpu_to_be32(1), > cpu_to_be32(RTAS_IBM_RESET_PE_DMA_WINDOW) > }; > + uint32_t associativity[] = {cpu_to_be32(0x4), > + cpu_to_be32(0x0), > + cpu_to_be32(0x0), > + cpu_to_be32(0x0), > + cpu_to_be32(phb->numa_node)}; > sPAPRTCETable *tcet; > PCIBus *bus = PCI_HOST_BRIDGE(phb)->bus; > sPAPRFDT s_fdt; > @@ -1837,6 +1844,12 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb, > &ddw_extensions, sizeof(ddw_extensions))); > } > > + /* Advertise NUMA via ibm,associativity */ > + if (nb_numa_nodes > 1) { > + _FDT(fdt_setprop(fdt, bus_off, "ibm,associativity", associativity, > + sizeof(associativity))); > + } > + > /* Build the interrupt-map, this must matches what is done > * in pci_spapr_map_irq > */ > diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h > index 5adc603..53c4b2d 100644 > --- a/include/hw/pci-host/spapr.h > +++ b/include/hw/pci-host/spapr.h > @@ -75,6 +75,8 @@ struct sPAPRPHBState { > bool ddw_enabled; > uint64_t page_size_mask; > uint64_t dma64_win_addr; > + > + uint32_t numa_node; > }; > > #define SPAPR_PCI_MAX_INDEX 255
On Thu, Sep 22, 2016 at 02:49:34PM +1000, David Gibson wrote: > On Wed, Sep 14, 2016 at 07:03:50AM -0500, Michael Roth wrote: > > Quoting Alexey Kardashevskiy (2016-09-14 04:39:10) > > > On 14/09/16 09:29, Michael Roth wrote: > > > > Quoting Alexey Kardashevskiy (2016-07-27 03:03:38) > > > >> This adds a numa id property to a PHB to allow linking passed PCI device > > > >> to CPU/memory. It is up to the management stack to do CPU/memory pinning > > > >> to the node with the actual PCI device. > > > > > > > > It looks like x86 relies on PCIBus->numa_node() method (via > > > > pci_bus_numa_node()) to encode similar PCIBus affinities > > > > into ACPI tables, and currently exposes it via > > > > -device pci-[-express]-expander-bus,numa_node=X. > > > > > > > > > > > > Well, until we allow DMA windows per PCI bus (not per PHB as it is now), > > > this won't make much sense for us (unless I am missing something here). > > > > I didn't consider that it's not a bus-level setting; I think > > you're right that re-using the interface to both store/retrieve doesn't > > make much sense in that case. > > > > My thought that was that since pci_bus_numa_node() could in theory come > > to be relied upon by common PCI code, that we should use it as well. But > > even if it doesn't make sense for us to use it, wouldn't it make sense to > > still set PCIBus->numa_node (in addition to the PHB-wide value) in the > > off-chance that common code does come to rely on > > pci_bus_numa_node()? > > Yes, it would be a good idea to set the PCIBus node value to inherit > the one that's set for the host bridge, just in case any generic code > looks at it in future. But that can be a followup patch, I've applied this to ppc-for-2.8 now.
diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c index 949c44f..af5394a 100644 --- a/hw/ppc/spapr_pci.c +++ b/hw/ppc/spapr_pci.c @@ -47,6 +47,7 @@ #include "sysemu/device_tree.h" #include "sysemu/kvm.h" #include "sysemu/hostmem.h" +#include "sysemu/numa.h" #include "hw/vfio/vfio.h" @@ -1544,6 +1545,7 @@ static Property spapr_phb_properties[] = { DEFINE_PROP_BOOL("ddw", sPAPRPHBState, ddw_enabled, true), DEFINE_PROP_UINT64("pgsz", sPAPRPHBState, page_size_mask, (1ULL << 12) | (1ULL << 16)), + DEFINE_PROP_UINT32("node", sPAPRPHBState, numa_node, -1), DEFINE_PROP_END_OF_LIST(), }; @@ -1805,6 +1807,11 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb, cpu_to_be32(1), cpu_to_be32(RTAS_IBM_RESET_PE_DMA_WINDOW) }; + uint32_t associativity[] = {cpu_to_be32(0x4), + cpu_to_be32(0x0), + cpu_to_be32(0x0), + cpu_to_be32(0x0), + cpu_to_be32(phb->numa_node)}; sPAPRTCETable *tcet; PCIBus *bus = PCI_HOST_BRIDGE(phb)->bus; sPAPRFDT s_fdt; @@ -1837,6 +1844,12 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb, &ddw_extensions, sizeof(ddw_extensions))); } + /* Advertise NUMA via ibm,associativity */ + if (nb_numa_nodes > 1) { + _FDT(fdt_setprop(fdt, bus_off, "ibm,associativity", associativity, + sizeof(associativity))); + } + /* Build the interrupt-map, this must matches what is done * in pci_spapr_map_irq */ diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h index 5adc603..53c4b2d 100644 --- a/include/hw/pci-host/spapr.h +++ b/include/hw/pci-host/spapr.h @@ -75,6 +75,8 @@ struct sPAPRPHBState { bool ddw_enabled; uint64_t page_size_mask; uint64_t dma64_win_addr; + + uint32_t numa_node; }; #define SPAPR_PCI_MAX_INDEX 255
This adds a numa id property to a PHB to allow linking passed PCI device to CPU/memory. It is up to the management stack to do CPU/memory pinning to the node with the actual PCI device. Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> --- hw/ppc/spapr_pci.c | 13 +++++++++++++ include/hw/pci-host/spapr.h | 2 ++ 2 files changed, 15 insertions(+)