Message ID | 1526801333-30613-4-git-send-email-whois.zihan.yang@gmail.com |
---|---|
State | New |
Headers | show |
Series | pci_expander_brdige: Put pxb host bridge into separate pci domain | expand |
On 05/20/2018 10:28 AM, Zihan Yang wrote: > Currently only q35 host bridge us allocated space in MCFG table. To put pxb host > into sepratate pci domain, each of them should have its own configuration space > int MCFG table > > Signed-off-by: Zihan Yang <whois.zihan.yang@gmail.com> > --- > hw/i386/acpi-build.c | 83 +++++++++++++++++++++++++++++++++++++++------------- > 1 file changed, 62 insertions(+), 21 deletions(-) > > diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c > index 9bc6d97..808d815 100644 > --- a/hw/i386/acpi-build.c > +++ b/hw/i386/acpi-build.c > @@ -89,6 +89,7 @@ > typedef struct AcpiMcfgInfo { > uint64_t mcfg_base; > uint32_t mcfg_size; > + struct AcpiMcfgInfo *next; > } AcpiMcfgInfo; > > typedef struct AcpiPmInfo { > @@ -2427,14 +2428,15 @@ build_mcfg_q35(GArray *table_data, BIOSLinker *linker, AcpiMcfgInfo *info) > { > AcpiTableMcfg *mcfg; > const char *sig; > - int len = sizeof(*mcfg) + 1 * sizeof(mcfg->allocation[0]); > + int len, count = 0; > + AcpiMcfgInfo *cfg = info; > > + while (cfg) { > + ++count; > + cfg = cfg->next; > + } > + len = sizeof(*mcfg) + count * sizeof(mcfg->allocation[0]); > mcfg = acpi_data_push(table_data, len); > - mcfg->allocation[0].address = cpu_to_le64(info->mcfg_base); > - /* Only a single allocation so no need to play with segments */ > - mcfg->allocation[0].pci_segment = cpu_to_le16(0); > - mcfg->allocation[0].start_bus_number = 0; > - mcfg->allocation[0].end_bus_number = PCIE_MMCFG_BUS(info->mcfg_size - 1); > > /* MCFG is used for ECAM which can be enabled or disabled by guest. > * To avoid table size changes (which create migration issues), > @@ -2448,6 +2450,17 @@ build_mcfg_q35(GArray *table_data, BIOSLinker *linker, AcpiMcfgInfo *info) > } else { > sig = "MCFG"; > } > + > + count = 0; > + while (info) { > + mcfg[count].allocation[0].address = cpu_to_le64(info->mcfg_base); > + /* Only a single allocation so no need to play with segments */ > + mcfg[count].allocation[0].pci_segment = cpu_to_le16(count); > + mcfg[count].allocation[0].start_bus_number = 0; > + mcfg[count++].allocation[0].end_bus_number = PCIE_MMCFG_BUS(info->mcfg_size - 1); An interesting point is if we want to limit the MMFCG size for PXBs, as we may not be interested to use all the buses in a specific domain. For each bus we require some address space that remains unused. > + info = info->next; > + } > + > build_header(linker, table_data, (void *)mcfg, sig, len, 1, NULL, NULL); > } > > @@ -2602,26 +2615,52 @@ struct AcpiBuildState { > MemoryRegion *linker_mr; > } AcpiBuildState; > > -static bool acpi_get_mcfg(AcpiMcfgInfo *mcfg) > +static inline void cleanup_mcfg(AcpiMcfgInfo *mcfg) > +{ > + AcpiMcfgInfo *tmp; > + while (mcfg) { > + tmp = mcfg->next; > + g_free(mcfg); > + mcfg = tmp; > + } > +} > + > +static AcpiMcfgInfo *acpi_get_mcfg(void) > { > Object *pci_host; > QObject *o; > + AcpiMcfgInfo *head = NULL, *tail, *mcfg; > > pci_host = acpi_get_i386_pci_host(); > g_assert(pci_host); > > - o = object_property_get_qobject(pci_host, PCIE_HOST_MCFG_BASE, NULL); > - if (!o) { > - return false; > + while (pci_host) { > + mcfg = g_new0(AcpiMcfgInfo, 1); > + mcfg->next = NULL; > + if (!head) { > + tail = head = mcfg; > + } else { > + tail->next = mcfg; > + tail = mcfg; > + } > + > + o = object_property_get_qobject(pci_host, PCIE_HOST_MCFG_BASE, NULL); > + if (!o) { > + cleanup_mcfg(head); > + g_free(mcfg); > + return NULL; > + } > + mcfg->mcfg_base = qnum_get_uint(qobject_to(QNum, o)); > + qobject_unref(o); > + > + o = object_property_get_qobject(pci_host, PCIE_HOST_MCFG_SIZE, NULL); I'll let Igor to comment on the APCI bits, but the idea of querying each PCI host for the MMFCG details and put it into a different table sounds good to me. [Adding Laszlo for his insights] Looking forward to see the SeaBIOS patches :) Thanks! Marcel > + assert(o); > + mcfg->mcfg_size = qnum_get_uint(qobject_to(QNum, o)); > + qobject_unref(o); > + > + pci_host = OBJECT(QTAILQ_NEXT(PCI_HOST_BRIDGE(pci_host), next)); > } > - mcfg->mcfg_base = qnum_get_uint(qobject_to(QNum, o)); > - qobject_unref(o); > - > - o = object_property_get_qobject(pci_host, PCIE_HOST_MCFG_SIZE, NULL); > - assert(o); > - mcfg->mcfg_size = qnum_get_uint(qobject_to(QNum, o)); > - qobject_unref(o); > - return true; > + return head; > } > > static > @@ -2633,7 +2672,7 @@ void acpi_build(AcpiBuildTables *tables, MachineState *machine) > unsigned facs, dsdt, rsdt, fadt; > AcpiPmInfo pm; > AcpiMiscInfo misc; > - AcpiMcfgInfo mcfg; > + AcpiMcfgInfo *mcfg; > Range pci_hole, pci_hole64; > uint8_t *u; > size_t aml_len = 0; > @@ -2714,10 +2753,12 @@ void acpi_build(AcpiBuildTables *tables, MachineState *machine) > build_slit(tables_blob, tables->linker); > } > } > - if (acpi_get_mcfg(&mcfg)) { > + if ((mcfg = acpi_get_mcfg()) != NULL) { > acpi_add_table(table_offsets, tables_blob); > - build_mcfg_q35(tables_blob, tables->linker, &mcfg); > + build_mcfg_q35(tables_blob, tables->linker, mcfg); > } > + cleanup_mcfg(mcfg); > + > if (x86_iommu_get_default()) { > IommuType IOMMUType = x86_iommu_get_type(); > if (IOMMUType == TYPE_AMD) {
> An interesting point is if we want to limit the MMFCG size for PXBs, as we may not be > interested to use all the buses in a specific domain. OK, perhaps providing an option for the user to specify the desired bus numbers? > For each bus we require some address space that remains unused. Does this mean we should reserve some slots in a bus, or reduce the number of buses used in each domain? > [Adding Laszlo for his insights] > Looking forward to see the SeaBIOS patches :) I appreciate your reviews, I'll write the rest part as soon as possible. Thanks
On 05/21/18 13:53, Marcel Apfelbaum wrote: > > > On 05/20/2018 10:28 AM, Zihan Yang wrote: >> Currently only q35 host bridge us allocated space in MCFG table. To >> put pxb host >> into sepratate pci domain, each of them should have its own >> configuration space >> int MCFG table >> >> Signed-off-by: Zihan Yang <whois.zihan.yang@gmail.com> >> --- >> hw/i386/acpi-build.c | 83 >> +++++++++++++++++++++++++++++++++++++++------------- >> 1 file changed, 62 insertions(+), 21 deletions(-) >> >> diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c >> index 9bc6d97..808d815 100644 >> --- a/hw/i386/acpi-build.c >> +++ b/hw/i386/acpi-build.c >> @@ -89,6 +89,7 @@ >> typedef struct AcpiMcfgInfo { >> uint64_t mcfg_base; >> uint32_t mcfg_size; >> + struct AcpiMcfgInfo *next; >> } AcpiMcfgInfo; >> typedef struct AcpiPmInfo { >> @@ -2427,14 +2428,15 @@ build_mcfg_q35(GArray *table_data, BIOSLinker >> *linker, AcpiMcfgInfo *info) >> { >> AcpiTableMcfg *mcfg; >> const char *sig; >> - int len = sizeof(*mcfg) + 1 * sizeof(mcfg->allocation[0]); >> + int len, count = 0; >> + AcpiMcfgInfo *cfg = info; >> + while (cfg) { >> + ++count; >> + cfg = cfg->next; >> + } >> + len = sizeof(*mcfg) + count * sizeof(mcfg->allocation[0]); >> mcfg = acpi_data_push(table_data, len); >> - mcfg->allocation[0].address = cpu_to_le64(info->mcfg_base); >> - /* Only a single allocation so no need to play with segments */ >> - mcfg->allocation[0].pci_segment = cpu_to_le16(0); >> - mcfg->allocation[0].start_bus_number = 0; >> - mcfg->allocation[0].end_bus_number = >> PCIE_MMCFG_BUS(info->mcfg_size - 1); >> /* MCFG is used for ECAM which can be enabled or disabled by >> guest. >> * To avoid table size changes (which create migration issues), >> @@ -2448,6 +2450,17 @@ build_mcfg_q35(GArray *table_data, BIOSLinker >> *linker, AcpiMcfgInfo *info) >> } else { >> sig = "MCFG"; >> } >> + >> + count = 0; >> + while (info) { >> + mcfg[count].allocation[0].address = >> cpu_to_le64(info->mcfg_base); >> + /* Only a single allocation so no need to play with segments */ >> + mcfg[count].allocation[0].pci_segment = cpu_to_le16(count); >> + mcfg[count].allocation[0].start_bus_number = 0; >> + mcfg[count++].allocation[0].end_bus_number = >> PCIE_MMCFG_BUS(info->mcfg_size - 1); > > An interesting point is if we want to limit the MMFCG size for PXBs, as > we may not be > interested to use all the buses in a specific domain. For each bus we > require some > address space that remains unused. > >> + info = info->next; >> + } >> + >> build_header(linker, table_data, (void *)mcfg, sig, len, 1, >> NULL, NULL); >> } >> @@ -2602,26 +2615,52 @@ struct AcpiBuildState { >> MemoryRegion *linker_mr; >> } AcpiBuildState; >> -static bool acpi_get_mcfg(AcpiMcfgInfo *mcfg) >> +static inline void cleanup_mcfg(AcpiMcfgInfo *mcfg) >> +{ >> + AcpiMcfgInfo *tmp; >> + while (mcfg) { >> + tmp = mcfg->next; >> + g_free(mcfg); >> + mcfg = tmp; >> + } >> +} >> + >> +static AcpiMcfgInfo *acpi_get_mcfg(void) >> { >> Object *pci_host; >> QObject *o; >> + AcpiMcfgInfo *head = NULL, *tail, *mcfg; >> pci_host = acpi_get_i386_pci_host(); >> g_assert(pci_host); >> - o = object_property_get_qobject(pci_host, PCIE_HOST_MCFG_BASE, >> NULL); >> - if (!o) { >> - return false; >> + while (pci_host) { >> + mcfg = g_new0(AcpiMcfgInfo, 1); >> + mcfg->next = NULL; >> + if (!head) { >> + tail = head = mcfg; >> + } else { >> + tail->next = mcfg; >> + tail = mcfg; >> + } >> + >> + o = object_property_get_qobject(pci_host, >> PCIE_HOST_MCFG_BASE, NULL); >> + if (!o) { >> + cleanup_mcfg(head); >> + g_free(mcfg); >> + return NULL; >> + } >> + mcfg->mcfg_base = qnum_get_uint(qobject_to(QNum, o)); >> + qobject_unref(o); >> + >> + o = object_property_get_qobject(pci_host, >> PCIE_HOST_MCFG_SIZE, NULL); > > I'll let Igor to comment on the APCI bits, but the idea of querying each > PCI host > for the MMFCG details and put it into a different table sounds good to me. > > [Adding Laszlo for his insights] Thanks for the CC -- I don't have many (positive) insights here to offer, I'm afraid. First, the patch set (including the blurb) doesn't seem to explain *why* multiple host bridges / ECAM areas are a good idea. Second, supporting such would take very intrusive patches / large feature development for OVMF (and that's not just for OvmfPkg and ArmVirtPkg platform code, but also core MdeModulePkg code). Obviously I'm not saying this feature should not be implemented for QEMU + SeaBIOS, just that don't expect edk2 support for it anytime soon. Without a convincing use case, even the review impact seems hard to justify. Thanks, Laszlo
On 05/22/2018 09:03 AM, Zihan Yang wrote: > > An interesting point is if we want to limit the MMFCG size for PXBs, > as we may not be > > interested to use all the buses in a specific domain. > > OK, perhaps providing an option for the user to specify the desired > bus numbers? > Right, specifying the number of buses that can be used (e.g: max_busses) will decrease a lot the amount of address space we have to reserve. > > For each bus we require some address space that remains unused. > > Does this mean we should reserve some slots in a bus, or reduce > the number of buses used in each domain? > Limit the number of buses per PCI domain (excluding PCI domain 0, of course) > > [Adding Laszlo for his insights] > > Looking forward to see the SeaBIOS patches :) > > I appreciate your reviews, I'll write the rest part as soon as possible. > Thanks, Marcel > Thanks >
Hi Laszlo, On 05/22/2018 12:52 PM, Laszlo Ersek wrote: > On 05/21/18 13:53, Marcel Apfelbaum wrote: >> >> On 05/20/2018 10:28 AM, Zihan Yang wrote: >>> Currently only q35 host bridge us allocated space in MCFG table. To >>> put pxb host >>> into sepratate pci domain, each of them should have its own >>> configuration space >>> int MCFG table >>> >>> Signed-off-by: Zihan Yang <whois.zihan.yang@gmail.com> >>> --- >>> hw/i386/acpi-build.c | 83 >>> +++++++++++++++++++++++++++++++++++++++------------- >>> 1 file changed, 62 insertions(+), 21 deletions(-) >>> >>> diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c >>> index 9bc6d97..808d815 100644 >>> --- a/hw/i386/acpi-build.c >>> +++ b/hw/i386/acpi-build.c >>> @@ -89,6 +89,7 @@ >>> typedef struct AcpiMcfgInfo { >>> uint64_t mcfg_base; >>> uint32_t mcfg_size; >>> + struct AcpiMcfgInfo *next; >>> } AcpiMcfgInfo; >>> typedef struct AcpiPmInfo { >>> @@ -2427,14 +2428,15 @@ build_mcfg_q35(GArray *table_data, BIOSLinker >>> *linker, AcpiMcfgInfo *info) >>> { >>> AcpiTableMcfg *mcfg; >>> const char *sig; >>> - int len = sizeof(*mcfg) + 1 * sizeof(mcfg->allocation[0]); >>> + int len, count = 0; >>> + AcpiMcfgInfo *cfg = info; >>> + while (cfg) { >>> + ++count; >>> + cfg = cfg->next; >>> + } >>> + len = sizeof(*mcfg) + count * sizeof(mcfg->allocation[0]); >>> mcfg = acpi_data_push(table_data, len); >>> - mcfg->allocation[0].address = cpu_to_le64(info->mcfg_base); >>> - /* Only a single allocation so no need to play with segments */ >>> - mcfg->allocation[0].pci_segment = cpu_to_le16(0); >>> - mcfg->allocation[0].start_bus_number = 0; >>> - mcfg->allocation[0].end_bus_number = >>> PCIE_MMCFG_BUS(info->mcfg_size - 1); >>> /* MCFG is used for ECAM which can be enabled or disabled by >>> guest. >>> * To avoid table size changes (which create migration issues), >>> @@ -2448,6 +2450,17 @@ build_mcfg_q35(GArray *table_data, BIOSLinker >>> *linker, AcpiMcfgInfo *info) >>> } else { >>> sig = "MCFG"; >>> } >>> + >>> + count = 0; >>> + while (info) { >>> + mcfg[count].allocation[0].address = >>> cpu_to_le64(info->mcfg_base); >>> + /* Only a single allocation so no need to play with segments */ >>> + mcfg[count].allocation[0].pci_segment = cpu_to_le16(count); >>> + mcfg[count].allocation[0].start_bus_number = 0; >>> + mcfg[count++].allocation[0].end_bus_number = >>> PCIE_MMCFG_BUS(info->mcfg_size - 1); >> An interesting point is if we want to limit the MMFCG size for PXBs, as >> we may not be >> interested to use all the buses in a specific domain. For each bus we >> require some >> address space that remains unused. >> >>> + info = info->next; >>> + } >>> + >>> build_header(linker, table_data, (void *)mcfg, sig, len, 1, >>> NULL, NULL); >>> } >>> @@ -2602,26 +2615,52 @@ struct AcpiBuildState { >>> MemoryRegion *linker_mr; >>> } AcpiBuildState; >>> -static bool acpi_get_mcfg(AcpiMcfgInfo *mcfg) >>> +static inline void cleanup_mcfg(AcpiMcfgInfo *mcfg) >>> +{ >>> + AcpiMcfgInfo *tmp; >>> + while (mcfg) { >>> + tmp = mcfg->next; >>> + g_free(mcfg); >>> + mcfg = tmp; >>> + } >>> +} >>> + >>> +static AcpiMcfgInfo *acpi_get_mcfg(void) >>> { >>> Object *pci_host; >>> QObject *o; >>> + AcpiMcfgInfo *head = NULL, *tail, *mcfg; >>> pci_host = acpi_get_i386_pci_host(); >>> g_assert(pci_host); >>> - o = object_property_get_qobject(pci_host, PCIE_HOST_MCFG_BASE, >>> NULL); >>> - if (!o) { >>> - return false; >>> + while (pci_host) { >>> + mcfg = g_new0(AcpiMcfgInfo, 1); >>> + mcfg->next = NULL; >>> + if (!head) { >>> + tail = head = mcfg; >>> + } else { >>> + tail->next = mcfg; >>> + tail = mcfg; >>> + } >>> + >>> + o = object_property_get_qobject(pci_host, >>> PCIE_HOST_MCFG_BASE, NULL); >>> + if (!o) { >>> + cleanup_mcfg(head); >>> + g_free(mcfg); >>> + return NULL; >>> + } >>> + mcfg->mcfg_base = qnum_get_uint(qobject_to(QNum, o)); >>> + qobject_unref(o); >>> + >>> + o = object_property_get_qobject(pci_host, >>> PCIE_HOST_MCFG_SIZE, NULL); >> I'll let Igor to comment on the APCI bits, but the idea of querying each >> PCI host >> for the MMFCG details and put it into a different table sounds good to me. >> >> [Adding Laszlo for his insights] > Thanks for the CC -- I don't have many (positive) insights here to > offer, I'm afraid. First, the patch set (including the blurb) doesn't > seem to explain *why* multiple host bridges / ECAM areas are a good > idea. The issue we want to solve is the hard limitation of 256 PCIe devices that can be used in a Q35 machine. We can use "PCI functions" to increase the number, but then we loose the hot-plugging granularity. Currently pxb-pcie host bridges share the same PCI domain (0) bus numbers, so adding more of them will not result in more usable devices (each PCIe device resides on its own bus), but putting each of them on a different domain removes that limitation. Another possible use (there is a good chance I am wrong, adding Alex to correct me), may be the "modeling" of a multi-function assigned device. Currently all the functions of an assigneddevice will be placed on different PCIe Root Ports "loosing" the connection between them. However, if we put all the functions of the same assigned device in the same PCI domain we may be able to "re-connect" them. > Second, supporting such would take very intrusive patches / large > feature development for OVMF (and that's not just for OvmfPkg and > ArmVirtPkg platform code, but also core MdeModulePkg code). > > Obviously I'm not saying this feature should not be implemented for QEMU > + SeaBIOS, just that don't expect edk2 support for it anytime soon. > Without a convincing use case, even the review impact seems hard to justify. I understand, thank you for the feedback, the point was to be sure we don't decide something that *can't be done* in OVMF. Thanks, Marcel > Thanks, > Laszlo
On 05/22/18 21:01, Marcel Apfelbaum wrote: > Hi Laszlo, > > On 05/22/2018 12:52 PM, Laszlo Ersek wrote: >> On 05/21/18 13:53, Marcel Apfelbaum wrote: >>> >>> On 05/20/2018 10:28 AM, Zihan Yang wrote: >>>> Currently only q35 host bridge us allocated space in MCFG table. To >>>> put pxb host >>>> into sepratate pci domain, each of them should have its own >>>> configuration space >>>> int MCFG table >>>> >>>> Signed-off-by: Zihan Yang <whois.zihan.yang@gmail.com> >>>> --- >>>> hw/i386/acpi-build.c | 83 >>>> +++++++++++++++++++++++++++++++++++++++------------- >>>> 1 file changed, 62 insertions(+), 21 deletions(-) >>>> >>>> diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c >>>> index 9bc6d97..808d815 100644 >>>> --- a/hw/i386/acpi-build.c >>>> +++ b/hw/i386/acpi-build.c >>>> @@ -89,6 +89,7 @@ >>>> typedef struct AcpiMcfgInfo { >>>> uint64_t mcfg_base; >>>> uint32_t mcfg_size; >>>> + struct AcpiMcfgInfo *next; >>>> } AcpiMcfgInfo; >>>> typedef struct AcpiPmInfo { >>>> @@ -2427,14 +2428,15 @@ build_mcfg_q35(GArray *table_data, BIOSLinker >>>> *linker, AcpiMcfgInfo *info) >>>> { >>>> AcpiTableMcfg *mcfg; >>>> const char *sig; >>>> - int len = sizeof(*mcfg) + 1 * sizeof(mcfg->allocation[0]); >>>> + int len, count = 0; >>>> + AcpiMcfgInfo *cfg = info; >>>> + while (cfg) { >>>> + ++count; >>>> + cfg = cfg->next; >>>> + } >>>> + len = sizeof(*mcfg) + count * sizeof(mcfg->allocation[0]); >>>> mcfg = acpi_data_push(table_data, len); >>>> - mcfg->allocation[0].address = cpu_to_le64(info->mcfg_base); >>>> - /* Only a single allocation so no need to play with segments */ >>>> - mcfg->allocation[0].pci_segment = cpu_to_le16(0); >>>> - mcfg->allocation[0].start_bus_number = 0; >>>> - mcfg->allocation[0].end_bus_number = >>>> PCIE_MMCFG_BUS(info->mcfg_size - 1); >>>> /* MCFG is used for ECAM which can be enabled or disabled by >>>> guest. >>>> * To avoid table size changes (which create migration issues), >>>> @@ -2448,6 +2450,17 @@ build_mcfg_q35(GArray *table_data, BIOSLinker >>>> *linker, AcpiMcfgInfo *info) >>>> } else { >>>> sig = "MCFG"; >>>> } >>>> + >>>> + count = 0; >>>> + while (info) { >>>> + mcfg[count].allocation[0].address = >>>> cpu_to_le64(info->mcfg_base); >>>> + /* Only a single allocation so no need to play with >>>> segments */ >>>> + mcfg[count].allocation[0].pci_segment = cpu_to_le16(count); >>>> + mcfg[count].allocation[0].start_bus_number = 0; >>>> + mcfg[count++].allocation[0].end_bus_number = >>>> PCIE_MMCFG_BUS(info->mcfg_size - 1); >>> An interesting point is if we want to limit the MMFCG size for PXBs, as >>> we may not be >>> interested to use all the buses in a specific domain. For each bus we >>> require some >>> address space that remains unused. >>> >>>> + info = info->next; >>>> + } >>>> + >>>> build_header(linker, table_data, (void *)mcfg, sig, len, 1, >>>> NULL, NULL); >>>> } >>>> @@ -2602,26 +2615,52 @@ struct AcpiBuildState { >>>> MemoryRegion *linker_mr; >>>> } AcpiBuildState; >>>> -static bool acpi_get_mcfg(AcpiMcfgInfo *mcfg) >>>> +static inline void cleanup_mcfg(AcpiMcfgInfo *mcfg) >>>> +{ >>>> + AcpiMcfgInfo *tmp; >>>> + while (mcfg) { >>>> + tmp = mcfg->next; >>>> + g_free(mcfg); >>>> + mcfg = tmp; >>>> + } >>>> +} >>>> + >>>> +static AcpiMcfgInfo *acpi_get_mcfg(void) >>>> { >>>> Object *pci_host; >>>> QObject *o; >>>> + AcpiMcfgInfo *head = NULL, *tail, *mcfg; >>>> pci_host = acpi_get_i386_pci_host(); >>>> g_assert(pci_host); >>>> - o = object_property_get_qobject(pci_host, PCIE_HOST_MCFG_BASE, >>>> NULL); >>>> - if (!o) { >>>> - return false; >>>> + while (pci_host) { >>>> + mcfg = g_new0(AcpiMcfgInfo, 1); >>>> + mcfg->next = NULL; >>>> + if (!head) { >>>> + tail = head = mcfg; >>>> + } else { >>>> + tail->next = mcfg; >>>> + tail = mcfg; >>>> + } >>>> + >>>> + o = object_property_get_qobject(pci_host, >>>> PCIE_HOST_MCFG_BASE, NULL); >>>> + if (!o) { >>>> + cleanup_mcfg(head); >>>> + g_free(mcfg); >>>> + return NULL; >>>> + } >>>> + mcfg->mcfg_base = qnum_get_uint(qobject_to(QNum, o)); >>>> + qobject_unref(o); >>>> + >>>> + o = object_property_get_qobject(pci_host, >>>> PCIE_HOST_MCFG_SIZE, NULL); >>> I'll let Igor to comment on the APCI bits, but the idea of querying each >>> PCI host >>> for the MMFCG details and put it into a different table sounds good >>> to me. >>> >>> [Adding Laszlo for his insights] >> Thanks for the CC -- I don't have many (positive) insights here to >> offer, I'm afraid. First, the patch set (including the blurb) doesn't >> seem to explain *why* multiple host bridges / ECAM areas are a good >> idea. > > The issue we want to solve is the hard limitation of 256 PCIe devices > that can be used in a Q35 machine. Implying that there are use cases for which ~256 PCIe devices aren't enough. I remain unconvinced until proved wrong :) Anyway, a significant source of waste comes from the restriction that we can only put 1 device (with up to 8 functions) on each non-root-complex PCI Express bus (such as root ports and downstream ports). This forfeits a huge portion of the ECAM area (about 31/32th) that we already have. Rather than spending more MMIO guest-phys address range on new discontiguous ECAM ranges, I'd prefer if we could look into ARI. I seem to recall from your earlier presentation that ARI could recover that lost address space (meaning both ECAM ranges and PCIe B/D/F address space). There are signs that the edk2 core supports ARI if the underlying platform supports it. (Which is not the case with multiple PCIe domains / multiple ECAM ranges.) ARI support could also help aarch64/virt. Eric (CC'd) has been working on raising the max PCIe bus count from *16* to 256 for aarch64/virt, and AFAIR one of the challenges there has been finding a good spot for the larger ECAM in guest-phys address space. Fiddling with such address maps is always a pain. Back to x86, the guest-phys address space is quite crowded too. The 32-bit PCI MMIO aperture (with the neighboring ECAM and platform MMIO areas such as LAPIC, IO-APIC, HPET, TPM, pflash, ...) is always a scarce resource. Plus, reaching agreement between OVMF and QEMU on the exact location of the 32-bit PCI MMIO aperture has always been a huge pain; so you'd likely shoot for 64-bit. But 64-bit is ill-partitioned and/or crowded too: first you have the cold-plugged >4GB DRAM (whose size the firmware can interrogate), then the hotplug DIMM area up to "etc/reserved-memory-end" (ditto), then the 64-bit PCI MMIO aperture (whose size the firmware decides itself, because QEMU cannot be asked about it presently). Placing the additional MMCFGs somewhere would need extending the total order between these things at the design level, more fw_cfg files, more sanity checks in platform code in the firmware, and, quite importantly, adding support to multiple discontiguous MMCFGs to core edk2 code. I don't know much about ARI but it already looks a whole lot more attractive to me. > We can use "PCI functions" to increase > the number, but then we loose the hot-plugging granularity. > > Currently pxb-pcie host bridges share the same PCI domain (0) > bus numbers, so adding more of them will not result in more usable > devices (each PCIe device resides on its own bus), > but putting each of them on a different domain removes > that limitation. > > Another possible use (there is a good chance I am wrong, adding Alex to > correct me), > may be the "modeling" of a multi-function assigned device. > Currently all the functions of an assigneddevice will be placed on > different PCIe > Root Ports "loosing" the connection between them. > > However, if we put all the functions of the same assigned device in the > same > PCI domain we may be able to "re-connect" them. OK -- why is that useful? What's the current practical problem with the splitting you describe? >> Second, supporting such would take very intrusive patches / large >> feature development for OVMF (and that's not just for OvmfPkg and >> ArmVirtPkg platform code, but also core MdeModulePkg code). >> >> Obviously I'm not saying this feature should not be implemented for QEMU >> + SeaBIOS, just that don't expect edk2 support for it anytime soon. >> Without a convincing use case, even the review impact seems hard to >> justify. > > I understand, thank you for the feedback, the point was to be sure > we don't decide something that *can't be done* in OVMF. Semantics :) Obviously, everything "can be done" in software; that's why it's *soft*ware. Who is going to write it in practice? It had taken years until the edk2 core gained a universal PciHostBridgeDxe driver with a well-defined platform customization interface, and that interface doesn't support multiple domains / segments. It had also taken years until the same edk2 core driver gained support for nonzero MMIO address translation (i.e. where the CPU view of system RAM addresses differs from the PCI device view of the same, for DMA purposes) -- and that's just a linear translation, not even about IOMMUs. The PCI core maintainers in edk2 are always very busy, and upstreaming such changes tends to take forever. Again, I'm not opposing this feature per se. I just see any potential support for it in edk2 very difficult. I remain unconvinced that ~256 PCIe devices aren't enough. Even assuming I'm proved plain wrong there, I'd still much prefer ARI (although I don't know much about it at this point), because (a) the edk2 core already shows signs that it intends to support ARI, (b) ARI would help aarch64/virt with nearly the same effort, (c) it seems that ARI would address the problem (namely, wasting MMCFG) head-on, rather than throwing yet more MMCFG at it. My apologies if I'm completely wrong about ARI (although that wouldn't change anything about the difficulties that are foreseeable in edk2 for supporting multiple host bridges). Thanks, Laszlo
On Tue, May 22, 2018 at 09:51:47PM +0200, Laszlo Ersek wrote: > On 05/22/18 21:01, Marcel Apfelbaum wrote: > > Hi Laszlo, > > > > On 05/22/2018 12:52 PM, Laszlo Ersek wrote: > >> On 05/21/18 13:53, Marcel Apfelbaum wrote: > >>> > >>> On 05/20/2018 10:28 AM, Zihan Yang wrote: > >>>> Currently only q35 host bridge us allocated space in MCFG table. To > >>>> put pxb host > >>>> into sepratate pci domain, each of them should have its own > >>>> configuration space > >>>> int MCFG table > >>>> > >>>> Signed-off-by: Zihan Yang <whois.zihan.yang@gmail.com> > >>>> --- > >>>> hw/i386/acpi-build.c | 83 > >>>> +++++++++++++++++++++++++++++++++++++++------------- > >>>> 1 file changed, 62 insertions(+), 21 deletions(-) > >>>> > >>>> diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c > >>>> index 9bc6d97..808d815 100644 > >>>> --- a/hw/i386/acpi-build.c > >>>> +++ b/hw/i386/acpi-build.c > >>>> @@ -89,6 +89,7 @@ > >>>> typedef struct AcpiMcfgInfo { > >>>> uint64_t mcfg_base; > >>>> uint32_t mcfg_size; > >>>> + struct AcpiMcfgInfo *next; > >>>> } AcpiMcfgInfo; > >>>> typedef struct AcpiPmInfo { > >>>> @@ -2427,14 +2428,15 @@ build_mcfg_q35(GArray *table_data, BIOSLinker > >>>> *linker, AcpiMcfgInfo *info) > >>>> { > >>>> AcpiTableMcfg *mcfg; > >>>> const char *sig; > >>>> - int len = sizeof(*mcfg) + 1 * sizeof(mcfg->allocation[0]); > >>>> + int len, count = 0; > >>>> + AcpiMcfgInfo *cfg = info; > >>>> + while (cfg) { > >>>> + ++count; > >>>> + cfg = cfg->next; > >>>> + } > >>>> + len = sizeof(*mcfg) + count * sizeof(mcfg->allocation[0]); > >>>> mcfg = acpi_data_push(table_data, len); > >>>> - mcfg->allocation[0].address = cpu_to_le64(info->mcfg_base); > >>>> - /* Only a single allocation so no need to play with segments */ > >>>> - mcfg->allocation[0].pci_segment = cpu_to_le16(0); > >>>> - mcfg->allocation[0].start_bus_number = 0; > >>>> - mcfg->allocation[0].end_bus_number = > >>>> PCIE_MMCFG_BUS(info->mcfg_size - 1); > >>>> /* MCFG is used for ECAM which can be enabled or disabled by > >>>> guest. > >>>> * To avoid table size changes (which create migration issues), > >>>> @@ -2448,6 +2450,17 @@ build_mcfg_q35(GArray *table_data, BIOSLinker > >>>> *linker, AcpiMcfgInfo *info) > >>>> } else { > >>>> sig = "MCFG"; > >>>> } > >>>> + > >>>> + count = 0; > >>>> + while (info) { > >>>> + mcfg[count].allocation[0].address = > >>>> cpu_to_le64(info->mcfg_base); > >>>> + /* Only a single allocation so no need to play with > >>>> segments */ > >>>> + mcfg[count].allocation[0].pci_segment = cpu_to_le16(count); > >>>> + mcfg[count].allocation[0].start_bus_number = 0; > >>>> + mcfg[count++].allocation[0].end_bus_number = > >>>> PCIE_MMCFG_BUS(info->mcfg_size - 1); > >>> An interesting point is if we want to limit the MMFCG size for PXBs, as > >>> we may not be > >>> interested to use all the buses in a specific domain. For each bus we > >>> require some > >>> address space that remains unused. > >>> > >>>> + info = info->next; > >>>> + } > >>>> + > >>>> build_header(linker, table_data, (void *)mcfg, sig, len, 1, > >>>> NULL, NULL); > >>>> } > >>>> @@ -2602,26 +2615,52 @@ struct AcpiBuildState { > >>>> MemoryRegion *linker_mr; > >>>> } AcpiBuildState; > >>>> -static bool acpi_get_mcfg(AcpiMcfgInfo *mcfg) > >>>> +static inline void cleanup_mcfg(AcpiMcfgInfo *mcfg) > >>>> +{ > >>>> + AcpiMcfgInfo *tmp; > >>>> + while (mcfg) { > >>>> + tmp = mcfg->next; > >>>> + g_free(mcfg); > >>>> + mcfg = tmp; > >>>> + } > >>>> +} > >>>> + > >>>> +static AcpiMcfgInfo *acpi_get_mcfg(void) > >>>> { > >>>> Object *pci_host; > >>>> QObject *o; > >>>> + AcpiMcfgInfo *head = NULL, *tail, *mcfg; > >>>> pci_host = acpi_get_i386_pci_host(); > >>>> g_assert(pci_host); > >>>> - o = object_property_get_qobject(pci_host, PCIE_HOST_MCFG_BASE, > >>>> NULL); > >>>> - if (!o) { > >>>> - return false; > >>>> + while (pci_host) { > >>>> + mcfg = g_new0(AcpiMcfgInfo, 1); > >>>> + mcfg->next = NULL; > >>>> + if (!head) { > >>>> + tail = head = mcfg; > >>>> + } else { > >>>> + tail->next = mcfg; > >>>> + tail = mcfg; > >>>> + } > >>>> + > >>>> + o = object_property_get_qobject(pci_host, > >>>> PCIE_HOST_MCFG_BASE, NULL); > >>>> + if (!o) { > >>>> + cleanup_mcfg(head); > >>>> + g_free(mcfg); > >>>> + return NULL; > >>>> + } > >>>> + mcfg->mcfg_base = qnum_get_uint(qobject_to(QNum, o)); > >>>> + qobject_unref(o); > >>>> + > >>>> + o = object_property_get_qobject(pci_host, > >>>> PCIE_HOST_MCFG_SIZE, NULL); > >>> I'll let Igor to comment on the APCI bits, but the idea of querying each > >>> PCI host > >>> for the MMFCG details and put it into a different table sounds good > >>> to me. > >>> > >>> [Adding Laszlo for his insights] > >> Thanks for the CC -- I don't have many (positive) insights here to > >> offer, I'm afraid. First, the patch set (including the blurb) doesn't > >> seem to explain *why* multiple host bridges / ECAM areas are a good > >> idea. > > > > The issue we want to solve is the hard limitation of 256 PCIe devices > > that can be used in a Q35 machine. > > Implying that there are use cases for which ~256 PCIe devices aren't > enough. I remain unconvinced until proved wrong :) > > Anyway, a significant source of waste comes from the restriction that we > can only put 1 device (with up to 8 functions) on each non-root-complex > PCI Express bus (such as root ports and downstream ports). This forfeits > a huge portion of the ECAM area (about 31/32th) that we already have. > Rather than spending more MMIO guest-phys address range on new > discontiguous ECAM ranges, I'd prefer if we could look into ARI. I seem > to recall from your earlier presentation that ARI could recover that > lost address space (meaning both ECAM ranges and PCIe B/D/F address space). Well ARI just adds more functions. I do think it's a useful feature but individual functions remain non-hotpluggable with the standard PCIe controller. > There are signs that the edk2 core supports ARI if the underlying > platform supports it. (Which is not the case with multiple PCIe domains > / multiple ECAM ranges.) > > ARI support could also help aarch64/virt. Eric (CC'd) has been working > on raising the max PCIe bus count from *16* to 256 for aarch64/virt, and > AFAIR one of the challenges there has been finding a good spot for the > larger ECAM in guest-phys address space. Fiddling with such address maps > is always a pain. > > Back to x86, the guest-phys address space is quite crowded too. The > 32-bit PCI MMIO aperture (with the neighboring ECAM and platform MMIO > areas such as LAPIC, IO-APIC, HPET, TPM, pflash, ...) is always a scarce > resource. Plus, reaching agreement between OVMF and QEMU on the exact > location of the 32-bit PCI MMIO aperture has always been a huge pain; so > you'd likely shoot for 64-bit. > > But 64-bit is ill-partitioned and/or crowded too: first you have the > cold-plugged >4GB DRAM (whose size the firmware can interrogate), then > the hotplug DIMM area up to "etc/reserved-memory-end" (ditto), then the > 64-bit PCI MMIO aperture (whose size the firmware decides itself, > because QEMU cannot be asked about it presently). Placing the additional > MMCFGs somewhere would need extending the total order between these > things at the design level, more fw_cfg files, more sanity checks in > platform code in the firmware, and, quite importantly, adding support to > multiple discontiguous MMCFGs to core edk2 code. In fact, last I looked the proposed patches left the layout completely to the firmware. So firmware can just select where it wants these things to be, and program hardware accordingly - there should not be a need for new fw cfg files. > I don't know much about ARI but it already looks a whole lot more > attractive to me. > > > We can use "PCI functions" to increase > > the number, but then we loose the hot-plugging granularity. > > > > Currently pxb-pcie host bridges share the same PCI domain (0) > > bus numbers, so adding more of them will not result in more usable > > devices (each PCIe device resides on its own bus), > > but putting each of them on a different domain removes > > that limitation. > > > > Another possible use (there is a good chance I am wrong, adding Alex to > > correct me), > > may be the "modeling" of a multi-function assigned device. > > Currently all the functions of an assigneddevice will be placed on > > different PCIe > > Root Ports "loosing" the connection between them. > > > > However, if we put all the functions of the same assigned device in the > > same > > PCI domain we may be able to "re-connect" them. > > OK -- why is that useful? What's the current practical problem with the > splitting you describe? > > >> Second, supporting such would take very intrusive patches / large > >> feature development for OVMF (and that's not just for OvmfPkg and > >> ArmVirtPkg platform code, but also core MdeModulePkg code). So there certainly exist hardware platforms with multiple pci root complexes. It's not a PV technology. So it's probably worth it to support in OVMF. The pxb hack I'm less convinced about. It was apparently designed to make pci support very easy for seabios, if it's not doing that for pcie then we should do something else. And certainly, the fact that it doesn't allow >256 devices is not a plus. > >> Obviously I'm not saying this feature should not be implemented for QEMU > >> + SeaBIOS, just that don't expect edk2 support for it anytime soon. > >> Without a convincing use case, even the review impact seems hard to > >> justify. > > > > I understand, thank you for the feedback, the point was to be sure > > we don't decide something that *can't be done* in OVMF. > > Semantics :) Obviously, everything "can be done" in software; that's why > it's *soft*ware. Who is going to write it in practice? It had taken > years until the edk2 core gained a universal PciHostBridgeDxe driver > with a well-defined platform customization interface, and that interface > doesn't support multiple domains / segments. It had also taken years > until the same edk2 core driver gained support for nonzero MMIO address > translation (i.e. where the CPU view of system RAM addresses differs > from the PCI device view of the same, for DMA purposes) -- and that's > just a linear translation, not even about IOMMUs. The PCI core > maintainers in edk2 are always very busy, and upstreaming such changes > tends to take forever. > > Again, I'm not opposing this feature per se. I just see any potential > support for it in edk2 very difficult. I remain unconvinced that ~256 > PCIe devices aren't enough. Even assuming I'm proved plain wrong there, > I'd still much prefer ARI (although I don't know much about it at this > point), because (a) the edk2 core already shows signs that it intends to > support ARI, (b) ARI would help aarch64/virt with nearly the same > effort, (c) it seems that ARI would address the problem (namely, wasting > MMCFG) head-on, rather than throwing yet more MMCFG at it. > > My apologies if I'm completely wrong about ARI (although that wouldn't > change anything about the difficulties that are foreseeable in edk2 for > supporting multiple host bridges). > > Thanks, > Laszlo It's not hard to think of a use-case where >256 devices are helpful, for example a nested virt scenario where each device is passed on to a different nested guest. But I think the main feature this is needed for is numa modeling. Guests seem to assume a numa node per PCI root, ergo we need more PCI roots.
On Tue, 22 May 2018 21:51:47 +0200 Laszlo Ersek <lersek@redhat.com> wrote: > On 05/22/18 21:01, Marcel Apfelbaum wrote: > > Hi Laszlo, > > > > On 05/22/2018 12:52 PM, Laszlo Ersek wrote: > >> On 05/21/18 13:53, Marcel Apfelbaum wrote: > >>> > >>> On 05/20/2018 10:28 AM, Zihan Yang wrote: > >>>> Currently only q35 host bridge us allocated space in MCFG table. To > >>>> put pxb host > >>>> into sepratate pci domain, each of them should have its own > >>>> configuration space > >>>> int MCFG table > >>>> > >>>> Signed-off-by: Zihan Yang <whois.zihan.yang@gmail.com> > >>>> --- > >>>> hw/i386/acpi-build.c | 83 > >>>> +++++++++++++++++++++++++++++++++++++++------------- > >>>> 1 file changed, 62 insertions(+), 21 deletions(-) > >>>> > >>>> diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c > >>>> index 9bc6d97..808d815 100644 > >>>> --- a/hw/i386/acpi-build.c > >>>> +++ b/hw/i386/acpi-build.c > >>>> @@ -89,6 +89,7 @@ > >>>> typedef struct AcpiMcfgInfo { > >>>> uint64_t mcfg_base; > >>>> uint32_t mcfg_size; > >>>> + struct AcpiMcfgInfo *next; > >>>> } AcpiMcfgInfo; > >>>> typedef struct AcpiPmInfo { > >>>> @@ -2427,14 +2428,15 @@ build_mcfg_q35(GArray *table_data, BIOSLinker > >>>> *linker, AcpiMcfgInfo *info) > >>>> { > >>>> AcpiTableMcfg *mcfg; > >>>> const char *sig; > >>>> - int len = sizeof(*mcfg) + 1 * sizeof(mcfg->allocation[0]); > >>>> + int len, count = 0; > >>>> + AcpiMcfgInfo *cfg = info; > >>>> + while (cfg) { > >>>> + ++count; > >>>> + cfg = cfg->next; > >>>> + } > >>>> + len = sizeof(*mcfg) + count * sizeof(mcfg->allocation[0]); > >>>> mcfg = acpi_data_push(table_data, len); > >>>> - mcfg->allocation[0].address = cpu_to_le64(info->mcfg_base); > >>>> - /* Only a single allocation so no need to play with segments */ > >>>> - mcfg->allocation[0].pci_segment = cpu_to_le16(0); > >>>> - mcfg->allocation[0].start_bus_number = 0; > >>>> - mcfg->allocation[0].end_bus_number = > >>>> PCIE_MMCFG_BUS(info->mcfg_size - 1); > >>>> /* MCFG is used for ECAM which can be enabled or disabled by > >>>> guest. > >>>> * To avoid table size changes (which create migration issues), > >>>> @@ -2448,6 +2450,17 @@ build_mcfg_q35(GArray *table_data, BIOSLinker > >>>> *linker, AcpiMcfgInfo *info) > >>>> } else { > >>>> sig = "MCFG"; > >>>> } > >>>> + > >>>> + count = 0; > >>>> + while (info) { > >>>> + mcfg[count].allocation[0].address = > >>>> cpu_to_le64(info->mcfg_base); > >>>> + /* Only a single allocation so no need to play with > >>>> segments */ > >>>> + mcfg[count].allocation[0].pci_segment = cpu_to_le16(count); > >>>> + mcfg[count].allocation[0].start_bus_number = 0; > >>>> + mcfg[count++].allocation[0].end_bus_number = > >>>> PCIE_MMCFG_BUS(info->mcfg_size - 1); > >>> An interesting point is if we want to limit the MMFCG size for PXBs, as > >>> we may not be > >>> interested to use all the buses in a specific domain. For each bus we > >>> require some > >>> address space that remains unused. > >>> > >>>> + info = info->next; > >>>> + } > >>>> + > >>>> build_header(linker, table_data, (void *)mcfg, sig, len, 1, > >>>> NULL, NULL); > >>>> } > >>>> @@ -2602,26 +2615,52 @@ struct AcpiBuildState { > >>>> MemoryRegion *linker_mr; > >>>> } AcpiBuildState; > >>>> -static bool acpi_get_mcfg(AcpiMcfgInfo *mcfg) > >>>> +static inline void cleanup_mcfg(AcpiMcfgInfo *mcfg) > >>>> +{ > >>>> + AcpiMcfgInfo *tmp; > >>>> + while (mcfg) { > >>>> + tmp = mcfg->next; > >>>> + g_free(mcfg); > >>>> + mcfg = tmp; > >>>> + } > >>>> +} > >>>> + > >>>> +static AcpiMcfgInfo *acpi_get_mcfg(void) > >>>> { > >>>> Object *pci_host; > >>>> QObject *o; > >>>> + AcpiMcfgInfo *head = NULL, *tail, *mcfg; > >>>> pci_host = acpi_get_i386_pci_host(); > >>>> g_assert(pci_host); > >>>> - o = object_property_get_qobject(pci_host, PCIE_HOST_MCFG_BASE, > >>>> NULL); > >>>> - if (!o) { > >>>> - return false; > >>>> + while (pci_host) { > >>>> + mcfg = g_new0(AcpiMcfgInfo, 1); > >>>> + mcfg->next = NULL; > >>>> + if (!head) { > >>>> + tail = head = mcfg; > >>>> + } else { > >>>> + tail->next = mcfg; > >>>> + tail = mcfg; > >>>> + } > >>>> + > >>>> + o = object_property_get_qobject(pci_host, > >>>> PCIE_HOST_MCFG_BASE, NULL); > >>>> + if (!o) { > >>>> + cleanup_mcfg(head); > >>>> + g_free(mcfg); > >>>> + return NULL; > >>>> + } > >>>> + mcfg->mcfg_base = qnum_get_uint(qobject_to(QNum, o)); > >>>> + qobject_unref(o); > >>>> + > >>>> + o = object_property_get_qobject(pci_host, > >>>> PCIE_HOST_MCFG_SIZE, NULL); > >>> I'll let Igor to comment on the APCI bits, but the idea of querying each > >>> PCI host > >>> for the MMFCG details and put it into a different table sounds good > >>> to me. > >>> > >>> [Adding Laszlo for his insights] > >> Thanks for the CC -- I don't have many (positive) insights here to > >> offer, I'm afraid. First, the patch set (including the blurb) doesn't > >> seem to explain *why* multiple host bridges / ECAM areas are a good > >> idea. > > > > The issue we want to solve is the hard limitation of 256 PCIe devices > > that can be used in a Q35 machine. Isn't it interesting that conventional PCI can easily support so many more devices? Sorta makes you wonder why we care that virtual devices are express rather than conventional for a high density configuration... > Implying that there are use cases for which ~256 PCIe devices aren't > enough. I remain unconvinced until proved wrong :) > > Anyway, a significant source of waste comes from the restriction that we > can only put 1 device (with up to 8 functions) on each non-root-complex > PCI Express bus (such as root ports and downstream ports). This forfeits > a huge portion of the ECAM area (about 31/32th) that we already have. > Rather than spending more MMIO guest-phys address range on new > discontiguous ECAM ranges, I'd prefer if we could look into ARI. I seem > to recall from your earlier presentation that ARI could recover that > lost address space (meaning both ECAM ranges and PCIe B/D/F address space). How does ARI solve the hotplug problem? ARI is effectively multifunction on steroids, the ARI capability in each function points to the next function number so that we don't need to scan the entire devfn address space per bus (an inefficiency we don't care about when there are only 8 function). So yeah, we can fill an entire bus with devices with ARI, but they're all rooted at 00.0. > There are signs that the edk2 core supports ARI if the underlying > platform supports it. (Which is not the case with multiple PCIe domains > / multiple ECAM ranges.) It's pretty surprising to me that edk2 wouldn't already have support for multiple PCIe domains, they're really not *that* uncommon. Some architectures make almost gratuitous use of PCIe domains. I certainly know there were UEFI ia64 platforms with multiple domains. > ARI support could also help aarch64/virt. Eric (CC'd) has been working > on raising the max PCIe bus count from *16* to 256 for aarch64/virt, and > AFAIR one of the challenges there has been finding a good spot for the > larger ECAM in guest-phys address space. Fiddling with such address maps > is always a pain. > > Back to x86, the guest-phys address space is quite crowded too. The > 32-bit PCI MMIO aperture (with the neighboring ECAM and platform MMIO > areas such as LAPIC, IO-APIC, HPET, TPM, pflash, ...) is always a scarce > resource. Plus, reaching agreement between OVMF and QEMU on the exact > location of the 32-bit PCI MMIO aperture has always been a huge pain; so > you'd likely shoot for 64-bit. Why do we need to equally split 32-bit MMIO space between domains? Put it on domain 0 and require devices installed into non-zero domains have no 32-bit dependencies. > But 64-bit is ill-partitioned and/or crowded too: first you have the > cold-plugged >4GB DRAM (whose size the firmware can interrogate), then > the hotplug DIMM area up to "etc/reserved-memory-end" (ditto), then the > 64-bit PCI MMIO aperture (whose size the firmware decides itself, > because QEMU cannot be asked about it presently). Placing the additional > MMCFGs somewhere would need extending the total order between these > things at the design level, more fw_cfg files, more sanity checks in > platform code in the firmware, and, quite importantly, adding support to > multiple discontiguous MMCFGs to core edk2 code. Hmm, we're doing something wrong if we can't figure out some standards within QEMU for placing per domain 64-bit MMIO and MMCFG ranges. > I don't know much about ARI but it already looks a whole lot more > attractive to me. > > > We can use "PCI functions" to increase > > the number, but then we loose the hot-plugging granularity. > > > > Currently pxb-pcie host bridges share the same PCI domain (0) > > bus numbers, so adding more of them will not result in more usable > > devices (each PCIe device resides on its own bus), > > but putting each of them on a different domain removes > > that limitation. > > > > Another possible use (there is a good chance I am wrong, adding Alex to > > correct me), > > may be the "modeling" of a multi-function assigned device. > > Currently all the functions of an assigneddevice will be placed on > > different PCIe > > Root Ports "loosing" the connection between them. > > > > However, if we put all the functions of the same assigned device in the > > same > > PCI domain we may be able to "re-connect" them. > > OK -- why is that useful? What's the current practical problem with the > splitting you describe? There are very few cases where this is useful, things like associating USB companion devices or translating a guest bus reset to host bus reset when functions are split to separate virtual buses. That said, I have no idea how multiple domains plays a factor here. Regardless of how many PCIe domains a VM supports, we can easily use existing multifunction within a single domain to expose multifunction assigned devices in a way that better resembles the physical hardware. Multifunction PCIe endpoints cannot span PCIe domains. In fact, IOMMUs generally cannot span domains either, so we better also be thinking about at least a VT-d DHRD or vIOMMU per PCIe domain. > >> Second, supporting such would take very intrusive patches / large > >> feature development for OVMF (and that's not just for OvmfPkg and > >> ArmVirtPkg platform code, but also core MdeModulePkg code). > >> > >> Obviously I'm not saying this feature should not be implemented for QEMU > >> + SeaBIOS, just that don't expect edk2 support for it anytime soon. > >> Without a convincing use case, even the review impact seems hard to > >> justify. > > > > I understand, thank you for the feedback, the point was to be sure > > we don't decide something that *can't be done* in OVMF. > > Semantics :) Obviously, everything "can be done" in software; that's why > it's *soft*ware. Who is going to write it in practice? It had taken > years until the edk2 core gained a universal PciHostBridgeDxe driver > with a well-defined platform customization interface, and that interface > doesn't support multiple domains / segments. It had also taken years > until the same edk2 core driver gained support for nonzero MMIO address > translation (i.e. where the CPU view of system RAM addresses differs > from the PCI device view of the same, for DMA purposes) -- and that's > just a linear translation, not even about IOMMUs. The PCI core > maintainers in edk2 are always very busy, and upstreaming such changes > tends to take forever. Wow, that's surprising, ia64 was doing all of that on UEFI platforms a decade ago. > Again, I'm not opposing this feature per se. I just see any potential > support for it in edk2 very difficult. I remain unconvinced that ~256 > PCIe devices aren't enough. Even assuming I'm proved plain wrong there, > I'd still much prefer ARI (although I don't know much about it at this > point), because (a) the edk2 core already shows signs that it intends to > support ARI, (b) ARI would help aarch64/virt with nearly the same > effort, (c) it seems that ARI would address the problem (namely, wasting > MMCFG) head-on, rather than throwing yet more MMCFG at it. -ENOHOTPLUG From an assigned device perspective, this also implies that we're potentially adding a PCIe extended capability that may not exist on the physical device to the virtual view of the device. There's no guarantee that we can place that capability in the devices extended config space without stepping on registers that the hardware has hidden between the capabilities (for real, GPUs have registers in extended config space that aren't covered by capabilities) or the unlikely event that there's no more room in extended config space. Thanks, Alex
On Tue, May 22, 2018 at 03:17:32PM -0600, Alex Williamson wrote: > On Tue, 22 May 2018 21:51:47 +0200 > Laszlo Ersek <lersek@redhat.com> wrote: > > > On 05/22/18 21:01, Marcel Apfelbaum wrote: > > > Hi Laszlo, > > > > > > On 05/22/2018 12:52 PM, Laszlo Ersek wrote: > > >> On 05/21/18 13:53, Marcel Apfelbaum wrote: > > >>> > > >>> On 05/20/2018 10:28 AM, Zihan Yang wrote: > > >>>> Currently only q35 host bridge us allocated space in MCFG table. To > > >>>> put pxb host > > >>>> into sepratate pci domain, each of them should have its own > > >>>> configuration space > > >>>> int MCFG table > > >>>> > > >>>> Signed-off-by: Zihan Yang <whois.zihan.yang@gmail.com> > > >>>> --- > > >>>> hw/i386/acpi-build.c | 83 > > >>>> +++++++++++++++++++++++++++++++++++++++------------- > > >>>> 1 file changed, 62 insertions(+), 21 deletions(-) > > >>>> > > >>>> diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c > > >>>> index 9bc6d97..808d815 100644 > > >>>> --- a/hw/i386/acpi-build.c > > >>>> +++ b/hw/i386/acpi-build.c > > >>>> @@ -89,6 +89,7 @@ > > >>>> typedef struct AcpiMcfgInfo { > > >>>> uint64_t mcfg_base; > > >>>> uint32_t mcfg_size; > > >>>> + struct AcpiMcfgInfo *next; > > >>>> } AcpiMcfgInfo; > > >>>> typedef struct AcpiPmInfo { > > >>>> @@ -2427,14 +2428,15 @@ build_mcfg_q35(GArray *table_data, BIOSLinker > > >>>> *linker, AcpiMcfgInfo *info) > > >>>> { > > >>>> AcpiTableMcfg *mcfg; > > >>>> const char *sig; > > >>>> - int len = sizeof(*mcfg) + 1 * sizeof(mcfg->allocation[0]); > > >>>> + int len, count = 0; > > >>>> + AcpiMcfgInfo *cfg = info; > > >>>> + while (cfg) { > > >>>> + ++count; > > >>>> + cfg = cfg->next; > > >>>> + } > > >>>> + len = sizeof(*mcfg) + count * sizeof(mcfg->allocation[0]); > > >>>> mcfg = acpi_data_push(table_data, len); > > >>>> - mcfg->allocation[0].address = cpu_to_le64(info->mcfg_base); > > >>>> - /* Only a single allocation so no need to play with segments */ > > >>>> - mcfg->allocation[0].pci_segment = cpu_to_le16(0); > > >>>> - mcfg->allocation[0].start_bus_number = 0; > > >>>> - mcfg->allocation[0].end_bus_number = > > >>>> PCIE_MMCFG_BUS(info->mcfg_size - 1); > > >>>> /* MCFG is used for ECAM which can be enabled or disabled by > > >>>> guest. > > >>>> * To avoid table size changes (which create migration issues), > > >>>> @@ -2448,6 +2450,17 @@ build_mcfg_q35(GArray *table_data, BIOSLinker > > >>>> *linker, AcpiMcfgInfo *info) > > >>>> } else { > > >>>> sig = "MCFG"; > > >>>> } > > >>>> + > > >>>> + count = 0; > > >>>> + while (info) { > > >>>> + mcfg[count].allocation[0].address = > > >>>> cpu_to_le64(info->mcfg_base); > > >>>> + /* Only a single allocation so no need to play with > > >>>> segments */ > > >>>> + mcfg[count].allocation[0].pci_segment = cpu_to_le16(count); > > >>>> + mcfg[count].allocation[0].start_bus_number = 0; > > >>>> + mcfg[count++].allocation[0].end_bus_number = > > >>>> PCIE_MMCFG_BUS(info->mcfg_size - 1); > > >>> An interesting point is if we want to limit the MMFCG size for PXBs, as > > >>> we may not be > > >>> interested to use all the buses in a specific domain. For each bus we > > >>> require some > > >>> address space that remains unused. > > >>> > > >>>> + info = info->next; > > >>>> + } > > >>>> + > > >>>> build_header(linker, table_data, (void *)mcfg, sig, len, 1, > > >>>> NULL, NULL); > > >>>> } > > >>>> @@ -2602,26 +2615,52 @@ struct AcpiBuildState { > > >>>> MemoryRegion *linker_mr; > > >>>> } AcpiBuildState; > > >>>> -static bool acpi_get_mcfg(AcpiMcfgInfo *mcfg) > > >>>> +static inline void cleanup_mcfg(AcpiMcfgInfo *mcfg) > > >>>> +{ > > >>>> + AcpiMcfgInfo *tmp; > > >>>> + while (mcfg) { > > >>>> + tmp = mcfg->next; > > >>>> + g_free(mcfg); > > >>>> + mcfg = tmp; > > >>>> + } > > >>>> +} > > >>>> + > > >>>> +static AcpiMcfgInfo *acpi_get_mcfg(void) > > >>>> { > > >>>> Object *pci_host; > > >>>> QObject *o; > > >>>> + AcpiMcfgInfo *head = NULL, *tail, *mcfg; > > >>>> pci_host = acpi_get_i386_pci_host(); > > >>>> g_assert(pci_host); > > >>>> - o = object_property_get_qobject(pci_host, PCIE_HOST_MCFG_BASE, > > >>>> NULL); > > >>>> - if (!o) { > > >>>> - return false; > > >>>> + while (pci_host) { > > >>>> + mcfg = g_new0(AcpiMcfgInfo, 1); > > >>>> + mcfg->next = NULL; > > >>>> + if (!head) { > > >>>> + tail = head = mcfg; > > >>>> + } else { > > >>>> + tail->next = mcfg; > > >>>> + tail = mcfg; > > >>>> + } > > >>>> + > > >>>> + o = object_property_get_qobject(pci_host, > > >>>> PCIE_HOST_MCFG_BASE, NULL); > > >>>> + if (!o) { > > >>>> + cleanup_mcfg(head); > > >>>> + g_free(mcfg); > > >>>> + return NULL; > > >>>> + } > > >>>> + mcfg->mcfg_base = qnum_get_uint(qobject_to(QNum, o)); > > >>>> + qobject_unref(o); > > >>>> + > > >>>> + o = object_property_get_qobject(pci_host, > > >>>> PCIE_HOST_MCFG_SIZE, NULL); > > >>> I'll let Igor to comment on the APCI bits, but the idea of querying each > > >>> PCI host > > >>> for the MMFCG details and put it into a different table sounds good > > >>> to me. > > >>> > > >>> [Adding Laszlo for his insights] > > >> Thanks for the CC -- I don't have many (positive) insights here to > > >> offer, I'm afraid. First, the patch set (including the blurb) doesn't > > >> seem to explain *why* multiple host bridges / ECAM areas are a good > > >> idea. > > > > > > The issue we want to solve is the hard limitation of 256 PCIe devices > > > that can be used in a Q35 machine. > > Isn't it interesting that conventional PCI can easily support so many > more devices? Sorta makes you wonder why we care that virtual devices > are express rather than conventional for a high density configuration... Because they are express in real life? > > Implying that there are use cases for which ~256 PCIe devices aren't > > enough. I remain unconvinced until proved wrong :) > > > > Anyway, a significant source of waste comes from the restriction that we > > can only put 1 device (with up to 8 functions) on each non-root-complex > > PCI Express bus (such as root ports and downstream ports). This forfeits > > a huge portion of the ECAM area (about 31/32th) that we already have. > > Rather than spending more MMIO guest-phys address range on new > > discontiguous ECAM ranges, I'd prefer if we could look into ARI. I seem > > to recall from your earlier presentation that ARI could recover that > > lost address space (meaning both ECAM ranges and PCIe B/D/F address space). > > How does ARI solve the hotplug problem? ARI is effectively > multifunction on steroids, the ARI capability in each function points > to the next function number so that we don't need to scan the entire > devfn address space per bus (an inefficiency we don't care about when > there are only 8 function). So yeah, we can fill an entire bus with > devices with ARI, but they're all rooted at 00.0. > > > There are signs that the edk2 core supports ARI if the underlying > > platform supports it. (Which is not the case with multiple PCIe domains > > / multiple ECAM ranges.) > > It's pretty surprising to me that edk2 wouldn't already have support > for multiple PCIe domains, they're really not *that* uncommon. Some > architectures make almost gratuitous use of PCIe domains. I certainly > know there were UEFI ia64 platforms with multiple domains. > > > ARI support could also help aarch64/virt. Eric (CC'd) has been working > > on raising the max PCIe bus count from *16* to 256 for aarch64/virt, and > > AFAIR one of the challenges there has been finding a good spot for the > > larger ECAM in guest-phys address space. Fiddling with such address maps > > is always a pain. > > > > Back to x86, the guest-phys address space is quite crowded too. The > > 32-bit PCI MMIO aperture (with the neighboring ECAM and platform MMIO > > areas such as LAPIC, IO-APIC, HPET, TPM, pflash, ...) is always a scarce > > resource. Plus, reaching agreement between OVMF and QEMU on the exact > > location of the 32-bit PCI MMIO aperture has always been a huge pain; so > > you'd likely shoot for 64-bit. > > Why do we need to equally split 32-bit MMIO space between domains? Put > it on domain 0 and require devices installed into non-zero domains have > no 32-bit dependencies. +1 > > But 64-bit is ill-partitioned and/or crowded too: first you have the > > cold-plugged >4GB DRAM (whose size the firmware can interrogate), then > > the hotplug DIMM area up to "etc/reserved-memory-end" (ditto), then the > > 64-bit PCI MMIO aperture (whose size the firmware decides itself, > > because QEMU cannot be asked about it presently). Placing the additional > > MMCFGs somewhere would need extending the total order between these > > things at the design level, more fw_cfg files, more sanity checks in > > platform code in the firmware, and, quite importantly, adding support to > > multiple discontiguous MMCFGs to core edk2 code. > > Hmm, we're doing something wrong if we can't figure out some standards > within QEMU for placing per domain 64-bit MMIO and MMCFG ranges. I thought in this patch it is done by firmware. > > I don't know much about ARI but it already looks a whole lot more > > attractive to me. > > > > > We can use "PCI functions" to increase > > > the number, but then we loose the hot-plugging granularity. > > > > > > Currently pxb-pcie host bridges share the same PCI domain (0) > > > bus numbers, so adding more of them will not result in more usable > > > devices (each PCIe device resides on its own bus), > > > but putting each of them on a different domain removes > > > that limitation. > > > > > > Another possible use (there is a good chance I am wrong, adding Alex to > > > correct me), > > > may be the "modeling" of a multi-function assigned device. > > > Currently all the functions of an assigneddevice will be placed on > > > different PCIe > > > Root Ports "loosing" the connection between them. > > > > > > However, if we put all the functions of the same assigned device in the > > > same > > > PCI domain we may be able to "re-connect" them. > > > > OK -- why is that useful? What's the current practical problem with the > > splitting you describe? > > There are very few cases where this is useful, things like associating > USB companion devices or translating a guest bus reset to host bus > reset when functions are split to separate virtual buses. That said, I > have no idea how multiple domains plays a factor here. Regardless of > how many PCIe domains a VM supports, we can easily use existing > multifunction within a single domain to expose multifunction assigned > devices in a way that better resembles the physical hardware. > Multifunction PCIe endpoints cannot span PCIe domains. > > In fact, IOMMUs generally cannot span domains either, so we better also > be thinking about at least a VT-d DHRD or vIOMMU per PCIe domain. > > > >> Second, supporting such would take very intrusive patches / large > > >> feature development for OVMF (and that's not just for OvmfPkg and > > >> ArmVirtPkg platform code, but also core MdeModulePkg code). > > >> > > >> Obviously I'm not saying this feature should not be implemented for QEMU > > >> + SeaBIOS, just that don't expect edk2 support for it anytime soon. > > >> Without a convincing use case, even the review impact seems hard to > > >> justify. > > > > > > I understand, thank you for the feedback, the point was to be sure > > > we don't decide something that *can't be done* in OVMF. > > > > Semantics :) Obviously, everything "can be done" in software; that's why > > it's *soft*ware. Who is going to write it in practice? It had taken > > years until the edk2 core gained a universal PciHostBridgeDxe driver > > with a well-defined platform customization interface, and that interface > > doesn't support multiple domains / segments. It had also taken years > > until the same edk2 core driver gained support for nonzero MMIO address > > translation (i.e. where the CPU view of system RAM addresses differs > > from the PCI device view of the same, for DMA purposes) -- and that's > > just a linear translation, not even about IOMMUs. The PCI core > > maintainers in edk2 are always very busy, and upstreaming such changes > > tends to take forever. > > Wow, that's surprising, ia64 was doing all of that on UEFI platforms a > decade ago. > > > Again, I'm not opposing this feature per se. I just see any potential > > support for it in edk2 very difficult. I remain unconvinced that ~256 > > PCIe devices aren't enough. Even assuming I'm proved plain wrong there, > > I'd still much prefer ARI (although I don't know much about it at this > > point), because (a) the edk2 core already shows signs that it intends to > > support ARI, (b) ARI would help aarch64/virt with nearly the same > > effort, (c) it seems that ARI would address the problem (namely, wasting > > MMCFG) head-on, rather than throwing yet more MMCFG at it. > > -ENOHOTPLUG From an assigned device perspective, this also implies > that we're potentially adding a PCIe extended capability that may not > exist on the physical device to the virtual view of the device. > There's no guarantee that we can place that capability in the devices > extended config space without stepping on registers that the hardware > has hidden between the capabilities (for real, GPUs have registers in > extended config space that aren't covered by capabilities) or the > unlikely event that there's no more room in extended config space. > Thanks, > > Alex
On Tue, 22 May 2018 23:58:30 +0300 "Michael S. Tsirkin" <mst@redhat.com> wrote: > > It's not hard to think of a use-case where >256 devices > are helpful, for example a nested virt scenario where > each device is passed on to a different nested guest. > > But I think the main feature this is needed for is numa modeling. > Guests seem to assume a numa node per PCI root, ergo we need more PCI > roots. But even if we have NUMA affinity per PCI host bridge, a PCI host bridge does not necessarily imply a new PCIe domain. Nearly any Intel multi-socket system proves this. Don't get me wrong, I'd like to see PCIe domain support and I'm surprised edk2 is so far from supporting it, but other than >256 hotpluggable slots, I'm having trouble coming up with use cases. Maybe hotpluggable PCI root hierarchies are easier with separate domains? Thanks, Alex
On Tue, May 22, 2018 at 03:36:59PM -0600, Alex Williamson wrote: > On Tue, 22 May 2018 23:58:30 +0300 > "Michael S. Tsirkin" <mst@redhat.com> wrote: > > > > It's not hard to think of a use-case where >256 devices > > are helpful, for example a nested virt scenario where > > each device is passed on to a different nested guest. > > > > But I think the main feature this is needed for is numa modeling. > > Guests seem to assume a numa node per PCI root, ergo we need more PCI > > roots. > > But even if we have NUMA affinity per PCI host bridge, a PCI host > bridge does not necessarily imply a new PCIe domain. What are you calling a PCIe domain? > Nearly any Intel > multi-socket system proves this. Don't get me wrong, I'd like to see > PCIe domain support and I'm surprised edk2 is so far from supporting > it, but other than >256 hotpluggable slots, I'm having trouble coming > up with use cases. Maybe hotpluggable PCI root hierarchies are easier > with separate domains? Thanks, > > Alex
On Wed, 23 May 2018 00:44:22 +0300 "Michael S. Tsirkin" <mst@redhat.com> wrote: > On Tue, May 22, 2018 at 03:36:59PM -0600, Alex Williamson wrote: > > On Tue, 22 May 2018 23:58:30 +0300 > > "Michael S. Tsirkin" <mst@redhat.com> wrote: > > > > > > It's not hard to think of a use-case where >256 devices > > > are helpful, for example a nested virt scenario where > > > each device is passed on to a different nested guest. > > > > > > But I think the main feature this is needed for is numa modeling. > > > Guests seem to assume a numa node per PCI root, ergo we need more PCI > > > roots. > > > > But even if we have NUMA affinity per PCI host bridge, a PCI host > > bridge does not necessarily imply a new PCIe domain. > > What are you calling a PCIe domain? Domain/segment 0000:00:00.0 ^^^^ This Isn't that the only reason we'd need a new MCFG section and the reason we're limited to 256 buses? Thanks, Alex
On 05/22/18 23:17, Alex Williamson wrote: > On Tue, 22 May 2018 21:51:47 +0200 > Laszlo Ersek <lersek@redhat.com> wrote: Thanks Michael and Alex for the education on ARI. I'd just like to comment on one sub-topic: >> There are signs that the edk2 core supports ARI if the underlying >> platform supports it. (Which is not the case with multiple PCIe domains >> / multiple ECAM ranges.) > > It's pretty surprising to me that edk2 wouldn't already have support > for multiple PCIe domains, they're really not *that* uncommon. Some > architectures make almost gratuitous use of PCIe domains. I certainly > know there were UEFI ia64 platforms with multiple domains. >> Semantics :) Obviously, everything "can be done" in software; that's why >> it's *soft*ware. Who is going to write it in practice? It had taken >> years until the edk2 core gained a universal PciHostBridgeDxe driver >> with a well-defined platform customization interface, and that interface >> doesn't support multiple domains / segments. It had also taken years >> until the same edk2 core driver gained support for nonzero MMIO address >> translation (i.e. where the CPU view of system RAM addresses differs >> from the PCI device view of the same, for DMA purposes) -- and that's >> just a linear translation, not even about IOMMUs. The PCI core >> maintainers in edk2 are always very busy, and upstreaming such changes >> tends to take forever. > > Wow, that's surprising, ia64 was doing all of that on UEFI platforms a > decade ago. Plenty of physical PI/UEFI platforms support multiple PCIe domains (similarly to how plenty of physical PI/UEFI platforms support CPU hotplug with SMM). None of that is open source, to my knowledge, so it might as well not exist. EDK2 purposely does not accept any contributions under the GPL. Convincing proprietary downstreams to open up and upstream their code is nigh impossible, and even when it succeeds, it takes absolutely forever. Technically it also happens that a proprietary downstream contributes an independent (likely more limited) open source variant of a feature that their physical platforms support -- assuming they see (or are shown) value in allocating resources to such a contribution. This is very rare, and I'm including it for technical correctness. For open source edk2 this would be a development from zero; it would even conflict with some assumptions in edk2. (On May 4th I submitted a small library to core edk2 that parses the capabilities lists of a device and puts the capabilities into an associative array (basically a dictionary). So that we wouldn't have to open-code yet more caplist parsing loops when looking for specific capabilities. In 18 days I've pinged once and haven't gotten any technical feedback. I'm not even annoyed because I know they simply don't have time for reviewing larger than halfway trivial patches.) Thanks, Laszlo
On 05/22/18 23:22, Michael S. Tsirkin wrote: > On Tue, May 22, 2018 at 03:17:32PM -0600, Alex Williamson wrote: >> On Tue, 22 May 2018 21:51:47 +0200 >> Laszlo Ersek <lersek@redhat.com> wrote: >>> But 64-bit is ill-partitioned and/or crowded too: first you have the >>> cold-plugged >4GB DRAM (whose size the firmware can interrogate), then >>> the hotplug DIMM area up to "etc/reserved-memory-end" (ditto), then the >>> 64-bit PCI MMIO aperture (whose size the firmware decides itself, >>> because QEMU cannot be asked about it presently). Placing the additional >>> MMCFGs somewhere would need extending the total order between these >>> things at the design level, more fw_cfg files, more sanity checks in >>> platform code in the firmware, and, quite importantly, adding support to >>> multiple discontiguous MMCFGs to core edk2 code. >> >> Hmm, we're doing something wrong if we can't figure out some standards >> within QEMU for placing per domain 64-bit MMIO and MMCFG ranges. > > I thought in this patch it is done by firmware. Sure, the firmware programs the exbar registers already, but it needs to rely on some information to figure out the values it should program. (Also, the information should not arrive in the form of *just* an ACPI table; that's too hard and/or too late to parse for the firmware.) Right now the "information" is a hard-coded constant that's known not to conflict with other parts. Furthermore, I didn't highlight it earlier, but we can't go arbitrarily high -- if EPT is enabled, then the physical address width of the host CPU limits the guest-physical memory address space (incl. memory mapped devices). Thanks Laszlo
On 05/22/18 23:47, Alex Williamson wrote: > On Wed, 23 May 2018 00:44:22 +0300 > "Michael S. Tsirkin" <mst@redhat.com> wrote: > >> On Tue, May 22, 2018 at 03:36:59PM -0600, Alex Williamson wrote: >>> On Tue, 22 May 2018 23:58:30 +0300 >>> "Michael S. Tsirkin" <mst@redhat.com> wrote: >>>> >>>> It's not hard to think of a use-case where >256 devices >>>> are helpful, for example a nested virt scenario where >>>> each device is passed on to a different nested guest. >>>> >>>> But I think the main feature this is needed for is numa modeling. >>>> Guests seem to assume a numa node per PCI root, ergo we need more PCI >>>> roots. >>> >>> But even if we have NUMA affinity per PCI host bridge, a PCI host >>> bridge does not necessarily imply a new PCIe domain. >> >> What are you calling a PCIe domain? > > Domain/segment > > 0000:00:00.0 > ^^^^ This > > Isn't that the only reason we'd need a new MCFG section and the reason > we're limited to 256 buses? Thanks, (Just to confirm: this matches my understanding of the thread as well.) Laszlo
Hold on, On 05/22/18 21:51, Laszlo Ersek wrote: > It had taken years until the edk2 core gained a universal > PciHostBridgeDxe driver with a well-defined platform customization > interface, and that interface doesn't support multiple domains / > segments. after doing a bit more research: I was wrong about this. What I remembered was not the current state. Edk2 does seem to support multiple domains, with different segment numbers, ECAM base addresses, and bus number ranges. If we figure out a placement strategy or an easy to consume representation of these data for the firmware, it might be possible for OVMF to hook them into the edk2 core (although not in the earliest firmware phases, such as SEC and PEI). In retrospect, I'm honestly surprised that so much multi-segment support has been upstreamed to the edk2 core. Sorry about the FUD. (My general points remain valid, for the record... But perhaps they no longer matter for this discussion.) (I meant to send this message soon after <http://mid.mail-archive.com/fc603491-7c41-e862-a583-2bae6f165b5a@redhat.com>, but my internet connection had to die right then.) Thanks Laszlo
On Tue, May 22, 2018 at 03:47:41PM -0600, Alex Williamson wrote: > On Wed, 23 May 2018 00:44:22 +0300 > "Michael S. Tsirkin" <mst@redhat.com> wrote: > > > On Tue, May 22, 2018 at 03:36:59PM -0600, Alex Williamson wrote: > > > On Tue, 22 May 2018 23:58:30 +0300 > > > "Michael S. Tsirkin" <mst@redhat.com> wrote: > > > > > > > > It's not hard to think of a use-case where >256 devices > > > > are helpful, for example a nested virt scenario where > > > > each device is passed on to a different nested guest. > > > > > > > > But I think the main feature this is needed for is numa modeling. > > > > Guests seem to assume a numa node per PCI root, ergo we need more PCI > > > > roots. > > > > > > But even if we have NUMA affinity per PCI host bridge, a PCI host > > > bridge does not necessarily imply a new PCIe domain. > > > > What are you calling a PCIe domain? > > Domain/segment > > 0000:00:00.0 > ^^^^ This Right. So we can thinkably have PCIe root complexes share an ACPI segment. I don't see what this buys us by itself. > Isn't that the only reason we'd need a new MCFG section and the reason > we're limited to 256 buses? Thanks, > > Alex I don't know whether a single MCFG section can describe multiple roots. I think it would be certainly unusual.
On Wed, May 23, 2018 at 12:42:09AM +0200, Laszlo Ersek wrote: > Hold on, > > On 05/22/18 21:51, Laszlo Ersek wrote: > > > It had taken years until the edk2 core gained a universal > > PciHostBridgeDxe driver with a well-defined platform customization > > interface, and that interface doesn't support multiple domains / > > segments. > > after doing a bit more research: I was wrong about this. What I > remembered was not the current state. Edk2 does seem to support multiple > domains, with different segment numbers, ECAM base addresses, and bus > number ranges. If we figure out a placement strategy or an easy to > consume representation of these data for the firmware, it might be > possible for OVMF to hook them into the edk2 core (although not in the > earliest firmware phases, such as SEC and PEI). > > In retrospect, I'm honestly surprised that so much multi-segment support > has been upstreamed to the edk2 core. Sorry about the FUD. (My general > points remain valid, for the record... But perhaps they no longer matter > for this discussion.) > > (I meant to send this message soon after > <http://mid.mail-archive.com/fc603491-7c41-e862-a583-2bae6f165b5a@redhat.com>, > but my internet connection had to die right then.) > > Thanks > Laszlo Is there support for any hardware which we could emulate?
On Wed, 23 May 2018 02:38:52 +0300 "Michael S. Tsirkin" <mst@redhat.com> wrote: > On Tue, May 22, 2018 at 03:47:41PM -0600, Alex Williamson wrote: > > On Wed, 23 May 2018 00:44:22 +0300 > > "Michael S. Tsirkin" <mst@redhat.com> wrote: > > > > > On Tue, May 22, 2018 at 03:36:59PM -0600, Alex Williamson wrote: > > > > On Tue, 22 May 2018 23:58:30 +0300 > > > > "Michael S. Tsirkin" <mst@redhat.com> wrote: > > > > > > > > > > It's not hard to think of a use-case where >256 devices > > > > > are helpful, for example a nested virt scenario where > > > > > each device is passed on to a different nested guest. > > > > > > > > > > But I think the main feature this is needed for is numa modeling. > > > > > Guests seem to assume a numa node per PCI root, ergo we need more PCI > > > > > roots. > > > > > > > > But even if we have NUMA affinity per PCI host bridge, a PCI host > > > > bridge does not necessarily imply a new PCIe domain. > > > > > > What are you calling a PCIe domain? > > > > Domain/segment > > > > 0000:00:00.0 > > ^^^^ This > > Right. So we can thinkably have PCIe root complexes share an ACPI segment. > I don't see what this buys us by itself. The ability to define NUMA locality for a PCI sub-hierarchy while maintaining compatibility with non-segment aware OSes (and firmware). > > Isn't that the only reason we'd need a new MCFG section and the reason > > we're limited to 256 buses? Thanks, > > > > Alex > > I don't know whether a single MCFG section can describe multiple roots. > I think it would be certainly unusual. I'm not sure here if you're referring to the actual MCFG ACPI table or the MMCONFIG range, aka the ECAM. Neither of these describe PCI host bridges. The MCFG table can describe one or more ECAM ranges, which provides the ECAM base address, the PCI segment associated with that ECAM and the start and end bus numbers to know the offset and extent of the ECAM range. PCI host bridges would then theoretically be separate ACPI objects with _SEG and _BBN methods to associate them to the correct ECAM range by segment number and base bus number. So it seems that tooling exists that an ECAM/MMCONFIG range could be provided per PCI host bridge, even if they exist within the same domain, but in practice what I see on systems I have access to is a single MMCONFIG range supporting all of the host bridges. It also seems there are numerous ways to describe the MMCONFIG range and I haven't actually found an example that seems to use the MCFG table. Two have MCFG tables (that don't seem terribly complete) and the kernel claims to find the MMCONFIG via e820, another doesn't even have an MCFG table and the kernel claims to find MMCONFIG via an ACPI motherboard resource. I'm not sure if I can enable PCI segments on anything to see how the firmware changes. Thanks, Alex
On 05/23/18 01:40, Michael S. Tsirkin wrote: > On Wed, May 23, 2018 at 12:42:09AM +0200, Laszlo Ersek wrote: >> Hold on, >> >> On 05/22/18 21:51, Laszlo Ersek wrote: >> >>> It had taken years until the edk2 core gained a universal >>> PciHostBridgeDxe driver with a well-defined platform customization >>> interface, and that interface doesn't support multiple domains / >>> segments. >> >> after doing a bit more research: I was wrong about this. What I >> remembered was not the current state. Edk2 does seem to support multiple >> domains, with different segment numbers, ECAM base addresses, and bus >> number ranges. If we figure out a placement strategy or an easy to >> consume representation of these data for the firmware, it might be >> possible for OVMF to hook them into the edk2 core (although not in the >> earliest firmware phases, such as SEC and PEI). >> >> In retrospect, I'm honestly surprised that so much multi-segment support >> has been upstreamed to the edk2 core. Sorry about the FUD. (My general >> points remain valid, for the record... But perhaps they no longer matter >> for this discussion.) >> >> (I meant to send this message soon after >> <http://mid.mail-archive.com/fc603491-7c41-e862-a583-2bae6f165b5a@redhat.com>, >> but my internet connection had to die right then.) >> >> Thanks >> Laszlo > > Is there support for any hardware which we could emulate? I don't see any actual hw support in the edk2 project, but I'll ask. Thanks Laszlo
Hi all, Thanks for all your comments and suggestions, I wasn't expecting so many professional reviewers. Some of the things you mentioned are beyond my knowledge right now. Please correct me if I'm wrong below. The original purpose was just to support multiple segments in Intel Q35 archtecure for PCIe topology, which makes bus number a less scarce resource. The patches are very primitive and many things are left for firmware to finish(the initial plan was to implement it in SeaBIOS), the AML part in QEMU is not finished either. I'm not familiar with OVMF or edk2, so there is no plan to touch it yet, but it seems not necessary since it already supports multi-segment in the end. Also, in this patch the assumption is one domain per host bridge, described by '_SEG()' in AML, which means a ECAM range per host bridge, but that should be configurable if the user prefers to staying in the same domain, I guess? I'd like to list a few things you've discussed to confirm I don't get it wrong * ARI enlarges the number of functions, but they does not solve the hot-pluggable issue. The multifunction PCIe endpoints cannot span PCIe domains * IOMMUs cannot span domains either, so bringing new domains introduces the need to add a VT-d DHRD or vIOMMU per PCIe domain * 64-bit space is crowded and there are no standards within QEMU for placing per domain 64-bit MMIO and MMCFG ranges * NUMA modeling seems to be a stronger motivation than the limitation of 256 but nubmers, that each NUMA node holds its own PCI(e) sub-hierarchy * We cannot put ECAM arbitrarily high because guest's PA width is limited by host's when EPT is enabled. * Compatibility issues in platforms that do not have MCFG table at all (perhaps we limit it to only q35 at present in which MCFG is present). Based on your discussions, I guess this proposal is still worth doing overall, but it seems many restrictions should be imposed to be compatible with some complicated situations. Please correct me if I get anything wrong or missing. Thanks Zihan
On 05/23/18 13:11, Zihan Yang wrote: > Hi all, > The original purpose was just to support multiple segments in Intel > Q35 archtecure for PCIe topology, which makes bus number a less scarce > resource. The patches are very primitive and many things are left for > firmware to finish(the initial plan was to implement it in SeaBIOS), > the AML part in QEMU is not finished either. I'm not familiar with > OVMF or edk2, so there is no plan to touch it yet, but it seems not > necessary since it already supports multi-segment in the end. That's incorrect. EDK2 stands for "EFI Development Kit II", and it is a collection of "universal" (= platform- and ISA-independent) modules (drivers and libraries), and platfor- and/or ISA-dependent modules (drivers and libraries). The OVMF firmware is built from a subset of these modules; the final firmware image includes modules from both categories -- universal modules, and modules specific to the i440fx and Q35 QEMU boards. The first category generally lives under MdePkg/, MdeModulePkg/, UefiCpuPkg/, NetworkPkg/, PcAtChipsetPkg, etc; while the second category lives under OvmfPkg/. (The exact same applies to the ArmVirtQemu firmware, with the second category consisting of ArmVirtPkg/ and OvmfPkg/ modules.) When we discuss anything PCI-related in edk2, it usually affects both categories: (a) the universal/core modules, such as - the PCI host bridge / root bridge driver at "MdeModulePkg/Bus/Pci/PciHostBridgeDxe", - the PCI bus driver at "MdeModulePkg/Bus/Pci/PciBusDxe", (b) and the platform-specific modules, such as - "OvmfPkg/IncompatiblePciDeviceSupportDxe" which causes PciBusDxe to allocate 64-bit MMIO BARs above 4 GB, regardless of option ROM availability (as long as a CSM is not present), conserving 32-bit MMIO aperture for 32-bit BARs, - "OvmfPkg/PciHotPlugInitDxe", which implements support for QEMU's resource reservation hints, so that we can avoid IO space exhaustion with many PCIe root ports, and so that we can reserve MMIO aperture for hot-plugging devices with large MMIO BARs, - "OvmfPkg/Library/DxePciLibI440FxQ35", which is a low-level PCI config space access library, usable in the DXE and later phases, that plugs into several drivers, and uses 0xCF8/0xCFC on i440x, and ECAM on Q35, - "OvmfPkg/Library/PciHostBridgeLib", which plugs into "PciHostBridgeDxe" above, exposing the various resource apertures to said host bridge / root bridge driver, and implementing support for the PXB / PXBe devices, - "OvmfPkg/PlatformPei", which is an early (PEI phase) module with a grab-bag of platform support code; e.g. it informs "DxePciLibI440FxQ35" above about the QEMU board being Q35 vs. i440fx, it configures the ECAM (exbar) registers on Q35, it determines where the 32-bit and 64-bit PCI MMIO apertures should be; - "ArmVirtPkg/Library/BaseCachingPciExpressLib", which is the aarch64/virt counterpart of "DxePciLibI440FxQ35" above, - "ArmVirtPkg/Library/FdtPciHostBridgeLib", which is the aarch64/virt counterpart of "PciHostBridgeLib", consuming the DTB exposed by qemu-system-aarch64, - "ArmVirtPkg/Library/FdtPciPcdProducerLib", which is an internal library that turns parts of the DTB that is exposed by qemu-system-aarch64 into various PCI-related, firmware-wide, scalar variables (called "PCDs"), upon which both "BaseCachingPciExpressLib" and "FdtPciHostBridgeLib" rely. The point is that any PCI feature in any edk2 platform firmware comes together from (a) core module support for the feature, and (b) platform integration between the core code and the QEMU board in question. If (a) is missing, that implies a very painful uphill battle, which is why I'd been loudly whining, initially, in this thread, until I realized that the core support was there in edk2, for PCIe segments. However, (b) is required as well -- i.e., platform integration under OvmfPkg/ and perhaps ArmVirtPkg/, between the QEMU boards and the core edk2 code --, and that definitely doesn't exist for the PCIe segments feature. If (a) exists and is flexible enough, then we at least have a chance at writing the platform support code (b) for it. So that's why I've stopped whining. Writing (b) is never easy -- in this case, a great many of the platform modules that I've listed above, under OvmfPkg/ pathnames, could be affected, or even be eligible for replacement -- but (b) is at least imaginable practice. Modules in category (a) are shipped *in* -- not "on" -- every single physical UEFI platform that you can buy today, which is one reason why it's hugely difficult to implement nontrivial changes for them. In brief: your statement is incorrect because category (b) is missing. And that requires dedicated QEMU support, similarly to how "OvmfPkg/PciHotPlugInitDxe" requires the vendor-specific resource reservation capability, and how "OvmfPkg/Library/PciHostBridgeLib" consumes the "etc/extra-pci-roots" fw_cfg file, and how most everything that ArmVirtQemu does for PCI(e) originates from QEMU's DTB. > * 64-bit space is crowded and there are no standards within QEMU for > placing per domain 64-bit MMIO and MMCFG ranges > * We cannot put ECAM arbitrarily high because guest's PA width is > limited by host's when EPT is enabled. That's right. One argument is that firmware can lay out these apertures and ECAM ranges internally. But that argument breaks down when you hit the PCPU physical address width, and would like the management stack, such as libvirtd, to warn you in advance. For that, either libvirtd or QEMU has to know, or direct, the layout. > * NUMA modeling seems to be a stronger motivation than the limitation > of 256 but nubmers, that each NUMA node holds its own PCI(e) > sub-hierarchy I'd also like to get more information about this -- I thought pxb-pci(e) was already motivated by supporting NUMA locality. And, to my knowledge, pxb-pci(e) actually *solved* this problem. Am I wrong? Let's say you have 16 NUMA nodes (which seems pretty large to me); is it really insufficient to assign ~16 devices to each node? Thanks Laszlo
On Tue, May 22, 2018 at 10:28:56PM -0600, Alex Williamson wrote: > On Wed, 23 May 2018 02:38:52 +0300 > "Michael S. Tsirkin" <mst@redhat.com> wrote: > > > On Tue, May 22, 2018 at 03:47:41PM -0600, Alex Williamson wrote: > > > On Wed, 23 May 2018 00:44:22 +0300 > > > "Michael S. Tsirkin" <mst@redhat.com> wrote: > > > > > > > On Tue, May 22, 2018 at 03:36:59PM -0600, Alex Williamson wrote: > > > > > On Tue, 22 May 2018 23:58:30 +0300 > > > > > "Michael S. Tsirkin" <mst@redhat.com> wrote: > > > > > > > > > > > > It's not hard to think of a use-case where >256 devices > > > > > > are helpful, for example a nested virt scenario where > > > > > > each device is passed on to a different nested guest. > > > > > > > > > > > > But I think the main feature this is needed for is numa modeling. > > > > > > Guests seem to assume a numa node per PCI root, ergo we need more PCI > > > > > > roots. > > > > > > > > > > But even if we have NUMA affinity per PCI host bridge, a PCI host > > > > > bridge does not necessarily imply a new PCIe domain. > > > > > > > > What are you calling a PCIe domain? > > > > > > Domain/segment > > > > > > 0000:00:00.0 > > > ^^^^ This > > > > Right. So we can thinkably have PCIe root complexes share an ACPI segment. > > I don't see what this buys us by itself. > > The ability to define NUMA locality for a PCI sub-hierarchy while > maintaining compatibility with non-segment aware OSes (and firmware). Fur sure, but NUMA is a kind of advanced topic, MCFG has been around for longer than various NUMA tables. Are there really non-segment aware guests that also know how to make use of NUMA? > > > Isn't that the only reason we'd need a new MCFG section and the reason > > > we're limited to 256 buses? Thanks, > > > > > > Alex > > > > I don't know whether a single MCFG section can describe multiple roots. > > I think it would be certainly unusual. > > I'm not sure here if you're referring to the actual MCFG ACPI table or > the MMCONFIG range, aka the ECAM. Neither of these describe PCI host > bridges. The MCFG table can describe one or more ECAM ranges, which > provides the ECAM base address, the PCI segment associated with that > ECAM and the start and end bus numbers to know the offset and extent of > the ECAM range. PCI host bridges would then theoretically be separate > ACPI objects with _SEG and _BBN methods to associate them to the > correct ECAM range by segment number and base bus number. So it seems > that tooling exists that an ECAM/MMCONFIG range could be provided per > PCI host bridge, even if they exist within the same domain, but in > practice what I see on systems I have access to is a single MMCONFIG > range supporting all of the host bridges. It also seems there are > numerous ways to describe the MMCONFIG range and I haven't actually > found an example that seems to use the MCFG table. Two have MCFG > tables (that don't seem terribly complete) and the kernel claims to > find the MMCONFIG via e820, another doesn't even have an MCFG table and > the kernel claims to find MMCONFIG via an ACPI motherboard resource. > I'm not sure if I can enable PCI segments on anything to see how the > firmware changes. Thanks, > > Alex Let me clarify. So MCFG have base address allocation structures. Each maps a segment and a range of bus numbers into memory. This structure is what I meant. IIUC you are saying on your systems everything is within a single segment, right? Multiple pci hosts map into a single segment? If you do this you can do NUMA, but do not gain > 256 devices. Are we are the same page then?
On Wed, 23 May 2018 17:25:32 +0300 "Michael S. Tsirkin" <mst@redhat.com> wrote: > On Tue, May 22, 2018 at 10:28:56PM -0600, Alex Williamson wrote: > > On Wed, 23 May 2018 02:38:52 +0300 > > "Michael S. Tsirkin" <mst@redhat.com> wrote: > > > > > On Tue, May 22, 2018 at 03:47:41PM -0600, Alex Williamson wrote: > > > > On Wed, 23 May 2018 00:44:22 +0300 > > > > "Michael S. Tsirkin" <mst@redhat.com> wrote: > > > > > > > > > On Tue, May 22, 2018 at 03:36:59PM -0600, Alex Williamson wrote: > > > > > > On Tue, 22 May 2018 23:58:30 +0300 > > > > > > "Michael S. Tsirkin" <mst@redhat.com> wrote: > > > > > > > > > > > > > > It's not hard to think of a use-case where >256 devices > > > > > > > are helpful, for example a nested virt scenario where > > > > > > > each device is passed on to a different nested guest. > > > > > > > > > > > > > > But I think the main feature this is needed for is numa modeling. > > > > > > > Guests seem to assume a numa node per PCI root, ergo we need more PCI > > > > > > > roots. > > > > > > > > > > > > But even if we have NUMA affinity per PCI host bridge, a PCI host > > > > > > bridge does not necessarily imply a new PCIe domain. > > > > > > > > > > What are you calling a PCIe domain? > > > > > > > > Domain/segment > > > > > > > > 0000:00:00.0 > > > > ^^^^ This > > > > > > Right. So we can thinkably have PCIe root complexes share an ACPI segment. > > > I don't see what this buys us by itself. > > > > The ability to define NUMA locality for a PCI sub-hierarchy while > > maintaining compatibility with non-segment aware OSes (and firmware). > > Fur sure, but NUMA is a kind of advanced topic, MCFG has been around for > longer than various NUMA tables. Are there really non-segment aware > guests that also know how to make use of NUMA? I can't answer that question, but I assume that multi-segment PCI support is perhaps not as pervasive as we may think considering hardware OEMs tend to avoid it for their default configurations with multiple host bridges. > > > > Isn't that the only reason we'd need a new MCFG section and the reason > > > > we're limited to 256 buses? Thanks, > > > > > > > > Alex > > > > > > I don't know whether a single MCFG section can describe multiple roots. > > > I think it would be certainly unusual. > > > > I'm not sure here if you're referring to the actual MCFG ACPI table or > > the MMCONFIG range, aka the ECAM. Neither of these describe PCI host > > bridges. The MCFG table can describe one or more ECAM ranges, which > > provides the ECAM base address, the PCI segment associated with that > > ECAM and the start and end bus numbers to know the offset and extent of > > the ECAM range. PCI host bridges would then theoretically be separate > > ACPI objects with _SEG and _BBN methods to associate them to the > > correct ECAM range by segment number and base bus number. So it seems > > that tooling exists that an ECAM/MMCONFIG range could be provided per > > PCI host bridge, even if they exist within the same domain, but in > > practice what I see on systems I have access to is a single MMCONFIG > > range supporting all of the host bridges. It also seems there are > > numerous ways to describe the MMCONFIG range and I haven't actually > > found an example that seems to use the MCFG table. Two have MCFG > > tables (that don't seem terribly complete) and the kernel claims to > > find the MMCONFIG via e820, another doesn't even have an MCFG table and > > the kernel claims to find MMCONFIG via an ACPI motherboard resource. > > I'm not sure if I can enable PCI segments on anything to see how the > > firmware changes. Thanks, > > > > Alex > > Let me clarify. So MCFG have base address allocation structures. > Each maps a segment and a range of bus numbers into memory. > This structure is what I meant. Ok, so this is the ECAM/MMCONFIG range through which we do config accesses, which is described by MCFG, among other options. > IIUC you are saying on your systems everything is within a single > segment, right? Multiple pci hosts map into a single segment? Yes, for instance a single MMCONFIG range handles bus number ranges 0x00-0x7f within segment 0x0 and the system has host bridges with base bus numbers of 0x00 and 0x40, each with different NUMA locality. > If you do this you can do NUMA, but do not gain > 256 devices. Correct, but let's also clarify that we're not limited to 256 devices, a segment is limited to 256 buses and each PCIe slot is a bus, so the limitation is number of hotpluggable slots. "Devices" implies that it includes multi-function, ARI, and SR-IOV devices as well, but we can have 256 of those per bus, we just don't have the desired hotplug granularity for those. > Are we are the same page then? Seems so. Thanks, Alex
On Wed, May 23, 2018 at 08:57:51AM -0600, Alex Williamson wrote: > On Wed, 23 May 2018 17:25:32 +0300 > "Michael S. Tsirkin" <mst@redhat.com> wrote: > > > On Tue, May 22, 2018 at 10:28:56PM -0600, Alex Williamson wrote: > > > On Wed, 23 May 2018 02:38:52 +0300 > > > "Michael S. Tsirkin" <mst@redhat.com> wrote: > > > > > > > On Tue, May 22, 2018 at 03:47:41PM -0600, Alex Williamson wrote: > > > > > On Wed, 23 May 2018 00:44:22 +0300 > > > > > "Michael S. Tsirkin" <mst@redhat.com> wrote: > > > > > > > > > > > On Tue, May 22, 2018 at 03:36:59PM -0600, Alex Williamson wrote: > > > > > > > On Tue, 22 May 2018 23:58:30 +0300 > > > > > > > "Michael S. Tsirkin" <mst@redhat.com> wrote: > > > > > > > > > > > > > > > > It's not hard to think of a use-case where >256 devices > > > > > > > > are helpful, for example a nested virt scenario where > > > > > > > > each device is passed on to a different nested guest. > > > > > > > > > > > > > > > > But I think the main feature this is needed for is numa modeling. > > > > > > > > Guests seem to assume a numa node per PCI root, ergo we need more PCI > > > > > > > > roots. > > > > > > > > > > > > > > But even if we have NUMA affinity per PCI host bridge, a PCI host > > > > > > > bridge does not necessarily imply a new PCIe domain. > > > > > > > > > > > > What are you calling a PCIe domain? > > > > > > > > > > Domain/segment > > > > > > > > > > 0000:00:00.0 > > > > > ^^^^ This > > > > > > > > Right. So we can thinkably have PCIe root complexes share an ACPI segment. > > > > I don't see what this buys us by itself. > > > > > > The ability to define NUMA locality for a PCI sub-hierarchy while > > > maintaining compatibility with non-segment aware OSes (and firmware). > > > > Fur sure, but NUMA is a kind of advanced topic, MCFG has been around for > > longer than various NUMA tables. Are there really non-segment aware > > guests that also know how to make use of NUMA? > > I can't answer that question, but I assume that multi-segment PCI > support is perhaps not as pervasive as we may think considering hardware > OEMs tend to avoid it for their default configurations with multiple > host bridges. > > > > > > Isn't that the only reason we'd need a new MCFG section and the reason > > > > > we're limited to 256 buses? Thanks, > > > > > > > > > > Alex > > > > > > > > I don't know whether a single MCFG section can describe multiple roots. > > > > I think it would be certainly unusual. > > > > > > I'm not sure here if you're referring to the actual MCFG ACPI table or > > > the MMCONFIG range, aka the ECAM. Neither of these describe PCI host > > > bridges. The MCFG table can describe one or more ECAM ranges, which > > > provides the ECAM base address, the PCI segment associated with that > > > ECAM and the start and end bus numbers to know the offset and extent of > > > the ECAM range. PCI host bridges would then theoretically be separate > > > ACPI objects with _SEG and _BBN methods to associate them to the > > > correct ECAM range by segment number and base bus number. So it seems > > > that tooling exists that an ECAM/MMCONFIG range could be provided per > > > PCI host bridge, even if they exist within the same domain, but in > > > practice what I see on systems I have access to is a single MMCONFIG > > > range supporting all of the host bridges. It also seems there are > > > numerous ways to describe the MMCONFIG range and I haven't actually > > > found an example that seems to use the MCFG table. Two have MCFG > > > tables (that don't seem terribly complete) and the kernel claims to > > > find the MMCONFIG via e820, another doesn't even have an MCFG table and > > > the kernel claims to find MMCONFIG via an ACPI motherboard resource. > > > I'm not sure if I can enable PCI segments on anything to see how the > > > firmware changes. Thanks, > > > > > > Alex > > > > Let me clarify. So MCFG have base address allocation structures. > > Each maps a segment and a range of bus numbers into memory. > > This structure is what I meant. > > Ok, so this is the ECAM/MMCONFIG range through which we do config > accesses, which is described by MCFG, among other options. > > > IIUC you are saying on your systems everything is within a single > > segment, right? Multiple pci hosts map into a single segment? > > Yes, for instance a single MMCONFIG range handles bus number ranges > 0x00-0x7f within segment 0x0 and the system has host bridges with base > bus numbers of 0x00 and 0x40, each with different NUMA locality. > > > If you do this you can do NUMA, but do not gain > 256 devices. > > Correct, but let's also clarify that we're not limited to 256 devices, > a segment is limited to 256 buses and each PCIe slot is a bus, so the > limitation is number of hotpluggable slots. "Devices" implies that it > includes multi-function, ARI, and SR-IOV devices as well, but we can > have 256 of those per bus, we just don't have the desired hotplug > granularity for those. Right, I consider a group of PF and all its VFs a device, and all functions in a multi-function device a single device for this purpose. > > Are we are the same page then? > > Seems so. Thanks, > > Alex
On 05/23/2018 05:25 PM, Michael S. Tsirkin wrote: > On Tue, May 22, 2018 at 10:28:56PM -0600, Alex Williamson wrote: >> On Wed, 23 May 2018 02:38:52 +0300 >> "Michael S. Tsirkin" <mst@redhat.com> wrote: >> >>> On Tue, May 22, 2018 at 03:47:41PM -0600, Alex Williamson wrote: >>>> On Wed, 23 May 2018 00:44:22 +0300 >>>> "Michael S. Tsirkin" <mst@redhat.com> wrote: >>>> >>>>> On Tue, May 22, 2018 at 03:36:59PM -0600, Alex Williamson wrote: >>>>>> On Tue, 22 May 2018 23:58:30 +0300 >>>>>> "Michael S. Tsirkin" <mst@redhat.com> wrote: >>>>>>> It's not hard to think of a use-case where >256 devices >>>>>>> are helpful, for example a nested virt scenario where >>>>>>> each device is passed on to a different nested guest. >>>>>>> >>>>>>> But I think the main feature this is needed for is numa modeling. >>>>>>> Guests seem to assume a numa node per PCI root, ergo we need more PCI >>>>>>> roots. >>>>>> But even if we have NUMA affinity per PCI host bridge, a PCI host >>>>>> bridge does not necessarily imply a new PCIe domain. >>>>> What are you calling a PCIe domain? >>>> Domain/segment >>>> >>>> 0000:00:00.0 >>>> ^^^^ This >>> Right. So we can thinkably have PCIe root complexes share an ACPI segment. >>> I don't see what this buys us by itself. >> The ability to define NUMA locality for a PCI sub-hierarchy while >> maintaining compatibility with non-segment aware OSes (and firmware). > Fur sure, but NUMA is a kind of advanced topic, MCFG has been around for > longer than various NUMA tables. Are there really non-segment aware > guests that also know how to make use of NUMA? > Yes, the current pxb devices accomplish exactly that. Multiple NUMA nodes while sharing PCI domain 0. Thanks, Marcel >>>> Isn't that the only reason we'd need a new MCFG section and the reason >>>> we're limited to 256 buses? Thanks, >>>> >>>> Alex >>> I don't know whether a single MCFG section can describe multiple roots. >>> I think it would be certainly unusual. >> I'm not sure here if you're referring to the actual MCFG ACPI table or >> the MMCONFIG range, aka the ECAM. Neither of these describe PCI host >> bridges. The MCFG table can describe one or more ECAM ranges, which >> provides the ECAM base address, the PCI segment associated with that >> ECAM and the start and end bus numbers to know the offset and extent of >> the ECAM range. PCI host bridges would then theoretically be separate >> ACPI objects with _SEG and _BBN methods to associate them to the >> correct ECAM range by segment number and base bus number. So it seems >> that tooling exists that an ECAM/MMCONFIG range could be provided per >> PCI host bridge, even if they exist within the same domain, but in >> practice what I see on systems I have access to is a single MMCONFIG >> range supporting all of the host bridges. It also seems there are >> numerous ways to describe the MMCONFIG range and I haven't actually >> found an example that seems to use the MCFG table. Two have MCFG >> tables (that don't seem terribly complete) and the kernel claims to >> find the MMCONFIG via e820, another doesn't even have an MCFG table and >> the kernel claims to find MMCONFIG via an ACPI motherboard resource. >> I'm not sure if I can enable PCI segments on anything to see how the >> firmware changes. Thanks, >> >> Alex > Let me clarify. So MCFG have base address allocation structures. > Each maps a segment and a range of bus numbers into memory. > This structure is what I meant. > > IIUC you are saying on your systems everything is within a single > segment, right? Multiple pci hosts map into a single segment? > > If you do this you can do NUMA, but do not gain > 256 devices. > > Are we are the same page then? >
Hi Alex, On 05/23/2018 12:17 AM, Alex Williamson wrote: > On Tue, 22 May 2018 21:51:47 +0200 > Laszlo Ersek <lersek@redhat.com> wrote: > >> On 05/22/18 21:01, Marcel Apfelbaum wrote: >>> Hi Laszlo, >>> >>> On 05/22/2018 12:52 PM, Laszlo Ersek wrote: >>>> On 05/21/18 13:53, Marcel Apfelbaum wrote: >>>>> On 05/20/2018 10:28 AM, Zihan Yang wrote: >>>>>> Currently only q35 host bridge us allocated space in MCFG table. To >>>>>> put pxb host >>>>>> into sepratate pci domain, each of them should have its own >>>>>> configuration space >>>>>> int MCFG table >>>>>> >>>>>> Signed-off-by: Zihan Yang <whois.zihan.yang@gmail.com> >>>>>> --- >>>>>> hw/i386/acpi-build.c | 83 >>>>>> +++++++++++++++++++++++++++++++++++++++------------- >>>>>> 1 file changed, 62 insertions(+), 21 deletions(-) >>>>>> >>>>>> diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c >>>>>> index 9bc6d97..808d815 100644 >>>>>> --- a/hw/i386/acpi-build.c >>>>>> +++ b/hw/i386/acpi-build.c >>>>>> @@ -89,6 +89,7 @@ >>>>>> typedef struct AcpiMcfgInfo { >>>>>> uint64_t mcfg_base; >>>>>> uint32_t mcfg_size; >>>>>> + struct AcpiMcfgInfo *next; >>>>>> } AcpiMcfgInfo; >>>>>> typedef struct AcpiPmInfo { >>>>>> @@ -2427,14 +2428,15 @@ build_mcfg_q35(GArray *table_data, BIOSLinker >>>>>> *linker, AcpiMcfgInfo *info) >>>>>> { >>>>>> AcpiTableMcfg *mcfg; >>>>>> const char *sig; >>>>>> - int len = sizeof(*mcfg) + 1 * sizeof(mcfg->allocation[0]); >>>>>> + int len, count = 0; >>>>>> + AcpiMcfgInfo *cfg = info; >>>>>> + while (cfg) { >>>>>> + ++count; >>>>>> + cfg = cfg->next; >>>>>> + } >>>>>> + len = sizeof(*mcfg) + count * sizeof(mcfg->allocation[0]); >>>>>> mcfg = acpi_data_push(table_data, len); >>>>>> - mcfg->allocation[0].address = cpu_to_le64(info->mcfg_base); >>>>>> - /* Only a single allocation so no need to play with segments */ >>>>>> - mcfg->allocation[0].pci_segment = cpu_to_le16(0); >>>>>> - mcfg->allocation[0].start_bus_number = 0; >>>>>> - mcfg->allocation[0].end_bus_number = >>>>>> PCIE_MMCFG_BUS(info->mcfg_size - 1); >>>>>> /* MCFG is used for ECAM which can be enabled or disabled by >>>>>> guest. >>>>>> * To avoid table size changes (which create migration issues), >>>>>> @@ -2448,6 +2450,17 @@ build_mcfg_q35(GArray *table_data, BIOSLinker >>>>>> *linker, AcpiMcfgInfo *info) >>>>>> } else { >>>>>> sig = "MCFG"; >>>>>> } >>>>>> + >>>>>> + count = 0; >>>>>> + while (info) { >>>>>> + mcfg[count].allocation[0].address = >>>>>> cpu_to_le64(info->mcfg_base); >>>>>> + /* Only a single allocation so no need to play with >>>>>> segments */ >>>>>> + mcfg[count].allocation[0].pci_segment = cpu_to_le16(count); >>>>>> + mcfg[count].allocation[0].start_bus_number = 0; >>>>>> + mcfg[count++].allocation[0].end_bus_number = >>>>>> PCIE_MMCFG_BUS(info->mcfg_size - 1); >>>>> An interesting point is if we want to limit the MMFCG size for PXBs, as >>>>> we may not be >>>>> interested to use all the buses in a specific domain. For each bus we >>>>> require some >>>>> address space that remains unused. >>>>> >>>>>> + info = info->next; >>>>>> + } >>>>>> + >>>>>> build_header(linker, table_data, (void *)mcfg, sig, len, 1, >>>>>> NULL, NULL); >>>>>> } >>>>>> @@ -2602,26 +2615,52 @@ struct AcpiBuildState { >>>>>> MemoryRegion *linker_mr; >>>>>> } AcpiBuildState; >>>>>> -static bool acpi_get_mcfg(AcpiMcfgInfo *mcfg) >>>>>> +static inline void cleanup_mcfg(AcpiMcfgInfo *mcfg) >>>>>> +{ >>>>>> + AcpiMcfgInfo *tmp; >>>>>> + while (mcfg) { >>>>>> + tmp = mcfg->next; >>>>>> + g_free(mcfg); >>>>>> + mcfg = tmp; >>>>>> + } >>>>>> +} >>>>>> + >>>>>> +static AcpiMcfgInfo *acpi_get_mcfg(void) >>>>>> { >>>>>> Object *pci_host; >>>>>> QObject *o; >>>>>> + AcpiMcfgInfo *head = NULL, *tail, *mcfg; >>>>>> pci_host = acpi_get_i386_pci_host(); >>>>>> g_assert(pci_host); >>>>>> - o = object_property_get_qobject(pci_host, PCIE_HOST_MCFG_BASE, >>>>>> NULL); >>>>>> - if (!o) { >>>>>> - return false; >>>>>> + while (pci_host) { >>>>>> + mcfg = g_new0(AcpiMcfgInfo, 1); >>>>>> + mcfg->next = NULL; >>>>>> + if (!head) { >>>>>> + tail = head = mcfg; >>>>>> + } else { >>>>>> + tail->next = mcfg; >>>>>> + tail = mcfg; >>>>>> + } >>>>>> + >>>>>> + o = object_property_get_qobject(pci_host, >>>>>> PCIE_HOST_MCFG_BASE, NULL); >>>>>> + if (!o) { >>>>>> + cleanup_mcfg(head); >>>>>> + g_free(mcfg); >>>>>> + return NULL; >>>>>> + } >>>>>> + mcfg->mcfg_base = qnum_get_uint(qobject_to(QNum, o)); >>>>>> + qobject_unref(o); >>>>>> + >>>>>> + o = object_property_get_qobject(pci_host, >>>>>> PCIE_HOST_MCFG_SIZE, NULL); >>>>> I'll let Igor to comment on the APCI bits, but the idea of querying each >>>>> PCI host >>>>> for the MMFCG details and put it into a different table sounds good >>>>> to me. >>>>> >>>>> [Adding Laszlo for his insights] >>>> Thanks for the CC -- I don't have many (positive) insights here to >>>> offer, I'm afraid. First, the patch set (including the blurb) doesn't >>>> seem to explain *why* multiple host bridges / ECAM areas are a good >>>> idea. >>> The issue we want to solve is the hard limitation of 256 PCIe devices >>> that can be used in a Q35 machine. > Isn't it interesting that conventional PCI can easily support so many > more devices? Sorta makes you wonder why we care that virtual devices > are express rather than conventional for a high density configuration... > >> Implying that there are use cases for which ~256 PCIe devices aren't >> enough. I remain unconvinced until proved wrong :) >> >> Anyway, a significant source of waste comes from the restriction that we >> can only put 1 device (with up to 8 functions) on each non-root-complex >> PCI Express bus (such as root ports and downstream ports). This forfeits >> a huge portion of the ECAM area (about 31/32th) that we already have. >> Rather than spending more MMIO guest-phys address range on new >> discontiguous ECAM ranges, I'd prefer if we could look into ARI. I seem >> to recall from your earlier presentation that ARI could recover that >> lost address space (meaning both ECAM ranges and PCIe B/D/F address space). > How does ARI solve the hotplug problem? ARI is effectively > multifunction on steroids, +1 > the ARI capability in each function points > to the next function number so that we don't need to scan the entire > devfn address space per bus (an inefficiency we don't care about when > there are only 8 function). So yeah, we can fill an entire bus with > devices with ARI, but they're all rooted at 00.0. > >> There are signs that the edk2 core supports ARI if the underlying >> platform supports it. (Which is not the case with multiple PCIe domains >> / multiple ECAM ranges.) > It's pretty surprising to me that edk2 wouldn't already have support > for multiple PCIe domains, they're really not *that* uncommon. Some > architectures make almost gratuitous use of PCIe domains. I certainly > know there were UEFI ia64 platforms with multiple domains. > >> ARI support could also help aarch64/virt. Eric (CC'd) has been working >> on raising the max PCIe bus count from *16* to 256 for aarch64/virt, and >> AFAIR one of the challenges there has been finding a good spot for the >> larger ECAM in guest-phys address space. Fiddling with such address maps >> is always a pain. >> >> Back to x86, the guest-phys address space is quite crowded too. The >> 32-bit PCI MMIO aperture (with the neighboring ECAM and platform MMIO >> areas such as LAPIC, IO-APIC, HPET, TPM, pflash, ...) is always a scarce >> resource. Plus, reaching agreement between OVMF and QEMU on the exact >> location of the 32-bit PCI MMIO aperture has always been a huge pain; so >> you'd likely shoot for 64-bit. > Why do we need to equally split 32-bit MMIO space between domains? Put > it on domain 0 and require devices installed into non-zero domains have > no 32-bit dependencies. > >> But 64-bit is ill-partitioned and/or crowded too: first you have the >> cold-plugged >4GB DRAM (whose size the firmware can interrogate), then >> the hotplug DIMM area up to "etc/reserved-memory-end" (ditto), then the >> 64-bit PCI MMIO aperture (whose size the firmware decides itself, >> because QEMU cannot be asked about it presently). Placing the additional >> MMCFGs somewhere would need extending the total order between these >> things at the design level, more fw_cfg files, more sanity checks in >> platform code in the firmware, and, quite importantly, adding support to >> multiple discontiguous MMCFGs to core edk2 code. > Hmm, we're doing something wrong if we can't figure out some standards > within QEMU for placing per domain 64-bit MMIO and MMCFG ranges. > >> I don't know much about ARI but it already looks a whole lot more >> attractive to me. >> >>> We can use "PCI functions" to increase >>> the number, but then we loose the hot-plugging granularity. >>> >>> Currently pxb-pcie host bridges share the same PCI domain (0) >>> bus numbers, so adding more of them will not result in more usable >>> devices (each PCIe device resides on its own bus), >>> but putting each of them on a different domain removes >>> that limitation. >>> >>> Another possible use (there is a good chance I am wrong, adding Alex to >>> correct me), >>> may be the "modeling" of a multi-function assigned device. >>> Currently all the functions of an assigneddevice will be placed on >>> different PCIe >>> Root Ports "loosing" the connection between them. >>> >>> However, if we put all the functions of the same assigned device in the >>> same >>> PCI domain we may be able to "re-connect" them. >> OK -- why is that useful? What's the current practical problem with the >> splitting you describe? > There are very few cases where this is useful, things like associating > USB companion devices or translating a guest bus reset to host bus > reset when functions are split to separate virtual buses. That said, I > have no idea how multiple domains plays a factor here. Regardless of > how many PCIe domains a VM supports, we can easily use existing > multifunction within a single domain to expose multifunction assigned > devices in a way that better resembles the physical hardware. > Multifunction PCIe endpoints cannot span PCIe domains. Sorry for the misunderstanding. I was referring to the complete opposite. We want to assign two functions of the same phys device to the same VM. They will land on different PCIe Root Ports and we may have issues with the data flow between them. I was wondering if assigning them both to the same PCI domain (different from 0) will help us avoid implementing the ACS for a PCIe Root Ports. But again, I may be way off here. > In fact, IOMMUs generally cannot span domains either, so we better also > be thinking about at least a VT-d DHRD or vIOMMU per PCIe domain. > Right Thanks, Marcel [...]
On 05/23/2018 10:32 AM, Laszlo Ersek wrote: > On 05/23/18 01:40, Michael S. Tsirkin wrote: >> On Wed, May 23, 2018 at 12:42:09AM +0200, Laszlo Ersek wrote: >>> Hold on, >>> >>> On 05/22/18 21:51, Laszlo Ersek wrote: >>> >>>> It had taken years until the edk2 core gained a universal >>>> PciHostBridgeDxe driver with a well-defined platform customization >>>> interface, and that interface doesn't support multiple domains / >>>> segments. >>> after doing a bit more research: I was wrong about this. What I >>> remembered was not the current state. Edk2 does seem to support multiple >>> domains, with different segment numbers, ECAM base addresses, and bus >>> number ranges. Good news! >>> If we figure out a placement strategy or an easy to >>> consume representation of these data for the firmware, it might be >>> possible for OVMF to hook them into the edk2 core (although not in the >>> earliest firmware phases, such as SEC and PEI). Can you please remind me how OVMF places the 64-bit PCI hotplug window? We may do something similar. Let me emphasize, I am not implying you/anybody else should work on that :), I just want to be on the same page if/when the time will come. For the moment we are looking for a POC, nothing more. >>> In retrospect, I'm honestly surprised that so much multi-segment support >>> has been upstreamed to the edk2 core. Sorry about the FUD. (My general >>> points remain valid, for the record... But perhaps they no longer matter >>> for this discussion.) >>> >>> (I meant to send this message soon after >>> <http://mid.mail-archive.com/fc603491-7c41-e862-a583-2bae6f165b5a@redhat.com>, >>> but my internet connection had to die right then.) >>> >>> Thanks >>> Laszlo >> Is there support for any hardware which we could emulate? > I don't see any actual hw support in the edk2 project, but I'll ask. I think we may be able to succeed with "standard" APCI declarations of the PCI segments + placing the extra MMCONFIG ranges before the 64-bit PCI hotplug area. Thanks, Marcel > Thanks > Laszlo
On 05/23/2018 03:28 PM, Laszlo Ersek wrote: > On 05/23/18 13:11, Zihan Yang wrote: >> Hi all, >> The original purpose was just to support multiple segments in Intel >> Q35 archtecure for PCIe topology, which makes bus number a less scarce >> resource. The patches are very primitive and many things are left for >> firmware to finish(the initial plan was to implement it in SeaBIOS), >> the AML part in QEMU is not finished either. I'm not familiar with >> OVMF or edk2, so there is no plan to touch it yet, but it seems not >> necessary since it already supports multi-segment in the end. > That's incorrect. EDK2 stands for "EFI Development Kit II", and it is a > collection of "universal" (= platform- and ISA-independent) modules > (drivers and libraries), and platfor- and/or ISA-dependent modules > (drivers and libraries). The OVMF firmware is built from a subset of > these modules; the final firmware image includes modules from both > categories -- universal modules, and modules specific to the i440fx and > Q35 QEMU boards. The first category generally lives under MdePkg/, > MdeModulePkg/, UefiCpuPkg/, NetworkPkg/, PcAtChipsetPkg, etc; while the > second category lives under OvmfPkg/. > > (The exact same applies to the ArmVirtQemu firmware, with the second > category consisting of ArmVirtPkg/ and OvmfPkg/ modules.) > > When we discuss anything PCI-related in edk2, it usually affects both > categories: > > (a) the universal/core modules, such as > > - the PCI host bridge / root bridge driver at > "MdeModulePkg/Bus/Pci/PciHostBridgeDxe", > > - the PCI bus driver at "MdeModulePkg/Bus/Pci/PciBusDxe", > > (b) and the platform-specific modules, such as > > - "OvmfPkg/IncompatiblePciDeviceSupportDxe" which causes PciBusDxe to > allocate 64-bit MMIO BARs above 4 GB, regardless of option ROM > availability (as long as a CSM is not present), conserving 32-bit > MMIO aperture for 32-bit BARs, > > - "OvmfPkg/PciHotPlugInitDxe", which implements support for QEMU's > resource reservation hints, so that we can avoid IO space exhaustion > with many PCIe root ports, and so that we can reserve MMIO aperture > for hot-plugging devices with large MMIO BARs, > > - "OvmfPkg/Library/DxePciLibI440FxQ35", which is a low-level PCI > config space access library, usable in the DXE and later phases, > that plugs into several drivers, and uses 0xCF8/0xCFC on i440x, and > ECAM on Q35, > > - "OvmfPkg/Library/PciHostBridgeLib", which plugs into > "PciHostBridgeDxe" above, exposing the various resource apertures to > said host bridge / root bridge driver, and implementing support for > the PXB / PXBe devices, > > - "OvmfPkg/PlatformPei", which is an early (PEI phase) module with a > grab-bag of platform support code; e.g. it informs > "DxePciLibI440FxQ35" above about the QEMU board being Q35 vs. > i440fx, it configures the ECAM (exbar) registers on Q35, it > determines where the 32-bit and 64-bit PCI MMIO apertures should be; > > - "ArmVirtPkg/Library/BaseCachingPciExpressLib", which is the > aarch64/virt counterpart of "DxePciLibI440FxQ35" above, > > - "ArmVirtPkg/Library/FdtPciHostBridgeLib", which is the aarch64/virt > counterpart of "PciHostBridgeLib", consuming the DTB exposed by > qemu-system-aarch64, > > - "ArmVirtPkg/Library/FdtPciPcdProducerLib", which is an internal > library that turns parts of the DTB that is exposed by > qemu-system-aarch64 into various PCI-related, firmware-wide, scalar > variables (called "PCDs"), upon which both > "BaseCachingPciExpressLib" and "FdtPciHostBridgeLib" rely. > > The point is that any PCI feature in any edk2 platform firmware comes > together from (a) core module support for the feature, and (b) platform > integration between the core code and the QEMU board in question. > > If (a) is missing, that implies a very painful uphill battle, which is > why I'd been loudly whining, initially, in this thread, until I realized > that the core support was there in edk2, for PCIe segments. > > However, (b) is required as well -- i.e., platform integration under > OvmfPkg/ and perhaps ArmVirtPkg/, between the QEMU boards and the core > edk2 code --, and that definitely doesn't exist for the PCIe segments > feature. > > If (a) exists and is flexible enough, then we at least have a chance at > writing the platform support code (b) for it. So that's why I've stopped > whining. Writing (b) is never easy -- in this case, a great many of the > platform modules that I've listed above, under OvmfPkg/ pathnames, could > be affected, or even be eligible for replacement -- but (b) is at least > imaginable practice. Modules in category (a) are shipped *in* -- not > "on" -- every single physical UEFI platform that you can buy today, > which is one reason why it's hugely difficult to implement nontrivial > changes for them. > > In brief: your statement is incorrect because category (b) is missing. > And that requires dedicated QEMU support, similarly to how > "OvmfPkg/PciHotPlugInitDxe" requires the vendor-specific resource > reservation capability, and how "OvmfPkg/Library/PciHostBridgeLib" > consumes the "etc/extra-pci-roots" fw_cfg file, and how most everything > that ArmVirtQemu does for PCI(e) originates from QEMU's DTB. > >> * 64-bit space is crowded and there are no standards within QEMU for >> placing per domain 64-bit MMIO and MMCFG ranges >> * We cannot put ECAM arbitrarily high because guest's PA width is >> limited by host's when EPT is enabled. > That's right. One argument is that firmware can lay out these apertures > and ECAM ranges internally. But that argument breaks down when you hit > the PCPU physical address width, and would like the management stack, > such as libvirtd, to warn you in advance. For that, either libvirtd or > QEMU has to know, or direct, the layout. > >> * NUMA modeling seems to be a stronger motivation than the limitation >> of 256 but nubmers, that each NUMA node holds its own PCI(e) >> sub-hierarchy NUMA modeling is not the motivation, the motivation is that each PCI domain can have up to 256 buses and the PCI Express architecture dictates one PCI device per bus. The limitation we have with NUMA is that a PCI Host-Bridge can belong to a single NUMA node. > I'd also like to get more information about this -- I thought pxb-pci(e) > was already motivated by supporting NUMA locality. Right > And, to my knowledge, > pxb-pci(e) actually *solved* this problem. Am I wrong? You are right. > Let's say you > have 16 NUMA nodes (which seems pretty large to me); is it really > insufficient to assign ~16 devices to each node? Is not about "Per Node limitation", it is about several scenarios: - We have Ray from Intel trying to use 1000 virtio-net devices (God knows why :) ). - We may have a VM managing some backups (tapes), we may have a lot of these. - We may want indeed to create a nested solution as Michael mentioned. The "main/hidden" issue: At some point we will switch to Q35 as the default X86 machine (QEMU 3.0 :) ?) and then we don't want people to be disappointed by such a "regression". Thanks for your time Laszlo, and sorry putting you on the spotlight. Marcel > > Thanks > Laszlo
On 05/23/18 19:11, Marcel Apfelbaum wrote: > On 05/23/2018 10:32 AM, Laszlo Ersek wrote: >> On 05/23/18 01:40, Michael S. Tsirkin wrote: >>> On Wed, May 23, 2018 at 12:42:09AM +0200, Laszlo Ersek wrote: >>>> If we figure out a placement strategy or an easy to consume >>>> representation of these data for the firmware, it might be possible >>>> for OVMF to hook them into the edk2 core (although not in the >>>> earliest firmware phases, such as SEC and PEI). > > Can you please remind me how OVMF places the 64-bit PCI hotplug > window? If you mean the 64-bit PCI MMIO aperture, I described it here in detail: https://bugzilla.redhat.com/show_bug.cgi?id=1353591#c8 I'll also quote it inline, before returning to your email: On 03/26/18 16:10, bugzilla@redhat.com wrote: > https://bugzilla.redhat.com/show_bug.cgi?id=1353591 > > Laszlo Ersek <lersek@redhat.com> changed: > > What |Removed |Added > ---------------------------------------------------------------------------- > Flags|needinfo?(lersek@redhat.com | > |) | > > > > --- Comment #8 from Laszlo Ersek <lersek@redhat.com> --- > Sure, I can attempt :) The function to look at is GetFirstNonAddress() > in "OvmfPkg/PlatformPei/MemDetect.c". I'll try to write it up here in > natural language (although I commented the function heavily as well). > > As an introduction, the "number of address bits" is a quantity that > the firmware itself needs to know, so that in the DXE phase page > tables exist that actually map that address space. The > GetFirstNonAddress() function (in the PEI phase) calculates the > highest *exclusive* address that the firmware might want or need to > use (in the DXE phase). > > (1) First we get the highest exclusive cold-plugged RAM address. > (There are two methods for this, the more robust one is to read QEMU's > E820 map, the older / less robust one is to calculate it from the > CMOS.) If the result would be <4GB, then we take exactly 4GB from this > step, because the firmware always needs to be able to address up to > 4GB. Note that this is already somewhat non-intuitive; for example, if > you have 4GB of RAM (as in, *amount*), it will go up to 6GB in the > guest-phys address space (because [0x8000_0000..0xFFFF_FFFF] is not > RAM but MMIO on q35). > > (2) If the DXE phase is 32-bit, then we're done. (No addresses >=4GB > can be accessed, either for RAM or MMIO.) For RHEL this is never the > case. > > (3) Grab the size of the 64-bit PCI MMIO aperture. This defaults to > 32GB, but a custom (OVMF specific) fw_cfg file (from the QEMU command > line) can resize it or even disable it. This aperture is relevant > because it's going to be the top of the address space that the > firmware is interested in. If the aperture is disabled (on the QEMU > cmdline), then we're done, and only the value from point (1) matters > -- that determines the address width we need. > > (4) OK, so we have a 64-bit PCI MMIO aperture (for allocating BARs out > of, later); we have to place it somewhere. The base cannot match the > value from (1) directly, because that would not leave room for the > DIMM hotplug area. So the end of that area is read from the fw_cfg > file "etc/reserved-memory-end". DIMM hotplug is enabled iff > "etc/reserved-memory-end" exists. If "etc/reserved-memory-end" exists, > then it is guaranteed to be larger than the value from (1) -- i.e., > top of cold-plugged RAM. > > (5) We round up the size of the 64-bit PCI aperture to 1GB. We also > round up the base of the same -- i.e., from (4) or (1), as appropriate > -- to 1GB. This is inspired by SeaBIOS, because this lets the host map > the aperture with 1GB hugepages. > > (6) The base address of the aperture is then rounded up so that it > ends up aligned "naturally". "Natural" alignment means that we take > the largest whole power of two (i.e., BAR size) that can fit *within* > the aperture (whose size comes from (3) and (5)) and use that BAR size > as alignment requirement. This is because the PciBusDxe driver sorts > the BARs in decreasing size order (and equivalently, decreasing > alignment order), for allocation in increasing address order, so if > our aperture base is aligned sufficiently for the largest BAR that can > theoretically fit into the aperture, then the base will be aligned > correctly for *any* other BAR that fits. > > For example, if you have a 32GB aperture size, then the largest BAR > that can fit is 32GB, so the alignment requirement in step (6) will be > 32GB. Whereas, if the user configures a 48GB aperture size in (3), > then your alignment will remain 32GB in step (6), because a 64GB BAR > would not fit, and a 32GB BAR (which fits) dictates a 32GB alignment. > > Thus we have the following "ladder" of ranges: > > (a) cold-plugged RAM (low, <2GB) > (b) 32-bit PCI MMIO aperture, ECAM/MMCONFIG, APIC, pflash, etc (<4GB) > (c) cold-plugged RAM (high, >=4GB) > (d) DIMM hot-plug area > (e) padding up to 1GB alignment (for hugepages) > (f) padding up to the natural alignment of the 64-bit PCI MMIO > aperture size (32GB by default) > (g) 64-bit PCI MMIO aperture > > To my understanding, "maxmem" determines the end of (d). And, the > address width is dictated by the end of (g). > > Two more examples. > > - If you have 36 phys address bits, that doesn't let you use > maxmem=32G. This is because maxmem=32G puts the end of the DIMM > hotplug area (d) strictly *above* 32GB (due to the "RAM gap" (b)), > and then the padding (f) places the 64-bit PCI MMIO aperture at > 64GB. So 36 phys address bits don't suffice. > > - On the other hand, if you have 37 phys address bits, that *should* > let you use maxmem=64G. While the DIMM hot-plug area will end > strictly above 64GB, the 64-bit PCI MMIO aperture (of size 32GB) can > be placed at 96GB, so it will all fit into 128GB (i.e. 37 address > bits). > > Sorry if this is confusing, I got very little sleep last night. > Back to your email: On 05/23/18 19:11, Marcel Apfelbaum wrote: > I think we may be able to succeed with "standard" APCI declarations of > the PCI segments + placing the extra MMCONFIG ranges before the 64-bit > PCI hotplug area. That idea could work, but firmware will need hints about it. Thanks! Laszlo
On 05/23/2018 02:11 PM, Zihan Yang wrote: > Hi all, > > Thanks for all your comments and suggestions, I wasn't expecting so > many professional > reviewers. Some of the things you mentioned are beyond my knowledge > right now. > Please correct me if I'm wrong below. > > The original purpose was just to support multiple segments in Intel > Q35 archtecure > for PCIe topology, which makes bus number a less scarce resource. The > patches are > very primitive and many things are left for firmware to finish(the > initial plan was > to implement it in SeaBIOS), the AML part in QEMU is not finished > either. I'm not > familiar with OVMF or edk2, so there is no plan to touch it yet, but > it seems not > necessary since it already supports multi-segment in the end. > > Also, in this patch the assumption is one domain per host bridge, > described by '_SEG()' > in AML, which means a ECAM range per host bridge, but that should be > configurable > if the user prefers to staying in the same domain, I guess? Yes. > > I'd like to list a few things you've discussed to confirm I don't get > it wrong > > * ARI enlarges the number of functions, but they does not solve the > hot-pluggable issue. > The multifunction PCIe endpoints cannot span PCIe domains Right > * IOMMUs cannot span domains either, so bringing new domains > introduces the need > to add a VT-d DHRD or vIOMMU per PCIe domain Not really, you may have PCI domains not associated to an vIOMMU. As a first step, you should not deal with it. The IOMMU can't span over multiple domains, yes. > * 64-bit space is crowded and there are no standards within QEMU for > placing per > domain 64-bit MMIO and MMCFG ranges Yes, but we do have some layout for the "over 4G" area and we can continue building on it. > * NUMA modeling seems to be a stronger motivation than the limitation > of 256 but > nubmers, that each NUMA node holds its own PCI(e) sub-hierarchy No, the 256 devices limitation is the biggest issue we are trying to solve. > * We cannot put ECAM arbitrarily high because guest's PA width is > limited by host's > when EPT is enabled. Indeed, we should be careful about the size the MMCFGs to not exceed CPU addressable bits. > * Compatibility issues in platforms that do not have MCFG table at all > (perhaps we limit > it to only q35 at present in which MCFG is present). > For sure. > Based on your discussions, I guess this proposal is still worth doing > overall, but it seems > many restrictions should be imposed to be compatible with some > complicated situations. > Correct. > Please correct me if I get anything wrong or missing. > You are on the right path, this discussion is meant to help you understand wider concerns and make you aware of different constrains we didn't think about. Good luck with the next version! Thanks, Marcel > Thanks > Zihan
> > The original purpose was just to support multiple segments in Intel > > Q35 archtecure for PCIe topology, which makes bus number a less scarce > > resource. The patches are very primitive and many things are left for > > firmware to finish(the initial plan was to implement it in SeaBIOS), > > the AML part in QEMU is not finished either. I'm not familiar with > > OVMF or edk2, so there is no plan to touch it yet, but it seems not > > necessary since it already supports multi-segment in the end. > > That's incorrect. EDK2 stands for "EFI Development Kit II", and it is a > collection of "universal" (= platform- and ISA-independent) modules > (drivers and libraries), and platfor- and/or ISA-dependent modules > (drivers and libraries). The OVMF firmware is built from a subset of > these modules; the final firmware image includes modules from both > categories -- universal modules, and modules specific to the i440fx and > Q35 QEMU boards. The first category generally lives under MdePkg/, > MdeModulePkg/, UefiCpuPkg/, NetworkPkg/, PcAtChipsetPkg, etc; while the > second category lives under OvmfPkg/. > > (The exact same applies to the ArmVirtQemu firmware, with the second > category consisting of ArmVirtPkg/ and OvmfPkg/ modules.) > > When we discuss anything PCI-related in edk2, it usually affects both > categories: > > (a) the universal/core modules, such as > > - the PCI host bridge / root bridge driver at > "MdeModulePkg/Bus/Pci/PciHostBridgeDxe", > > - the PCI bus driver at "MdeModulePkg/Bus/Pci/PciBusDxe", > > (b) and the platform-specific modules, such as > > - "OvmfPkg/IncompatiblePciDeviceSupportDxe" which causes PciBusDxe to > allocate 64-bit MMIO BARs above 4 GB, regardless of option ROM > availability (as long as a CSM is not present), conserving 32-bit > MMIO aperture for 32-bit BARs, > > - "OvmfPkg/PciHotPlugInitDxe", which implements support for QEMU's > resource reservation hints, so that we can avoid IO space exhaustion > with many PCIe root ports, and so that we can reserve MMIO aperture > for hot-plugging devices with large MMIO BARs, > > - "OvmfPkg/Library/DxePciLibI440FxQ35", which is a low-level PCI > config space access library, usable in the DXE and later phases, > that plugs into several drivers, and uses 0xCF8/0xCFC on i440x, and > ECAM on Q35, > > - "OvmfPkg/Library/PciHostBridgeLib", which plugs into > "PciHostBridgeDxe" above, exposing the various resource apertures to > said host bridge / root bridge driver, and implementing support for > the PXB / PXBe devices, > > - "OvmfPkg/PlatformPei", which is an early (PEI phase) module with a > grab-bag of platform support code; e.g. it informs > "DxePciLibI440FxQ35" above about the QEMU board being Q35 vs. > i440fx, it configures the ECAM (exbar) registers on Q35, it > determines where the 32-bit and 64-bit PCI MMIO apertures should be; > > - "ArmVirtPkg/Library/BaseCachingPciExpressLib", which is the > aarch64/virt counterpart of "DxePciLibI440FxQ35" above, > > - "ArmVirtPkg/Library/FdtPciHostBridgeLib", which is the aarch64/virt > counterpart of "PciHostBridgeLib", consuming the DTB exposed by > qemu-system-aarch64, > > - "ArmVirtPkg/Library/FdtPciPcdProducerLib", which is an internal > library that turns parts of the DTB that is exposed by > qemu-system-aarch64 into various PCI-related, firmware-wide, scalar > variables (called "PCDs"), upon which both > "BaseCachingPciExpressLib" and "FdtPciHostBridgeLib" rely. > > The point is that any PCI feature in any edk2 platform firmware comes > together from (a) core module support for the feature, and (b) platform > integration between the core code and the QEMU board in question. > > If (a) is missing, that implies a very painful uphill battle, which is > why I'd been loudly whining, initially, in this thread, until I realized > that the core support was there in edk2, for PCIe segments. > > However, (b) is required as well -- i.e., platform integration under > OvmfPkg/ and perhaps ArmVirtPkg/, between the QEMU boards and the core > edk2 code --, and that definitely doesn't exist for the PCIe segments > feature. > > If (a) exists and is flexible enough, then we at least have a chance at > writing the platform support code (b) for it. So that's why I've stopped > whining. Writing (b) is never easy -- in this case, a great many of the > platform modules that I've listed above, under OvmfPkg/ pathnames, could > be affected, or even be eligible for replacement -- but (b) is at least > imaginable practice. Modules in category (a) are shipped *in* -- not > "on" -- every single physical UEFI platform that you can buy today, > which is one reason why it's hugely difficult to implement nontrivial > changes for them. > > In brief: your statement is incorrect because category (b) is missing. > And that requires dedicated QEMU support, similarly to how > "OvmfPkg/PciHotPlugInitDxe" requires the vendor-specific resource > reservation capability, and how "OvmfPkg/Library/PciHostBridgeLib" > consumes the "etc/extra-pci-roots" fw_cfg file, and how most everything > that ArmVirtQemu does for PCI(e) originates from QEMU's DTB. Thanks for clarifying, I didn't realize the complexity in OVMF, I think at least for now I'll only focus on SeaBIOS. > > * NUMA modeling seems to be a stronger motivation than the limitation > > of 256 but nubmers, that each NUMA node holds its own PCI(e) > > sub-hierarchy > > I'd also like to get more information about this -- I thought pxb-pci(e) > was already motivated by supporting NUMA locality. And, to my knowledge, > pxb-pci(e) actually *solved* this problem. Am I wrong? Let's say you > have 16 NUMA nodes (which seems pretty large to me); is it really > insufficient to assign ~16 devices to each node? It seems my previous statement was incorrect about the motivation, the device number limitation is still the motivation as Marcel points out in later comment. NUMA modeling just happens to fit into this situation. Thanks Zihan
2018-05-24 1:33 GMT+08:00 Marcel Apfelbaum <marcel.apfelbaum@gmail.com>: > >> * IOMMUs cannot span domains either, so bringing new domains introduces >> the need >> to add a VT-d DHRD or vIOMMU per PCIe domain > > > Not really, you may have PCI domains not associated to an vIOMMU. As a first > step, > you should not deal with it. The IOMMU can't span over multiple domains, > yes. > OK, I'll leave IOMMU part at present. >> * 64-bit space is crowded and there are no standards within QEMU for >> placing per >> domain 64-bit MMIO and MMCFG ranges > > > Yes, but we do have some layout for the "over 4G" area and we can continue > building on it. That sounds good. >> * NUMA modeling seems to be a stronger motivation than the limitation of >> 256 but >> nubmers, that each NUMA node holds its own PCI(e) sub-hierarchy > > > No, the 256 devices limitation is the biggest issue we are trying to solve. Great, the initial purpose still holds. > You are on the right path, this discussion is meant to help you understand > wider concerns > and make you aware of different constrains we didn't think about. > > Good luck with the next version! It is indeed very helpful. I'll try to deal with (at least most of) the issues int v2. Thanks for your valuable reviews! Zihan
On 05/23/18 09:32, Laszlo Ersek wrote: > On 05/23/18 01:40, Michael S. Tsirkin wrote: >> On Wed, May 23, 2018 at 12:42:09AM +0200, Laszlo Ersek wrote: >>> Hold on, >>> >>> On 05/22/18 21:51, Laszlo Ersek wrote: >>> >>>> It had taken years until the edk2 core gained a universal >>>> PciHostBridgeDxe driver with a well-defined platform customization >>>> interface, and that interface doesn't support multiple domains / >>>> segments. >>> >>> after doing a bit more research: I was wrong about this. What I >>> remembered was not the current state. Edk2 does seem to support multiple >>> domains, with different segment numbers, ECAM base addresses, and bus >>> number ranges. If we figure out a placement strategy or an easy to >>> consume representation of these data for the firmware, it might be >>> possible for OVMF to hook them into the edk2 core (although not in the >>> earliest firmware phases, such as SEC and PEI). >>> >>> In retrospect, I'm honestly surprised that so much multi-segment support >>> has been upstreamed to the edk2 core. Sorry about the FUD. (My general >>> points remain valid, for the record... But perhaps they no longer matter >>> for this discussion.) >>> >>> (I meant to send this message soon after >>> <http://mid.mail-archive.com/fc603491-7c41-e862-a583-2bae6f165b5a@redhat.com>, >>> but my internet connection had to die right then.) >>> >>> Thanks >>> Laszlo >> >> Is there support for any hardware which we could emulate? > > I don't see any actual hw support in the edk2 project, but I'll ask. I got an answer in the negative. Thanks Laszlo
diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index 9bc6d97..808d815 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -89,6 +89,7 @@ typedef struct AcpiMcfgInfo { uint64_t mcfg_base; uint32_t mcfg_size; + struct AcpiMcfgInfo *next; } AcpiMcfgInfo; typedef struct AcpiPmInfo { @@ -2427,14 +2428,15 @@ build_mcfg_q35(GArray *table_data, BIOSLinker *linker, AcpiMcfgInfo *info) { AcpiTableMcfg *mcfg; const char *sig; - int len = sizeof(*mcfg) + 1 * sizeof(mcfg->allocation[0]); + int len, count = 0; + AcpiMcfgInfo *cfg = info; + while (cfg) { + ++count; + cfg = cfg->next; + } + len = sizeof(*mcfg) + count * sizeof(mcfg->allocation[0]); mcfg = acpi_data_push(table_data, len); - mcfg->allocation[0].address = cpu_to_le64(info->mcfg_base); - /* Only a single allocation so no need to play with segments */ - mcfg->allocation[0].pci_segment = cpu_to_le16(0); - mcfg->allocation[0].start_bus_number = 0; - mcfg->allocation[0].end_bus_number = PCIE_MMCFG_BUS(info->mcfg_size - 1); /* MCFG is used for ECAM which can be enabled or disabled by guest. * To avoid table size changes (which create migration issues), @@ -2448,6 +2450,17 @@ build_mcfg_q35(GArray *table_data, BIOSLinker *linker, AcpiMcfgInfo *info) } else { sig = "MCFG"; } + + count = 0; + while (info) { + mcfg[count].allocation[0].address = cpu_to_le64(info->mcfg_base); + /* Only a single allocation so no need to play with segments */ + mcfg[count].allocation[0].pci_segment = cpu_to_le16(count); + mcfg[count].allocation[0].start_bus_number = 0; + mcfg[count++].allocation[0].end_bus_number = PCIE_MMCFG_BUS(info->mcfg_size - 1); + info = info->next; + } + build_header(linker, table_data, (void *)mcfg, sig, len, 1, NULL, NULL); } @@ -2602,26 +2615,52 @@ struct AcpiBuildState { MemoryRegion *linker_mr; } AcpiBuildState; -static bool acpi_get_mcfg(AcpiMcfgInfo *mcfg) +static inline void cleanup_mcfg(AcpiMcfgInfo *mcfg) +{ + AcpiMcfgInfo *tmp; + while (mcfg) { + tmp = mcfg->next; + g_free(mcfg); + mcfg = tmp; + } +} + +static AcpiMcfgInfo *acpi_get_mcfg(void) { Object *pci_host; QObject *o; + AcpiMcfgInfo *head = NULL, *tail, *mcfg; pci_host = acpi_get_i386_pci_host(); g_assert(pci_host); - o = object_property_get_qobject(pci_host, PCIE_HOST_MCFG_BASE, NULL); - if (!o) { - return false; + while (pci_host) { + mcfg = g_new0(AcpiMcfgInfo, 1); + mcfg->next = NULL; + if (!head) { + tail = head = mcfg; + } else { + tail->next = mcfg; + tail = mcfg; + } + + o = object_property_get_qobject(pci_host, PCIE_HOST_MCFG_BASE, NULL); + if (!o) { + cleanup_mcfg(head); + g_free(mcfg); + return NULL; + } + mcfg->mcfg_base = qnum_get_uint(qobject_to(QNum, o)); + qobject_unref(o); + + o = object_property_get_qobject(pci_host, PCIE_HOST_MCFG_SIZE, NULL); + assert(o); + mcfg->mcfg_size = qnum_get_uint(qobject_to(QNum, o)); + qobject_unref(o); + + pci_host = OBJECT(QTAILQ_NEXT(PCI_HOST_BRIDGE(pci_host), next)); } - mcfg->mcfg_base = qnum_get_uint(qobject_to(QNum, o)); - qobject_unref(o); - - o = object_property_get_qobject(pci_host, PCIE_HOST_MCFG_SIZE, NULL); - assert(o); - mcfg->mcfg_size = qnum_get_uint(qobject_to(QNum, o)); - qobject_unref(o); - return true; + return head; } static @@ -2633,7 +2672,7 @@ void acpi_build(AcpiBuildTables *tables, MachineState *machine) unsigned facs, dsdt, rsdt, fadt; AcpiPmInfo pm; AcpiMiscInfo misc; - AcpiMcfgInfo mcfg; + AcpiMcfgInfo *mcfg; Range pci_hole, pci_hole64; uint8_t *u; size_t aml_len = 0; @@ -2714,10 +2753,12 @@ void acpi_build(AcpiBuildTables *tables, MachineState *machine) build_slit(tables_blob, tables->linker); } } - if (acpi_get_mcfg(&mcfg)) { + if ((mcfg = acpi_get_mcfg()) != NULL) { acpi_add_table(table_offsets, tables_blob); - build_mcfg_q35(tables_blob, tables->linker, &mcfg); + build_mcfg_q35(tables_blob, tables->linker, mcfg); } + cleanup_mcfg(mcfg); + if (x86_iommu_get_default()) { IommuType IOMMUType = x86_iommu_get_type(); if (IOMMUType == TYPE_AMD) {
Currently only q35 host bridge us allocated space in MCFG table. To put pxb host into sepratate pci domain, each of them should have its own configuration space int MCFG table Signed-off-by: Zihan Yang <whois.zihan.yang@gmail.com> --- hw/i386/acpi-build.c | 83 +++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 62 insertions(+), 21 deletions(-)