diff mbox series

[RFC,4/6] i386/pc: Keep PCI 64-bit hole within usable IOVA space

Message ID 20210622154905.30858-5-joao.m.martins@oracle.com
State New
Headers show
Series i386/pc: Fix creation of >= 1Tb guests on AMD systems with IOMMU | expand

Commit Message

Joao Martins June 22, 2021, 3:49 p.m. UTC
pci_memory initialized by q35 and i440fx is set to a range
of 0 .. UINT64_MAX, and as a consequence when ACPI and pci-host
pick the hole64_start it does not account for allowed IOVA ranges.

Rather than blindly returning, round up the hole64_start
value to the allowable IOVA range, such that it accounts for
the 1Tb hole *on AMD*. On Intel it returns the input value
for hole64 start.

Suggested-by: David Edmondson <david.edmondson@oracle.com>
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
---
 hw/i386/pc.c         | 17 +++++++++++++++--
 hw/pci-host/i440fx.c |  4 +++-
 hw/pci-host/q35.c    |  4 +++-
 include/hw/i386/pc.h |  3 ++-
 4 files changed, 23 insertions(+), 5 deletions(-)

Comments

Igor Mammedov June 23, 2021, 12:30 p.m. UTC | #1
On Tue, 22 Jun 2021 16:49:03 +0100
Joao Martins <joao.m.martins@oracle.com> wrote:

> pci_memory initialized by q35 and i440fx is set to a range
> of 0 .. UINT64_MAX, and as a consequence when ACPI and pci-host
> pick the hole64_start it does not account for allowed IOVA ranges.
> 
> Rather than blindly returning, round up the hole64_start
> value to the allowable IOVA range, such that it accounts for
> the 1Tb hole *on AMD*. On Intel it returns the input value
> for hole64 start.

wouldn't that work only in case where guest firmware hadn't
mapped any PCI bars above 4Gb (possibly in not allowed region).

Looking at Seabios, it uses reserved_memory_end as the start
of PCI64 window. Not sure about OVMF,
 CCing Laszlo.

> Suggested-by: David Edmondson <david.edmondson@oracle.com>
> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> ---
>  hw/i386/pc.c         | 17 +++++++++++++++--
>  hw/pci-host/i440fx.c |  4 +++-
>  hw/pci-host/q35.c    |  4 +++-
>  include/hw/i386/pc.h |  3 ++-
>  4 files changed, 23 insertions(+), 5 deletions(-)
> 
> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> index 2e2ea82a4661..65885cc16037 100644
> --- a/hw/i386/pc.c
> +++ b/hw/i386/pc.c
> @@ -1141,7 +1141,7 @@ void pc_memory_init(PCMachineState *pcms,
>   * The 64bit pci hole starts after "above 4G RAM" and
>   * potentially the space reserved for memory hotplug.
>   */
> -uint64_t pc_pci_hole64_start(void)
> +uint64_t pc_pci_hole64_start(uint64_t size)
>  {
>      PCMachineState *pcms = PC_MACHINE(qdev_get_machine());
>      PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
> @@ -1155,12 +1155,25 @@ uint64_t pc_pci_hole64_start(void)
>              hole64_start += memory_region_size(&ms->device_memory->mr);
>          }
>      } else {
> -        hole64_start = 0x100000000ULL + x86ms->above_4g_mem_size;
> +        if (!x86ms->above_1t_mem_size) {
> +            hole64_start = 0x100000000ULL + x86ms->above_4g_mem_size;
> +        } else {
> +            hole64_start = x86ms->above_1t_maxram_start;
> +        }
>      }

> +    hole64_start = allowed_round_up(hole64_start, size);

I'm not sure but, might it cause problems if there were BARs placed
by firmware in region below rounded up value?
(i.e. ACPI description which uses PCI_HOST_PROP_PCI_HOLE_START property
won't match whatever firmware programmed due to rounding pushing hole
start up)

>      return ROUND_UP(hole64_start, 1 * GiB);
>  }
>  
> +uint64_t pc_pci_hole64_start_aligned(uint64_t start, uint64_t size)
> +{
> +    if (nb_iova_ranges == DEFAULT_NR_USABLE_IOVAS) {
> +        return start;
> +    }
> +    return allowed_round_up(start, size);
> +}
> +
>  DeviceState *pc_vga_init(ISABus *isa_bus, PCIBus *pci_bus)
>  {
>      DeviceState *dev = NULL;
> diff --git a/hw/pci-host/i440fx.c b/hw/pci-host/i440fx.c
> index 28c9bae89944..e8eaebfe1034 100644
> --- a/hw/pci-host/i440fx.c
> +++ b/hw/pci-host/i440fx.c
> @@ -163,8 +163,10 @@ static uint64_t i440fx_pcihost_get_pci_hole64_start_value(Object *obj)
>      pci_bus_get_w64_range(h->bus, &w64);
>      value = range_is_empty(&w64) ? 0 : range_lob(&w64);
>      if (!value && s->pci_hole64_fix) {
> -        value = pc_pci_hole64_start();
> +        value = pc_pci_hole64_start(s->pci_hole64_size);
>      }
> +    /* This returns @value when not on AMD */
> +    value = pc_pci_hole64_start_aligned(value, s->pci_hole64_size);
>      return value;
>  }
>  
> diff --git a/hw/pci-host/q35.c b/hw/pci-host/q35.c
> index 2eb729dff585..d556eb965ddb 100644
> --- a/hw/pci-host/q35.c
> +++ b/hw/pci-host/q35.c
> @@ -126,8 +126,10 @@ static uint64_t q35_host_get_pci_hole64_start_value(Object *obj)
>      pci_bus_get_w64_range(h->bus, &w64);
>      value = range_is_empty(&w64) ? 0 : range_lob(&w64);
>      if (!value && s->pci_hole64_fix) {
> -        value = pc_pci_hole64_start();
> +        value = pc_pci_hole64_start(s->mch.pci_hole64_size);
>      }
> +    /* This returns @value when not on AMD */
> +    value = pc_pci_hole64_start_aligned(value, s->mch.pci_hole64_size);
>      return value;
>  }
>  
> diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
> index 73b8e2900c72..b924aef3a218 100644
> --- a/include/hw/i386/pc.h
> +++ b/include/hw/i386/pc.h
> @@ -217,7 +217,8 @@ void pc_memory_init(PCMachineState *pcms,
>                      MemoryRegion *system_memory,
>                      MemoryRegion *rom_memory,
>                      MemoryRegion **ram_memory);
> -uint64_t pc_pci_hole64_start(void);
> +uint64_t pc_pci_hole64_start(uint64_t size);
> +uint64_t pc_pci_hole64_start_aligned(uint64_t value, uint64_t size);
>  DeviceState *pc_vga_init(ISABus *isa_bus, PCIBus *pci_bus);
>  void pc_basic_device_init(struct PCMachineState *pcms,
>                            ISABus *isa_bus, qemu_irq *gsi,
Joao Martins June 23, 2021, 1:22 p.m. UTC | #2
On 6/23/21 1:30 PM, Igor Mammedov wrote:
> On Tue, 22 Jun 2021 16:49:03 +0100
> Joao Martins <joao.m.martins@oracle.com> wrote:
> 
>> pci_memory initialized by q35 and i440fx is set to a range
>> of 0 .. UINT64_MAX, and as a consequence when ACPI and pci-host
>> pick the hole64_start it does not account for allowed IOVA ranges.
>>
>> Rather than blindly returning, round up the hole64_start
>> value to the allowable IOVA range, such that it accounts for
>> the 1Tb hole *on AMD*. On Intel it returns the input value
>> for hole64 start.
> 
> wouldn't that work only in case where guest firmware hadn't
> mapped any PCI bars above 4Gb (possibly in not allowed region).
> 
> Looking at Seabios, it uses reserved_memory_end as the start
> of PCI64 window. Not sure about OVMF,
>  CCing Laszlo.
> 
Hmmm, perhaps only in the case that I don't advertise the reserved
region (i.e. mem size not being big enough to let the guest know in
the e820). But then that it begs the question, should we then always
advertise the said HT region as reserved regardless of memory size?

>> Suggested-by: David Edmondson <david.edmondson@oracle.com>
>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>> ---
>>  hw/i386/pc.c         | 17 +++++++++++++++--
>>  hw/pci-host/i440fx.c |  4 +++-
>>  hw/pci-host/q35.c    |  4 +++-
>>  include/hw/i386/pc.h |  3 ++-
>>  4 files changed, 23 insertions(+), 5 deletions(-)
>>
>> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
>> index 2e2ea82a4661..65885cc16037 100644
>> --- a/hw/i386/pc.c
>> +++ b/hw/i386/pc.c
>> @@ -1141,7 +1141,7 @@ void pc_memory_init(PCMachineState *pcms,
>>   * The 64bit pci hole starts after "above 4G RAM" and
>>   * potentially the space reserved for memory hotplug.
>>   */
>> -uint64_t pc_pci_hole64_start(void)
>> +uint64_t pc_pci_hole64_start(uint64_t size)
>>  {
>>      PCMachineState *pcms = PC_MACHINE(qdev_get_machine());
>>      PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
>> @@ -1155,12 +1155,25 @@ uint64_t pc_pci_hole64_start(void)
>>              hole64_start += memory_region_size(&ms->device_memory->mr);
>>          }
>>      } else {
>> -        hole64_start = 0x100000000ULL + x86ms->above_4g_mem_size;
>> +        if (!x86ms->above_1t_mem_size) {
>> +            hole64_start = 0x100000000ULL + x86ms->above_4g_mem_size;
>> +        } else {
>> +            hole64_start = x86ms->above_1t_maxram_start;
>> +        }
>>      }
> 
>> +    hole64_start = allowed_round_up(hole64_start, size);
> 
> I'm not sure but, might it cause problems if there were BARs placed
> by firmware in region below rounded up value?
> (i.e. ACPI description which uses PCI_HOST_PROP_PCI_HOLE_START property
> won't match whatever firmware programmed due to rounding pushing hole
> start up)
> 
But wouldn't then the problem be firmware ignoring E820 to avoid putting
firmware region where it shouldn't? Unless of course, it wasn't advertised
like I mentioned earlier.

>>      return ROUND_UP(hole64_start, 1 * GiB);
>>  }
>>  
>> +uint64_t pc_pci_hole64_start_aligned(uint64_t start, uint64_t size)
>> +{
>> +    if (nb_iova_ranges == DEFAULT_NR_USABLE_IOVAS) {
>> +        return start;
>> +    }
>> +    return allowed_round_up(start, size);
>> +}
>> +
>>  DeviceState *pc_vga_init(ISABus *isa_bus, PCIBus *pci_bus)
>>  {
>>      DeviceState *dev = NULL;
>> diff --git a/hw/pci-host/i440fx.c b/hw/pci-host/i440fx.c
>> index 28c9bae89944..e8eaebfe1034 100644
>> --- a/hw/pci-host/i440fx.c
>> +++ b/hw/pci-host/i440fx.c
>> @@ -163,8 +163,10 @@ static uint64_t i440fx_pcihost_get_pci_hole64_start_value(Object *obj)
>>      pci_bus_get_w64_range(h->bus, &w64);
>>      value = range_is_empty(&w64) ? 0 : range_lob(&w64);
>>      if (!value && s->pci_hole64_fix) {
>> -        value = pc_pci_hole64_start();
>> +        value = pc_pci_hole64_start(s->pci_hole64_size);
>>      }
>> +    /* This returns @value when not on AMD */
>> +    value = pc_pci_hole64_start_aligned(value, s->pci_hole64_size);
>>      return value;
>>  }
>>  
>> diff --git a/hw/pci-host/q35.c b/hw/pci-host/q35.c
>> index 2eb729dff585..d556eb965ddb 100644
>> --- a/hw/pci-host/q35.c
>> +++ b/hw/pci-host/q35.c
>> @@ -126,8 +126,10 @@ static uint64_t q35_host_get_pci_hole64_start_value(Object *obj)
>>      pci_bus_get_w64_range(h->bus, &w64);
>>      value = range_is_empty(&w64) ? 0 : range_lob(&w64);
>>      if (!value && s->pci_hole64_fix) {
>> -        value = pc_pci_hole64_start();
>> +        value = pc_pci_hole64_start(s->mch.pci_hole64_size);
>>      }
>> +    /* This returns @value when not on AMD */
>> +    value = pc_pci_hole64_start_aligned(value, s->mch.pci_hole64_size);
>>      return value;
>>  }
>>  
>> diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
>> index 73b8e2900c72..b924aef3a218 100644
>> --- a/include/hw/i386/pc.h
>> +++ b/include/hw/i386/pc.h
>> @@ -217,7 +217,8 @@ void pc_memory_init(PCMachineState *pcms,
>>                      MemoryRegion *system_memory,
>>                      MemoryRegion *rom_memory,
>>                      MemoryRegion **ram_memory);
>> -uint64_t pc_pci_hole64_start(void);
>> +uint64_t pc_pci_hole64_start(uint64_t size);
>> +uint64_t pc_pci_hole64_start_aligned(uint64_t value, uint64_t size);
>>  DeviceState *pc_vga_init(ISABus *isa_bus, PCIBus *pci_bus);
>>  void pc_basic_device_init(struct PCMachineState *pcms,
>>                            ISABus *isa_bus, qemu_irq *gsi,
>
Laszlo Ersek June 23, 2021, 4:33 p.m. UTC | #3
Adding Marcel and Dave.

Adding Alex (seriously, a vfio- / iommu-related patch set without Alex on CC?)

comments below

On 06/23/21 14:30, Igor Mammedov wrote:
> On Tue, 22 Jun 2021 16:49:03 +0100
> Joao Martins <joao.m.martins@oracle.com> wrote:
> 
>> pci_memory initialized by q35 and i440fx is set to a range
>> of 0 .. UINT64_MAX, and as a consequence when ACPI and pci-host
>> pick the hole64_start it does not account for allowed IOVA ranges.
>>
>> Rather than blindly returning, round up the hole64_start
>> value to the allowable IOVA range, such that it accounts for
>> the 1Tb hole *on AMD*. On Intel it returns the input value
>> for hole64 start.
> 
> wouldn't that work only in case where guest firmware hadn't
> mapped any PCI bars above 4Gb (possibly in not allowed region).
> 
> Looking at Seabios, it uses reserved_memory_end as the start
> of PCI64 window. Not sure about OVMF,
>  CCing Laszlo.

(thanks for the CC)

Yes, see "OvmfPkg/PlatformPei/MemDetect.c":

  //
  // The "etc/reserved-memory-end" fw_cfg file, when present, contains an
  // absolute, exclusive end address for the memory hotplug area. This area
  // starts right at the end of the memory above 4GB. The 64-bit PCI host
  // aperture must be placed above it.
  //
  Status = QemuFwCfgFindFile ("etc/reserved-memory-end", &FwCfgItem,
             &FwCfgSize);


> 
>> Suggested-by: David Edmondson <david.edmondson@oracle.com>
>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>> ---
>>  hw/i386/pc.c         | 17 +++++++++++++++--
>>  hw/pci-host/i440fx.c |  4 +++-
>>  hw/pci-host/q35.c    |  4 +++-
>>  include/hw/i386/pc.h |  3 ++-
>>  4 files changed, 23 insertions(+), 5 deletions(-)
>>
>> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
>> index 2e2ea82a4661..65885cc16037 100644
>> --- a/hw/i386/pc.c
>> +++ b/hw/i386/pc.c
>> @@ -1141,7 +1141,7 @@ void pc_memory_init(PCMachineState *pcms,
>>   * The 64bit pci hole starts after "above 4G RAM" and
>>   * potentially the space reserved for memory hotplug.
>>   */
>> -uint64_t pc_pci_hole64_start(void)
>> +uint64_t pc_pci_hole64_start(uint64_t size)
>>  {
>>      PCMachineState *pcms = PC_MACHINE(qdev_get_machine());
>>      PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
>> @@ -1155,12 +1155,25 @@ uint64_t pc_pci_hole64_start(void)
>>              hole64_start += memory_region_size(&ms->device_memory->mr);
>>          }
>>      } else {
>> -        hole64_start = 0x100000000ULL + x86ms->above_4g_mem_size;
>> +        if (!x86ms->above_1t_mem_size) {
>> +            hole64_start = 0x100000000ULL + x86ms->above_4g_mem_size;
>> +        } else {
>> +            hole64_start = x86ms->above_1t_maxram_start;
>> +        }
>>      }
> 
>> +    hole64_start = allowed_round_up(hole64_start, size);
> 
> I'm not sure but, might it cause problems if there were BARs placed
> by firmware in region below rounded up value?
> (i.e. ACPI description which uses PCI_HOST_PROP_PCI_HOLE_START property
> won't match whatever firmware programmed due to rounding pushing hole
> start up)

This code is a hornets' nest. Marcel and MST too surely remember the troubles between it and OVMF from the past years.

The firmware assigns BARs, then distinctly later, the firmware temporarily reenables MMIO decoding for all PCI devices, and fetches the ACPI tables blob via fw_cfg. At that point, the ACPI content is re-generated by QEMU, which (due to all PCI devices having MMIO decoding enabled) involves computing "bounding boxes" over the programmed BARs (32-bit and 64-bit). Said "bounding boxes" are exposed in the _CRS. If the bounding boxes exposed in the _CRS are shrunk after the BAR assignment by the firmware, the guest OS will almost certainly reject at least some pre-existent BAR assignments by the firmware, by virtue of them falling outside of the _CRS windows.

If you want to keep the 64-bit PCI MMIO *aperture* restricted to a particular area, then you need to steer the guest firmware to allocate the BARs from that area in the first place.

In OVMF, we have an experimental fw_cfg knob for controlling the *size* of the aperture. Search the above-referenced OVMF source file for the string "opt/ovmf/X-PciMmio64Mb".

Regarding the *base* of the aperture, its calculation in OVMF is already non-trivial; please refer to the GetFirstNonAddress() function in the same file. I'm not sure what you're trying to do, but you might need another piece of guest enlightenment (fw_cfg?) for steering OVMF to place the base at a higher address (if that's what you want to do). If you can recognize the AMD IOMMU limitation in the guest without explicit paravirt stuff, then migrating part of the logic to OVMF could be easier (not that I look forward to reviewing related firmware patches, TBH).

The topic overlaps with the vCPU physical address width problem, for which QEMU guests cannot trust the otherwise appropriate CPUID leaf. Refer to the following upstream TianoCore BZ for details: <https://bugzilla.tianocore.org/show_bug.cgi?id=2796>.

Hmmmmm. How about this workaround. Might be worth a shot. The 64-bit PCI MMIO aperture in OVMF is "naturally aligned":

  //
  // The 64-bit PCI host aperture should also be "naturally" aligned. The
  // alignment is determined by rounding the size of the aperture down to the
  // next smaller or equal power of two. That is, align the aperture by the
  // largest BAR size that can fit into it.
  //
  Pci64Base = ALIGN_VALUE (Pci64Base, GetPowerOfTwo64 (Pci64Size));

This means that, if you specify a huge enough count of megabytes for the *size*, for example 1TB in total:

  -fw_cfg opt/ovmf/X-PciMmio64Mb,string=$((1024*1024))

then you'll also end up with a high-enough *base* (1 TB). From the blurb, I reckon that should work, without any patches at all for QEMU?

(OVMF currently allows a 16 TB aperture size via the above knob:

  //
  // See if the user specified the number of megabytes for the 64-bit PCI host
  // aperture. Accept an aperture size up to 16TB.
  //
)

Thanks
Laszlo


> 
>>      return ROUND_UP(hole64_start, 1 * GiB);
>>  }
>>  
>> +uint64_t pc_pci_hole64_start_aligned(uint64_t start, uint64_t size)
>> +{
>> +    if (nb_iova_ranges == DEFAULT_NR_USABLE_IOVAS) {
>> +        return start;
>> +    }
>> +    return allowed_round_up(start, size);
>> +}
>> +
>>  DeviceState *pc_vga_init(ISABus *isa_bus, PCIBus *pci_bus)
>>  {
>>      DeviceState *dev = NULL;
>> diff --git a/hw/pci-host/i440fx.c b/hw/pci-host/i440fx.c
>> index 28c9bae89944..e8eaebfe1034 100644
>> --- a/hw/pci-host/i440fx.c
>> +++ b/hw/pci-host/i440fx.c
>> @@ -163,8 +163,10 @@ static uint64_t i440fx_pcihost_get_pci_hole64_start_value(Object *obj)
>>      pci_bus_get_w64_range(h->bus, &w64);
>>      value = range_is_empty(&w64) ? 0 : range_lob(&w64);
>>      if (!value && s->pci_hole64_fix) {
>> -        value = pc_pci_hole64_start();
>> +        value = pc_pci_hole64_start(s->pci_hole64_size);
>>      }
>> +    /* This returns @value when not on AMD */
>> +    value = pc_pci_hole64_start_aligned(value, s->pci_hole64_size);
>>      return value;
>>  }
>>  
>> diff --git a/hw/pci-host/q35.c b/hw/pci-host/q35.c
>> index 2eb729dff585..d556eb965ddb 100644
>> --- a/hw/pci-host/q35.c
>> +++ b/hw/pci-host/q35.c
>> @@ -126,8 +126,10 @@ static uint64_t q35_host_get_pci_hole64_start_value(Object *obj)
>>      pci_bus_get_w64_range(h->bus, &w64);
>>      value = range_is_empty(&w64) ? 0 : range_lob(&w64);
>>      if (!value && s->pci_hole64_fix) {
>> -        value = pc_pci_hole64_start();
>> +        value = pc_pci_hole64_start(s->mch.pci_hole64_size);
>>      }
>> +    /* This returns @value when not on AMD */
>> +    value = pc_pci_hole64_start_aligned(value, s->mch.pci_hole64_size);
>>      return value;
>>  }
>>  
>> diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
>> index 73b8e2900c72..b924aef3a218 100644
>> --- a/include/hw/i386/pc.h
>> +++ b/include/hw/i386/pc.h
>> @@ -217,7 +217,8 @@ void pc_memory_init(PCMachineState *pcms,
>>                      MemoryRegion *system_memory,
>>                      MemoryRegion *rom_memory,
>>                      MemoryRegion **ram_memory);
>> -uint64_t pc_pci_hole64_start(void);
>> +uint64_t pc_pci_hole64_start(uint64_t size);
>> +uint64_t pc_pci_hole64_start_aligned(uint64_t value, uint64_t size);
>>  DeviceState *pc_vga_init(ISABus *isa_bus, PCIBus *pci_bus);
>>  void pc_basic_device_init(struct PCMachineState *pcms,
>>                            ISABus *isa_bus, qemu_irq *gsi,
>
Joao Martins June 25, 2021, 5:19 p.m. UTC | #4
On 6/23/21 5:33 PM, Laszlo Ersek wrote:
> Adding Marcel and Dave.
> 
> Adding Alex (seriously, a vfio- / iommu-related patch set without Alex on CC?)
> 
> comments below
> 
> On 06/23/21 14:30, Igor Mammedov wrote:
>> On Tue, 22 Jun 2021 16:49:03 +0100
>> Joao Martins <joao.m.martins@oracle.com> wrote:
>>
>>> pci_memory initialized by q35 and i440fx is set to a range
>>> of 0 .. UINT64_MAX, and as a consequence when ACPI and pci-host
>>> pick the hole64_start it does not account for allowed IOVA ranges.
>>>
>>> Rather than blindly returning, round up the hole64_start
>>> value to the allowable IOVA range, such that it accounts for
>>> the 1Tb hole *on AMD*. On Intel it returns the input value
>>> for hole64 start.
>>
>> wouldn't that work only in case where guest firmware hadn't
>> mapped any PCI bars above 4Gb (possibly in not allowed region).
>>
>> Looking at Seabios, it uses reserved_memory_end as the start
>> of PCI64 window. Not sure about OVMF,
>>  CCing Laszlo.
> 
> (thanks for the CC)
> 
> Yes, see "OvmfPkg/PlatformPei/MemDetect.c":
> 
>   //
>   // The "etc/reserved-memory-end" fw_cfg file, when present, contains an
>   // absolute, exclusive end address for the memory hotplug area. This area
>   // starts right at the end of the memory above 4GB. The 64-bit PCI host
>   // aperture must be placed above it.
>   //
>   Status = QemuFwCfgFindFile ("etc/reserved-memory-end", &FwCfgItem,
>              &FwCfgSize);
> 
> 
'etc/reserved-memory-end' is *I think* advertised correctly after this series (taking into
consideration the HT range).

Except when memory is sufficiently small that it doesn't need to deal with the HT hole.
Which could well lead firmware to not know the said hole that it should avoid (with big
enough aperture). Maybe I should advertise this nonetheless considering this sits below
the default phys-bits/maxphysaddr of 1Tb.

>>
>>> Suggested-by: David Edmondson <david.edmondson@oracle.com>
>>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>>> ---
>>>  hw/i386/pc.c         | 17 +++++++++++++++--
>>>  hw/pci-host/i440fx.c |  4 +++-
>>>  hw/pci-host/q35.c    |  4 +++-
>>>  include/hw/i386/pc.h |  3 ++-
>>>  4 files changed, 23 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
>>> index 2e2ea82a4661..65885cc16037 100644
>>> --- a/hw/i386/pc.c
>>> +++ b/hw/i386/pc.c
>>> @@ -1141,7 +1141,7 @@ void pc_memory_init(PCMachineState *pcms,
>>>   * The 64bit pci hole starts after "above 4G RAM" and
>>>   * potentially the space reserved for memory hotplug.
>>>   */
>>> -uint64_t pc_pci_hole64_start(void)
>>> +uint64_t pc_pci_hole64_start(uint64_t size)
>>>  {
>>>      PCMachineState *pcms = PC_MACHINE(qdev_get_machine());
>>>      PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
>>> @@ -1155,12 +1155,25 @@ uint64_t pc_pci_hole64_start(void)
>>>              hole64_start += memory_region_size(&ms->device_memory->mr);
>>>          }
>>>      } else {
>>> -        hole64_start = 0x100000000ULL + x86ms->above_4g_mem_size;
>>> +        if (!x86ms->above_1t_mem_size) {
>>> +            hole64_start = 0x100000000ULL + x86ms->above_4g_mem_size;
>>> +        } else {
>>> +            hole64_start = x86ms->above_1t_maxram_start;
>>> +        }
>>>      }
>>
>>> +    hole64_start = allowed_round_up(hole64_start, size);
>>
>> I'm not sure but, might it cause problems if there were BARs placed
>> by firmware in region below rounded up value?
>> (i.e. ACPI description which uses PCI_HOST_PROP_PCI_HOLE_START property
>> won't match whatever firmware programmed due to rounding pushing hole
>> start up)
> 
> This code is a hornets' nest. Marcel and MST too surely remember the troubles between it and OVMF from the past years.
> 
Thanks for the explanation below, appreciate it.

> The firmware assigns BARs, then distinctly later, the firmware temporarily reenables MMIO decoding for all PCI devices, and fetches the ACPI tables blob via fw_cfg. At that point, the ACPI content is re-generated by QEMU, which (due to all PCI devices having MMIO decoding enabled) involves computing "bounding boxes" over the programmed BARs (32-bit and 64-bit). Said "bounding boxes" are exposed in the _CRS. If the bounding boxes exposed in the _CRS are shrunk after the BAR assignment by the firmware, the guest OS will almost certainly reject at least some pre-existent BAR assignments by the firmware, by virtue of them falling outside of the _CRS windows.
> 
I think I am doing that here? That is limiting the said bounding boxes.

At least with this OVMF doesn't seem to be doing the wrong thing. when looking at the
configured bars.

> If you want to keep the 64-bit PCI MMIO *aperture* restricted to a particular area, then you need to steer the guest firmware to allocate the BARs from that area in the first place.
> 
Hmmm, why isn't E820_RESERVED entries enough? That is exposed here.

You already look at E820 RAM entries, but so far we assume that the reserved always comes
at the end, or via that fw_cfg entry.

> In OVMF, we have an experimental fw_cfg knob for controlling the *size* of the aperture. Search the above-referenced OVMF source file for the string "opt/ovmf/X-PciMmio64Mb".
> 
> Regarding the *base* of the aperture, its calculation in OVMF is already non-trivial; please refer to the GetFirstNonAddress() function in the same file. I'm not sure what you're trying to do, but you might need another piece of guest enlightenment (fw_cfg?) for steering OVMF to place the base at a higher address (if that's what you want to do). If you can recognize the AMD IOMMU limitation in the guest without explicit paravirt stuff, then migrating part of the logic to OVMF could be easier (not that I look forward to reviewing related firmware patches, TBH).
> 
> The topic overlaps with the vCPU physical address width problem, for which QEMU guests cannot trust the otherwise appropriate CPUID leaf. Refer to the following upstream TianoCore BZ for details: <https://bugzilla.tianocore.org/show_bug.cgi?id=2796>.
> 
> Hmmmmm. How about this workaround. Might be worth a shot. The 64-bit PCI MMIO aperture in OVMF is "naturally aligned":
> 
>   //
>   // The 64-bit PCI host aperture should also be "naturally" aligned. The
>   // alignment is determined by rounding the size of the aperture down to the
>   // next smaller or equal power of two. That is, align the aperture by the
>   // largest BAR size that can fit into it.
>   //
>   Pci64Base = ALIGN_VALUE (Pci64Base, GetPowerOfTwo64 (Pci64Size));
> 
> This means that, if you specify a huge enough count of megabytes for the *size*, for example 1TB in total:
> 
>   -fw_cfg opt/ovmf/X-PciMmio64Mb,string=$((1024*1024))
> 
> then you'll also end up with a high-enough *base* (1 TB). From the blurb, I reckon that should work, without any patches at all for QEMU?
> 
Yeah, albeit like you said it feels like a workaround. Firmware should either parse
'properly' what's reserved, or machine model should provide a representative memory map
that lets guest/firmware know that it should avoid that said range (either
implicit/explictly).

> (OVMF currently allows a 16 TB aperture size via the above knob:
> 
>   //
>   // See if the user specified the number of megabytes for the 64-bit PCI host
>   // aperture. Accept an aperture size up to 16TB.
>   //
> )
> 
> Thanks
> Laszlo
> 
> 
Thanks!
>>
>>>      return ROUND_UP(hole64_start, 1 * GiB);
>>>  }
>>>  
>>> +uint64_t pc_pci_hole64_start_aligned(uint64_t start, uint64_t size)
>>> +{
>>> +    if (nb_iova_ranges == DEFAULT_NR_USABLE_IOVAS) {
>>> +        return start;
>>> +    }
>>> +    return allowed_round_up(start, size);
>>> +}
>>> +
>>>  DeviceState *pc_vga_init(ISABus *isa_bus, PCIBus *pci_bus)
>>>  {
>>>      DeviceState *dev = NULL;
>>> diff --git a/hw/pci-host/i440fx.c b/hw/pci-host/i440fx.c
>>> index 28c9bae89944..e8eaebfe1034 100644
>>> --- a/hw/pci-host/i440fx.c
>>> +++ b/hw/pci-host/i440fx.c
>>> @@ -163,8 +163,10 @@ static uint64_t i440fx_pcihost_get_pci_hole64_start_value(Object *obj)
>>>      pci_bus_get_w64_range(h->bus, &w64);
>>>      value = range_is_empty(&w64) ? 0 : range_lob(&w64);
>>>      if (!value && s->pci_hole64_fix) {
>>> -        value = pc_pci_hole64_start();
>>> +        value = pc_pci_hole64_start(s->pci_hole64_size);
>>>      }
>>> +    /* This returns @value when not on AMD */
>>> +    value = pc_pci_hole64_start_aligned(value, s->pci_hole64_size);
>>>      return value;
>>>  }
>>>  
>>> diff --git a/hw/pci-host/q35.c b/hw/pci-host/q35.c
>>> index 2eb729dff585..d556eb965ddb 100644
>>> --- a/hw/pci-host/q35.c
>>> +++ b/hw/pci-host/q35.c
>>> @@ -126,8 +126,10 @@ static uint64_t q35_host_get_pci_hole64_start_value(Object *obj)
>>>      pci_bus_get_w64_range(h->bus, &w64);
>>>      value = range_is_empty(&w64) ? 0 : range_lob(&w64);
>>>      if (!value && s->pci_hole64_fix) {
>>> -        value = pc_pci_hole64_start();
>>> +        value = pc_pci_hole64_start(s->mch.pci_hole64_size);
>>>      }
>>> +    /* This returns @value when not on AMD */
>>> +    value = pc_pci_hole64_start_aligned(value, s->mch.pci_hole64_size);
>>>      return value;
>>>  }
>>>  
>>> diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
>>> index 73b8e2900c72..b924aef3a218 100644
>>> --- a/include/hw/i386/pc.h
>>> +++ b/include/hw/i386/pc.h
>>> @@ -217,7 +217,8 @@ void pc_memory_init(PCMachineState *pcms,
>>>                      MemoryRegion *system_memory,
>>>                      MemoryRegion *rom_memory,
>>>                      MemoryRegion **ram_memory);
>>> -uint64_t pc_pci_hole64_start(void);
>>> +uint64_t pc_pci_hole64_start(uint64_t size);
>>> +uint64_t pc_pci_hole64_start_aligned(uint64_t value, uint64_t size);
>>>  DeviceState *pc_vga_init(ISABus *isa_bus, PCIBus *pci_bus);
>>>  void pc_basic_device_init(struct PCMachineState *pcms,
>>>                            ISABus *isa_bus, qemu_irq *gsi,
>>
>
Igor Mammedov June 28, 2021, 3:37 p.m. UTC | #5
On Wed, 23 Jun 2021 14:22:29 +0100
Joao Martins <joao.m.martins@oracle.com> wrote:

> On 6/23/21 1:30 PM, Igor Mammedov wrote:
> > On Tue, 22 Jun 2021 16:49:03 +0100
> > Joao Martins <joao.m.martins@oracle.com> wrote:
> >   
> >> pci_memory initialized by q35 and i440fx is set to a range
> >> of 0 .. UINT64_MAX, and as a consequence when ACPI and pci-host
> >> pick the hole64_start it does not account for allowed IOVA ranges.
> >>
> >> Rather than blindly returning, round up the hole64_start
> >> value to the allowable IOVA range, such that it accounts for
> >> the 1Tb hole *on AMD*. On Intel it returns the input value
> >> for hole64 start.  
> > 
> > wouldn't that work only in case where guest firmware hadn't
> > mapped any PCI bars above 4Gb (possibly in not allowed region).
> > 
> > Looking at Seabios, it uses reserved_memory_end as the start
> > of PCI64 window. Not sure about OVMF,
> >  CCing Laszlo.
> >   
> Hmmm, perhaps only in the case that I don't advertise the reserved
> region (i.e. mem size not being big enough to let the guest know in
> the e820). But then that it begs the question, should we then always
> advertise the said HT region as reserved regardless of memory size?

Where in firmware that e820 reserved entry will be accounted for?
All I see in Seabios is qemu_cfg_e820() which accounts for E820_RAM (sometimes)
and there the high RAM is hardwired to 4G. Then later on it might get used
to setup the start of 64-bit PCI hole if etc/reserved-memory-end
doesn't exist.

And for E820_RAM to work, the entry with highest address has to be
the last in the table, which is fragile and it's only a question of
time when someone breaks that.
(well if it's explicitly specified in ACPI spec, I won't object
if properly documented)
So firmware side will also need changes to account for this,
and make it independent if 4G starting point.

For Seabios it might be better to make use of fwcfg file Laszlo has
suggested in his reply than relying on e820 & reserved-memory-end,
either way Seabios would need related patches for this to work
properly. For OVMF you can get away with QEMU only patches,
if using that knob.

(PS:
I'm not PCI expert so there might be more issues hidden there
)

> >> Suggested-by: David Edmondson <david.edmondson@oracle.com>
> >> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> >> ---
> >>  hw/i386/pc.c         | 17 +++++++++++++++--
> >>  hw/pci-host/i440fx.c |  4 +++-
> >>  hw/pci-host/q35.c    |  4 +++-
> >>  include/hw/i386/pc.h |  3 ++-
> >>  4 files changed, 23 insertions(+), 5 deletions(-)
> >>
> >> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> >> index 2e2ea82a4661..65885cc16037 100644
> >> --- a/hw/i386/pc.c
> >> +++ b/hw/i386/pc.c
> >> @@ -1141,7 +1141,7 @@ void pc_memory_init(PCMachineState *pcms,
> >>   * The 64bit pci hole starts after "above 4G RAM" and
> >>   * potentially the space reserved for memory hotplug.
> >>   */
> >> -uint64_t pc_pci_hole64_start(void)
> >> +uint64_t pc_pci_hole64_start(uint64_t size)
> >>  {
> >>      PCMachineState *pcms = PC_MACHINE(qdev_get_machine());
> >>      PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
> >> @@ -1155,12 +1155,25 @@ uint64_t pc_pci_hole64_start(void)
> >>              hole64_start += memory_region_size(&ms->device_memory->mr);
> >>          }
> >>      } else {
> >> -        hole64_start = 0x100000000ULL + x86ms->above_4g_mem_size;
> >> +        if (!x86ms->above_1t_mem_size) {
> >> +            hole64_start = 0x100000000ULL + x86ms->above_4g_mem_size;
> >> +        } else {
> >> +            hole64_start = x86ms->above_1t_maxram_start;
> >> +        }
> >>      }  
> >   
> >> +    hole64_start = allowed_round_up(hole64_start, size);  
> > 
> > I'm not sure but, might it cause problems if there were BARs placed
> > by firmware in region below rounded up value?
> > (i.e. ACPI description which uses PCI_HOST_PROP_PCI_HOLE_START property
> > won't match whatever firmware programmed due to rounding pushing hole
> > start up)
> >   
> But wouldn't then the problem be firmware ignoring E820 to avoid putting
> firmware region where it shouldn't? Unless of course, it wasn't advertised
> like I mentioned earlier.
> 
> >>      return ROUND_UP(hole64_start, 1 * GiB);
> >>  }
> >>  
> >> +uint64_t pc_pci_hole64_start_aligned(uint64_t start, uint64_t size)
> >> +{
> >> +    if (nb_iova_ranges == DEFAULT_NR_USABLE_IOVAS) {
> >> +        return start;
> >> +    }
> >> +    return allowed_round_up(start, size);
> >> +}
> >> +
> >>  DeviceState *pc_vga_init(ISABus *isa_bus, PCIBus *pci_bus)
> >>  {
> >>      DeviceState *dev = NULL;
> >> diff --git a/hw/pci-host/i440fx.c b/hw/pci-host/i440fx.c
> >> index 28c9bae89944..e8eaebfe1034 100644
> >> --- a/hw/pci-host/i440fx.c
> >> +++ b/hw/pci-host/i440fx.c
> >> @@ -163,8 +163,10 @@ static uint64_t i440fx_pcihost_get_pci_hole64_start_value(Object *obj)
> >>      pci_bus_get_w64_range(h->bus, &w64);
> >>      value = range_is_empty(&w64) ? 0 : range_lob(&w64);
> >>      if (!value && s->pci_hole64_fix) {
> >> -        value = pc_pci_hole64_start();
> >> +        value = pc_pci_hole64_start(s->pci_hole64_size);
> >>      }
> >> +    /* This returns @value when not on AMD */
> >> +    value = pc_pci_hole64_start_aligned(value, s->pci_hole64_size);
> >>      return value;
> >>  }
> >>  
> >> diff --git a/hw/pci-host/q35.c b/hw/pci-host/q35.c
> >> index 2eb729dff585..d556eb965ddb 100644
> >> --- a/hw/pci-host/q35.c
> >> +++ b/hw/pci-host/q35.c
> >> @@ -126,8 +126,10 @@ static uint64_t q35_host_get_pci_hole64_start_value(Object *obj)
> >>      pci_bus_get_w64_range(h->bus, &w64);
> >>      value = range_is_empty(&w64) ? 0 : range_lob(&w64);
> >>      if (!value && s->pci_hole64_fix) {
> >> -        value = pc_pci_hole64_start();
> >> +        value = pc_pci_hole64_start(s->mch.pci_hole64_size);
> >>      }
> >> +    /* This returns @value when not on AMD */
> >> +    value = pc_pci_hole64_start_aligned(value, s->mch.pci_hole64_size);
> >>      return value;
> >>  }
> >>  
> >> diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
> >> index 73b8e2900c72..b924aef3a218 100644
> >> --- a/include/hw/i386/pc.h
> >> +++ b/include/hw/i386/pc.h
> >> @@ -217,7 +217,8 @@ void pc_memory_init(PCMachineState *pcms,
> >>                      MemoryRegion *system_memory,
> >>                      MemoryRegion *rom_memory,
> >>                      MemoryRegion **ram_memory);
> >> -uint64_t pc_pci_hole64_start(void);
> >> +uint64_t pc_pci_hole64_start(uint64_t size);
> >> +uint64_t pc_pci_hole64_start_aligned(uint64_t value, uint64_t size);
> >>  DeviceState *pc_vga_init(ISABus *isa_bus, PCIBus *pci_bus);
> >>  void pc_basic_device_init(struct PCMachineState *pcms,
> >>                            ISABus *isa_bus, qemu_irq *gsi,  
> >   
>
diff mbox series

Patch

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 2e2ea82a4661..65885cc16037 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1141,7 +1141,7 @@  void pc_memory_init(PCMachineState *pcms,
  * The 64bit pci hole starts after "above 4G RAM" and
  * potentially the space reserved for memory hotplug.
  */
-uint64_t pc_pci_hole64_start(void)
+uint64_t pc_pci_hole64_start(uint64_t size)
 {
     PCMachineState *pcms = PC_MACHINE(qdev_get_machine());
     PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
@@ -1155,12 +1155,25 @@  uint64_t pc_pci_hole64_start(void)
             hole64_start += memory_region_size(&ms->device_memory->mr);
         }
     } else {
-        hole64_start = 0x100000000ULL + x86ms->above_4g_mem_size;
+        if (!x86ms->above_1t_mem_size) {
+            hole64_start = 0x100000000ULL + x86ms->above_4g_mem_size;
+        } else {
+            hole64_start = x86ms->above_1t_maxram_start;
+        }
     }
+    hole64_start = allowed_round_up(hole64_start, size);
 
     return ROUND_UP(hole64_start, 1 * GiB);
 }
 
+uint64_t pc_pci_hole64_start_aligned(uint64_t start, uint64_t size)
+{
+    if (nb_iova_ranges == DEFAULT_NR_USABLE_IOVAS) {
+        return start;
+    }
+    return allowed_round_up(start, size);
+}
+
 DeviceState *pc_vga_init(ISABus *isa_bus, PCIBus *pci_bus)
 {
     DeviceState *dev = NULL;
diff --git a/hw/pci-host/i440fx.c b/hw/pci-host/i440fx.c
index 28c9bae89944..e8eaebfe1034 100644
--- a/hw/pci-host/i440fx.c
+++ b/hw/pci-host/i440fx.c
@@ -163,8 +163,10 @@  static uint64_t i440fx_pcihost_get_pci_hole64_start_value(Object *obj)
     pci_bus_get_w64_range(h->bus, &w64);
     value = range_is_empty(&w64) ? 0 : range_lob(&w64);
     if (!value && s->pci_hole64_fix) {
-        value = pc_pci_hole64_start();
+        value = pc_pci_hole64_start(s->pci_hole64_size);
     }
+    /* This returns @value when not on AMD */
+    value = pc_pci_hole64_start_aligned(value, s->pci_hole64_size);
     return value;
 }
 
diff --git a/hw/pci-host/q35.c b/hw/pci-host/q35.c
index 2eb729dff585..d556eb965ddb 100644
--- a/hw/pci-host/q35.c
+++ b/hw/pci-host/q35.c
@@ -126,8 +126,10 @@  static uint64_t q35_host_get_pci_hole64_start_value(Object *obj)
     pci_bus_get_w64_range(h->bus, &w64);
     value = range_is_empty(&w64) ? 0 : range_lob(&w64);
     if (!value && s->pci_hole64_fix) {
-        value = pc_pci_hole64_start();
+        value = pc_pci_hole64_start(s->mch.pci_hole64_size);
     }
+    /* This returns @value when not on AMD */
+    value = pc_pci_hole64_start_aligned(value, s->mch.pci_hole64_size);
     return value;
 }
 
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 73b8e2900c72..b924aef3a218 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -217,7 +217,8 @@  void pc_memory_init(PCMachineState *pcms,
                     MemoryRegion *system_memory,
                     MemoryRegion *rom_memory,
                     MemoryRegion **ram_memory);
-uint64_t pc_pci_hole64_start(void);
+uint64_t pc_pci_hole64_start(uint64_t size);
+uint64_t pc_pci_hole64_start_aligned(uint64_t value, uint64_t size);
 DeviceState *pc_vga_init(ISABus *isa_bus, PCIBus *pci_bus);
 void pc_basic_device_init(struct PCMachineState *pcms,
                           ISABus *isa_bus, qemu_irq *gsi,