diff mbox

[v3] hw/arm/virt: Add high MMIO PCI region

Message ID 015a01d0c85c$b52b61d0$1f822570$@samsung.com
State New
Headers show

Commit Message

Pavel Fedin July 27, 2015, 11:09 a.m. UTC
This large region is necessary for some devices like ivshmem and video cards

Signed-off-by: Pavel Fedin <p.fedin@samsung.com>
---
Changes since v2:
- Region size increased to 512G
- Added ACPI description
Changes since v1:
- Region address changed to 512G, leaving more space for RAM
---
 hw/arm/virt-acpi-build.c |  8 ++++++++
 hw/arm/virt.c            | 13 ++++++++++++-
 include/hw/arm/virt.h    |  1 +
 3 files changed, 21 insertions(+), 1 deletion(-)

Comments

Igor Mammedov July 27, 2015, 1:26 p.m. UTC | #1
On Mon, 27 Jul 2015 14:09:28 +0300
Pavel Fedin <p.fedin@samsung.com> wrote:

> This large region is necessary for some devices like ivshmem and video cards
> 
> Signed-off-by: Pavel Fedin <p.fedin@samsung.com>
> ---
> Changes since v2:
> - Region size increased to 512G
> - Added ACPI description
> Changes since v1:
> - Region address changed to 512G, leaving more space for RAM
> ---
>  hw/arm/virt-acpi-build.c |  8 ++++++++
>  hw/arm/virt.c            | 13 ++++++++++++-
>  include/hw/arm/virt.h    |  1 +
>  3 files changed, 21 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c
> index f365140..020aad6 100644
> --- a/hw/arm/virt-acpi-build.c
> +++ b/hw/arm/virt-acpi-build.c
> @@ -169,6 +169,8 @@ static void acpi_dsdt_add_pci(Aml *scope, const MemMapEntry *memmap, int irq)
>      hwaddr size_pio = memmap[VIRT_PCIE_PIO].size;
>      hwaddr base_ecam = memmap[VIRT_PCIE_ECAM].base;
>      hwaddr size_ecam = memmap[VIRT_PCIE_ECAM].size;
> +    hwaddr base_mmio_high = memmap[VIRT_PCIE_MMIO_HIGH].base;
> +    hwaddr size_mmio_high = memmap[VIRT_PCIE_MMIO_HIGH].size;
>      int nr_pcie_buses = size_ecam / PCIE_MMCFG_SIZE_MIN;
>  
>      Aml *dev = aml_device("%s", "PCI0");
> @@ -234,6 +236,12 @@ static void acpi_dsdt_add_pci(Aml *scope, const MemMapEntry *memmap, int irq)
>                       AML_ENTIRE_RANGE, 0x0000, 0x0000, size_pio - 1, base_pio,
>                       size_pio));
>  
> +    aml_append(rbuf,
> +        aml_dword_memory(AML_POS_DECODE, AML_MIN_FIXED, AML_MAX_FIXED,
> +                         AML_NON_CACHEABLE, AML_READ_WRITE, 0x0000,
> +                         base_mmio_high, base_mmio_high + size_mmio_high - 1,
> +                         0x0000, size_mmio_high));
> +
>      aml_append(method, aml_name_decl("RBUF", rbuf));
>      aml_append(method, aml_return(rbuf));
>      aml_append(dev, method);
> diff --git a/hw/arm/virt.c b/hw/arm/virt.c
> index e53ef4c..c20b3b8 100644
> --- a/hw/arm/virt.c
> +++ b/hw/arm/virt.c
> @@ -124,6 +124,7 @@ static const MemMapEntry a15memmap[] = {
>      [VIRT_PCIE_PIO] =           { 0x3eff0000, 0x00010000 },
>      [VIRT_PCIE_ECAM] =          { 0x3f000000, 0x01000000 },
>      [VIRT_MEM] =                { 0x40000000, 30ULL * 1024 * 1024 * 1024 },
> +    [VIRT_PCIE_MMIO_HIGH] =   { 0x8000000000, 0x8000000000 },
I'm not sure  but fixed hole start/size might be a problem later when adding memory hotplug wasting address space.

On x86 we do it a little different, see call chain:
 acpi_setup -> build_ssdt ->
   i440fx_pcihost_get_pci_hole64_start -> pci_bus_get_w64_range
   ...                          _end -> ...

where acpi_setup() is called from pc_guest_info_machine_done() right before
guest starts and later after guest's BIOS(UEFI) initialized PCI devices.

Perhaps we should do the same for ARM as well, CCing Michael

>  };
>  
>  static const int a15irqmap[] = {
> @@ -758,6 +759,8 @@ static void create_pcie(const VirtBoardInfo *vbi, qemu_irq *pic)
>      hwaddr size_pio = vbi->memmap[VIRT_PCIE_PIO].size;
>      hwaddr base_ecam = vbi->memmap[VIRT_PCIE_ECAM].base;
>      hwaddr size_ecam = vbi->memmap[VIRT_PCIE_ECAM].size;
> +    hwaddr base_mmio_high = vbi->memmap[VIRT_PCIE_MMIO_HIGH].base;
> +    hwaddr size_mmio_high = vbi->memmap[VIRT_PCIE_MMIO_HIGH].size;
>      hwaddr base = base_mmio;
>      int nr_pcie_buses = size_ecam / PCIE_MMCFG_SIZE_MIN;
>      int irq = vbi->irqmap[VIRT_PCIE];
> @@ -793,6 +796,12 @@ static void create_pcie(const VirtBoardInfo *vbi, qemu_irq *pic)
>      /* Map IO port space */
>      sysbus_mmio_map(SYS_BUS_DEVICE(dev), 2, base_pio);
>  
> +    /* High MMIO space */
> +    mmio_alias = g_new0(MemoryRegion, 1);
> +    memory_region_init_alias(mmio_alias, OBJECT(dev), "pcie-mmio-high",
> +                             mmio_reg, base_mmio_high, size_mmio_high);
> +    memory_region_add_subregion(get_system_memory(), base_mmio_high, mmio_alias);
Is there any specific reason to have 2 separate regions vs using 1 like in
 pc_pci_as_mapping_init()
using region priority instead of splitting.

>      for (i = 0; i < GPEX_NUM_IRQS; i++) {
>          sysbus_connect_irq(SYS_BUS_DEVICE(dev), i, pic[irq + i]);
>      }
> @@ -818,7 +827,9 @@ static void create_pcie(const VirtBoardInfo *vbi, qemu_irq *pic)
>                                   1, FDT_PCI_RANGE_IOPORT, 2, 0,
>                                   2, base_pio, 2, size_pio,
>                                   1, FDT_PCI_RANGE_MMIO, 2, base_mmio,
> -                                 2, base_mmio, 2, size_mmio);
> +                                 2, base_mmio, 2, size_mmio,
> +                                 1, FDT_PCI_RANGE_MMIO, 2, base_mmio_high,
> +                                 2, base_mmio_high, 2, size_mmio_high);
>  
>      qemu_fdt_setprop_cell(vbi->fdt, nodename, "#interrupt-cells", 1);
>      create_pcie_irq_map(vbi, vbi->gic_phandle, irq, nodename);
> diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h
> index 852efb9..1d43598 100644
> --- a/include/hw/arm/virt.h
> +++ b/include/hw/arm/virt.h
> @@ -60,6 +60,7 @@ enum {
>      VIRT_PCIE_PIO,
>      VIRT_PCIE_ECAM,
>      VIRT_PLATFORM_BUS,
> +    VIRT_PCIE_MMIO_HIGH,
>  };
>  
>  typedef struct MemMapEntry {
Pavel Fedin July 27, 2015, 2:36 p.m. UTC | #2
Hello!

> > +    /* High MMIO space */
> > +    mmio_alias = g_new0(MemoryRegion, 1);
> > +    memory_region_init_alias(mmio_alias, OBJECT(dev), "pcie-mmio-high",
> > +                             mmio_reg, base_mmio_high, size_mmio_high);
> > +    memory_region_add_subregion(get_system_memory(), base_mmio_high, mmio_alias);
> Is there any specific reason to have 2 separate regions vs using 1 like in
>  pc_pci_as_mapping_init()
> using region priority instead of splitting.

 Unfortunately i'm not familiar very well with qemu memory internals. I saw PC code and i know that
it adds PCI region of the size of the whole memory, then adds other things as overlapped regions.
But wouldn't it be some resource waste in this case? I understand that in PC absolutely all "unused"
addresses fall through to PCI, so that any device can plug in there. On ARM this is different, PCI
controller is not a core of the system, it's just one of devices instead. And on our case a huge
part of PCI region between VIRT_PCIE_MMIO and VIRT_PCIE_MMIO_HIGH would never be used. Does it worth
that ?

Kind regards,
Pavel Fedin
Expert Engineer
Samsung Electronics Research center Russia
Michael S. Tsirkin July 27, 2015, 3:18 p.m. UTC | #3
On Mon, Jul 27, 2015 at 05:36:07PM +0300, Pavel Fedin wrote:
>  Hello!
> 
> > > +    /* High MMIO space */
> > > +    mmio_alias = g_new0(MemoryRegion, 1);
> > > +    memory_region_init_alias(mmio_alias, OBJECT(dev), "pcie-mmio-high",
> > > +                             mmio_reg, base_mmio_high, size_mmio_high);
> > > +    memory_region_add_subregion(get_system_memory(), base_mmio_high, mmio_alias);
> > Is there any specific reason to have 2 separate regions vs using 1 like in
> >  pc_pci_as_mapping_init()
> > using region priority instead of splitting.
> 
>  Unfortunately i'm not familiar very well with qemu memory internals. I saw PC code and i know that
> it adds PCI region of the size of the whole memory, then adds other things as overlapped regions.
> But wouldn't it be some resource waste in this case? I understand that in PC absolutely all "unused"
> addresses fall through to PCI, so that any device can plug in there. On ARM this is different, PCI
> controller is not a core of the system, it's just one of devices instead. And on our case a huge
> part of PCI region between VIRT_PCIE_MMIO and VIRT_PCIE_MMIO_HIGH would never be used. Does it worth
> that ?
> 
> Kind regards,
> Pavel Fedin
> Expert Engineer
> Samsung Electronics Research center Russia

It's more a question of figuring out what does real hardware do.
It's true, PIIX has this "everything that isn't memory is PCI"
assumption. We do this for Q35 but I'm not even sure it's the
right thing to do there.

If as you say real ARM hardware doesn't work like this, then QEMU
shouldn't do this for ARM.
Peter Maydell July 27, 2015, 3:51 p.m. UTC | #4
On 27 July 2015 at 16:18, Michael S. Tsirkin <mst@redhat.com> wrote:
> It's more a question of figuring out what does real hardware do.
> It's true, PIIX has this "everything that isn't memory is PCI"
> assumption. We do this for Q35 but I'm not even sure it's the
> right thing to do there.
>
> If as you say real ARM hardware doesn't work like this, then QEMU
> shouldn't do this for ARM.

Real ARM hardware might do a variety of things varying between
"we just have some windows" and "we do things exactly like
how an x86 PC does". The architecture doesn't care.

I don't know what the restrictions of the kernel's "generic
PCIe" handling are, whether it insists on specific windows
or can handle the "everything in the background is a window"
configuration as well. The dt binding doc for it implies not.

thanks
-- PMM
Pavel Fedin July 29, 2015, 8:58 a.m. UTC | #5
Hello!

> I'm not sure  but fixed hole start/size might be a problem later when adding memory hotplug
wasting
> address space.

 But 'virt' machine entirely relies on fixed layout. And, we can always change it if we need to.

> 
> On x86 we do it a little different, see call chain:
>  acpi_setup -> build_ssdt ->
>    i440fx_pcihost_get_pci_hole64_start -> pci_bus_get_w64_range
>    ...                          _end -> ...
> 
> where acpi_setup() is called from pc_guest_info_machine_done() right before
> guest starts and later after guest's BIOS(UEFI) initialized PCI devices.
> 
> Perhaps we should do the same for ARM as well, CCing Michael

 I took a look at the code. As far as i could understand, it iterates devices on the bus and finds
out start of the lowest assigned region and end of the highest assigned region. Does it?
 I'm not sure that ARM architecture has this machine_done callback. And, to tell the truth, i don't
use EFI on my setup so i cannot test the thing.
 So, can we leave fixed layout for now? I am currently reworking the patch because i discovered
problems with 32-bit guests. They simply truncate high word and end up in attempt to put PCI at
0x00000000 - 0xFFFFFFFF, fail, and do not work of course. So, my next version will have high MMIO
only for 64-bit guests.
 By the way, what is the most correct way to determine whether selected CPU is 32 or 64 bit?

Kind regards,
Pavel Fedin
Expert Engineer
Samsung Electronics Research center Russia
Peter Maydell July 29, 2015, 9:03 a.m. UTC | #6
On 29 July 2015 at 09:58, Pavel Fedin <p.fedin@samsung.com> wrote:
>  So, can we leave fixed layout for now? I am currently reworking the patch
> because i discovered problems with 32-bit guests. They simply truncate
> high word and end up in attempt to put PCI at 0x00000000 - 0xFFFFFFFF, fail,
> and do not work of course. So, my next version will have high MMIO
> only for 64-bit guests.

That sounds like a guest bug to me. If the device tree says that
addresses are two-cells wide then the code reading it ought to
read both cells, not just ignore the top 32 bits...

thanks
-- PMM
Igor Mammedov July 29, 2015, 9:10 a.m. UTC | #7
On Mon, 27 Jul 2015 14:09:28 +0300
Pavel Fedin <p.fedin@samsung.com> wrote:

> @@ -234,6 +236,12 @@ static void acpi_dsdt_add_pci(Aml *scope, const MemMapEntry *memmap, int irq)
>                       AML_ENTIRE_RANGE, 0x0000, 0x0000, size_pio - 1, base_pio,
>                       size_pio));
>  
> +    aml_append(rbuf,
> +        aml_dword_memory(AML_POS_DECODE, AML_MIN_FIXED, AML_MAX_FIXED,
this is wrong since dword is too small for values of high memory
use aml_qword_memory() instead

> +                         AML_NON_CACHEABLE, AML_READ_WRITE, 0x0000,
> +                         base_mmio_high, base_mmio_high + size_mmio_high - 1,
since window is at fixed position and it's not possible for guest to
move base address of the range, make AddressMaximum the same as AddressMinimum i.e.

  s/base_mmio_high + size_mmio_high - 1/base_mmio_high/

> +                         0x0000, size_mmio_high));
Igor Mammedov July 29, 2015, 9:32 a.m. UTC | #8
On Wed, 29 Jul 2015 11:58:11 +0300
Pavel Fedin <p.fedin@samsung.com> wrote:

>  Hello!
> 
> > I'm not sure  but fixed hole start/size might be a problem later when adding memory hotplug
> wasting
> > address space.
> 
>  But 'virt' machine entirely relies on fixed layout. And, we can always change it if we need to.
> 
> > 
> > On x86 we do it a little different, see call chain:
> >  acpi_setup -> build_ssdt ->
> >    i440fx_pcihost_get_pci_hole64_start -> pci_bus_get_w64_range
> >    ...                          _end -> ...
> > 
> > where acpi_setup() is called from pc_guest_info_machine_done() right before
> > guest starts and later after guest's BIOS(UEFI) initialized PCI devices.
> > 
> > Perhaps we should do the same for ARM as well, CCing Michael
> 
>  I took a look at the code. As far as i could understand, it iterates devices on the bus and finds
> out start of the lowest assigned region and end of the highest assigned region. Does it?
yep

>  I'm not sure that ARM architecture has this machine_done callback.
Maybe this would help you,
   git grep machine_done

> And, to tell the truth, i don't
> use EFI on my setup so i cannot test the thing.
>  So, can we leave fixed layout for now? 
I suppose we could, it just means that we will have to add version-ed machines like
it's done on x86 to keep memory layout on old machine type the same so that
hardware won't change under guest's feet unexpectedly.

Also in light of guests with huge memory size, 512Gb  gap for RAM seems too small,
what are limitations of ARM64 regarding max supported physical address bits?

Could we put this 64 bit PCI hole at the end of address space, leaving the rest of
address space for RAM or whatever?

> I am currently reworking the patch because i discovered
> problems with 32-bit guests. They simply truncate high word and end up in attempt to put PCI at
> 0x00000000 - 0xFFFFFFFF, fail, and do not work of course. So, my next version will have high MMIO
> only for 64-bit guests.
>  By the way, what is the most correct way to determine whether selected CPU is 32 or 64 bit?
> 
> Kind regards,
> Pavel Fedin
> Expert Engineer
> Samsung Electronics Research center Russia
> 
>
Pavel Fedin July 29, 2015, 9:45 a.m. UTC | #9
> > because i discovered problems with 32-bit guests. They simply truncate
> > high word and end up in attempt to put PCI at 0x00000000 - 0xFFFFFFFF, fail,
> > and do not work of course. So, my next version will have high MMIO
> > only for 64-bit guests.
> 
> That sounds like a guest bug to me. If the device tree says that
> addresses are two-cells wide then the code reading it ought to
> read both cells, not just ignore the top 32 bits...

 May be. It's old kernel, v3.19, but still it works as it works. We could, of course, have some machine option, but personally i currently care only about 64 bits, so for simplicity let's leave alone 32 bits for now. When more important things are settled down, adding an option will not be a problem.
 
Kind regards,
Pavel Fedin
Expert Engineer
Samsung Electronics Research center Russia
Pavel Fedin July 29, 2015, 9:48 a.m. UTC | #10
Hello!

> this is wrong since dword is too small for values of high memory
> use aml_qword_memory() instead

 Thanks, will fix it.

> since window is at fixed position and it's not possible for guest to
> move base address of the range, make AddressMaximum the same as AddressMinimum i.e.

 But it is anyway not possible. On ARM hardware PCI range addresses are normally hardwired (at least
on simple machine which we are emulating), and the OS doesn't have control over it. And i'm not sure
whether some hotplugged memory can be overlaid on top of it. This is because on PC PCI is a core
system bus, which represents all address space, and RAM is kind of plugged into it. On ARM another
buses are used for system core, and PCI controller, from the point of view of that bus, is a single
device, which occupies some space. So AFAIK having something else on top of PCI hole on ARM would be
abnormal.

Kind regards,
Pavel Fedin
Expert Engineer
Samsung Electronics Research center Russia
Peter Maydell July 29, 2015, 9:56 a.m. UTC | #11
On 29 July 2015 at 10:45, Pavel Fedin <p.fedin@samsung.com> wrote:
>> > because i discovered problems with 32-bit guests. They simply truncate
>> > high word and end up in attempt to put PCI at 0x00000000 - 0xFFFFFFFF, fail,
>> > and do not work of course. So, my next version will have high MMIO
>> > only for 64-bit guests.
>>
>> That sounds like a guest bug to me. If the device tree says that
>> addresses are two-cells wide then the code reading it ought to
>> read both cells, not just ignore the top 32 bits...
>
>  May be. It's old kernel, v3.19, but still it works as it works.

It matters because you can run 32 bit guests in 64-bit-capable
CPUs. I don't really want to have two different address layouts
just to work around a kernel bug if we could fix the kernel
bug instead...

-- PMM
Pavel Fedin July 29, 2015, 10:03 a.m. UTC | #12
Hello!

> >  I'm not sure that ARM architecture has this machine_done callback.
> Maybe this would help you,
>    git grep machine_done

 Heh, i was not patient enough. I have already discovered this by myself, and yes, "virt" uses the
same mechanism. But, still, i perhaps can have problems with testing it.

> >  So, can we leave fixed layout for now?
> I suppose we could, it just means that we will have to add version-ed machines like
> it's done on x86 to keep memory layout on old machine type the same so that
> hardware won't change under guest's feet unexpectedly.

 Why?
 First, "virt" is completely virtual hardware. It does not exist in real world.
 Second, ARM architecture is flexible by definition. Guest is never expected to use hardcoded
addresses, it is expected to read device tree. And, if base address of RAM changes, or base address
of PCI region changes, nothing bad should happen.

> Also in light of guests with huge memory size, 512Gb  gap for RAM seems too small,
> what are limitations of ARM64 regarding max supported physical address bits?

 40 bits IIRC.

> Could we put this 64 bit PCI hole at the end of address space, leaving the rest of
> address space for RAM or whatever?

 I've done exactly this. Start = 512GB and size = 512GB. 0x8000000000...0xFFFFFFFFFF. I've done this
after Paolo's comment that 2G is still small. And i agree because single ivshmem could be used by
lots of VMs.

Kind regards,
Pavel Fedin
Expert Engineer
Samsung Electronics Research center Russia
Peter Maydell July 29, 2015, 10:21 a.m. UTC | #13
On 29 July 2015 at 11:03, Pavel Fedin <p.fedin@samsung.com> wrote:

>> Also in light of guests with huge memory size, 512Gb  gap for RAM seems too small,
>> what are limitations of ARM64 regarding max supported physical address bits?
>
>  40 bits IIRC.

In hardware, implemented maximum physical address size may be
anything from 32 bits to 48 bits in 4 bit steps. At least
40 and 48 bits have been implemented.

A 32-bit VM has a maximum physical address size of 40 bits;
a 64-bit VM has a maximum physical address size equal to the
host's maximum physical address size (unless KVM chooses to
constrain it further).

-- PMM
Pavel Fedin July 29, 2015, 11:16 a.m. UTC | #14
Hello!

> It matters because you can run 32 bit guests in 64-bit-capable
> CPUs.

 Yes, i know.

> I don't really want to have two different address layouts
> just to work around a kernel bug

 They will not be really different. Just for 32-bit machines there will be no second MMIO window, and for 64-bit ones there will be. Nothing else will be different.

> if we could fix the kernel bug instead...

 We could. But what about existing installations? They will be forced to upgrade kernels, we will not have backwards compatibility option. And users will complain that "my old VM stopped working with new qemu".
 Well, this discussion seems to be stuck, so let's move on. What is your final word? Options are:
1. Include second region unconditionally. Old 32-bit guests will stop working.
2. Add machine option to specify second region size. For 64-bits it will default to something, for 32-bits it will default to 0 (= off). It will be possible to turn on the region for 32-bit machines explicitly.
3. Include second region only on 64-bit guests. 32-bit ones will wait until somebody needs it.

 Your choice ?

 P.S. My 3.19 guest has LPAE disabled in the configuration. Perhaps my fault, but still it's a potential problem.
 

Kind regards,
Pavel Fedin
Expert Engineer
Samsung Electronics Research center Russia
Peter Maydell July 29, 2015, 11:45 a.m. UTC | #15
On 29 July 2015 at 12:16, Pavel Fedin <p.fedin@samsung.com> wrote:
>> I don't really want to have two different address layouts
>> just to work around a kernel bug
>
>  They will not be really different. Just for 32-bit machines
> there will be no second MMIO window, and for 64-bit ones there
> will be. Nothing else will be different.

That's still a difference. (Also, even a 32-bit guest might
have a 40-bit physical address space and be potentially able
to use an upper MMIO window.)

>> if we could fix the kernel bug instead...
>
>  We could. But what about existing installations? They will be forced to upgrade kernels, we will not have backwards compatibility option. And users will complain that "my old VM stopped working with new qemu".
>  Well, this discussion seems to be stuck, so let's move on.
> What is your final word?

(1) We should confirm whether this really is a guest kernel
bug (as opposed to the device tree QEMU emits not being
in spec)
(2) If it is a kernel bug, submit a patch to fix it
(3) Consider a workaround for older guests anyway. The
scope of that workaround would depend on exactly which
guests are affected, which is presumably something we
figured out during step (1).

thanks
-- PMM
Igor Mammedov July 29, 2015, 11:59 a.m. UTC | #16
On Wed, 29 Jul 2015 12:48:18 +0300
Pavel Fedin <p.fedin@samsung.com> wrote:

>  Hello!
> 
> > this is wrong since dword is too small for values of high memory
> > use aml_qword_memory() instead
> 
>  Thanks, will fix it.
> 
> > since window is at fixed position and it's not possible for guest to
> > move base address of the range, make AddressMaximum the same as AddressMinimum i.e.
> 
>  But it is anyway not possible. On ARM hardware PCI range addresses are normally hardwired (at least
> on simple machine which we are emulating), and the OS doesn't have control over it. And i'm not sure
> whether some hotplugged memory can be overlaid on top of it. This is because on PC PCI is a core
> system bus, which represents all address space, and RAM is kind of plugged into it. On ARM another
> buses are used for system core, and PCI controller, from the point of view of that bus, is a single
> device, which occupies some space. So AFAIK having something else on top of PCI hole on ARM would be
> abnormal.

I don't suggest to have something mapped on top of the window,
Suggestion was to just use the same value for
AddressMaximum and AddressMinimum arguments in aml_qword_memory() call.

> 
> Kind regards,
> Pavel Fedin
> Expert Engineer
> Samsung Electronics Research center Russia
> 
> 
>
Pavel Fedin July 29, 2015, 12:02 p.m. UTC | #17
Hello!

> I don't suggest to have something mapped on top of the window,
> Suggestion was to just use the same value for
> AddressMaximum and AddressMinimum arguments in aml_qword_memory() call.

 And this will shrink the region down to zero, right? For what reason? Physical region would still
be there (here we imagine this is a real HW).

Kind regards,
Pavel Fedin
Expert Engineer
Samsung Electronics Research center Russia
Igor Mammedov July 29, 2015, 12:05 p.m. UTC | #18
On Wed, 29 Jul 2015 13:03:57 +0300
Pavel Fedin <p.fedin@samsung.com> wrote:

>  Hello!
> 
> > >  I'm not sure that ARM architecture has this machine_done callback.
> > Maybe this would help you,
> >    git grep machine_done
> 
>  Heh, i was not patient enough. I have already discovered this by myself, and yes, "virt" uses the
> same mechanism. But, still, i perhaps can have problems with testing it.
> 
> > >  So, can we leave fixed layout for now?
> > I suppose we could, it just means that we will have to add version-ed machines like
> > it's done on x86 to keep memory layout on old machine type the same so that
> > hardware won't change under guest's feet unexpectedly.
> 
>  Why?
>  First, "virt" is completely virtual hardware. It does not exist in real world.
>  Second, ARM architecture is flexible by definition. Guest is never expected to use hardcoded
> addresses, it is expected to read device tree. And, if base address of RAM changes, or base address
> of PCI region changes, nothing bad should happen.
Now imagine that guest was started on host with 'old' layout and then has been
migrated to a host with 'new' layout. Stability of layout matters, that's why
we support versioned machine types and compat nightmare even if basic machine
is the same, the same applies to ARM's virt machine.

> 
> > Also in light of guests with huge memory size, 512Gb  gap for RAM seems too small,
> > what are limitations of ARM64 regarding max supported physical address bits?
> 
>  40 bits IIRC.
> 
> > Could we put this 64 bit PCI hole at the end of address space, leaving the rest of
> > address space for RAM or whatever?
> 
>  I've done exactly this. Start = 512GB and size = 512GB. 0x8000000000...0xFFFFFFFFFF. I've done this
> after Paolo's comment that 2G is still small. And i agree because single ivshmem could be used by
> lots of VMs.
> 
> Kind regards,
> Pavel Fedin
> Expert Engineer
> Samsung Electronics Research center Russia
> 
>
Pavel Fedin July 29, 2015, 12:13 p.m. UTC | #19
Hello!

> Now imagine that guest was started on host with 'old' layout and then has been
> migrated to a host with 'new' layout.

 Aaarrgghh, yes, i totally forgot about migration. Stupid me...

Kind regards,
Pavel Fedin
Expert Engineer
Samsung Electronics Research center Russia
Peter Maydell July 29, 2015, 12:35 p.m. UTC | #20
On 29 July 2015 at 13:05, Igor Mammedov <imammedo@redhat.com> wrote:
> Now imagine that guest was started on host with 'old' layout and then has been
> migrated to a host with 'new' layout. Stability of layout matters, that's why
> we support versioned machine types and compat nightmare even if basic machine
> is the same, the same applies to ARM's virt machine.

Note that we don't support versioned machines or cross-version
migration on ARM *yet*. We will at some point but I am putting
this off for as long as I can exactly because it makes it
very painful to make changes to QEMU.

(Roughly, I plan to avoid cross-version migration until the
point where somebody comes along and says (a) they absolutely
need this and (b) they are prepared to support it in terms of
the testing effort etc. I don't think we have really ironed
out all the bugs in same-version migration on ARM yet...)

thanks
-- PMM
Igor Mammedov July 29, 2015, 1:24 p.m. UTC | #21
On Wed, 29 Jul 2015 15:02:00 +0300
Pavel Fedin <p.fedin@samsung.com> wrote:

>  Hello!
> 
> > I don't suggest to have something mapped on top of the window,
> > Suggestion was to just use the same value for
> > AddressMaximum and AddressMinimum arguments in aml_qword_memory() call.
> 
>  And this will shrink the region down to zero, right? For what reason? Physical region would still
> be there (here we imagine this is a real HW).
nope, it won't shrink anything. It will prevent OS from potential
relocation of region.

AddressMaximum is a max possible BASE address of range,
I read 'base address' as starting address of range and
it has nothing to do with the size.

See spec, ACPI6.0: 19.6.104 QWordMemory

> 
> Kind regards,
> Pavel Fedin
> Expert Engineer
> Samsung Electronics Research center Russia
> 
>
Pavel Fedin July 29, 2015, 2:01 p.m. UTC | #22
Hello! I have studied the problem. It is a kernel bug and it's still not fixed, at least in 4.1

> (1) We should confirm whether this really is a guest kernel
> bug (as opposed to the device tree QEMU emits not being
> in spec)

 The problem is in of_pci_range_to_resource(): http://lxr.free-electrons.com/source/drivers/of/address.c#L313
 Note the line 333: res->start = range->cpu_addr; here is the problem. The problem occurs if CONFIG_ARM_LPAE is disabled. Inside struct resource 'start' and 'end' are of  resource_size_t type, which is an alias of phys_addr_t:
--- cut ---
#ifdef CONFIG_PHYS_ADDR_T_64BIT
typedef u64 phys_addr_t;
#else
typedef u32 phys_addr_t;
#endif

typedef phys_addr_t resource_size_t;
--- cut ---
 Config option chain is as follows: CONFIG_ARM_LPAE => CONFIG_ARCH_PHYS_ADDR_T_64BIT => CONFIG_PHYS_ADDR_T_64BIT
 This function should check that range->cpu_addr fits into 32 bits if LPAE is disabled.

> (2) If it is a kernel bug, submit a patch to fix it

 Will do it.

> (3) Consider a workaround for older guests anyway. The
> scope of that workaround would depend on exactly which
> guests are affected, which is presumably something we
> figured out during step (1).

 Problem occurs if LPAE is disabled in the kernel. What is your verdict then? Do we need an option or just ignore those poor guys with such old configs?

Kind regards,
Pavel Fedin
Expert Engineer
Samsung Electronics Research center Russia
Pavel Fedin Aug. 3, 2015, 7:03 a.m. UTC | #23
Hi! I have done an additional study...

> (1) We should confirm whether this really is a guest kernel
> bug (as opposed to the device tree QEMU emits not being
> in spec)
> (2) If it is a kernel bug, submit a patch to fix it

 It is a bug, but not major bug. The bug is only that the kernel tries to map the device anyway, despite it takes wrong, truncated physical addresses. But, if we fix it, then the only option for the kernel is not to try to use this device at all. Because, in general case, this would mean that the device is outside of usable address space, and we cannot access it, therefore cannot drive it properly.
 Therefore, the non-LPAE-enabled kernel will not be able to use PCI controller with high MMIO anyway.

> (3) Consider a workaround for older guests anyway. The
> scope of that workaround would depend on exactly which
> guests are affected, which is presumably something we
> figured out during step (1).

 The behavior which i explained above causes boot problems if our configuration assumes that we boot off emulated PCI device. Because PCI controller becomes unusable. Therefore, if we want to stay compatible with such guests (which would be indeed old), we need an option which allows to turn off high MMIO region.
 Now the question is: what do we want as default on 32-bit virt? On 64 bits the default would be obviously "ON", what about 32 bits?

Kind regards,
Pavel Fedin
Expert Engineer
Samsung Electronics Research center Russia
Peter Maydell Aug. 3, 2015, 7:56 a.m. UTC | #24
On 3 August 2015 at 08:03, Pavel Fedin <p.fedin@samsung.com> wrote:
>  Hi! I have done an additional study...
>
>> (1) We should confirm whether this really is a guest kernel
>> bug (as opposed to the device tree QEMU emits not being
>> in spec)
>> (2) If it is a kernel bug, submit a patch to fix it
>
>  It is a bug, but not major bug. The bug is only that the kernel
> tries to map the device anyway, despite it takes wrong, truncated
 > physical addresses. But, if we fix it, then the only option for
> the kernel is not to try to use this device at all. Because, in
> general case, this would mean that the device is outside of usable
> address space, and we cannot access it, therefore cannot drive it
> properly.
>  Therefore, the non-LPAE-enabled kernel will not be able to use PCI
> controller with high MMIO anyway.

Right, you can't use a device that's not in the accessible range
for you. But that's always going to be the case -- a non-LPAE
kernel just can't get at anything outside the 32-bit physical
address window. If you want a high-MMIO window to actually
work you're going to need LPAE enabled.

What I thought you meant was that a non-LPAE kernel didn't
work at all if we told it about the high-MMIO window (which
would mean we'd need to *not* put that in the dtb if we
wanted to avoid breaking non-LPAE guests that didn't care
about the other window.)

>  The behavior which i explained above causes boot problems if our
> configuration assumes that we boot off emulated PCI device. Because
> PCI controller becomes unusable.

...which is what you're saying here.

Which is it?

-- PMM
Pavel Fedin Aug. 3, 2015, 8:09 a.m. UTC | #25
Hi!

> What I thought you meant was that a non-LPAE kernel didn't
> work at all if we told it about the high-MMIO window (which
> would mean we'd need to *not* put that in the dtb if we
> wanted to avoid breaking non-LPAE guests that didn't care
> about the other window.)

 Current generic PCI driver is not so smart. It simply tries to map all resources using devm_request_resource() in a loop. If a single call fails, the driver thinks that it cannot work and fails. It does not try to ignore inaccessible regions.

> >  The behavior which i explained above causes boot problems if our
> > configuration assumes that we boot off emulated PCI device. Because
> > PCI controller becomes unusable.
> 
> ...which is what you're saying here.
> 
> Which is it?

 I don't understand this last question...
 Ok, from scratch: imagine that we already have a qemu installation running non-LPAE 32-bit guest. This guest is configured to have PCI bus and SCSI drive on this bus, and boots off it. Now, if we implement a high MMIO window in qemu, which is unconditionally enabled, after qemu upgrade this guest will break, because its pre-existing kernel cannot work around the inaccessible PCI region.
 In order to keep this guest working, we need a possibility to disable the new MMIO region in qemu. At least to omit it from the device tree.
 Is this clear enough now ?

Kind regards,
Pavel Fedin
Expert Engineer
Samsung Electronics Research center Russia
Peter Maydell Aug. 3, 2015, 9:48 a.m. UTC | #26
On 3 August 2015 at 09:09, Pavel Fedin <p.fedin@samsung.com> wrote:
>  Hi!
>
>> What I thought you meant was that a non-LPAE kernel didn't
>> work at all if we told it about the high-MMIO window (which
>> would mean we'd need to *not* put that in the dtb if we
>> wanted to avoid breaking non-LPAE guests that didn't care
>> about the other window.)
>
>  Current generic PCI driver is not so smart. It simply tries to map all resources using devm_request_resource() in a loop. If a single call fails, the driver thinks that it cannot work and fails. It does not try to ignore inaccessible regions.

Then this is a kernel bug and the kernel should be fixed.

>> >  The behavior which i explained above causes boot problems if our
>> > configuration assumes that we boot off emulated PCI device. Because
>> > PCI controller becomes unusable.
>>
>> ...which is what you're saying here.
>>
>> Which is it?
>
>  I don't understand this last question...

I was confused about whether what happened was
(a) the high region is just ignored
(b) the presence of the high region in the device tree blob causes
things which previously worked to stop working

It sounds like the answer is (b).

>  In order to keep this guest working, we need a possibility to
> disable the new MMIO region in qemu. At least to omit it from the
> device tree.

Yes, this is the workaround, which it sounds like we need.

thanks
-- PMM
Pavel Fedin Aug. 3, 2015, 10:20 a.m. UTC | #27
Hello!

> >  In order to keep this guest working, we need a possibility to
> > disable the new MMIO region in qemu. At least to omit it from the
> > device tree.
> 
> Yes, this is the workaround, which it sounds like we need.

 Ok. Just to avoid sending one more version which will be rejected. What behavior for 32 bits would you prefer?
a) Enable high MMIO by default, owners of old guests will have to add an option to disable it.
b) Disable it by default. If somebody wants to use it on 32 bits, enable it explicitly.

Kind regards,
Pavel Fedin
Expert Engineer
Samsung Electronics Research center Russia
Alexander Graf Aug. 3, 2015, 8:17 p.m. UTC | #28
> Am 03.08.2015 um 11:20 schrieb Pavel Fedin <p.fedin@samsung.com>:
> 
> Hello!
> 
>>> In order to keep this guest working, we need a possibility to
>>> disable the new MMIO region in qemu. At least to omit it from the
>>> device tree.
>> 
>> Yes, this is the workaround, which it sounds like we need.
> 
> Ok. Just to avoid sending one more version which will be rejected. What behavior for 32 bits would you prefer?
> a) Enable high MMIO by default, owners of old guests will have to add an option to disable it.

I prefer this option.

Alex

> b) Disable it by default. If somebody wants to use it on 32 bits, enable it explicitly.
> 
> Kind regards,
> Pavel Fedin
> Expert Engineer
> Samsung Electronics Research center Russia
> 
>
diff mbox

Patch

diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c
index f365140..020aad6 100644
--- a/hw/arm/virt-acpi-build.c
+++ b/hw/arm/virt-acpi-build.c
@@ -169,6 +169,8 @@  static void acpi_dsdt_add_pci(Aml *scope, const MemMapEntry *memmap, int irq)
     hwaddr size_pio = memmap[VIRT_PCIE_PIO].size;
     hwaddr base_ecam = memmap[VIRT_PCIE_ECAM].base;
     hwaddr size_ecam = memmap[VIRT_PCIE_ECAM].size;
+    hwaddr base_mmio_high = memmap[VIRT_PCIE_MMIO_HIGH].base;
+    hwaddr size_mmio_high = memmap[VIRT_PCIE_MMIO_HIGH].size;
     int nr_pcie_buses = size_ecam / PCIE_MMCFG_SIZE_MIN;
 
     Aml *dev = aml_device("%s", "PCI0");
@@ -234,6 +236,12 @@  static void acpi_dsdt_add_pci(Aml *scope, const MemMapEntry *memmap, int irq)
                      AML_ENTIRE_RANGE, 0x0000, 0x0000, size_pio - 1, base_pio,
                      size_pio));
 
+    aml_append(rbuf,
+        aml_dword_memory(AML_POS_DECODE, AML_MIN_FIXED, AML_MAX_FIXED,
+                         AML_NON_CACHEABLE, AML_READ_WRITE, 0x0000,
+                         base_mmio_high, base_mmio_high + size_mmio_high - 1,
+                         0x0000, size_mmio_high));
+
     aml_append(method, aml_name_decl("RBUF", rbuf));
     aml_append(method, aml_return(rbuf));
     aml_append(dev, method);
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index e53ef4c..c20b3b8 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -124,6 +124,7 @@  static const MemMapEntry a15memmap[] = {
     [VIRT_PCIE_PIO] =           { 0x3eff0000, 0x00010000 },
     [VIRT_PCIE_ECAM] =          { 0x3f000000, 0x01000000 },
     [VIRT_MEM] =                { 0x40000000, 30ULL * 1024 * 1024 * 1024 },
+    [VIRT_PCIE_MMIO_HIGH] =   { 0x8000000000, 0x8000000000 },
 };
 
 static const int a15irqmap[] = {
@@ -758,6 +759,8 @@  static void create_pcie(const VirtBoardInfo *vbi, qemu_irq *pic)
     hwaddr size_pio = vbi->memmap[VIRT_PCIE_PIO].size;
     hwaddr base_ecam = vbi->memmap[VIRT_PCIE_ECAM].base;
     hwaddr size_ecam = vbi->memmap[VIRT_PCIE_ECAM].size;
+    hwaddr base_mmio_high = vbi->memmap[VIRT_PCIE_MMIO_HIGH].base;
+    hwaddr size_mmio_high = vbi->memmap[VIRT_PCIE_MMIO_HIGH].size;
     hwaddr base = base_mmio;
     int nr_pcie_buses = size_ecam / PCIE_MMCFG_SIZE_MIN;
     int irq = vbi->irqmap[VIRT_PCIE];
@@ -793,6 +796,12 @@  static void create_pcie(const VirtBoardInfo *vbi, qemu_irq *pic)
     /* Map IO port space */
     sysbus_mmio_map(SYS_BUS_DEVICE(dev), 2, base_pio);
 
+    /* High MMIO space */
+    mmio_alias = g_new0(MemoryRegion, 1);
+    memory_region_init_alias(mmio_alias, OBJECT(dev), "pcie-mmio-high",
+                             mmio_reg, base_mmio_high, size_mmio_high);
+    memory_region_add_subregion(get_system_memory(), base_mmio_high, mmio_alias);
+
     for (i = 0; i < GPEX_NUM_IRQS; i++) {
         sysbus_connect_irq(SYS_BUS_DEVICE(dev), i, pic[irq + i]);
     }
@@ -818,7 +827,9 @@  static void create_pcie(const VirtBoardInfo *vbi, qemu_irq *pic)
                                  1, FDT_PCI_RANGE_IOPORT, 2, 0,
                                  2, base_pio, 2, size_pio,
                                  1, FDT_PCI_RANGE_MMIO, 2, base_mmio,
-                                 2, base_mmio, 2, size_mmio);
+                                 2, base_mmio, 2, size_mmio,
+                                 1, FDT_PCI_RANGE_MMIO, 2, base_mmio_high,
+                                 2, base_mmio_high, 2, size_mmio_high);
 
     qemu_fdt_setprop_cell(vbi->fdt, nodename, "#interrupt-cells", 1);
     create_pcie_irq_map(vbi, vbi->gic_phandle, irq, nodename);
diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h
index 852efb9..1d43598 100644
--- a/include/hw/arm/virt.h
+++ b/include/hw/arm/virt.h
@@ -60,6 +60,7 @@  enum {
     VIRT_PCIE_PIO,
     VIRT_PCIE_ECAM,
     VIRT_PLATFORM_BUS,
+    VIRT_PCIE_MMIO_HIGH,
 };
 
 typedef struct MemMapEntry {