diff mbox series

[RFC,v3,06/15] hw/arm/virt: Allocate device_memory

Message ID 1530602398-16127-7-git-send-email-eric.auger@redhat.com
State New
Headers show
Series ARM virt: PCDIMM/NVDIMM at 2TB | expand

Commit Message

Eric Auger July 3, 2018, 7:19 a.m. UTC
We define a new hotpluggable RAM region (aka. device memory).
Its base is 2TB GPA. This obviously requires 42b IPA support
in KVM/ARM, FW and guest kernel. At the moment the device
memory region is max 2TB.

This is largely inspired of device memory initialization in
pc machine code.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Kwangwoo Lee <kwangwoo.lee@sk.com>
---
 hw/arm/virt.c         | 104 ++++++++++++++++++++++++++++++++++++--------------
 include/hw/arm/arm.h  |   2 +
 include/hw/arm/virt.h |   1 +
 3 files changed, 79 insertions(+), 28 deletions(-)

Comments

David Hildenbrand July 3, 2018, 6:25 p.m. UTC | #1
On 03.07.2018 09:19, Eric Auger wrote:
> We define a new hotpluggable RAM region (aka. device memory).
> Its base is 2TB GPA. This obviously requires 42b IPA support
> in KVM/ARM, FW and guest kernel. At the moment the device
> memory region is max 2TB.

Maybe a stupid question, but why exactly does it have to start at 2TB
(and not e.g. at 1TB)?

> 
> This is largely inspired of device memory initialization in
> pc machine code.
> 
> Signed-off-by: Eric Auger <eric.auger@redhat.com>
> Signed-off-by: Kwangwoo Lee <kwangwoo.lee@sk.com>
> ---
>  hw/arm/virt.c         | 104 ++++++++++++++++++++++++++++++++++++--------------
>  include/hw/arm/arm.h  |   2 +
>  include/hw/arm/virt.h |   1 +
>  3 files changed, 79 insertions(+), 28 deletions(-)
> 
> diff --git a/hw/arm/virt.c b/hw/arm/virt.c
> index 5a4d0bf..6fefb78 100644
> --- a/hw/arm/virt.c
> +++ b/hw/arm/virt.c
> @@ -59,6 +59,7 @@
>  #include "qapi/visitor.h"
>  #include "standard-headers/linux/input.h"
>  #include "hw/arm/smmuv3.h"
> +#include "hw/acpi/acpi.h"
>  
>  #define DEFINE_VIRT_MACHINE_LATEST(major, minor, latest) \
>      static void virt_##major##_##minor##_class_init(ObjectClass *oc, \
> @@ -94,34 +95,25 @@
>  
>  #define PLATFORM_BUS_NUM_IRQS 64
>  
> -/* RAM limit in GB. Since VIRT_MEM starts at the 1GB mark, this means
> - * RAM can go up to the 256GB mark, leaving 256GB of the physical
> - * address space unallocated and free for future use between 256G and 512G.
> - * If we need to provide more RAM to VMs in the future then we need to:
> - *  * allocate a second bank of RAM starting at 2TB and working up
> - *  * fix the DT and ACPI table generation code in QEMU to correctly
> - *    report two split lumps of RAM to the guest
> - *  * fix KVM in the host kernel to allow guests with >40 bit address spaces
> - * (We don't want to fill all the way up to 512GB with RAM because
> - * we might want it for non-RAM purposes later. Conversely it seems
> - * reasonable to assume that anybody configuring a VM with a quarter
> - * of a terabyte of RAM will be doing it on a host with more than a
> - * terabyte of physical address space.)
> - */
> -#define RAMLIMIT_GB 255
> -#define RAMLIMIT_BYTES (RAMLIMIT_GB * 1024ULL * 1024 * 1024)
> +#define SZ_64K 0x10000
> +#define SZ_1G (1024ULL * 1024 * 1024)
>  
>  /* Addresses and sizes of our components.
> - * 0..128MB is space for a flash device so we can run bootrom code such as UEFI.
> - * 128MB..256MB is used for miscellaneous device I/O.
> - * 256MB..1GB is reserved for possible future PCI support (ie where the
> - * PCI memory window will go if we add a PCI host controller).
> - * 1GB and up is RAM (which may happily spill over into the
> - * high memory region beyond 4GB).
> - * This represents a compromise between how much RAM can be given to
> - * a 32 bit VM and leaving space for expansion and in particular for PCI.
> - * Note that devices should generally be placed at multiples of 0x10000,
> + * 0..128MB is space for a flash device so we can run bootrom code such as UEFI,
> + * 128MB..256MB is used for miscellaneous device I/O,
> + * 256MB..1GB is used for PCI host controller,
> + * 1GB..256GB is RAM (not hotpluggable),
> + * 256GB..512GB: is left for device I/O (non RAM purpose),
> + * 512GB..1TB: high mem PCI MMIO region,
> + * 2TB..4TB is used for hot-pluggable DIMM (assumes 42b GPA is supported).
> + *
> + * Note that IO devices should generally be placed at multiples of 0x10000,
>   * to accommodate guests using 64K pages.
> + *
> + * Conversely it seems reasonable to assume that anybody configuring a VM
> + * with a quarter of a terabyte of RAM will be doing it on a host with more
> + * than a terabyte of physical address space.)
> + *
>   */
>  static const MemMapEntry a15memmap[] = {
>      /* Space up to 0x8000000 is reserved for a boot ROM */
> @@ -148,12 +140,13 @@ static const MemMapEntry a15memmap[] = {
>      [VIRT_PCIE_MMIO] =          { 0x10000000, 0x2eff0000 },
>      [VIRT_PCIE_PIO] =           { 0x3eff0000, 0x00010000 },
>      [VIRT_PCIE_ECAM] =          { 0x3f000000, 0x01000000 },
> -    [VIRT_MEM] =                { 0x40000000, RAMLIMIT_BYTES },
> +    [VIRT_MEM] =                { SZ_1G , 255 * SZ_1G },
>      /* Additional 64 MB redist region (can contain up to 512 redistributors) */
>      [VIRT_GIC_REDIST2] =        { 0x4000000000ULL, 0x4000000 },
>      [VIRT_PCIE_ECAM_HIGH] =     { 0x4010000000ULL, 0x10000000 },
>      /* Second PCIe window, 512GB wide at the 512GB boundary */
> -    [VIRT_PCIE_MMIO_HIGH] =   { 0x8000000000ULL, 0x8000000000ULL },
> +    [VIRT_PCIE_MMIO_HIGH] =     { 512 * SZ_1G, 512 * SZ_1G },
> +    [VIRT_HOTPLUG_MEM] =        { 2048 * SZ_1G, 2048 * SZ_1G },
>  };
>  
>  static const int a15irqmap[] = {
> @@ -1223,6 +1216,58 @@ static void create_secure_ram(VirtMachineState *vms,
>      g_free(nodename);
>  }
>  
> +static void create_device_memory(VirtMachineState *vms, MemoryRegion *sysmem)
> +{
> +    MachineState *ms = MACHINE(vms);
> +    uint64_t device_memory_size;
> +    uint64_t align = SZ_64K;
> +
> +    /* always allocate the device memory information */
> +    ms->device_memory = g_malloc0(sizeof(*ms->device_memory));
> +
> +    if (vms->max_vm_phys_shift < 42) {
> +        /* device memory starts at 2TB whereas this VM supports less than
> +         * 2TB GPA */
> +        if (ms->maxram_size > ms->ram_size || ms->ram_slots) {
> +            MachineClass *mc = MACHINE_GET_CLASS(ms);
> +
> +            error_report("\"-memory 'slots|maxmem'\" is not supported by %s "
> +                         "since KVM does not support more than 41b IPA",
> +                         mc->name);
> +            exit(EXIT_FAILURE);
> +        }
> +        return;
> +    }
> +
> +    if (ms->ram_slots > ACPI_MAX_RAM_SLOTS) {
> +        error_report("unsupported number of memory slots: %"PRIu64,
> +                     ms->ram_slots);
> +        exit(EXIT_FAILURE);
> +    }
> +
> +    if (QEMU_ALIGN_UP(ms->maxram_size, align) != ms->maxram_size) {
> +        error_report("maximum memory size must be aligned to multiple of 0x%"
> +                     PRIx64, align);
> +            exit(EXIT_FAILURE);
> +    }
> +
> +    ms->device_memory->base = vms->memmap[VIRT_HOTPLUG_MEM].base;
> +    device_memory_size = ms->maxram_size - ms->ram_size;
> +
> +    if (device_memory_size > vms->memmap[VIRT_HOTPLUG_MEM].size) {
> +        error_report("unsupported amount of maximum memory: " RAM_ADDR_FMT,
> +                         ms->maxram_size);
> +        exit(EXIT_FAILURE);
> +    }
> +
> +    memory_region_init(&ms->device_memory->mr, OBJECT(vms),
> +                       "device-memory", device_memory_size);
> +    memory_region_add_subregion(sysmem, ms->device_memory->base,
> +                                &ms->device_memory->mr);
> +    vms->bootinfo.device_memory_start = ms->device_memory->base;
> +    vms->bootinfo.device_memory_size = device_memory_size;
> +}
> +
>  static void *machvirt_dtb(const struct arm_boot_info *binfo, int *fdt_size)
>  {
>      const VirtMachineState *board = container_of(binfo, VirtMachineState,
> @@ -1430,7 +1475,8 @@ static void machvirt_init(MachineState *machine)
>      vms->smp_cpus = smp_cpus;
>  
>      if (machine->ram_size > vms->memmap[VIRT_MEM].size) {
> -        error_report("mach-virt: cannot model more than %dGB RAM", RAMLIMIT_GB);
> +        error_report("mach-virt: cannot model more than %dGB RAM",
> +                     (int)(vms->memmap[VIRT_MEM].size / SZ_1G));
>          exit(1);
>      }
>  
> @@ -1525,6 +1571,8 @@ static void machvirt_init(MachineState *machine)
>                                           machine->ram_size);
>      memory_region_add_subregion(sysmem, vms->memmap[VIRT_MEM].base, ram);
>  
> +    create_device_memory(vms, sysmem);
> +
>      create_flash(vms, sysmem, secure_sysmem ? secure_sysmem : sysmem);
>  
>      create_gic(vms, pic);
> diff --git a/include/hw/arm/arm.h b/include/hw/arm/arm.h
> index ffed392..76269e6 100644
> --- a/include/hw/arm/arm.h
> +++ b/include/hw/arm/arm.h
> @@ -116,6 +116,8 @@ struct arm_boot_info {
>      bool secure_board_setup;
>  
>      arm_endianness endianness;
> +    hwaddr device_memory_start;
> +    hwaddr device_memory_size;
>  };
>  
>  /**
> diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h
> index 91f6de2..173938d 100644
> --- a/include/hw/arm/virt.h
> +++ b/include/hw/arm/virt.h
> @@ -78,6 +78,7 @@ enum {
>      VIRT_GPIO,
>      VIRT_SECURE_UART,
>      VIRT_SECURE_MEM,
> +    VIRT_HOTPLUG_MEM,
>  };
>  
>  typedef enum VirtIOMMUType {
>
Eric Auger July 3, 2018, 7:27 p.m. UTC | #2
Hi David,
On 07/03/2018 08:25 PM, David Hildenbrand wrote:
> On 03.07.2018 09:19, Eric Auger wrote:
>> We define a new hotpluggable RAM region (aka. device memory).
>> Its base is 2TB GPA. This obviously requires 42b IPA support
>> in KVM/ARM, FW and guest kernel. At the moment the device
>> memory region is max 2TB.
> 
> Maybe a stupid question, but why exactly does it have to start at 2TB
> (and not e.g. at 1TB)?
not a stupid question. See tentative answer below.
> 
>>
>> This is largely inspired of device memory initialization in
>> pc machine code.
>>
>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
>> Signed-off-by: Kwangwoo Lee <kwangwoo.lee@sk.com>
>> ---
>>  hw/arm/virt.c         | 104 ++++++++++++++++++++++++++++++++++++--------------
>>  include/hw/arm/arm.h  |   2 +
>>  include/hw/arm/virt.h |   1 +
>>  3 files changed, 79 insertions(+), 28 deletions(-)
>>
>> diff --git a/hw/arm/virt.c b/hw/arm/virt.c
>> index 5a4d0bf..6fefb78 100644
>> --- a/hw/arm/virt.c
>> +++ b/hw/arm/virt.c
>> @@ -59,6 +59,7 @@
>>  #include "qapi/visitor.h"
>>  #include "standard-headers/linux/input.h"
>>  #include "hw/arm/smmuv3.h"
>> +#include "hw/acpi/acpi.h"
>>  
>>  #define DEFINE_VIRT_MACHINE_LATEST(major, minor, latest) \
>>      static void virt_##major##_##minor##_class_init(ObjectClass *oc, \
>> @@ -94,34 +95,25 @@
>>  
>>  #define PLATFORM_BUS_NUM_IRQS 64
>>  
>> -/* RAM limit in GB. Since VIRT_MEM starts at the 1GB mark, this means
>> - * RAM can go up to the 256GB mark, leaving 256GB of the physical
>> - * address space unallocated and free for future use between 256G and 512G.
>> - * If we need to provide more RAM to VMs in the future then we need to:
>> - *  * allocate a second bank of RAM starting at 2TB and working up
I acknowledge this comment was the main justification. Now if you look at

Principles of ARM Memory Maps
http://infocenter.arm.com/help/topic/com.arm.doc.den0001c/DEN0001C_principles_of_arm_memory_maps.pdf
chapter 2.3 you will find that when adding PA bits, you always leave
space for reserved space and mapped IO.

On the other hand, if you look at chapter 5, "Proposed 44-bit and 48bit
Address Maps", we should logically put the additional RAM at 8TB if we
want to comply with that doc.

Peter, was there any other justification why we should put the RAM at 2TB?

Thanks

Eric


>> - *  * fix the DT and ACPI table generation code in QEMU to correctly
>> - *    report two split lumps of RAM to the guest
>> - *  * fix KVM in the host kernel to allow guests with >40 bit address spaces
>> - * (We don't want to fill all the way up to 512GB with RAM because
>> - * we might want it for non-RAM purposes later. Conversely it seems
>> - * reasonable to assume that anybody configuring a VM with a quarter
>> - * of a terabyte of RAM will be doing it on a host with more than a
>> - * terabyte of physical address space.)
>> - */
>> -#define RAMLIMIT_GB 255
>> -#define RAMLIMIT_BYTES (RAMLIMIT_GB * 1024ULL * 1024 * 1024)
>> +#define SZ_64K 0x10000
>> +#define SZ_1G (1024ULL * 1024 * 1024)
>>  
>>  /* Addresses and sizes of our components.
>> - * 0..128MB is space for a flash device so we can run bootrom code such as UEFI.
>> - * 128MB..256MB is used for miscellaneous device I/O.
>> - * 256MB..1GB is reserved for possible future PCI support (ie where the
>> - * PCI memory window will go if we add a PCI host controller).
>> - * 1GB and up is RAM (which may happily spill over into the
>> - * high memory region beyond 4GB).
>> - * This represents a compromise between how much RAM can be given to
>> - * a 32 bit VM and leaving space for expansion and in particular for PCI.
>> - * Note that devices should generally be placed at multiples of 0x10000,
>> + * 0..128MB is space for a flash device so we can run bootrom code such as UEFI,
>> + * 128MB..256MB is used for miscellaneous device I/O,
>> + * 256MB..1GB is used for PCI host controller,
>> + * 1GB..256GB is RAM (not hotpluggable),
>> + * 256GB..512GB: is left for device I/O (non RAM purpose),
>> + * 512GB..1TB: high mem PCI MMIO region,
>> + * 2TB..4TB is used for hot-pluggable DIMM (assumes 42b GPA is supported).
>> + *
>> + * Note that IO devices should generally be placed at multiples of 0x10000,
>>   * to accommodate guests using 64K pages.
>> + *
>> + * Conversely it seems reasonable to assume that anybody configuring a VM
>> + * with a quarter of a terabyte of RAM will be doing it on a host with more
>> + * than a terabyte of physical address space.)
>> + *
>>   */
>>  static const MemMapEntry a15memmap[] = {
>>      /* Space up to 0x8000000 is reserved for a boot ROM */
>> @@ -148,12 +140,13 @@ static const MemMapEntry a15memmap[] = {
>>      [VIRT_PCIE_MMIO] =          { 0x10000000, 0x2eff0000 },
>>      [VIRT_PCIE_PIO] =           { 0x3eff0000, 0x00010000 },
>>      [VIRT_PCIE_ECAM] =          { 0x3f000000, 0x01000000 },
>> -    [VIRT_MEM] =                { 0x40000000, RAMLIMIT_BYTES },
>> +    [VIRT_MEM] =                { SZ_1G , 255 * SZ_1G },
>>      /* Additional 64 MB redist region (can contain up to 512 redistributors) */
>>      [VIRT_GIC_REDIST2] =        { 0x4000000000ULL, 0x4000000 },
>>      [VIRT_PCIE_ECAM_HIGH] =     { 0x4010000000ULL, 0x10000000 },
>>      /* Second PCIe window, 512GB wide at the 512GB boundary */
>> -    [VIRT_PCIE_MMIO_HIGH] =   { 0x8000000000ULL, 0x8000000000ULL },
>> +    [VIRT_PCIE_MMIO_HIGH] =     { 512 * SZ_1G, 512 * SZ_1G },
>> +    [VIRT_HOTPLUG_MEM] =        { 2048 * SZ_1G, 2048 * SZ_1G },
>>  };
>>  
>>  static const int a15irqmap[] = {
>> @@ -1223,6 +1216,58 @@ static void create_secure_ram(VirtMachineState *vms,
>>      g_free(nodename);
>>  }
>>  
>> +static void create_device_memory(VirtMachineState *vms, MemoryRegion *sysmem)
>> +{
>> +    MachineState *ms = MACHINE(vms);
>> +    uint64_t device_memory_size;
>> +    uint64_t align = SZ_64K;
>> +
>> +    /* always allocate the device memory information */
>> +    ms->device_memory = g_malloc0(sizeof(*ms->device_memory));
>> +
>> +    if (vms->max_vm_phys_shift < 42) {
>> +        /* device memory starts at 2TB whereas this VM supports less than
>> +         * 2TB GPA */
>> +        if (ms->maxram_size > ms->ram_size || ms->ram_slots) {
>> +            MachineClass *mc = MACHINE_GET_CLASS(ms);
>> +
>> +            error_report("\"-memory 'slots|maxmem'\" is not supported by %s "
>> +                         "since KVM does not support more than 41b IPA",
>> +                         mc->name);
>> +            exit(EXIT_FAILURE);
>> +        }
>> +        return;
>> +    }
>> +
>> +    if (ms->ram_slots > ACPI_MAX_RAM_SLOTS) {
>> +        error_report("unsupported number of memory slots: %"PRIu64,
>> +                     ms->ram_slots);
>> +        exit(EXIT_FAILURE);
>> +    }
>> +
>> +    if (QEMU_ALIGN_UP(ms->maxram_size, align) != ms->maxram_size) {
>> +        error_report("maximum memory size must be aligned to multiple of 0x%"
>> +                     PRIx64, align);
>> +            exit(EXIT_FAILURE);
>> +    }
>> +
>> +    ms->device_memory->base = vms->memmap[VIRT_HOTPLUG_MEM].base;
>> +    device_memory_size = ms->maxram_size - ms->ram_size;
>> +
>> +    if (device_memory_size > vms->memmap[VIRT_HOTPLUG_MEM].size) {
>> +        error_report("unsupported amount of maximum memory: " RAM_ADDR_FMT,
>> +                         ms->maxram_size);
>> +        exit(EXIT_FAILURE);
>> +    }
>> +
>> +    memory_region_init(&ms->device_memory->mr, OBJECT(vms),
>> +                       "device-memory", device_memory_size);
>> +    memory_region_add_subregion(sysmem, ms->device_memory->base,
>> +                                &ms->device_memory->mr);
>> +    vms->bootinfo.device_memory_start = ms->device_memory->base;
>> +    vms->bootinfo.device_memory_size = device_memory_size;
>> +}
>> +
>>  static void *machvirt_dtb(const struct arm_boot_info *binfo, int *fdt_size)
>>  {
>>      const VirtMachineState *board = container_of(binfo, VirtMachineState,
>> @@ -1430,7 +1475,8 @@ static void machvirt_init(MachineState *machine)
>>      vms->smp_cpus = smp_cpus;
>>  
>>      if (machine->ram_size > vms->memmap[VIRT_MEM].size) {
>> -        error_report("mach-virt: cannot model more than %dGB RAM", RAMLIMIT_GB);
>> +        error_report("mach-virt: cannot model more than %dGB RAM",
>> +                     (int)(vms->memmap[VIRT_MEM].size / SZ_1G));
>>          exit(1);
>>      }
>>  
>> @@ -1525,6 +1571,8 @@ static void machvirt_init(MachineState *machine)
>>                                           machine->ram_size);
>>      memory_region_add_subregion(sysmem, vms->memmap[VIRT_MEM].base, ram);
>>  
>> +    create_device_memory(vms, sysmem);
>> +
>>      create_flash(vms, sysmem, secure_sysmem ? secure_sysmem : sysmem);
>>  
>>      create_gic(vms, pic);
>> diff --git a/include/hw/arm/arm.h b/include/hw/arm/arm.h
>> index ffed392..76269e6 100644
>> --- a/include/hw/arm/arm.h
>> +++ b/include/hw/arm/arm.h
>> @@ -116,6 +116,8 @@ struct arm_boot_info {
>>      bool secure_board_setup;
>>  
>>      arm_endianness endianness;
>> +    hwaddr device_memory_start;
>> +    hwaddr device_memory_size;
>>  };
>>  
>>  /**
>> diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h
>> index 91f6de2..173938d 100644
>> --- a/include/hw/arm/virt.h
>> +++ b/include/hw/arm/virt.h
>> @@ -78,6 +78,7 @@ enum {
>>      VIRT_GPIO,
>>      VIRT_SECURE_UART,
>>      VIRT_SECURE_MEM,
>> +    VIRT_HOTPLUG_MEM,
>>  };
>>  
>>  typedef enum VirtIOMMUType {
>>
> 
>
David Hildenbrand July 4, 2018, 12:05 p.m. UTC | #3
On 03.07.2018 21:27, Auger Eric wrote:
> Hi David,
> On 07/03/2018 08:25 PM, David Hildenbrand wrote:
>> On 03.07.2018 09:19, Eric Auger wrote:
>>> We define a new hotpluggable RAM region (aka. device memory).
>>> Its base is 2TB GPA. This obviously requires 42b IPA support
>>> in KVM/ARM, FW and guest kernel. At the moment the device
>>> memory region is max 2TB.
>>
>> Maybe a stupid question, but why exactly does it have to start at 2TB
>> (and not e.g. at 1TB)?
> not a stupid question. See tentative answer below.
>>
>>>
>>> This is largely inspired of device memory initialization in
>>> pc machine code.
>>>
>>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
>>> Signed-off-by: Kwangwoo Lee <kwangwoo.lee@sk.com>
>>> ---
>>>  hw/arm/virt.c         | 104 ++++++++++++++++++++++++++++++++++++--------------
>>>  include/hw/arm/arm.h  |   2 +
>>>  include/hw/arm/virt.h |   1 +
>>>  3 files changed, 79 insertions(+), 28 deletions(-)
>>>
>>> diff --git a/hw/arm/virt.c b/hw/arm/virt.c
>>> index 5a4d0bf..6fefb78 100644
>>> --- a/hw/arm/virt.c
>>> +++ b/hw/arm/virt.c
>>> @@ -59,6 +59,7 @@
>>>  #include "qapi/visitor.h"
>>>  #include "standard-headers/linux/input.h"
>>>  #include "hw/arm/smmuv3.h"
>>> +#include "hw/acpi/acpi.h"
>>>  
>>>  #define DEFINE_VIRT_MACHINE_LATEST(major, minor, latest) \
>>>      static void virt_##major##_##minor##_class_init(ObjectClass *oc, \
>>> @@ -94,34 +95,25 @@
>>>  
>>>  #define PLATFORM_BUS_NUM_IRQS 64
>>>  
>>> -/* RAM limit in GB. Since VIRT_MEM starts at the 1GB mark, this means
>>> - * RAM can go up to the 256GB mark, leaving 256GB of the physical
>>> - * address space unallocated and free for future use between 256G and 512G.
>>> - * If we need to provide more RAM to VMs in the future then we need to:
>>> - *  * allocate a second bank of RAM starting at 2TB and working up
> I acknowledge this comment was the main justification. Now if you look at
> 
> Principles of ARM Memory Maps
> http://infocenter.arm.com/help/topic/com.arm.doc.den0001c/DEN0001C_principles_of_arm_memory_maps.pdf
> chapter 2.3 you will find that when adding PA bits, you always leave
> space for reserved space and mapped IO.

Thanks for the pointer!

So ... we can fit

a) 2GB at 2GB
b) 32GB at 32GB
c) 512GB at 512GB
d) 8TB at 8TB
e) 128TB at 128TB

(this is a nice rule of thumb if I understand it correctly :) )

We should strive for device memory (maxram_size - ram_size) to fit
exactly into one of these slots (otherwise things get nasty).

Depending on the ram_size, we might have simpler setups and can support
more configurations, no?

E.g. ram_size <= 34GB, device_memory <= 512GB
-> move ram into a) and b)
-> move device memory into c)

We should make up our mind right from the beginning how our setup will
look, so we can avoid (possibly complicated) compatibility handling
later on.

> 
> On the other hand, if you look at chapter 5, "Proposed 44-bit and 48bit
> Address Maps", we should logically put the additional RAM at 8TB if we
> want to comply with that doc.

I agree, 2TB es in the reserved area.

> 
> Peter, was there any other justification why we should put the RAM at 2TB?
> 
> Thanks
> 
> Eric
Eric Auger July 5, 2018, 11:42 a.m. UTC | #4
Hi David,

On 07/04/2018 02:05 PM, David Hildenbrand wrote:
> On 03.07.2018 21:27, Auger Eric wrote:
>> Hi David,
>> On 07/03/2018 08:25 PM, David Hildenbrand wrote:
>>> On 03.07.2018 09:19, Eric Auger wrote:
>>>> We define a new hotpluggable RAM region (aka. device memory).
>>>> Its base is 2TB GPA. This obviously requires 42b IPA support
>>>> in KVM/ARM, FW and guest kernel. At the moment the device
>>>> memory region is max 2TB.
>>>
>>> Maybe a stupid question, but why exactly does it have to start at 2TB
>>> (and not e.g. at 1TB)?
>> not a stupid question. See tentative answer below.
>>>
>>>>
>>>> This is largely inspired of device memory initialization in
>>>> pc machine code.
>>>>
>>>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
>>>> Signed-off-by: Kwangwoo Lee <kwangwoo.lee@sk.com>
>>>> ---
>>>>  hw/arm/virt.c         | 104 ++++++++++++++++++++++++++++++++++++--------------
>>>>  include/hw/arm/arm.h  |   2 +
>>>>  include/hw/arm/virt.h |   1 +
>>>>  3 files changed, 79 insertions(+), 28 deletions(-)
>>>>
>>>> diff --git a/hw/arm/virt.c b/hw/arm/virt.c
>>>> index 5a4d0bf..6fefb78 100644
>>>> --- a/hw/arm/virt.c
>>>> +++ b/hw/arm/virt.c
>>>> @@ -59,6 +59,7 @@
>>>>  #include "qapi/visitor.h"
>>>>  #include "standard-headers/linux/input.h"
>>>>  #include "hw/arm/smmuv3.h"
>>>> +#include "hw/acpi/acpi.h"
>>>>  
>>>>  #define DEFINE_VIRT_MACHINE_LATEST(major, minor, latest) \
>>>>      static void virt_##major##_##minor##_class_init(ObjectClass *oc, \
>>>> @@ -94,34 +95,25 @@
>>>>  
>>>>  #define PLATFORM_BUS_NUM_IRQS 64
>>>>  
>>>> -/* RAM limit in GB. Since VIRT_MEM starts at the 1GB mark, this means
>>>> - * RAM can go up to the 256GB mark, leaving 256GB of the physical
>>>> - * address space unallocated and free for future use between 256G and 512G.
>>>> - * If we need to provide more RAM to VMs in the future then we need to:
>>>> - *  * allocate a second bank of RAM starting at 2TB and working up
>> I acknowledge this comment was the main justification. Now if you look at
>>
>> Principles of ARM Memory Maps
>> http://infocenter.arm.com/help/topic/com.arm.doc.den0001c/DEN0001C_principles_of_arm_memory_maps.pdf
>> chapter 2.3 you will find that when adding PA bits, you always leave
>> space for reserved space and mapped IO.
> 
> Thanks for the pointer!
> 
> So ... we can fit
> 
> a) 2GB at 2GB
> b) 32GB at 32GB
> c) 512GB at 512GB
> d) 8TB at 8TB
> e) 128TB at 128TB
> 
> (this is a nice rule of thumb if I understand it correctly :) )
> 
> We should strive for device memory (maxram_size - ram_size) to fit
> exactly into one of these slots (otherwise things get nasty).
> 
> Depending on the ram_size, we might have simpler setups and can support
> more configurations, no?
> 
> E.g. ram_size <= 34GB, device_memory <= 512GB
> -> move ram into a) and b)
> -> move device memory into c)

The issue is machvirt doesn't comply with that document.
At the moment we have
0 -> 1GB MMIO
1GB -> 256GB RAM
256GB -> 512GB is theoretically reserved for IO but most is free.
512GB -> 1T is reserved for ECAM MMIO range. This is the top of our
existing 40b GPA space.

We don't want to change this address map due to legacy reasons.

Another question! do you know if it would be possible to have
device_memory region split into several discontinuous segments?

Thanks

Eric
> 
> We should make up our mind right from the beginning how our setup will
> look, so we can avoid (possibly complicated) compatibility handling
> later on.
> 
>>
>> On the other hand, if you look at chapter 5, "Proposed 44-bit and 48bit
>> Address Maps", we should logically put the additional RAM at 8TB if we
>> want to comply with that doc.
> 
> I agree, 2TB es in the reserved area.
> 
>>
>> Peter, was there any other justification why we should put the RAM at 2TB?
>>
>> Thanks
>>
>> Eric
> 
>
David Hildenbrand July 5, 2018, 11:54 a.m. UTC | #5
On 05.07.2018 13:42, Auger Eric wrote:
> Hi David,
> 
> On 07/04/2018 02:05 PM, David Hildenbrand wrote:
>> On 03.07.2018 21:27, Auger Eric wrote:
>>> Hi David,
>>> On 07/03/2018 08:25 PM, David Hildenbrand wrote:
>>>> On 03.07.2018 09:19, Eric Auger wrote:
>>>>> We define a new hotpluggable RAM region (aka. device memory).
>>>>> Its base is 2TB GPA. This obviously requires 42b IPA support
>>>>> in KVM/ARM, FW and guest kernel. At the moment the device
>>>>> memory region is max 2TB.
>>>>
>>>> Maybe a stupid question, but why exactly does it have to start at 2TB
>>>> (and not e.g. at 1TB)?
>>> not a stupid question. See tentative answer below.
>>>>
>>>>>
>>>>> This is largely inspired of device memory initialization in
>>>>> pc machine code.
>>>>>
>>>>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
>>>>> Signed-off-by: Kwangwoo Lee <kwangwoo.lee@sk.com>
>>>>> ---
>>>>>  hw/arm/virt.c         | 104 ++++++++++++++++++++++++++++++++++++--------------
>>>>>  include/hw/arm/arm.h  |   2 +
>>>>>  include/hw/arm/virt.h |   1 +
>>>>>  3 files changed, 79 insertions(+), 28 deletions(-)
>>>>>
>>>>> diff --git a/hw/arm/virt.c b/hw/arm/virt.c
>>>>> index 5a4d0bf..6fefb78 100644
>>>>> --- a/hw/arm/virt.c
>>>>> +++ b/hw/arm/virt.c
>>>>> @@ -59,6 +59,7 @@
>>>>>  #include "qapi/visitor.h"
>>>>>  #include "standard-headers/linux/input.h"
>>>>>  #include "hw/arm/smmuv3.h"
>>>>> +#include "hw/acpi/acpi.h"
>>>>>  
>>>>>  #define DEFINE_VIRT_MACHINE_LATEST(major, minor, latest) \
>>>>>      static void virt_##major##_##minor##_class_init(ObjectClass *oc, \
>>>>> @@ -94,34 +95,25 @@
>>>>>  
>>>>>  #define PLATFORM_BUS_NUM_IRQS 64
>>>>>  
>>>>> -/* RAM limit in GB. Since VIRT_MEM starts at the 1GB mark, this means
>>>>> - * RAM can go up to the 256GB mark, leaving 256GB of the physical
>>>>> - * address space unallocated and free for future use between 256G and 512G.
>>>>> - * If we need to provide more RAM to VMs in the future then we need to:
>>>>> - *  * allocate a second bank of RAM starting at 2TB and working up
>>> I acknowledge this comment was the main justification. Now if you look at
>>>
>>> Principles of ARM Memory Maps
>>> http://infocenter.arm.com/help/topic/com.arm.doc.den0001c/DEN0001C_principles_of_arm_memory_maps.pdf
>>> chapter 2.3 you will find that when adding PA bits, you always leave
>>> space for reserved space and mapped IO.
>>
>> Thanks for the pointer!
>>
>> So ... we can fit
>>
>> a) 2GB at 2GB
>> b) 32GB at 32GB
>> c) 512GB at 512GB
>> d) 8TB at 8TB
>> e) 128TB at 128TB
>>
>> (this is a nice rule of thumb if I understand it correctly :) )
>>
>> We should strive for device memory (maxram_size - ram_size) to fit
>> exactly into one of these slots (otherwise things get nasty).
>>
>> Depending on the ram_size, we might have simpler setups and can support
>> more configurations, no?
>>
>> E.g. ram_size <= 34GB, device_memory <= 512GB
>> -> move ram into a) and b)
>> -> move device memory into c)
> 
> The issue is machvirt doesn't comply with that document.
> At the moment we have
> 0 -> 1GB MMIO
> 1GB -> 256GB RAM
> 256GB -> 512GB is theoretically reserved for IO but most is free.
> 512GB -> 1T is reserved for ECAM MMIO range. This is the top of our
> existing 40b GPA space.
> 
> We don't want to change this address map due to legacy reasons.
> 

Thanks, good to know!

> Another question! do you know if it would be possible to have
> device_memory region split into several discontinuous segments?

It can be implemented for sure, but I would try to avoid that, as it
makes certain configurations impossible (and very end user unfriendly).

E.g. (numbers completely made up, but it should show what I mean)

-m 20G,maxmem=120G:
-> Try to add a DIMM with 100G -> error.
-> But we can add e.g. two DIMMs with 40G and 60G.

This exposes internal details to the end user. And the end user has no
idea what is going on.

So I think we should try our best to keep that area consecutive.

> 
> Thanks
> 
> Eric
Eric Auger July 5, 2018, noon UTC | #6
Hi David,

On 07/05/2018 01:54 PM, David Hildenbrand wrote:
> On 05.07.2018 13:42, Auger Eric wrote:
>> Hi David,
>>
>> On 07/04/2018 02:05 PM, David Hildenbrand wrote:
>>> On 03.07.2018 21:27, Auger Eric wrote:
>>>> Hi David,
>>>> On 07/03/2018 08:25 PM, David Hildenbrand wrote:
>>>>> On 03.07.2018 09:19, Eric Auger wrote:
>>>>>> We define a new hotpluggable RAM region (aka. device memory).
>>>>>> Its base is 2TB GPA. This obviously requires 42b IPA support
>>>>>> in KVM/ARM, FW and guest kernel. At the moment the device
>>>>>> memory region is max 2TB.
>>>>>
>>>>> Maybe a stupid question, but why exactly does it have to start at 2TB
>>>>> (and not e.g. at 1TB)?
>>>> not a stupid question. See tentative answer below.
>>>>>
>>>>>>
>>>>>> This is largely inspired of device memory initialization in
>>>>>> pc machine code.
>>>>>>
>>>>>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
>>>>>> Signed-off-by: Kwangwoo Lee <kwangwoo.lee@sk.com>
>>>>>> ---
>>>>>>  hw/arm/virt.c         | 104 ++++++++++++++++++++++++++++++++++++--------------
>>>>>>  include/hw/arm/arm.h  |   2 +
>>>>>>  include/hw/arm/virt.h |   1 +
>>>>>>  3 files changed, 79 insertions(+), 28 deletions(-)
>>>>>>
>>>>>> diff --git a/hw/arm/virt.c b/hw/arm/virt.c
>>>>>> index 5a4d0bf..6fefb78 100644
>>>>>> --- a/hw/arm/virt.c
>>>>>> +++ b/hw/arm/virt.c
>>>>>> @@ -59,6 +59,7 @@
>>>>>>  #include "qapi/visitor.h"
>>>>>>  #include "standard-headers/linux/input.h"
>>>>>>  #include "hw/arm/smmuv3.h"
>>>>>> +#include "hw/acpi/acpi.h"
>>>>>>  
>>>>>>  #define DEFINE_VIRT_MACHINE_LATEST(major, minor, latest) \
>>>>>>      static void virt_##major##_##minor##_class_init(ObjectClass *oc, \
>>>>>> @@ -94,34 +95,25 @@
>>>>>>  
>>>>>>  #define PLATFORM_BUS_NUM_IRQS 64
>>>>>>  
>>>>>> -/* RAM limit in GB. Since VIRT_MEM starts at the 1GB mark, this means
>>>>>> - * RAM can go up to the 256GB mark, leaving 256GB of the physical
>>>>>> - * address space unallocated and free for future use between 256G and 512G.
>>>>>> - * If we need to provide more RAM to VMs in the future then we need to:
>>>>>> - *  * allocate a second bank of RAM starting at 2TB and working up
>>>> I acknowledge this comment was the main justification. Now if you look at
>>>>
>>>> Principles of ARM Memory Maps
>>>> http://infocenter.arm.com/help/topic/com.arm.doc.den0001c/DEN0001C_principles_of_arm_memory_maps.pdf
>>>> chapter 2.3 you will find that when adding PA bits, you always leave
>>>> space for reserved space and mapped IO.
>>>
>>> Thanks for the pointer!
>>>
>>> So ... we can fit
>>>
>>> a) 2GB at 2GB
>>> b) 32GB at 32GB
>>> c) 512GB at 512GB
>>> d) 8TB at 8TB
>>> e) 128TB at 128TB
>>>
>>> (this is a nice rule of thumb if I understand it correctly :) )
>>>
>>> We should strive for device memory (maxram_size - ram_size) to fit
>>> exactly into one of these slots (otherwise things get nasty).
>>>
>>> Depending on the ram_size, we might have simpler setups and can support
>>> more configurations, no?
>>>
>>> E.g. ram_size <= 34GB, device_memory <= 512GB
>>> -> move ram into a) and b)
>>> -> move device memory into c)
>>
>> The issue is machvirt doesn't comply with that document.
>> At the moment we have
>> 0 -> 1GB MMIO
>> 1GB -> 256GB RAM
>> 256GB -> 512GB is theoretically reserved for IO but most is free.
>> 512GB -> 1T is reserved for ECAM MMIO range. This is the top of our
>> existing 40b GPA space.
>>
>> We don't want to change this address map due to legacy reasons.
>>
> 
> Thanks, good to know!
> 
>> Another question! do you know if it would be possible to have
>> device_memory region split into several discontinuous segments?
> 
> It can be implemented for sure, but I would try to avoid that, as it
> makes certain configurations impossible (and very end user unfriendly).
> 
> E.g. (numbers completely made up, but it should show what I mean)
> 
> -m 20G,maxmem=120G:
> -> Try to add a DIMM with 100G -> error.
> -> But we can add e.g. two DIMMs with 40G and 60G.
> 
> This exposes internal details to the end user. And the end user has no
> idea what is going on.
> 
> So I think we should try our best to keep that area consecutive.

Actually I didn't sufficiently detail the context. I would like
1) 1 segment to be exposed to the end-user through slot|maxmem stuff
(what this series targets) and
2) another segment used to instantiate PC-DIMM for internal needs as
replacement of part of the 1GB -> 256GB static RAM. This was the purpose
of Shameer's original series

[1] [RFC v2 0/6] hw/arm: Add support for non-contiguous iova regions
http://patchwork.ozlabs.org/cover/914694/
This approach is not yet validated though.

The rationale is sometimes you must have "holes" in RAM as some GPAs
match reserved IOVAs for assigned devices.

Thanks

Eric

> 
>>
>> Thanks
>>
>> Eric
> 
>
David Hildenbrand July 5, 2018, 12:09 p.m. UTC | #7
On 05.07.2018 14:00, Auger Eric wrote:
> Hi David,
> 
> On 07/05/2018 01:54 PM, David Hildenbrand wrote:
>> On 05.07.2018 13:42, Auger Eric wrote:
>>> Hi David,
>>>
>>> On 07/04/2018 02:05 PM, David Hildenbrand wrote:
>>>> On 03.07.2018 21:27, Auger Eric wrote:
>>>>> Hi David,
>>>>> On 07/03/2018 08:25 PM, David Hildenbrand wrote:
>>>>>> On 03.07.2018 09:19, Eric Auger wrote:
>>>>>>> We define a new hotpluggable RAM region (aka. device memory).
>>>>>>> Its base is 2TB GPA. This obviously requires 42b IPA support
>>>>>>> in KVM/ARM, FW and guest kernel. At the moment the device
>>>>>>> memory region is max 2TB.
>>>>>>
>>>>>> Maybe a stupid question, but why exactly does it have to start at 2TB
>>>>>> (and not e.g. at 1TB)?
>>>>> not a stupid question. See tentative answer below.
>>>>>>
>>>>>>>
>>>>>>> This is largely inspired of device memory initialization in
>>>>>>> pc machine code.
>>>>>>>
>>>>>>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
>>>>>>> Signed-off-by: Kwangwoo Lee <kwangwoo.lee@sk.com>
>>>>>>> ---
>>>>>>>  hw/arm/virt.c         | 104 ++++++++++++++++++++++++++++++++++++--------------
>>>>>>>  include/hw/arm/arm.h  |   2 +
>>>>>>>  include/hw/arm/virt.h |   1 +
>>>>>>>  3 files changed, 79 insertions(+), 28 deletions(-)
>>>>>>>
>>>>>>> diff --git a/hw/arm/virt.c b/hw/arm/virt.c
>>>>>>> index 5a4d0bf..6fefb78 100644
>>>>>>> --- a/hw/arm/virt.c
>>>>>>> +++ b/hw/arm/virt.c
>>>>>>> @@ -59,6 +59,7 @@
>>>>>>>  #include "qapi/visitor.h"
>>>>>>>  #include "standard-headers/linux/input.h"
>>>>>>>  #include "hw/arm/smmuv3.h"
>>>>>>> +#include "hw/acpi/acpi.h"
>>>>>>>  
>>>>>>>  #define DEFINE_VIRT_MACHINE_LATEST(major, minor, latest) \
>>>>>>>      static void virt_##major##_##minor##_class_init(ObjectClass *oc, \
>>>>>>> @@ -94,34 +95,25 @@
>>>>>>>  
>>>>>>>  #define PLATFORM_BUS_NUM_IRQS 64
>>>>>>>  
>>>>>>> -/* RAM limit in GB. Since VIRT_MEM starts at the 1GB mark, this means
>>>>>>> - * RAM can go up to the 256GB mark, leaving 256GB of the physical
>>>>>>> - * address space unallocated and free for future use between 256G and 512G.
>>>>>>> - * If we need to provide more RAM to VMs in the future then we need to:
>>>>>>> - *  * allocate a second bank of RAM starting at 2TB and working up
>>>>> I acknowledge this comment was the main justification. Now if you look at
>>>>>
>>>>> Principles of ARM Memory Maps
>>>>> http://infocenter.arm.com/help/topic/com.arm.doc.den0001c/DEN0001C_principles_of_arm_memory_maps.pdf
>>>>> chapter 2.3 you will find that when adding PA bits, you always leave
>>>>> space for reserved space and mapped IO.
>>>>
>>>> Thanks for the pointer!
>>>>
>>>> So ... we can fit
>>>>
>>>> a) 2GB at 2GB
>>>> b) 32GB at 32GB
>>>> c) 512GB at 512GB
>>>> d) 8TB at 8TB
>>>> e) 128TB at 128TB
>>>>
>>>> (this is a nice rule of thumb if I understand it correctly :) )
>>>>
>>>> We should strive for device memory (maxram_size - ram_size) to fit
>>>> exactly into one of these slots (otherwise things get nasty).
>>>>
>>>> Depending on the ram_size, we might have simpler setups and can support
>>>> more configurations, no?
>>>>
>>>> E.g. ram_size <= 34GB, device_memory <= 512GB
>>>> -> move ram into a) and b)
>>>> -> move device memory into c)
>>>
>>> The issue is machvirt doesn't comply with that document.
>>> At the moment we have
>>> 0 -> 1GB MMIO
>>> 1GB -> 256GB RAM
>>> 256GB -> 512GB is theoretically reserved for IO but most is free.
>>> 512GB -> 1T is reserved for ECAM MMIO range. This is the top of our
>>> existing 40b GPA space.
>>>
>>> We don't want to change this address map due to legacy reasons.
>>>
>>
>> Thanks, good to know!
>>
>>> Another question! do you know if it would be possible to have
>>> device_memory region split into several discontinuous segments?
>>
>> It can be implemented for sure, but I would try to avoid that, as it
>> makes certain configurations impossible (and very end user unfriendly).
>>
>> E.g. (numbers completely made up, but it should show what I mean)
>>
>> -m 20G,maxmem=120G:
>> -> Try to add a DIMM with 100G -> error.
>> -> But we can add e.g. two DIMMs with 40G and 60G.
>>
>> This exposes internal details to the end user. And the end user has no
>> idea what is going on.
>>
>> So I think we should try our best to keep that area consecutive.
> 
> Actually I didn't sufficiently detail the context. I would like
> 1) 1 segment to be exposed to the end-user through slot|maxmem stuff
> (what this series targets) and
> 2) another segment used to instantiate PC-DIMM for internal needs as
> replacement of part of the 1GB -> 256GB static RAM. This was the purpose
> of Shameer's original series

I am not sure if PC-DIMMs are exactly what you want for internal purposes.

> 
> [1] [RFC v2 0/6] hw/arm: Add support for non-contiguous iova regions
> http://patchwork.ozlabs.org/cover/914694/
> This approach is not yet validated though.
> 
> The rationale is sometimes you must have "holes" in RAM as some GPAs
> match reserved IOVAs for assigned devices.

So if I understand it correctly, all you want is some memory region that
a) contains only initially defined memory
b) can have some holes in it

This is exactly what x86 already does (pc_memory_init): Simply construct
your own memory region leaving holes in it.


memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", ram,
                         0, pcms->below_4g_mem_size);
memory_region_add_subregion(system_memory, 0, ram_below_4g);
...
if (pcms->above_4g_mem_size > 0)
    memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
    ...
    memory_region_add_subregion(system_memory, 0x100000000ULL,
    ...

They "indicate" these different GPA areas using the e820 map to the guest.

Would that also work for you?

> 
> Thanks
> 
> Eric
> 
>>
>>>
>>> Thanks
>>>
>>> Eric
>>
>>
Eric Auger July 5, 2018, 12:17 p.m. UTC | #8
Hi David,

On 07/05/2018 02:09 PM, David Hildenbrand wrote:
> On 05.07.2018 14:00, Auger Eric wrote:
>> Hi David,
>>
>> On 07/05/2018 01:54 PM, David Hildenbrand wrote:
>>> On 05.07.2018 13:42, Auger Eric wrote:
>>>> Hi David,
>>>>
>>>> On 07/04/2018 02:05 PM, David Hildenbrand wrote:
>>>>> On 03.07.2018 21:27, Auger Eric wrote:
>>>>>> Hi David,
>>>>>> On 07/03/2018 08:25 PM, David Hildenbrand wrote:
>>>>>>> On 03.07.2018 09:19, Eric Auger wrote:
>>>>>>>> We define a new hotpluggable RAM region (aka. device memory).
>>>>>>>> Its base is 2TB GPA. This obviously requires 42b IPA support
>>>>>>>> in KVM/ARM, FW and guest kernel. At the moment the device
>>>>>>>> memory region is max 2TB.
>>>>>>>
>>>>>>> Maybe a stupid question, but why exactly does it have to start at 2TB
>>>>>>> (and not e.g. at 1TB)?
>>>>>> not a stupid question. See tentative answer below.
>>>>>>>
>>>>>>>>
>>>>>>>> This is largely inspired of device memory initialization in
>>>>>>>> pc machine code.
>>>>>>>>
>>>>>>>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
>>>>>>>> Signed-off-by: Kwangwoo Lee <kwangwoo.lee@sk.com>
>>>>>>>> ---
>>>>>>>>  hw/arm/virt.c         | 104 ++++++++++++++++++++++++++++++++++++--------------
>>>>>>>>  include/hw/arm/arm.h  |   2 +
>>>>>>>>  include/hw/arm/virt.h |   1 +
>>>>>>>>  3 files changed, 79 insertions(+), 28 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/hw/arm/virt.c b/hw/arm/virt.c
>>>>>>>> index 5a4d0bf..6fefb78 100644
>>>>>>>> --- a/hw/arm/virt.c
>>>>>>>> +++ b/hw/arm/virt.c
>>>>>>>> @@ -59,6 +59,7 @@
>>>>>>>>  #include "qapi/visitor.h"
>>>>>>>>  #include "standard-headers/linux/input.h"
>>>>>>>>  #include "hw/arm/smmuv3.h"
>>>>>>>> +#include "hw/acpi/acpi.h"
>>>>>>>>  
>>>>>>>>  #define DEFINE_VIRT_MACHINE_LATEST(major, minor, latest) \
>>>>>>>>      static void virt_##major##_##minor##_class_init(ObjectClass *oc, \
>>>>>>>> @@ -94,34 +95,25 @@
>>>>>>>>  
>>>>>>>>  #define PLATFORM_BUS_NUM_IRQS 64
>>>>>>>>  
>>>>>>>> -/* RAM limit in GB. Since VIRT_MEM starts at the 1GB mark, this means
>>>>>>>> - * RAM can go up to the 256GB mark, leaving 256GB of the physical
>>>>>>>> - * address space unallocated and free for future use between 256G and 512G.
>>>>>>>> - * If we need to provide more RAM to VMs in the future then we need to:
>>>>>>>> - *  * allocate a second bank of RAM starting at 2TB and working up
>>>>>> I acknowledge this comment was the main justification. Now if you look at
>>>>>>
>>>>>> Principles of ARM Memory Maps
>>>>>> http://infocenter.arm.com/help/topic/com.arm.doc.den0001c/DEN0001C_principles_of_arm_memory_maps.pdf
>>>>>> chapter 2.3 you will find that when adding PA bits, you always leave
>>>>>> space for reserved space and mapped IO.
>>>>>
>>>>> Thanks for the pointer!
>>>>>
>>>>> So ... we can fit
>>>>>
>>>>> a) 2GB at 2GB
>>>>> b) 32GB at 32GB
>>>>> c) 512GB at 512GB
>>>>> d) 8TB at 8TB
>>>>> e) 128TB at 128TB
>>>>>
>>>>> (this is a nice rule of thumb if I understand it correctly :) )
>>>>>
>>>>> We should strive for device memory (maxram_size - ram_size) to fit
>>>>> exactly into one of these slots (otherwise things get nasty).
>>>>>
>>>>> Depending on the ram_size, we might have simpler setups and can support
>>>>> more configurations, no?
>>>>>
>>>>> E.g. ram_size <= 34GB, device_memory <= 512GB
>>>>> -> move ram into a) and b)
>>>>> -> move device memory into c)
>>>>
>>>> The issue is machvirt doesn't comply with that document.
>>>> At the moment we have
>>>> 0 -> 1GB MMIO
>>>> 1GB -> 256GB RAM
>>>> 256GB -> 512GB is theoretically reserved for IO but most is free.
>>>> 512GB -> 1T is reserved for ECAM MMIO range. This is the top of our
>>>> existing 40b GPA space.
>>>>
>>>> We don't want to change this address map due to legacy reasons.
>>>>
>>>
>>> Thanks, good to know!
>>>
>>>> Another question! do you know if it would be possible to have
>>>> device_memory region split into several discontinuous segments?
>>>
>>> It can be implemented for sure, but I would try to avoid that, as it
>>> makes certain configurations impossible (and very end user unfriendly).
>>>
>>> E.g. (numbers completely made up, but it should show what I mean)
>>>
>>> -m 20G,maxmem=120G:
>>> -> Try to add a DIMM with 100G -> error.
>>> -> But we can add e.g. two DIMMs with 40G and 60G.
>>>
>>> This exposes internal details to the end user. And the end user has no
>>> idea what is going on.
>>>
>>> So I think we should try our best to keep that area consecutive.
>>
>> Actually I didn't sufficiently detail the context. I would like
>> 1) 1 segment to be exposed to the end-user through slot|maxmem stuff
>> (what this series targets) and
>> 2) another segment used to instantiate PC-DIMM for internal needs as
>> replacement of part of the 1GB -> 256GB static RAM. This was the purpose
>> of Shameer's original series
> 
> I am not sure if PC-DIMMs are exactly what you want for internal purposes.
> 
>>
>> [1] [RFC v2 0/6] hw/arm: Add support for non-contiguous iova regions
>> http://patchwork.ozlabs.org/cover/914694/
>> This approach is not yet validated though.
>>
>> The rationale is sometimes you must have "holes" in RAM as some GPAs
>> match reserved IOVAs for assigned devices.
> 
> So if I understand it correctly, all you want is some memory region that
> a) contains only initially defined memory
> b) can have some holes in it
> 
> This is exactly what x86 already does (pc_memory_init): Simply construct
> your own memory region leaving holes in it.
> 
> 
> memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", ram,
>                          0, pcms->below_4g_mem_size);
> memory_region_add_subregion(system_memory, 0, ram_below_4g);
> ...
> if (pcms->above_4g_mem_size > 0)
>     memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
>     ...
>     memory_region_add_subregion(system_memory, 0x100000000ULL,
>     ...
> 
> They "indicate" these different GPA areas using the e820 map to the guest.
> 
> Would that also work for you?

I would tentatively say yes. Effectively I am not sure that if we were
to actually put holes in the 1G-256GB RAM segment, PC-DIMM would be the
natural choice. Also the reserved IOVA issue impacts the device_memory
region area I think. I am skeptical about the fact we can put holes in
static RAM and device_memory regions like that.

Thanks!

Eric
> 
>>
>> Thanks
>>
>> Eric
>>
>>>
>>>>
>>>> Thanks
>>>>
>>>> Eric
>>>
>>>
> 
>
Shameerali Kolothum Thodi July 5, 2018, 1:19 p.m. UTC | #9
> -----Original Message-----
> From: Auger Eric [mailto:eric.auger@redhat.com]
> Sent: 05 July 2018 13:18
> To: David Hildenbrand <david@redhat.com>; eric.auger.pro@gmail.com;
> qemu-devel@nongnu.org; qemu-arm@nongnu.org; peter.maydell@linaro.org;
> Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;
> imammedo@redhat.com
> Cc: wei@redhat.com; drjones@redhat.com; david@gibson.dropbear.id.au;
> dgilbert@redhat.com; agraf@suse.de
> Subject: Re: [Qemu-devel] [RFC v3 06/15] hw/arm/virt: Allocate
> device_memory
> 
> Hi David,
> 
> On 07/05/2018 02:09 PM, David Hildenbrand wrote:
> > On 05.07.2018 14:00, Auger Eric wrote:
> >> Hi David,
> >>
> >> On 07/05/2018 01:54 PM, David Hildenbrand wrote:
> >>> On 05.07.2018 13:42, Auger Eric wrote:
> >>>> Hi David,
> >>>>
> >>>> On 07/04/2018 02:05 PM, David Hildenbrand wrote:
> >>>>> On 03.07.2018 21:27, Auger Eric wrote:
> >>>>>> Hi David,
> >>>>>> On 07/03/2018 08:25 PM, David Hildenbrand wrote:
> >>>>>>> On 03.07.2018 09:19, Eric Auger wrote:
> >>>>>>>> We define a new hotpluggable RAM region (aka. device memory).
> >>>>>>>> Its base is 2TB GPA. This obviously requires 42b IPA support
> >>>>>>>> in KVM/ARM, FW and guest kernel. At the moment the device
> >>>>>>>> memory region is max 2TB.
> >>>>>>>
> >>>>>>> Maybe a stupid question, but why exactly does it have to start at 2TB
> >>>>>>> (and not e.g. at 1TB)?
> >>>>>> not a stupid question. See tentative answer below.
> >>>>>>>
> >>>>>>>>
> >>>>>>>> This is largely inspired of device memory initialization in
> >>>>>>>> pc machine code.
> >>>>>>>>
> >>>>>>>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
> >>>>>>>> Signed-off-by: Kwangwoo Lee <kwangwoo.lee@sk.com>
> >>>>>>>> ---
> >>>>>>>>  hw/arm/virt.c         | 104
> ++++++++++++++++++++++++++++++++++++--------------
> >>>>>>>>  include/hw/arm/arm.h  |   2 +
> >>>>>>>>  include/hw/arm/virt.h |   1 +
> >>>>>>>>  3 files changed, 79 insertions(+), 28 deletions(-)
> >>>>>>>>
> >>>>>>>> diff --git a/hw/arm/virt.c b/hw/arm/virt.c
> >>>>>>>> index 5a4d0bf..6fefb78 100644
> >>>>>>>> --- a/hw/arm/virt.c
> >>>>>>>> +++ b/hw/arm/virt.c
> >>>>>>>> @@ -59,6 +59,7 @@
> >>>>>>>>  #include "qapi/visitor.h"
> >>>>>>>>  #include "standard-headers/linux/input.h"
> >>>>>>>>  #include "hw/arm/smmuv3.h"
> >>>>>>>> +#include "hw/acpi/acpi.h"
> >>>>>>>>
> >>>>>>>>  #define DEFINE_VIRT_MACHINE_LATEST(major, minor, latest) \
> >>>>>>>>      static void virt_##major##_##minor##_class_init(ObjectClass *oc,
> \
> >>>>>>>> @@ -94,34 +95,25 @@
> >>>>>>>>
> >>>>>>>>  #define PLATFORM_BUS_NUM_IRQS 64
> >>>>>>>>
> >>>>>>>> -/* RAM limit in GB. Since VIRT_MEM starts at the 1GB mark, this
> means
> >>>>>>>> - * RAM can go up to the 256GB mark, leaving 256GB of the physical
> >>>>>>>> - * address space unallocated and free for future use between 256G
> and 512G.
> >>>>>>>> - * If we need to provide more RAM to VMs in the future then we
> need to:
> >>>>>>>> - *  * allocate a second bank of RAM starting at 2TB and working up
> >>>>>> I acknowledge this comment was the main justification. Now if you look
> at
> >>>>>>
> >>>>>> Principles of ARM Memory Maps
> >>>>>>
> http://infocenter.arm.com/help/topic/com.arm.doc.den0001c/DEN0001C_princ
> iples_of_arm_memory_maps.pdf
> >>>>>> chapter 2.3 you will find that when adding PA bits, you always leave
> >>>>>> space for reserved space and mapped IO.
> >>>>>
> >>>>> Thanks for the pointer!
> >>>>>
> >>>>> So ... we can fit
> >>>>>
> >>>>> a) 2GB at 2GB
> >>>>> b) 32GB at 32GB
> >>>>> c) 512GB at 512GB
> >>>>> d) 8TB at 8TB
> >>>>> e) 128TB at 128TB
> >>>>>
> >>>>> (this is a nice rule of thumb if I understand it correctly :) )
> >>>>>
> >>>>> We should strive for device memory (maxram_size - ram_size) to fit
> >>>>> exactly into one of these slots (otherwise things get nasty).
> >>>>>
> >>>>> Depending on the ram_size, we might have simpler setups and can
> support
> >>>>> more configurations, no?
> >>>>>
> >>>>> E.g. ram_size <= 34GB, device_memory <= 512GB
> >>>>> -> move ram into a) and b)
> >>>>> -> move device memory into c)
> >>>>
> >>>> The issue is machvirt doesn't comply with that document.
> >>>> At the moment we have
> >>>> 0 -> 1GB MMIO
> >>>> 1GB -> 256GB RAM
> >>>> 256GB -> 512GB is theoretically reserved for IO but most is free.
> >>>> 512GB -> 1T is reserved for ECAM MMIO range. This is the top of our
> >>>> existing 40b GPA space.
> >>>>
> >>>> We don't want to change this address map due to legacy reasons.
> >>>>
> >>>
> >>> Thanks, good to know!
> >>>
> >>>> Another question! do you know if it would be possible to have
> >>>> device_memory region split into several discontinuous segments?
> >>>
> >>> It can be implemented for sure, but I would try to avoid that, as it
> >>> makes certain configurations impossible (and very end user unfriendly).
> >>>
> >>> E.g. (numbers completely made up, but it should show what I mean)
> >>>
> >>> -m 20G,maxmem=120G:
> >>> -> Try to add a DIMM with 100G -> error.
> >>> -> But we can add e.g. two DIMMs with 40G and 60G.
> >>>
> >>> This exposes internal details to the end user. And the end user has no
> >>> idea what is going on.
> >>>
> >>> So I think we should try our best to keep that area consecutive.
> >>
> >> Actually I didn't sufficiently detail the context. I would like
> >> 1) 1 segment to be exposed to the end-user through slot|maxmem stuff
> >> (what this series targets) and
> >> 2) another segment used to instantiate PC-DIMM for internal needs as
> >> replacement of part of the 1GB -> 256GB static RAM. This was the purpose
> >> of Shameer's original series
> >
> > I am not sure if PC-DIMMs are exactly what you want for internal purposes.
> >
> >>
> >> [1] [RFC v2 0/6] hw/arm: Add support for non-contiguous iova regions
> >> http://patchwork.ozlabs.org/cover/914694/
> >> This approach is not yet validated though.
> >>
> >> The rationale is sometimes you must have "holes" in RAM as some GPAs
> >> match reserved IOVAs for assigned devices.
> >
> > So if I understand it correctly, all you want is some memory region that
> > a) contains only initially defined memory
> > b) can have some holes in it
> >
> > This is exactly what x86 already does (pc_memory_init): Simply construct
> > your own memory region leaving holes in it.
> >
> >
> > memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", ram,
> >                          0, pcms->below_4g_mem_size);
> > memory_region_add_subregion(system_memory, 0, ram_below_4g);
> > ...
> > if (pcms->above_4g_mem_size > 0)
> >     memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> >     ...
> >     memory_region_add_subregion(system_memory, 0x100000000ULL,
> >     ...
> >
> > They "indicate" these different GPA areas using the e820 map to the guest.
> >
> > Would that also work for you?
> 
> I would tentatively say yes. Effectively I am not sure that if we were
> to actually put holes in the 1G-256GB RAM segment, PC-DIMM would be the
> natural choice. Also the reserved IOVA issue impacts the device_memory
> region area I think. I am skeptical about the fact we can put holes in
> static RAM and device_memory regions like that.

The first approach[1] we had to address the holes in memory was using
the memory alias way mentioned above.  And based on Drew's review, the
pc-dimm way of handling was introduced. I think the main argument was that
it will be useful when we eventually support hotplug. But since that is added
anyway as part of this series, I am not sure we have any other benefit in
modeling it as pc-dimm. May be I am missing something here.

Thanks,
Shameer

[1]. https://lists.gnu.org/archive/html/qemu-arm/2018-04/msg00243.html


> Thanks!
> 
> Eric
> >
> >>
> >> Thanks
> >>
> >> Eric
> >>
> >>>
> >>>>
> >>>> Thanks
> >>>>
> >>>> Eric
> >>>
> >>>
> >
> >
Eric Auger July 5, 2018, 2:27 p.m. UTC | #10
Hi Shameer,

On 07/05/2018 03:19 PM, Shameerali Kolothum Thodi wrote:
> 
>> -----Original Message-----
>> From: Auger Eric [mailto:eric.auger@redhat.com]
>> Sent: 05 July 2018 13:18
>> To: David Hildenbrand <david@redhat.com>; eric.auger.pro@gmail.com;
>> qemu-devel@nongnu.org; qemu-arm@nongnu.org; peter.maydell@linaro.org;
>> Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;
>> imammedo@redhat.com
>> Cc: wei@redhat.com; drjones@redhat.com; david@gibson.dropbear.id.au;
>> dgilbert@redhat.com; agraf@suse.de
>> Subject: Re: [Qemu-devel] [RFC v3 06/15] hw/arm/virt: Allocate
>> device_memory
>>
>> Hi David,
>>
>> On 07/05/2018 02:09 PM, David Hildenbrand wrote:
>>> On 05.07.2018 14:00, Auger Eric wrote:
>>>> Hi David,
>>>>
>>>> On 07/05/2018 01:54 PM, David Hildenbrand wrote:
>>>>> On 05.07.2018 13:42, Auger Eric wrote:
>>>>>> Hi David,
>>>>>>
>>>>>> On 07/04/2018 02:05 PM, David Hildenbrand wrote:
>>>>>>> On 03.07.2018 21:27, Auger Eric wrote:
>>>>>>>> Hi David,
>>>>>>>> On 07/03/2018 08:25 PM, David Hildenbrand wrote:
>>>>>>>>> On 03.07.2018 09:19, Eric Auger wrote:
>>>>>>>>>> We define a new hotpluggable RAM region (aka. device memory).
>>>>>>>>>> Its base is 2TB GPA. This obviously requires 42b IPA support
>>>>>>>>>> in KVM/ARM, FW and guest kernel. At the moment the device
>>>>>>>>>> memory region is max 2TB.
>>>>>>>>>
>>>>>>>>> Maybe a stupid question, but why exactly does it have to start at 2TB
>>>>>>>>> (and not e.g. at 1TB)?
>>>>>>>> not a stupid question. See tentative answer below.
>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> This is largely inspired of device memory initialization in
>>>>>>>>>> pc machine code.
>>>>>>>>>>
>>>>>>>>>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
>>>>>>>>>> Signed-off-by: Kwangwoo Lee <kwangwoo.lee@sk.com>
>>>>>>>>>> ---
>>>>>>>>>>  hw/arm/virt.c         | 104
>> ++++++++++++++++++++++++++++++++++++--------------
>>>>>>>>>>  include/hw/arm/arm.h  |   2 +
>>>>>>>>>>  include/hw/arm/virt.h |   1 +
>>>>>>>>>>  3 files changed, 79 insertions(+), 28 deletions(-)
>>>>>>>>>>
>>>>>>>>>> diff --git a/hw/arm/virt.c b/hw/arm/virt.c
>>>>>>>>>> index 5a4d0bf..6fefb78 100644
>>>>>>>>>> --- a/hw/arm/virt.c
>>>>>>>>>> +++ b/hw/arm/virt.c
>>>>>>>>>> @@ -59,6 +59,7 @@
>>>>>>>>>>  #include "qapi/visitor.h"
>>>>>>>>>>  #include "standard-headers/linux/input.h"
>>>>>>>>>>  #include "hw/arm/smmuv3.h"
>>>>>>>>>> +#include "hw/acpi/acpi.h"
>>>>>>>>>>
>>>>>>>>>>  #define DEFINE_VIRT_MACHINE_LATEST(major, minor, latest) \
>>>>>>>>>>      static void virt_##major##_##minor##_class_init(ObjectClass *oc,
>> \
>>>>>>>>>> @@ -94,34 +95,25 @@
>>>>>>>>>>
>>>>>>>>>>  #define PLATFORM_BUS_NUM_IRQS 64
>>>>>>>>>>
>>>>>>>>>> -/* RAM limit in GB. Since VIRT_MEM starts at the 1GB mark, this
>> means
>>>>>>>>>> - * RAM can go up to the 256GB mark, leaving 256GB of the physical
>>>>>>>>>> - * address space unallocated and free for future use between 256G
>> and 512G.
>>>>>>>>>> - * If we need to provide more RAM to VMs in the future then we
>> need to:
>>>>>>>>>> - *  * allocate a second bank of RAM starting at 2TB and working up
>>>>>>>> I acknowledge this comment was the main justification. Now if you look
>> at
>>>>>>>>
>>>>>>>> Principles of ARM Memory Maps
>>>>>>>>
>> http://infocenter.arm.com/help/topic/com.arm.doc.den0001c/DEN0001C_princ
>> iples_of_arm_memory_maps.pdf
>>>>>>>> chapter 2.3 you will find that when adding PA bits, you always leave
>>>>>>>> space for reserved space and mapped IO.
>>>>>>>
>>>>>>> Thanks for the pointer!
>>>>>>>
>>>>>>> So ... we can fit
>>>>>>>
>>>>>>> a) 2GB at 2GB
>>>>>>> b) 32GB at 32GB
>>>>>>> c) 512GB at 512GB
>>>>>>> d) 8TB at 8TB
>>>>>>> e) 128TB at 128TB
>>>>>>>
>>>>>>> (this is a nice rule of thumb if I understand it correctly :) )
>>>>>>>
>>>>>>> We should strive for device memory (maxram_size - ram_size) to fit
>>>>>>> exactly into one of these slots (otherwise things get nasty).
>>>>>>>
>>>>>>> Depending on the ram_size, we might have simpler setups and can
>> support
>>>>>>> more configurations, no?
>>>>>>>
>>>>>>> E.g. ram_size <= 34GB, device_memory <= 512GB
>>>>>>> -> move ram into a) and b)
>>>>>>> -> move device memory into c)
>>>>>>
>>>>>> The issue is machvirt doesn't comply with that document.
>>>>>> At the moment we have
>>>>>> 0 -> 1GB MMIO
>>>>>> 1GB -> 256GB RAM
>>>>>> 256GB -> 512GB is theoretically reserved for IO but most is free.
>>>>>> 512GB -> 1T is reserved for ECAM MMIO range. This is the top of our
>>>>>> existing 40b GPA space.
>>>>>>
>>>>>> We don't want to change this address map due to legacy reasons.
>>>>>>
>>>>>
>>>>> Thanks, good to know!
>>>>>
>>>>>> Another question! do you know if it would be possible to have
>>>>>> device_memory region split into several discontinuous segments?
>>>>>
>>>>> It can be implemented for sure, but I would try to avoid that, as it
>>>>> makes certain configurations impossible (and very end user unfriendly).
>>>>>
>>>>> E.g. (numbers completely made up, but it should show what I mean)
>>>>>
>>>>> -m 20G,maxmem=120G:
>>>>> -> Try to add a DIMM with 100G -> error.
>>>>> -> But we can add e.g. two DIMMs with 40G and 60G.
>>>>>
>>>>> This exposes internal details to the end user. And the end user has no
>>>>> idea what is going on.
>>>>>
>>>>> So I think we should try our best to keep that area consecutive.
>>>>
>>>> Actually I didn't sufficiently detail the context. I would like
>>>> 1) 1 segment to be exposed to the end-user through slot|maxmem stuff
>>>> (what this series targets) and
>>>> 2) another segment used to instantiate PC-DIMM for internal needs as
>>>> replacement of part of the 1GB -> 256GB static RAM. This was the purpose
>>>> of Shameer's original series
>>>
>>> I am not sure if PC-DIMMs are exactly what you want for internal purposes.
>>>
>>>>
>>>> [1] [RFC v2 0/6] hw/arm: Add support for non-contiguous iova regions
>>>> http://patchwork.ozlabs.org/cover/914694/
>>>> This approach is not yet validated though.
>>>>
>>>> The rationale is sometimes you must have "holes" in RAM as some GPAs
>>>> match reserved IOVAs for assigned devices.
>>>
>>> So if I understand it correctly, all you want is some memory region that
>>> a) contains only initially defined memory
>>> b) can have some holes in it
>>>
>>> This is exactly what x86 already does (pc_memory_init): Simply construct
>>> your own memory region leaving holes in it.
>>>
>>>
>>> memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", ram,
>>>                          0, pcms->below_4g_mem_size);
>>> memory_region_add_subregion(system_memory, 0, ram_below_4g);
>>> ...
>>> if (pcms->above_4g_mem_size > 0)
>>>     memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
>>>     ...
>>>     memory_region_add_subregion(system_memory, 0x100000000ULL,
>>>     ...
>>>
>>> They "indicate" these different GPA areas using the e820 map to the guest.
>>>
>>> Would that also work for you?
>>
>> I would tentatively say yes. Effectively I am not sure that if we were
>> to actually put holes in the 1G-256GB RAM segment, PC-DIMM would be the
>> natural choice. Also the reserved IOVA issue impacts the device_memory
>> region area I think. I am skeptical about the fact we can put holes in
>> static RAM and device_memory regions like that.
> 
> The first approach[1] we had to address the holes in memory was using
> the memory alias way mentioned above.  And based on Drew's review, the
> pc-dimm way of handling was introduced. I think the main argument was that
> it will be useful when we eventually support hotplug.

That's my understanding too.

 But since that is added
> anyway as part of this series, I am not sure we have any other benefit in
> modeling it as pc-dimm. May be I am missing something here.

I tentatively agree with you. I was trying to understand if the
device_memory region was fitting the original needs too but I think
standard alias approach is more adapted to hole creation.

Thanks

Eric
> 
> Thanks,
> Shameer
> 
> [1]. https://lists.gnu.org/archive/html/qemu-arm/2018-04/msg00243.html
> 
> 
>> Thanks!
>>
>> Eric
>>>
>>>>
>>>> Thanks
>>>>
>>>> Eric
>>>>
>>>>>
>>>>>>
>>>>>> Thanks
>>>>>>
>>>>>> Eric
>>>>>
>>>>>
>>>
>>>
Igor Mammedov July 11, 2018, 1:17 p.m. UTC | #11
On Thu, 5 Jul 2018 16:27:05 +0200
Auger Eric <eric.auger@redhat.com> wrote:

> Hi Shameer,
> 
> On 07/05/2018 03:19 PM, Shameerali Kolothum Thodi wrote:
> >   
> >> -----Original Message-----
> >> From: Auger Eric [mailto:eric.auger@redhat.com]
> >> Sent: 05 July 2018 13:18
> >> To: David Hildenbrand <david@redhat.com>; eric.auger.pro@gmail.com;
> >> qemu-devel@nongnu.org; qemu-arm@nongnu.org; peter.maydell@linaro.org;
> >> Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;
> >> imammedo@redhat.com
> >> Cc: wei@redhat.com; drjones@redhat.com; david@gibson.dropbear.id.au;
> >> dgilbert@redhat.com; agraf@suse.de
> >> Subject: Re: [Qemu-devel] [RFC v3 06/15] hw/arm/virt: Allocate
> >> device_memory
> >>
> >> Hi David,
> >>
> >> On 07/05/2018 02:09 PM, David Hildenbrand wrote:  
> >>> On 05.07.2018 14:00, Auger Eric wrote:  
> >>>> Hi David,
> >>>>
> >>>> On 07/05/2018 01:54 PM, David Hildenbrand wrote:  
> >>>>> On 05.07.2018 13:42, Auger Eric wrote:  
> >>>>>> Hi David,
> >>>>>>
> >>>>>> On 07/04/2018 02:05 PM, David Hildenbrand wrote:  
> >>>>>>> On 03.07.2018 21:27, Auger Eric wrote:  
> >>>>>>>> Hi David,
> >>>>>>>> On 07/03/2018 08:25 PM, David Hildenbrand wrote:  
> >>>>>>>>> On 03.07.2018 09:19, Eric Auger wrote:  
> >>>>>>>>>> We define a new hotpluggable RAM region (aka. device memory).
> >>>>>>>>>> Its base is 2TB GPA. This obviously requires 42b IPA support
> >>>>>>>>>> in KVM/ARM, FW and guest kernel. At the moment the device
> >>>>>>>>>> memory region is max 2TB.  
> >>>>>>>>>
> >>>>>>>>> Maybe a stupid question, but why exactly does it have to start at 2TB
> >>>>>>>>> (and not e.g. at 1TB)?  
> >>>>>>>> not a stupid question. See tentative answer below.  
> >>>>>>>>>  
> >>>>>>>>>>
> >>>>>>>>>> This is largely inspired of device memory initialization in
> >>>>>>>>>> pc machine code.
> >>>>>>>>>>
> >>>>>>>>>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
> >>>>>>>>>> Signed-off-by: Kwangwoo Lee <kwangwoo.lee@sk.com>
> >>>>>>>>>> ---
> >>>>>>>>>>  hw/arm/virt.c         | 104  
> >> ++++++++++++++++++++++++++++++++++++--------------  
> >>>>>>>>>>  include/hw/arm/arm.h  |   2 +
> >>>>>>>>>>  include/hw/arm/virt.h |   1 +
> >>>>>>>>>>  3 files changed, 79 insertions(+), 28 deletions(-)
> >>>>>>>>>>
> >>>>>>>>>> diff --git a/hw/arm/virt.c b/hw/arm/virt.c
> >>>>>>>>>> index 5a4d0bf..6fefb78 100644
> >>>>>>>>>> --- a/hw/arm/virt.c
> >>>>>>>>>> +++ b/hw/arm/virt.c
> >>>>>>>>>> @@ -59,6 +59,7 @@
> >>>>>>>>>>  #include "qapi/visitor.h"
> >>>>>>>>>>  #include "standard-headers/linux/input.h"
> >>>>>>>>>>  #include "hw/arm/smmuv3.h"
> >>>>>>>>>> +#include "hw/acpi/acpi.h"
> >>>>>>>>>>
> >>>>>>>>>>  #define DEFINE_VIRT_MACHINE_LATEST(major, minor, latest) \
> >>>>>>>>>>      static void virt_##major##_##minor##_class_init(ObjectClass *oc,  
> >> \  
> >>>>>>>>>> @@ -94,34 +95,25 @@
> >>>>>>>>>>
> >>>>>>>>>>  #define PLATFORM_BUS_NUM_IRQS 64
> >>>>>>>>>>
> >>>>>>>>>> -/* RAM limit in GB. Since VIRT_MEM starts at the 1GB mark, this  
> >> means  
> >>>>>>>>>> - * RAM can go up to the 256GB mark, leaving 256GB of the physical
> >>>>>>>>>> - * address space unallocated and free for future use between 256G  
> >> and 512G.  
> >>>>>>>>>> - * If we need to provide more RAM to VMs in the future then we  
> >> need to:  
> >>>>>>>>>> - *  * allocate a second bank of RAM starting at 2TB and working up  
> >>>>>>>> I acknowledge this comment was the main justification. Now if you look  
> >> at  
> >>>>>>>>
> >>>>>>>> Principles of ARM Memory Maps
> >>>>>>>>  
> >> http://infocenter.arm.com/help/topic/com.arm.doc.den0001c/DEN0001C_princ
> >> iples_of_arm_memory_maps.pdf  
> >>>>>>>> chapter 2.3 you will find that when adding PA bits, you always leave
> >>>>>>>> space for reserved space and mapped IO.  
> >>>>>>>
> >>>>>>> Thanks for the pointer!
> >>>>>>>
> >>>>>>> So ... we can fit
> >>>>>>>
> >>>>>>> a) 2GB at 2GB
> >>>>>>> b) 32GB at 32GB
> >>>>>>> c) 512GB at 512GB
> >>>>>>> d) 8TB at 8TB
> >>>>>>> e) 128TB at 128TB
> >>>>>>>
> >>>>>>> (this is a nice rule of thumb if I understand it correctly :) )
> >>>>>>>
> >>>>>>> We should strive for device memory (maxram_size - ram_size) to fit
> >>>>>>> exactly into one of these slots (otherwise things get nasty).
> >>>>>>>
> >>>>>>> Depending on the ram_size, we might have simpler setups and can  
> >> support  
> >>>>>>> more configurations, no?
> >>>>>>>
> >>>>>>> E.g. ram_size <= 34GB, device_memory <= 512GB  
> >>>>>>> -> move ram into a) and b)
> >>>>>>> -> move device memory into c)  
> >>>>>>
> >>>>>> The issue is machvirt doesn't comply with that document.
> >>>>>> At the moment we have
> >>>>>> 0 -> 1GB MMIO
> >>>>>> 1GB -> 256GB RAM
> >>>>>> 256GB -> 512GB is theoretically reserved for IO but most is free.
> >>>>>> 512GB -> 1T is reserved for ECAM MMIO range. This is the top of our
> >>>>>> existing 40b GPA space.
> >>>>>>
> >>>>>> We don't want to change this address map due to legacy reasons.
> >>>>>>  
> >>>>>
> >>>>> Thanks, good to know!
> >>>>>  
> >>>>>> Another question! do you know if it would be possible to have
> >>>>>> device_memory region split into several discontinuous segments?  
> >>>>>
> >>>>> It can be implemented for sure, but I would try to avoid that, as it
> >>>>> makes certain configurations impossible (and very end user unfriendly).
> >>>>>
> >>>>> E.g. (numbers completely made up, but it should show what I mean)
> >>>>>
> >>>>> -m 20G,maxmem=120G:  
> >>>>> -> Try to add a DIMM with 100G -> error.
> >>>>> -> But we can add e.g. two DIMMs with 40G and 60G.  
> >>>>>
> >>>>> This exposes internal details to the end user. And the end user has no
> >>>>> idea what is going on.
> >>>>>
> >>>>> So I think we should try our best to keep that area consecutive.  
> >>>>
> >>>> Actually I didn't sufficiently detail the context. I would like
> >>>> 1) 1 segment to be exposed to the end-user through slot|maxmem stuff
> >>>> (what this series targets) and
> >>>> 2) another segment used to instantiate PC-DIMM for internal needs as
> >>>> replacement of part of the 1GB -> 256GB static RAM. This was the purpose
> >>>> of Shameer's original series  
> >>>
> >>> I am not sure if PC-DIMMs are exactly what you want for internal purposes.
> >>>  
> >>>>
> >>>> [1] [RFC v2 0/6] hw/arm: Add support for non-contiguous iova regions
> >>>> http://patchwork.ozlabs.org/cover/914694/
> >>>> This approach is not yet validated though.
> >>>>
> >>>> The rationale is sometimes you must have "holes" in RAM as some GPAs
> >>>> match reserved IOVAs for assigned devices.  
> >>>
> >>> So if I understand it correctly, all you want is some memory region that
> >>> a) contains only initially defined memory
> >>> b) can have some holes in it
> >>>
> >>> This is exactly what x86 already does (pc_memory_init): Simply construct
> >>> your own memory region leaving holes in it.
> >>>
> >>>
> >>> memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", ram,
> >>>                          0, pcms->below_4g_mem_size);
> >>> memory_region_add_subregion(system_memory, 0, ram_below_4g);
> >>> ...
> >>> if (pcms->above_4g_mem_size > 0)
> >>>     memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> >>>     ...
> >>>     memory_region_add_subregion(system_memory, 0x100000000ULL,
> >>>     ...
> >>>
> >>> They "indicate" these different GPA areas using the e820 map to the guest.
> >>>
> >>> Would that also work for you?  
> >>
> >> I would tentatively say yes. Effectively I am not sure that if we were
> >> to actually put holes in the 1G-256GB RAM segment, PC-DIMM would be the
> >> natural choice. Also the reserved IOVA issue impacts the device_memory
> >> region area I think. I am skeptical about the fact we can put holes in
> >> static RAM and device_memory regions like that.
Could we just use a single device_memory region for both initial+hotpluggable
RAM if we make base RAM address dynamic?
In this case RAM could start wherever there is a free space for maxmem
(if there is free space in lowmem, put device_memory there, otherwise put
it somewhere in high mem) and we won't care if there is IOVA or not.

*I don't have a clue about iommus, so here goes a stupid question*
I agree with Peter that whole IOVA thing looks broken, when host
layout dictates the guest's one.
Shouldn't be there somewhere an iommu that would remap host map into
guest specific one? (so guest would model board we need and be migrate-able
instead of mimicking host hw)


> > The first approach[1] we had to address the holes in memory was using
> > the memory alias way mentioned above.  And based on Drew's review, the
> > pc-dimm way of handling was introduced. I think the main argument was that
> > it will be useful when we eventually support hotplug.  
> 
> That's my understanding too.
not only hotplug,

  a RAM memory region that's split by aliases is difficult to handle
  as it creates nonlinear GPA<->HVA mapping instead of
  1:1 mapping of pc-dimm,
  so if one needs to build HVA<->GPA map for a given MemoryRegion
  in case of aliases one would have to get list of MemorySections
  that belong to it and build map from that vs (addr + offset) in
  case of simple 1:1 mapping.

  complicated machine specific SRAT/e820 code due to holes
  /grep 'the memory map is a bit tricky'/

>  But since that is added
> > anyway as part of this series, I am not sure we have any other benefit in
> > modeling it as pc-dimm. May be I am missing something here.  
> 
> I tentatively agree with you. I was trying to understand if the
> device_memory region was fitting the original needs too but I think
> standard alias approach is more adapted to hole creation.
Aliases are easy way to start with, but as compat knobs grow
(based on PC experience,  grep 'Calculate ram split')
It's quite a pain to maintain manual implicit aliases layout
without breaking it by accident.
We probably won't be able to get rid of aliases on PC for legacy
reasons but why introduce the same pain to virt board.

Well, magical conversion from -m X to 2..y memory regions (aliases or not)
aren't going to be easy in both cases, especially if one would take into
account "-numa memdev|mem".
I'd rather use a single pc-dimm approach for both /initial and hotpluggble RAM/
and then use device_memory framework to enumerate RAM wherever needed (ACPI/DT)
in inform way.


> Thanks
> 
> Eric
> > 
> > Thanks,
> > Shameer
> > 
> > [1]. https://lists.gnu.org/archive/html/qemu-arm/2018-04/msg00243.html
> > 
> >   
> >> Thanks!
> >>
> >> Eric  
> >>>  
> >>>>
> >>>> Thanks
> >>>>
> >>>> Eric
> >>>>  
> >>>>>  
> >>>>>>
> >>>>>> Thanks
> >>>>>>
> >>>>>> Eric  
> >>>>>
> >>>>>  
> >>>
> >>>
Eric Auger July 12, 2018, 2:22 p.m. UTC | #12
Hi Igor,

On 07/11/2018 03:17 PM, Igor Mammedov wrote:
> On Thu, 5 Jul 2018 16:27:05 +0200
> Auger Eric <eric.auger@redhat.com> wrote:
> 
>> Hi Shameer,
>>
>> On 07/05/2018 03:19 PM, Shameerali Kolothum Thodi wrote:
>>>   
>>>> -----Original Message-----
>>>> From: Auger Eric [mailto:eric.auger@redhat.com]
>>>> Sent: 05 July 2018 13:18
>>>> To: David Hildenbrand <david@redhat.com>; eric.auger.pro@gmail.com;
>>>> qemu-devel@nongnu.org; qemu-arm@nongnu.org; peter.maydell@linaro.org;
>>>> Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;
>>>> imammedo@redhat.com
>>>> Cc: wei@redhat.com; drjones@redhat.com; david@gibson.dropbear.id.au;
>>>> dgilbert@redhat.com; agraf@suse.de
>>>> Subject: Re: [Qemu-devel] [RFC v3 06/15] hw/arm/virt: Allocate
>>>> device_memory
>>>>
>>>> Hi David,
>>>>
>>>> On 07/05/2018 02:09 PM, David Hildenbrand wrote:  
>>>>> On 05.07.2018 14:00, Auger Eric wrote:  
>>>>>> Hi David,
>>>>>>
>>>>>> On 07/05/2018 01:54 PM, David Hildenbrand wrote:  
>>>>>>> On 05.07.2018 13:42, Auger Eric wrote:  
>>>>>>>> Hi David,
>>>>>>>>
>>>>>>>> On 07/04/2018 02:05 PM, David Hildenbrand wrote:  
>>>>>>>>> On 03.07.2018 21:27, Auger Eric wrote:  
>>>>>>>>>> Hi David,
>>>>>>>>>> On 07/03/2018 08:25 PM, David Hildenbrand wrote:  
>>>>>>>>>>> On 03.07.2018 09:19, Eric Auger wrote:  
>>>>>>>>>>>> We define a new hotpluggable RAM region (aka. device memory).
>>>>>>>>>>>> Its base is 2TB GPA. This obviously requires 42b IPA support
>>>>>>>>>>>> in KVM/ARM, FW and guest kernel. At the moment the device
>>>>>>>>>>>> memory region is max 2TB.  
>>>>>>>>>>>
>>>>>>>>>>> Maybe a stupid question, but why exactly does it have to start at 2TB
>>>>>>>>>>> (and not e.g. at 1TB)?  
>>>>>>>>>> not a stupid question. See tentative answer below.  
>>>>>>>>>>>  
>>>>>>>>>>>>
>>>>>>>>>>>> This is largely inspired of device memory initialization in
>>>>>>>>>>>> pc machine code.
>>>>>>>>>>>>
>>>>>>>>>>>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
>>>>>>>>>>>> Signed-off-by: Kwangwoo Lee <kwangwoo.lee@sk.com>
>>>>>>>>>>>> ---
>>>>>>>>>>>>  hw/arm/virt.c         | 104  
>>>> ++++++++++++++++++++++++++++++++++++--------------  
>>>>>>>>>>>>  include/hw/arm/arm.h  |   2 +
>>>>>>>>>>>>  include/hw/arm/virt.h |   1 +
>>>>>>>>>>>>  3 files changed, 79 insertions(+), 28 deletions(-)
>>>>>>>>>>>>
>>>>>>>>>>>> diff --git a/hw/arm/virt.c b/hw/arm/virt.c
>>>>>>>>>>>> index 5a4d0bf..6fefb78 100644
>>>>>>>>>>>> --- a/hw/arm/virt.c
>>>>>>>>>>>> +++ b/hw/arm/virt.c
>>>>>>>>>>>> @@ -59,6 +59,7 @@
>>>>>>>>>>>>  #include "qapi/visitor.h"
>>>>>>>>>>>>  #include "standard-headers/linux/input.h"
>>>>>>>>>>>>  #include "hw/arm/smmuv3.h"
>>>>>>>>>>>> +#include "hw/acpi/acpi.h"
>>>>>>>>>>>>
>>>>>>>>>>>>  #define DEFINE_VIRT_MACHINE_LATEST(major, minor, latest) \
>>>>>>>>>>>>      static void virt_##major##_##minor##_class_init(ObjectClass *oc,  
>>>> \  
>>>>>>>>>>>> @@ -94,34 +95,25 @@
>>>>>>>>>>>>
>>>>>>>>>>>>  #define PLATFORM_BUS_NUM_IRQS 64
>>>>>>>>>>>>
>>>>>>>>>>>> -/* RAM limit in GB. Since VIRT_MEM starts at the 1GB mark, this  
>>>> means  
>>>>>>>>>>>> - * RAM can go up to the 256GB mark, leaving 256GB of the physical
>>>>>>>>>>>> - * address space unallocated and free for future use between 256G  
>>>> and 512G.  
>>>>>>>>>>>> - * If we need to provide more RAM to VMs in the future then we  
>>>> need to:  
>>>>>>>>>>>> - *  * allocate a second bank of RAM starting at 2TB and working up  
>>>>>>>>>> I acknowledge this comment was the main justification. Now if you look  
>>>> at  
>>>>>>>>>>
>>>>>>>>>> Principles of ARM Memory Maps
>>>>>>>>>>  
>>>> http://infocenter.arm.com/help/topic/com.arm.doc.den0001c/DEN0001C_princ
>>>> iples_of_arm_memory_maps.pdf  
>>>>>>>>>> chapter 2.3 you will find that when adding PA bits, you always leave
>>>>>>>>>> space for reserved space and mapped IO.  
>>>>>>>>>
>>>>>>>>> Thanks for the pointer!
>>>>>>>>>
>>>>>>>>> So ... we can fit
>>>>>>>>>
>>>>>>>>> a) 2GB at 2GB
>>>>>>>>> b) 32GB at 32GB
>>>>>>>>> c) 512GB at 512GB
>>>>>>>>> d) 8TB at 8TB
>>>>>>>>> e) 128TB at 128TB
>>>>>>>>>
>>>>>>>>> (this is a nice rule of thumb if I understand it correctly :) )
>>>>>>>>>
>>>>>>>>> We should strive for device memory (maxram_size - ram_size) to fit
>>>>>>>>> exactly into one of these slots (otherwise things get nasty).
>>>>>>>>>
>>>>>>>>> Depending on the ram_size, we might have simpler setups and can  
>>>> support  
>>>>>>>>> more configurations, no?
>>>>>>>>>
>>>>>>>>> E.g. ram_size <= 34GB, device_memory <= 512GB  
>>>>>>>>> -> move ram into a) and b)
>>>>>>>>> -> move device memory into c)  
>>>>>>>>
>>>>>>>> The issue is machvirt doesn't comply with that document.
>>>>>>>> At the moment we have
>>>>>>>> 0 -> 1GB MMIO
>>>>>>>> 1GB -> 256GB RAM
>>>>>>>> 256GB -> 512GB is theoretically reserved for IO but most is free.
>>>>>>>> 512GB -> 1T is reserved for ECAM MMIO range. This is the top of our
>>>>>>>> existing 40b GPA space.
>>>>>>>>
>>>>>>>> We don't want to change this address map due to legacy reasons.
>>>>>>>>  
>>>>>>>
>>>>>>> Thanks, good to know!
>>>>>>>  
>>>>>>>> Another question! do you know if it would be possible to have
>>>>>>>> device_memory region split into several discontinuous segments?  
>>>>>>>
>>>>>>> It can be implemented for sure, but I would try to avoid that, as it
>>>>>>> makes certain configurations impossible (and very end user unfriendly).
>>>>>>>
>>>>>>> E.g. (numbers completely made up, but it should show what I mean)
>>>>>>>
>>>>>>> -m 20G,maxmem=120G:  
>>>>>>> -> Try to add a DIMM with 100G -> error.
>>>>>>> -> But we can add e.g. two DIMMs with 40G and 60G.  
>>>>>>>
>>>>>>> This exposes internal details to the end user. And the end user has no
>>>>>>> idea what is going on.
>>>>>>>
>>>>>>> So I think we should try our best to keep that area consecutive.  
>>>>>>
>>>>>> Actually I didn't sufficiently detail the context. I would like
>>>>>> 1) 1 segment to be exposed to the end-user through slot|maxmem stuff
>>>>>> (what this series targets) and
>>>>>> 2) another segment used to instantiate PC-DIMM for internal needs as
>>>>>> replacement of part of the 1GB -> 256GB static RAM. This was the purpose
>>>>>> of Shameer's original series  
>>>>>
>>>>> I am not sure if PC-DIMMs are exactly what you want for internal purposes.
>>>>>  
>>>>>>
>>>>>> [1] [RFC v2 0/6] hw/arm: Add support for non-contiguous iova regions
>>>>>> http://patchwork.ozlabs.org/cover/914694/
>>>>>> This approach is not yet validated though.
>>>>>>
>>>>>> The rationale is sometimes you must have "holes" in RAM as some GPAs
>>>>>> match reserved IOVAs for assigned devices.  
>>>>>
>>>>> So if I understand it correctly, all you want is some memory region that
>>>>> a) contains only initially defined memory
>>>>> b) can have some holes in it
>>>>>
>>>>> This is exactly what x86 already does (pc_memory_init): Simply construct
>>>>> your own memory region leaving holes in it.
>>>>>
>>>>>
>>>>> memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", ram,
>>>>>                          0, pcms->below_4g_mem_size);
>>>>> memory_region_add_subregion(system_memory, 0, ram_below_4g);
>>>>> ...
>>>>> if (pcms->above_4g_mem_size > 0)
>>>>>     memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
>>>>>     ...
>>>>>     memory_region_add_subregion(system_memory, 0x100000000ULL,
>>>>>     ...
>>>>>
>>>>> They "indicate" these different GPA areas using the e820 map to the guest.
>>>>>
>>>>> Would that also work for you?  
>>>>
>>>> I would tentatively say yes. Effectively I am not sure that if we were
>>>> to actually put holes in the 1G-256GB RAM segment, PC-DIMM would be the
>>>> natural choice. Also the reserved IOVA issue impacts the device_memory
>>>> region area I think. I am skeptical about the fact we can put holes in
>>>> static RAM and device_memory regions like that.
> Could we just use a single device_memory region for both initial+hotpluggable
> RAM if we make base RAM address dynamic?
This assumes FW does support dynamic RAM base. If I understand correctly
this is not the case. Also there is the problematic of migration. How
would you migrate between guests whose RAM is not laid out at the same
place? I understood hotplug memory relied on a specific device_memory
region. So do you mean we would have 2 contiguous regions?
> In this case RAM could start wherever there is a free space for maxmem
> (if there is free space in lowmem, put device_memory there, otherwise put
> it somewhere in high mem) and we won't care if there is IOVA or not.
> 
> *I don't have a clue about iommus, so here goes a stupid question*
> I agree with Peter that whole IOVA thing looks broken, when host
> layout dictates the guest's one.
> Shouldn't be there somewhere an iommu that would remap host map into
> guest specific one? (so guest would model board we need and be migrate-able
> instead of mimicking host hw)
The issue is related to IOVAs programmed by the guest in the host
assigned devices. DMA requests issued by the assigned devices using
those IOVAs are supposed to reach the guest RAM. But due to the host
topology they won't (host PCI host bridge windows or MSI reserved
regions). Adding a vIOMMU on guest side effectively allows to use IOVAs
!= GPAs but guest is exposed to that change. Adding this extra
translation stage adds a huge performance penalty. And eventually the
IOVA allocator used in the guest should theoretically be aware of host
reserved regions as well.

> 
> 
>>> The first approach[1] we had to address the holes in memory was using
>>> the memory alias way mentioned above.  And based on Drew's review, the
>>> pc-dimm way of handling was introduced. I think the main argument was that
>>> it will be useful when we eventually support hotplug.  
>>
>> That's my understanding too.
> not only hotplug,
> 
>   a RAM memory region that's split by aliases is difficult to handle
>   as it creates nonlinear GPA<->HVA mapping instead of
>   1:1 mapping of pc-dimm,
>   so if one needs to build HVA<->GPA map for a given MemoryRegion
>   in case of aliases one would have to get list of MemorySections
>   that belong to it and build map from that vs (addr + offset) in
>   case of simple 1:1 mapping.
> 
>   complicated machine specific SRAT/e820 code due to holes
>   /grep 'the memory map is a bit tricky'/
> 
>>  But since that is added
>>> anyway as part of this series, I am not sure we have any other benefit in
>>> modeling it as pc-dimm. May be I am missing something here.  
>>
>> I tentatively agree with you. I was trying to understand if the
>> device_memory region was fitting the original needs too but I think
>> standard alias approach is more adapted to hole creation.
> Aliases are easy way to start with, but as compat knobs grow
> (based on PC experience,  grep 'Calculate ram split')
> It's quite a pain to maintain manual implicit aliases layout
> without breaking it by accident.
> We probably won't be able to get rid of aliases on PC for legacy
> reasons but why introduce the same pain to virt board.
> 
> Well, magical conversion from -m X to 2..y memory regions (aliases or not)
> aren't going to be easy in both cases, especially if one would take into
> account "-numa memdev|mem".
> I'd rather use a single pc-dimm approach for both /initial and hotpluggble RAM/
> and then use device_memory framework to enumerate RAM wherever needed (ACPI/DT)
> in inform way.
We have 2 problematics:
- support more RAM. This can be achieved by adding a single memory
region based on DIMMs
- manage IOVA reserved regions. I don't think we have a consensus on the
solution at the moment. What about migration between 2 guests having a
different memory topogy?

Thanks

Eric
> 
> 
>> Thanks
>>
>> Eric
>>>
>>> Thanks,
>>> Shameer
>>>
>>> [1]. https://lists.gnu.org/archive/html/qemu-arm/2018-04/msg00243.html
>>>
>>>   
>>>> Thanks!
>>>>
>>>> Eric  
>>>>>  
>>>>>>
>>>>>> Thanks
>>>>>>
>>>>>> Eric
>>>>>>  
>>>>>>>  
>>>>>>>>
>>>>>>>> Thanks
>>>>>>>>
>>>>>>>> Eric  
>>>>>>>
>>>>>>>  
>>>>>
>>>>>  
> 
>
Andrew Jones July 12, 2018, 2:45 p.m. UTC | #13
On Thu, Jul 12, 2018 at 04:22:05PM +0200, Auger Eric wrote:
> Hi Igor,
> 
> On 07/11/2018 03:17 PM, Igor Mammedov wrote:
> > On Thu, 5 Jul 2018 16:27:05 +0200
> > Auger Eric <eric.auger@redhat.com> wrote:
> > 
> >> Hi Shameer,
> >>
> >> On 07/05/2018 03:19 PM, Shameerali Kolothum Thodi wrote:
> >>>   
> >>>> -----Original Message-----
> >>>> From: Auger Eric [mailto:eric.auger@redhat.com]
> >>>> Sent: 05 July 2018 13:18
> >>>> To: David Hildenbrand <david@redhat.com>; eric.auger.pro@gmail.com;
> >>>> qemu-devel@nongnu.org; qemu-arm@nongnu.org; peter.maydell@linaro.org;
> >>>> Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;
> >>>> imammedo@redhat.com
> >>>> Cc: wei@redhat.com; drjones@redhat.com; david@gibson.dropbear.id.au;
> >>>> dgilbert@redhat.com; agraf@suse.de
> >>>> Subject: Re: [Qemu-devel] [RFC v3 06/15] hw/arm/virt: Allocate
> >>>> device_memory
> >>>>
> >>>> Hi David,
> >>>>
> >>>> On 07/05/2018 02:09 PM, David Hildenbrand wrote:  
> >>>>> On 05.07.2018 14:00, Auger Eric wrote:  
> >>>>>> Hi David,
> >>>>>>
> >>>>>> On 07/05/2018 01:54 PM, David Hildenbrand wrote:  
> >>>>>>> On 05.07.2018 13:42, Auger Eric wrote:  
> >>>>>>>> Hi David,
> >>>>>>>>
> >>>>>>>> On 07/04/2018 02:05 PM, David Hildenbrand wrote:  
> >>>>>>>>> On 03.07.2018 21:27, Auger Eric wrote:  
> >>>>>>>>>> Hi David,
> >>>>>>>>>> On 07/03/2018 08:25 PM, David Hildenbrand wrote:  
> >>>>>>>>>>> On 03.07.2018 09:19, Eric Auger wrote:  
> >>>>>>>>>>>> We define a new hotpluggable RAM region (aka. device memory).
> >>>>>>>>>>>> Its base is 2TB GPA. This obviously requires 42b IPA support
> >>>>>>>>>>>> in KVM/ARM, FW and guest kernel. At the moment the device
> >>>>>>>>>>>> memory region is max 2TB.  
> >>>>>>>>>>>
> >>>>>>>>>>> Maybe a stupid question, but why exactly does it have to start at 2TB
> >>>>>>>>>>> (and not e.g. at 1TB)?  
> >>>>>>>>>> not a stupid question. See tentative answer below.  
> >>>>>>>>>>>  
> >>>>>>>>>>>>
> >>>>>>>>>>>> This is largely inspired of device memory initialization in
> >>>>>>>>>>>> pc machine code.
> >>>>>>>>>>>>
> >>>>>>>>>>>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
> >>>>>>>>>>>> Signed-off-by: Kwangwoo Lee <kwangwoo.lee@sk.com>
> >>>>>>>>>>>> ---
> >>>>>>>>>>>>  hw/arm/virt.c         | 104  
> >>>> ++++++++++++++++++++++++++++++++++++--------------  
> >>>>>>>>>>>>  include/hw/arm/arm.h  |   2 +
> >>>>>>>>>>>>  include/hw/arm/virt.h |   1 +
> >>>>>>>>>>>>  3 files changed, 79 insertions(+), 28 deletions(-)
> >>>>>>>>>>>>
> >>>>>>>>>>>> diff --git a/hw/arm/virt.c b/hw/arm/virt.c
> >>>>>>>>>>>> index 5a4d0bf..6fefb78 100644
> >>>>>>>>>>>> --- a/hw/arm/virt.c
> >>>>>>>>>>>> +++ b/hw/arm/virt.c
> >>>>>>>>>>>> @@ -59,6 +59,7 @@
> >>>>>>>>>>>>  #include "qapi/visitor.h"
> >>>>>>>>>>>>  #include "standard-headers/linux/input.h"
> >>>>>>>>>>>>  #include "hw/arm/smmuv3.h"
> >>>>>>>>>>>> +#include "hw/acpi/acpi.h"
> >>>>>>>>>>>>
> >>>>>>>>>>>>  #define DEFINE_VIRT_MACHINE_LATEST(major, minor, latest) \
> >>>>>>>>>>>>      static void virt_##major##_##minor##_class_init(ObjectClass *oc,  
> >>>> \  
> >>>>>>>>>>>> @@ -94,34 +95,25 @@
> >>>>>>>>>>>>
> >>>>>>>>>>>>  #define PLATFORM_BUS_NUM_IRQS 64
> >>>>>>>>>>>>
> >>>>>>>>>>>> -/* RAM limit in GB. Since VIRT_MEM starts at the 1GB mark, this  
> >>>> means  
> >>>>>>>>>>>> - * RAM can go up to the 256GB mark, leaving 256GB of the physical
> >>>>>>>>>>>> - * address space unallocated and free for future use between 256G  
> >>>> and 512G.  
> >>>>>>>>>>>> - * If we need to provide more RAM to VMs in the future then we  
> >>>> need to:  
> >>>>>>>>>>>> - *  * allocate a second bank of RAM starting at 2TB and working up  
> >>>>>>>>>> I acknowledge this comment was the main justification. Now if you look  
> >>>> at  
> >>>>>>>>>>
> >>>>>>>>>> Principles of ARM Memory Maps
> >>>>>>>>>>  
> >>>> http://infocenter.arm.com/help/topic/com.arm.doc.den0001c/DEN0001C_princ
> >>>> iples_of_arm_memory_maps.pdf  
> >>>>>>>>>> chapter 2.3 you will find that when adding PA bits, you always leave
> >>>>>>>>>> space for reserved space and mapped IO.  
> >>>>>>>>>
> >>>>>>>>> Thanks for the pointer!
> >>>>>>>>>
> >>>>>>>>> So ... we can fit
> >>>>>>>>>
> >>>>>>>>> a) 2GB at 2GB
> >>>>>>>>> b) 32GB at 32GB
> >>>>>>>>> c) 512GB at 512GB
> >>>>>>>>> d) 8TB at 8TB
> >>>>>>>>> e) 128TB at 128TB
> >>>>>>>>>
> >>>>>>>>> (this is a nice rule of thumb if I understand it correctly :) )
> >>>>>>>>>
> >>>>>>>>> We should strive for device memory (maxram_size - ram_size) to fit
> >>>>>>>>> exactly into one of these slots (otherwise things get nasty).
> >>>>>>>>>
> >>>>>>>>> Depending on the ram_size, we might have simpler setups and can  
> >>>> support  
> >>>>>>>>> more configurations, no?
> >>>>>>>>>
> >>>>>>>>> E.g. ram_size <= 34GB, device_memory <= 512GB  
> >>>>>>>>> -> move ram into a) and b)
> >>>>>>>>> -> move device memory into c)  
> >>>>>>>>
> >>>>>>>> The issue is machvirt doesn't comply with that document.
> >>>>>>>> At the moment we have
> >>>>>>>> 0 -> 1GB MMIO
> >>>>>>>> 1GB -> 256GB RAM
> >>>>>>>> 256GB -> 512GB is theoretically reserved for IO but most is free.
> >>>>>>>> 512GB -> 1T is reserved for ECAM MMIO range. This is the top of our
> >>>>>>>> existing 40b GPA space.
> >>>>>>>>
> >>>>>>>> We don't want to change this address map due to legacy reasons.
> >>>>>>>>  
> >>>>>>>
> >>>>>>> Thanks, good to know!
> >>>>>>>  
> >>>>>>>> Another question! do you know if it would be possible to have
> >>>>>>>> device_memory region split into several discontinuous segments?  
> >>>>>>>
> >>>>>>> It can be implemented for sure, but I would try to avoid that, as it
> >>>>>>> makes certain configurations impossible (and very end user unfriendly).
> >>>>>>>
> >>>>>>> E.g. (numbers completely made up, but it should show what I mean)
> >>>>>>>
> >>>>>>> -m 20G,maxmem=120G:  
> >>>>>>> -> Try to add a DIMM with 100G -> error.
> >>>>>>> -> But we can add e.g. two DIMMs with 40G and 60G.  
> >>>>>>>
> >>>>>>> This exposes internal details to the end user. And the end user has no
> >>>>>>> idea what is going on.
> >>>>>>>
> >>>>>>> So I think we should try our best to keep that area consecutive.  
> >>>>>>
> >>>>>> Actually I didn't sufficiently detail the context. I would like
> >>>>>> 1) 1 segment to be exposed to the end-user through slot|maxmem stuff
> >>>>>> (what this series targets) and
> >>>>>> 2) another segment used to instantiate PC-DIMM for internal needs as
> >>>>>> replacement of part of the 1GB -> 256GB static RAM. This was the purpose
> >>>>>> of Shameer's original series  
> >>>>>
> >>>>> I am not sure if PC-DIMMs are exactly what you want for internal purposes.
> >>>>>  
> >>>>>>
> >>>>>> [1] [RFC v2 0/6] hw/arm: Add support for non-contiguous iova regions
> >>>>>> http://patchwork.ozlabs.org/cover/914694/
> >>>>>> This approach is not yet validated though.
> >>>>>>
> >>>>>> The rationale is sometimes you must have "holes" in RAM as some GPAs
> >>>>>> match reserved IOVAs for assigned devices.  
> >>>>>
> >>>>> So if I understand it correctly, all you want is some memory region that
> >>>>> a) contains only initially defined memory
> >>>>> b) can have some holes in it
> >>>>>
> >>>>> This is exactly what x86 already does (pc_memory_init): Simply construct
> >>>>> your own memory region leaving holes in it.
> >>>>>
> >>>>>
> >>>>> memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", ram,
> >>>>>                          0, pcms->below_4g_mem_size);
> >>>>> memory_region_add_subregion(system_memory, 0, ram_below_4g);
> >>>>> ...
> >>>>> if (pcms->above_4g_mem_size > 0)
> >>>>>     memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> >>>>>     ...
> >>>>>     memory_region_add_subregion(system_memory, 0x100000000ULL,
> >>>>>     ...
> >>>>>
> >>>>> They "indicate" these different GPA areas using the e820 map to the guest.
> >>>>>
> >>>>> Would that also work for you?  
> >>>>
> >>>> I would tentatively say yes. Effectively I am not sure that if we were
> >>>> to actually put holes in the 1G-256GB RAM segment, PC-DIMM would be the
> >>>> natural choice. Also the reserved IOVA issue impacts the device_memory
> >>>> region area I think. I am skeptical about the fact we can put holes in
> >>>> static RAM and device_memory regions like that.
> > Could we just use a single device_memory region for both initial+hotpluggable
> > RAM if we make base RAM address dynamic?
> This assumes FW does support dynamic RAM base. If I understand correctly
> this is not the case. 

It's not currently the case, but I've adding prototyping this near the top
of my TODO. So stay tuned.

> Also there is the problematic of migration. How
> would you migrate between guests whose RAM is not laid out at the same
> place?

I'm not sure what you mean here. Boot a guest with a new memory map,
probably by explicitly asking for it with a new machine property,
which means a new virt machine version. Then migrate at will to any
host that supports that machine type.

> I understood hotplug memory relied on a specific device_memory
> region. So do you mean we would have 2 contiguous regions?

I think Igor wants one contiguous region for RAM, where additional
space can be reserved for hotplugging.

> > In this case RAM could start wherever there is a free space for maxmem
> > (if there is free space in lowmem, put device_memory there, otherwise put
> > it somewhere in high mem) and we won't care if there is IOVA or not.
> > 
> > *I don't have a clue about iommus, so here goes a stupid question*
> > I agree with Peter that whole IOVA thing looks broken, when host
> > layout dictates the guest's one.
> > Shouldn't be there somewhere an iommu that would remap host map into
> > guest specific one? (so guest would model board we need and be migrate-able
> > instead of mimicking host hw)
> The issue is related to IOVAs programmed by the guest in the host
> assigned devices. DMA requests issued by the assigned devices using
> those IOVAs are supposed to reach the guest RAM. But due to the host
> topology they won't (host PCI host bridge windows or MSI reserved
> regions). Adding a vIOMMU on guest side effectively allows to use IOVAs
> != GPAs but guest is exposed to that change. Adding this extra
> translation stage adds a huge performance penalty. And eventually the
> IOVA allocator used in the guest should theoretically be aware of host
> reserved regions as well.
> 
> > 
> > 
> >>> The first approach[1] we had to address the holes in memory was using
> >>> the memory alias way mentioned above.  And based on Drew's review, the
> >>> pc-dimm way of handling was introduced. I think the main argument was that
> >>> it will be useful when we eventually support hotplug.  
> >>
> >> That's my understanding too.
> > not only hotplug,
> > 
> >   a RAM memory region that's split by aliases is difficult to handle
> >   as it creates nonlinear GPA<->HVA mapping instead of
> >   1:1 mapping of pc-dimm,
> >   so if one needs to build HVA<->GPA map for a given MemoryRegion
> >   in case of aliases one would have to get list of MemorySections
> >   that belong to it and build map from that vs (addr + offset) in
> >   case of simple 1:1 mapping.
> > 
> >   complicated machine specific SRAT/e820 code due to holes
> >   /grep 'the memory map is a bit tricky'/
> > 
> >>  But since that is added
> >>> anyway as part of this series, I am not sure we have any other benefit in
> >>> modeling it as pc-dimm. May be I am missing something here.  
> >>
> >> I tentatively agree with you. I was trying to understand if the
> >> device_memory region was fitting the original needs too but I think
> >> standard alias approach is more adapted to hole creation.
> > Aliases are easy way to start with, but as compat knobs grow
> > (based on PC experience,  grep 'Calculate ram split')
> > It's quite a pain to maintain manual implicit aliases layout
> > without breaking it by accident.
> > We probably won't be able to get rid of aliases on PC for legacy
> > reasons but why introduce the same pain to virt board.
> > 
> > Well, magical conversion from -m X to 2..y memory regions (aliases or not)
> > aren't going to be easy in both cases, especially if one would take into
> > account "-numa memdev|mem".
> > I'd rather use a single pc-dimm approach for both /initial and hotpluggble RAM/
> > and then use device_memory framework to enumerate RAM wherever needed (ACPI/DT)
> > in inform way.
> We have 2 problematics:
> - support more RAM. This can be achieved by adding a single memory
> region based on DIMMs
> - manage IOVA reserved regions. I don't think we have a consensus on the
> solution at the moment. What about migration between 2 guests having a
> different memory topogy?

With a dynamic RAM base (pretty easy to do in QEMU, but requires FW change
- now on my TODO), then one only needs to pick a contiguous region within
the guest physical address limits that has the requested size and does not
overlap any host reserved regions (I think). I'm still not sure what the
migration concern is.

Thanks,
drew

> 
> Thanks
> 
> Eric
> > 
> > 
> >> Thanks
> >>
> >> Eric
> >>>
> >>> Thanks,
> >>> Shameer
> >>>
> >>> [1]. https://lists.gnu.org/archive/html/qemu-arm/2018-04/msg00243.html
> >>>
> >>>   
> >>>> Thanks!
> >>>>
> >>>> Eric  
> >>>>>  
> >>>>>>
> >>>>>> Thanks
> >>>>>>
> >>>>>> Eric
> >>>>>>  
> >>>>>>>  
> >>>>>>>>
> >>>>>>>> Thanks
> >>>>>>>>
> >>>>>>>> Eric  
> >>>>>>>
> >>>>>>>  
> >>>>>
> >>>>>  
> > 
> > 
>
Eric Auger July 12, 2018, 2:53 p.m. UTC | #14
Hi Drew,

On 07/12/2018 04:45 PM, Andrew Jones wrote:
> On Thu, Jul 12, 2018 at 04:22:05PM +0200, Auger Eric wrote:
>> Hi Igor,
>>
>> On 07/11/2018 03:17 PM, Igor Mammedov wrote:
>>> On Thu, 5 Jul 2018 16:27:05 +0200
>>> Auger Eric <eric.auger@redhat.com> wrote:
>>>
>>>> Hi Shameer,
>>>>
>>>> On 07/05/2018 03:19 PM, Shameerali Kolothum Thodi wrote:
>>>>>   
>>>>>> -----Original Message-----
>>>>>> From: Auger Eric [mailto:eric.auger@redhat.com]
>>>>>> Sent: 05 July 2018 13:18
>>>>>> To: David Hildenbrand <david@redhat.com>; eric.auger.pro@gmail.com;
>>>>>> qemu-devel@nongnu.org; qemu-arm@nongnu.org; peter.maydell@linaro.org;
>>>>>> Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;
>>>>>> imammedo@redhat.com
>>>>>> Cc: wei@redhat.com; drjones@redhat.com; david@gibson.dropbear.id.au;
>>>>>> dgilbert@redhat.com; agraf@suse.de
>>>>>> Subject: Re: [Qemu-devel] [RFC v3 06/15] hw/arm/virt: Allocate
>>>>>> device_memory
>>>>>>
>>>>>> Hi David,
>>>>>>
>>>>>> On 07/05/2018 02:09 PM, David Hildenbrand wrote:  
>>>>>>> On 05.07.2018 14:00, Auger Eric wrote:  
>>>>>>>> Hi David,
>>>>>>>>
>>>>>>>> On 07/05/2018 01:54 PM, David Hildenbrand wrote:  
>>>>>>>>> On 05.07.2018 13:42, Auger Eric wrote:  
>>>>>>>>>> Hi David,
>>>>>>>>>>
>>>>>>>>>> On 07/04/2018 02:05 PM, David Hildenbrand wrote:  
>>>>>>>>>>> On 03.07.2018 21:27, Auger Eric wrote:  
>>>>>>>>>>>> Hi David,
>>>>>>>>>>>> On 07/03/2018 08:25 PM, David Hildenbrand wrote:  
>>>>>>>>>>>>> On 03.07.2018 09:19, Eric Auger wrote:  
>>>>>>>>>>>>>> We define a new hotpluggable RAM region (aka. device memory).
>>>>>>>>>>>>>> Its base is 2TB GPA. This obviously requires 42b IPA support
>>>>>>>>>>>>>> in KVM/ARM, FW and guest kernel. At the moment the device
>>>>>>>>>>>>>> memory region is max 2TB.  
>>>>>>>>>>>>>
>>>>>>>>>>>>> Maybe a stupid question, but why exactly does it have to start at 2TB
>>>>>>>>>>>>> (and not e.g. at 1TB)?  
>>>>>>>>>>>> not a stupid question. See tentative answer below.  
>>>>>>>>>>>>>  
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> This is largely inspired of device memory initialization in
>>>>>>>>>>>>>> pc machine code.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
>>>>>>>>>>>>>> Signed-off-by: Kwangwoo Lee <kwangwoo.lee@sk.com>
>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>  hw/arm/virt.c         | 104  
>>>>>> ++++++++++++++++++++++++++++++++++++--------------  
>>>>>>>>>>>>>>  include/hw/arm/arm.h  |   2 +
>>>>>>>>>>>>>>  include/hw/arm/virt.h |   1 +
>>>>>>>>>>>>>>  3 files changed, 79 insertions(+), 28 deletions(-)
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> diff --git a/hw/arm/virt.c b/hw/arm/virt.c
>>>>>>>>>>>>>> index 5a4d0bf..6fefb78 100644
>>>>>>>>>>>>>> --- a/hw/arm/virt.c
>>>>>>>>>>>>>> +++ b/hw/arm/virt.c
>>>>>>>>>>>>>> @@ -59,6 +59,7 @@
>>>>>>>>>>>>>>  #include "qapi/visitor.h"
>>>>>>>>>>>>>>  #include "standard-headers/linux/input.h"
>>>>>>>>>>>>>>  #include "hw/arm/smmuv3.h"
>>>>>>>>>>>>>> +#include "hw/acpi/acpi.h"
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>  #define DEFINE_VIRT_MACHINE_LATEST(major, minor, latest) \
>>>>>>>>>>>>>>      static void virt_##major##_##minor##_class_init(ObjectClass *oc,  
>>>>>> \  
>>>>>>>>>>>>>> @@ -94,34 +95,25 @@
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>  #define PLATFORM_BUS_NUM_IRQS 64
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> -/* RAM limit in GB. Since VIRT_MEM starts at the 1GB mark, this  
>>>>>> means  
>>>>>>>>>>>>>> - * RAM can go up to the 256GB mark, leaving 256GB of the physical
>>>>>>>>>>>>>> - * address space unallocated and free for future use between 256G  
>>>>>> and 512G.  
>>>>>>>>>>>>>> - * If we need to provide more RAM to VMs in the future then we  
>>>>>> need to:  
>>>>>>>>>>>>>> - *  * allocate a second bank of RAM starting at 2TB and working up  
>>>>>>>>>>>> I acknowledge this comment was the main justification. Now if you look  
>>>>>> at  
>>>>>>>>>>>>
>>>>>>>>>>>> Principles of ARM Memory Maps
>>>>>>>>>>>>  
>>>>>> http://infocenter.arm.com/help/topic/com.arm.doc.den0001c/DEN0001C_princ
>>>>>> iples_of_arm_memory_maps.pdf  
>>>>>>>>>>>> chapter 2.3 you will find that when adding PA bits, you always leave
>>>>>>>>>>>> space for reserved space and mapped IO.  
>>>>>>>>>>>
>>>>>>>>>>> Thanks for the pointer!
>>>>>>>>>>>
>>>>>>>>>>> So ... we can fit
>>>>>>>>>>>
>>>>>>>>>>> a) 2GB at 2GB
>>>>>>>>>>> b) 32GB at 32GB
>>>>>>>>>>> c) 512GB at 512GB
>>>>>>>>>>> d) 8TB at 8TB
>>>>>>>>>>> e) 128TB at 128TB
>>>>>>>>>>>
>>>>>>>>>>> (this is a nice rule of thumb if I understand it correctly :) )
>>>>>>>>>>>
>>>>>>>>>>> We should strive for device memory (maxram_size - ram_size) to fit
>>>>>>>>>>> exactly into one of these slots (otherwise things get nasty).
>>>>>>>>>>>
>>>>>>>>>>> Depending on the ram_size, we might have simpler setups and can  
>>>>>> support  
>>>>>>>>>>> more configurations, no?
>>>>>>>>>>>
>>>>>>>>>>> E.g. ram_size <= 34GB, device_memory <= 512GB  
>>>>>>>>>>> -> move ram into a) and b)
>>>>>>>>>>> -> move device memory into c)  
>>>>>>>>>>
>>>>>>>>>> The issue is machvirt doesn't comply with that document.
>>>>>>>>>> At the moment we have
>>>>>>>>>> 0 -> 1GB MMIO
>>>>>>>>>> 1GB -> 256GB RAM
>>>>>>>>>> 256GB -> 512GB is theoretically reserved for IO but most is free.
>>>>>>>>>> 512GB -> 1T is reserved for ECAM MMIO range. This is the top of our
>>>>>>>>>> existing 40b GPA space.
>>>>>>>>>>
>>>>>>>>>> We don't want to change this address map due to legacy reasons.
>>>>>>>>>>  
>>>>>>>>>
>>>>>>>>> Thanks, good to know!
>>>>>>>>>  
>>>>>>>>>> Another question! do you know if it would be possible to have
>>>>>>>>>> device_memory region split into several discontinuous segments?  
>>>>>>>>>
>>>>>>>>> It can be implemented for sure, but I would try to avoid that, as it
>>>>>>>>> makes certain configurations impossible (and very end user unfriendly).
>>>>>>>>>
>>>>>>>>> E.g. (numbers completely made up, but it should show what I mean)
>>>>>>>>>
>>>>>>>>> -m 20G,maxmem=120G:  
>>>>>>>>> -> Try to add a DIMM with 100G -> error.
>>>>>>>>> -> But we can add e.g. two DIMMs with 40G and 60G.  
>>>>>>>>>
>>>>>>>>> This exposes internal details to the end user. And the end user has no
>>>>>>>>> idea what is going on.
>>>>>>>>>
>>>>>>>>> So I think we should try our best to keep that area consecutive.  
>>>>>>>>
>>>>>>>> Actually I didn't sufficiently detail the context. I would like
>>>>>>>> 1) 1 segment to be exposed to the end-user through slot|maxmem stuff
>>>>>>>> (what this series targets) and
>>>>>>>> 2) another segment used to instantiate PC-DIMM for internal needs as
>>>>>>>> replacement of part of the 1GB -> 256GB static RAM. This was the purpose
>>>>>>>> of Shameer's original series  
>>>>>>>
>>>>>>> I am not sure if PC-DIMMs are exactly what you want for internal purposes.
>>>>>>>  
>>>>>>>>
>>>>>>>> [1] [RFC v2 0/6] hw/arm: Add support for non-contiguous iova regions
>>>>>>>> http://patchwork.ozlabs.org/cover/914694/
>>>>>>>> This approach is not yet validated though.
>>>>>>>>
>>>>>>>> The rationale is sometimes you must have "holes" in RAM as some GPAs
>>>>>>>> match reserved IOVAs for assigned devices.  
>>>>>>>
>>>>>>> So if I understand it correctly, all you want is some memory region that
>>>>>>> a) contains only initially defined memory
>>>>>>> b) can have some holes in it
>>>>>>>
>>>>>>> This is exactly what x86 already does (pc_memory_init): Simply construct
>>>>>>> your own memory region leaving holes in it.
>>>>>>>
>>>>>>>
>>>>>>> memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", ram,
>>>>>>>                          0, pcms->below_4g_mem_size);
>>>>>>> memory_region_add_subregion(system_memory, 0, ram_below_4g);
>>>>>>> ...
>>>>>>> if (pcms->above_4g_mem_size > 0)
>>>>>>>     memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
>>>>>>>     ...
>>>>>>>     memory_region_add_subregion(system_memory, 0x100000000ULL,
>>>>>>>     ...
>>>>>>>
>>>>>>> They "indicate" these different GPA areas using the e820 map to the guest.
>>>>>>>
>>>>>>> Would that also work for you?  
>>>>>>
>>>>>> I would tentatively say yes. Effectively I am not sure that if we were
>>>>>> to actually put holes in the 1G-256GB RAM segment, PC-DIMM would be the
>>>>>> natural choice. Also the reserved IOVA issue impacts the device_memory
>>>>>> region area I think. I am skeptical about the fact we can put holes in
>>>>>> static RAM and device_memory regions like that.
>>> Could we just use a single device_memory region for both initial+hotpluggable
>>> RAM if we make base RAM address dynamic?
>> This assumes FW does support dynamic RAM base. If I understand correctly
>> this is not the case. 
> 
> It's not currently the case, but I've adding prototyping this near the top
> of my TODO. So stay tuned.
ok
> 
>> Also there is the problematic of migration. How
>> would you migrate between guests whose RAM is not laid out at the same
>> place?
> 
> I'm not sure what you mean here. Boot a guest with a new memory map,
> probably by explicitly asking for it with a new machine property,
> which means a new virt machine version. Then migrate at will to any
> host that supports that machine type.
My concern rather was about holes in the memory map matching reserved
regions.
> 
>> I understood hotplug memory relied on a specific device_memory
>> region. So do you mean we would have 2 contiguous regions?
> 
> I think Igor wants one contiguous region for RAM, where additional
> space can be reserved for hotplugging.
This is not compliant with 2012 ARM white paper, although I don't really
know if this document truly is a reference (did not get any reply).

Thanks

Eric
> 
>>> In this case RAM could start wherever there is a free space for maxmem
>>> (if there is free space in lowmem, put device_memory there, otherwise put
>>> it somewhere in high mem) and we won't care if there is IOVA or not.
>>>
>>> *I don't have a clue about iommus, so here goes a stupid question*
>>> I agree with Peter that whole IOVA thing looks broken, when host
>>> layout dictates the guest's one.
>>> Shouldn't be there somewhere an iommu that would remap host map into
>>> guest specific one? (so guest would model board we need and be migrate-able
>>> instead of mimicking host hw)
>> The issue is related to IOVAs programmed by the guest in the host
>> assigned devices. DMA requests issued by the assigned devices using
>> those IOVAs are supposed to reach the guest RAM. But due to the host
>> topology they won't (host PCI host bridge windows or MSI reserved
>> regions). Adding a vIOMMU on guest side effectively allows to use IOVAs
>> != GPAs but guest is exposed to that change. Adding this extra
>> translation stage adds a huge performance penalty. And eventually the
>> IOVA allocator used in the guest should theoretically be aware of host
>> reserved regions as well.
>>
>>>
>>>
>>>>> The first approach[1] we had to address the holes in memory was using
>>>>> the memory alias way mentioned above.  And based on Drew's review, the
>>>>> pc-dimm way of handling was introduced. I think the main argument was that
>>>>> it will be useful when we eventually support hotplug.  
>>>>
>>>> That's my understanding too.
>>> not only hotplug,
>>>
>>>   a RAM memory region that's split by aliases is difficult to handle
>>>   as it creates nonlinear GPA<->HVA mapping instead of
>>>   1:1 mapping of pc-dimm,
>>>   so if one needs to build HVA<->GPA map for a given MemoryRegion
>>>   in case of aliases one would have to get list of MemorySections
>>>   that belong to it and build map from that vs (addr + offset) in
>>>   case of simple 1:1 mapping.
>>>
>>>   complicated machine specific SRAT/e820 code due to holes
>>>   /grep 'the memory map is a bit tricky'/
>>>
>>>>  But since that is added
>>>>> anyway as part of this series, I am not sure we have any other benefit in
>>>>> modeling it as pc-dimm. May be I am missing something here.  
>>>>
>>>> I tentatively agree with you. I was trying to understand if the
>>>> device_memory region was fitting the original needs too but I think
>>>> standard alias approach is more adapted to hole creation.
>>> Aliases are easy way to start with, but as compat knobs grow
>>> (based on PC experience,  grep 'Calculate ram split')
>>> It's quite a pain to maintain manual implicit aliases layout
>>> without breaking it by accident.
>>> We probably won't be able to get rid of aliases on PC for legacy
>>> reasons but why introduce the same pain to virt board.
>>>
>>> Well, magical conversion from -m X to 2..y memory regions (aliases or not)
>>> aren't going to be easy in both cases, especially if one would take into
>>> account "-numa memdev|mem".
>>> I'd rather use a single pc-dimm approach for both /initial and hotpluggble RAM/
>>> and then use device_memory framework to enumerate RAM wherever needed (ACPI/DT)
>>> in inform way.
>> We have 2 problematics:
>> - support more RAM. This can be achieved by adding a single memory
>> region based on DIMMs
>> - manage IOVA reserved regions. I don't think we have a consensus on the
>> solution at the moment. What about migration between 2 guests having a
>> different memory topogy?
> 
> With a dynamic RAM base (pretty easy to do in QEMU, but requires FW change
> - now on my TODO), then one only needs to pick a contiguous region within
> the guest physical address limits that has the requested size and does not
> overlap any host reserved regions (I think). I'm still not sure what the
> migration concern is.
> 
> Thanks,
> drew
> 
>>
>> Thanks
>>
>> Eric
>>>
>>>
>>>> Thanks
>>>>
>>>> Eric
>>>>>
>>>>> Thanks,
>>>>> Shameer
>>>>>
>>>>> [1]. https://lists.gnu.org/archive/html/qemu-arm/2018-04/msg00243.html
>>>>>
>>>>>   
>>>>>> Thanks!
>>>>>>
>>>>>> Eric  
>>>>>>>  
>>>>>>>>
>>>>>>>> Thanks
>>>>>>>>
>>>>>>>> Eric
>>>>>>>>  
>>>>>>>>>  
>>>>>>>>>>
>>>>>>>>>> Thanks
>>>>>>>>>>
>>>>>>>>>> Eric  
>>>>>>>>>
>>>>>>>>>  
>>>>>>>
>>>>>>>  
>>>
>>>
>>
Andrew Jones July 12, 2018, 3:15 p.m. UTC | #15
On Thu, Jul 12, 2018 at 04:53:01PM +0200, Auger Eric wrote:
> Hi Drew,
> 
> On 07/12/2018 04:45 PM, Andrew Jones wrote:
> > On Thu, Jul 12, 2018 at 04:22:05PM +0200, Auger Eric wrote:
> >> Hi Igor,
> >>
> >> On 07/11/2018 03:17 PM, Igor Mammedov wrote:
> >>> On Thu, 5 Jul 2018 16:27:05 +0200
> >>> Auger Eric <eric.auger@redhat.com> wrote:
> >>>
> >>>> Hi Shameer,
> >>>>
> >>>> On 07/05/2018 03:19 PM, Shameerali Kolothum Thodi wrote:
> >>>>>   
> >>>>>> -----Original Message-----
> >>>>>> From: Auger Eric [mailto:eric.auger@redhat.com]
> >>>>>> Sent: 05 July 2018 13:18
> >>>>>> To: David Hildenbrand <david@redhat.com>; eric.auger.pro@gmail.com;
> >>>>>> qemu-devel@nongnu.org; qemu-arm@nongnu.org; peter.maydell@linaro.org;
> >>>>>> Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;
> >>>>>> imammedo@redhat.com
> >>>>>> Cc: wei@redhat.com; drjones@redhat.com; david@gibson.dropbear.id.au;
> >>>>>> dgilbert@redhat.com; agraf@suse.de
> >>>>>> Subject: Re: [Qemu-devel] [RFC v3 06/15] hw/arm/virt: Allocate
> >>>>>> device_memory
> >>>>>>
> >>>>>> Hi David,
> >>>>>>
> >>>>>> On 07/05/2018 02:09 PM, David Hildenbrand wrote:  
> >>>>>>> On 05.07.2018 14:00, Auger Eric wrote:  
> >>>>>>>> Hi David,
> >>>>>>>>
> >>>>>>>> On 07/05/2018 01:54 PM, David Hildenbrand wrote:  
> >>>>>>>>> On 05.07.2018 13:42, Auger Eric wrote:  
> >>>>>>>>>> Hi David,
> >>>>>>>>>>
> >>>>>>>>>> On 07/04/2018 02:05 PM, David Hildenbrand wrote:  
> >>>>>>>>>>> On 03.07.2018 21:27, Auger Eric wrote:  
> >>>>>>>>>>>> Hi David,
> >>>>>>>>>>>> On 07/03/2018 08:25 PM, David Hildenbrand wrote:  
> >>>>>>>>>>>>> On 03.07.2018 09:19, Eric Auger wrote:  
> >>>>>>>>>>>>>> We define a new hotpluggable RAM region (aka. device memory).
> >>>>>>>>>>>>>> Its base is 2TB GPA. This obviously requires 42b IPA support
> >>>>>>>>>>>>>> in KVM/ARM, FW and guest kernel. At the moment the device
> >>>>>>>>>>>>>> memory region is max 2TB.  
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> Maybe a stupid question, but why exactly does it have to start at 2TB
> >>>>>>>>>>>>> (and not e.g. at 1TB)?  
> >>>>>>>>>>>> not a stupid question. See tentative answer below.  
> >>>>>>>>>>>>>  
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> This is largely inspired of device memory initialization in
> >>>>>>>>>>>>>> pc machine code.
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
> >>>>>>>>>>>>>> Signed-off-by: Kwangwoo Lee <kwangwoo.lee@sk.com>
> >>>>>>>>>>>>>> ---
> >>>>>>>>>>>>>>  hw/arm/virt.c         | 104  
> >>>>>> ++++++++++++++++++++++++++++++++++++--------------  
> >>>>>>>>>>>>>>  include/hw/arm/arm.h  |   2 +
> >>>>>>>>>>>>>>  include/hw/arm/virt.h |   1 +
> >>>>>>>>>>>>>>  3 files changed, 79 insertions(+), 28 deletions(-)
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> diff --git a/hw/arm/virt.c b/hw/arm/virt.c
> >>>>>>>>>>>>>> index 5a4d0bf..6fefb78 100644
> >>>>>>>>>>>>>> --- a/hw/arm/virt.c
> >>>>>>>>>>>>>> +++ b/hw/arm/virt.c
> >>>>>>>>>>>>>> @@ -59,6 +59,7 @@
> >>>>>>>>>>>>>>  #include "qapi/visitor.h"
> >>>>>>>>>>>>>>  #include "standard-headers/linux/input.h"
> >>>>>>>>>>>>>>  #include "hw/arm/smmuv3.h"
> >>>>>>>>>>>>>> +#include "hw/acpi/acpi.h"
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>>  #define DEFINE_VIRT_MACHINE_LATEST(major, minor, latest) \
> >>>>>>>>>>>>>>      static void virt_##major##_##minor##_class_init(ObjectClass *oc,  
> >>>>>> \  
> >>>>>>>>>>>>>> @@ -94,34 +95,25 @@
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>>  #define PLATFORM_BUS_NUM_IRQS 64
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> -/* RAM limit in GB. Since VIRT_MEM starts at the 1GB mark, this  
> >>>>>> means  
> >>>>>>>>>>>>>> - * RAM can go up to the 256GB mark, leaving 256GB of the physical
> >>>>>>>>>>>>>> - * address space unallocated and free for future use between 256G  
> >>>>>> and 512G.  
> >>>>>>>>>>>>>> - * If we need to provide more RAM to VMs in the future then we  
> >>>>>> need to:  
> >>>>>>>>>>>>>> - *  * allocate a second bank of RAM starting at 2TB and working up  
> >>>>>>>>>>>> I acknowledge this comment was the main justification. Now if you look  
> >>>>>> at  
> >>>>>>>>>>>>
> >>>>>>>>>>>> Principles of ARM Memory Maps
> >>>>>>>>>>>>  
> >>>>>> http://infocenter.arm.com/help/topic/com.arm.doc.den0001c/DEN0001C_princ
> >>>>>> iples_of_arm_memory_maps.pdf  
> >>>>>>>>>>>> chapter 2.3 you will find that when adding PA bits, you always leave
> >>>>>>>>>>>> space for reserved space and mapped IO.  
> >>>>>>>>>>>
> >>>>>>>>>>> Thanks for the pointer!
> >>>>>>>>>>>
> >>>>>>>>>>> So ... we can fit
> >>>>>>>>>>>
> >>>>>>>>>>> a) 2GB at 2GB
> >>>>>>>>>>> b) 32GB at 32GB
> >>>>>>>>>>> c) 512GB at 512GB
> >>>>>>>>>>> d) 8TB at 8TB
> >>>>>>>>>>> e) 128TB at 128TB
> >>>>>>>>>>>
> >>>>>>>>>>> (this is a nice rule of thumb if I understand it correctly :) )
> >>>>>>>>>>>
> >>>>>>>>>>> We should strive for device memory (maxram_size - ram_size) to fit
> >>>>>>>>>>> exactly into one of these slots (otherwise things get nasty).
> >>>>>>>>>>>
> >>>>>>>>>>> Depending on the ram_size, we might have simpler setups and can  
> >>>>>> support  
> >>>>>>>>>>> more configurations, no?
> >>>>>>>>>>>
> >>>>>>>>>>> E.g. ram_size <= 34GB, device_memory <= 512GB  
> >>>>>>>>>>> -> move ram into a) and b)
> >>>>>>>>>>> -> move device memory into c)  
> >>>>>>>>>>
> >>>>>>>>>> The issue is machvirt doesn't comply with that document.
> >>>>>>>>>> At the moment we have
> >>>>>>>>>> 0 -> 1GB MMIO
> >>>>>>>>>> 1GB -> 256GB RAM
> >>>>>>>>>> 256GB -> 512GB is theoretically reserved for IO but most is free.
> >>>>>>>>>> 512GB -> 1T is reserved for ECAM MMIO range. This is the top of our
> >>>>>>>>>> existing 40b GPA space.
> >>>>>>>>>>
> >>>>>>>>>> We don't want to change this address map due to legacy reasons.
> >>>>>>>>>>  
> >>>>>>>>>
> >>>>>>>>> Thanks, good to know!
> >>>>>>>>>  
> >>>>>>>>>> Another question! do you know if it would be possible to have
> >>>>>>>>>> device_memory region split into several discontinuous segments?  
> >>>>>>>>>
> >>>>>>>>> It can be implemented for sure, but I would try to avoid that, as it
> >>>>>>>>> makes certain configurations impossible (and very end user unfriendly).
> >>>>>>>>>
> >>>>>>>>> E.g. (numbers completely made up, but it should show what I mean)
> >>>>>>>>>
> >>>>>>>>> -m 20G,maxmem=120G:  
> >>>>>>>>> -> Try to add a DIMM with 100G -> error.
> >>>>>>>>> -> But we can add e.g. two DIMMs with 40G and 60G.  
> >>>>>>>>>
> >>>>>>>>> This exposes internal details to the end user. And the end user has no
> >>>>>>>>> idea what is going on.
> >>>>>>>>>
> >>>>>>>>> So I think we should try our best to keep that area consecutive.  
> >>>>>>>>
> >>>>>>>> Actually I didn't sufficiently detail the context. I would like
> >>>>>>>> 1) 1 segment to be exposed to the end-user through slot|maxmem stuff
> >>>>>>>> (what this series targets) and
> >>>>>>>> 2) another segment used to instantiate PC-DIMM for internal needs as
> >>>>>>>> replacement of part of the 1GB -> 256GB static RAM. This was the purpose
> >>>>>>>> of Shameer's original series  
> >>>>>>>
> >>>>>>> I am not sure if PC-DIMMs are exactly what you want for internal purposes.
> >>>>>>>  
> >>>>>>>>
> >>>>>>>> [1] [RFC v2 0/6] hw/arm: Add support for non-contiguous iova regions
> >>>>>>>> http://patchwork.ozlabs.org/cover/914694/
> >>>>>>>> This approach is not yet validated though.
> >>>>>>>>
> >>>>>>>> The rationale is sometimes you must have "holes" in RAM as some GPAs
> >>>>>>>> match reserved IOVAs for assigned devices.  
> >>>>>>>
> >>>>>>> So if I understand it correctly, all you want is some memory region that
> >>>>>>> a) contains only initially defined memory
> >>>>>>> b) can have some holes in it
> >>>>>>>
> >>>>>>> This is exactly what x86 already does (pc_memory_init): Simply construct
> >>>>>>> your own memory region leaving holes in it.
> >>>>>>>
> >>>>>>>
> >>>>>>> memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", ram,
> >>>>>>>                          0, pcms->below_4g_mem_size);
> >>>>>>> memory_region_add_subregion(system_memory, 0, ram_below_4g);
> >>>>>>> ...
> >>>>>>> if (pcms->above_4g_mem_size > 0)
> >>>>>>>     memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> >>>>>>>     ...
> >>>>>>>     memory_region_add_subregion(system_memory, 0x100000000ULL,
> >>>>>>>     ...
> >>>>>>>
> >>>>>>> They "indicate" these different GPA areas using the e820 map to the guest.
> >>>>>>>
> >>>>>>> Would that also work for you?  
> >>>>>>
> >>>>>> I would tentatively say yes. Effectively I am not sure that if we were
> >>>>>> to actually put holes in the 1G-256GB RAM segment, PC-DIMM would be the
> >>>>>> natural choice. Also the reserved IOVA issue impacts the device_memory
> >>>>>> region area I think. I am skeptical about the fact we can put holes in
> >>>>>> static RAM and device_memory regions like that.
> >>> Could we just use a single device_memory region for both initial+hotpluggable
> >>> RAM if we make base RAM address dynamic?
> >> This assumes FW does support dynamic RAM base. If I understand correctly
> >> this is not the case. 
> > 
> > It's not currently the case, but I've adding prototyping this near the top
> > of my TODO. So stay tuned.
> ok
> > 
> >> Also there is the problematic of migration. How
> >> would you migrate between guests whose RAM is not laid out at the same
> >> place?
> > 
> > I'm not sure what you mean here. Boot a guest with a new memory map,
> > probably by explicitly asking for it with a new machine property,
> > which means a new virt machine version. Then migrate at will to any
> > host that supports that machine type.
> My concern rather was about holes in the memory map matching reserved
> regions.

Oh, I see. I don't think the reserved-host-memory-regions-messing-up-guest
problem can ever be solved for the migration case of the destination host
not having strictly an equal set or subset of the source host's reserved
regions. So there's nothing we can do but force upper management layers to
maintain migration candidate lists in these environments. A pre-check
could also be added, allowing migration to error-out early when an overlap
is detected.

> > 
> >> I understood hotplug memory relied on a specific device_memory
> >> region. So do you mean we would have 2 contiguous regions?
> > 
> > I think Igor wants one contiguous region for RAM, where additional
> > space can be reserved for hotplugging.
> This is not compliant with 2012 ARM white paper, although I don't really
> know if this document truly is a reference (did not get any reply).
> 
> Thanks
> 
> Eric
> > 
> >>> In this case RAM could start wherever there is a free space for maxmem
> >>> (if there is free space in lowmem, put device_memory there, otherwise put
> >>> it somewhere in high mem) and we won't care if there is IOVA or not.
> >>>
> >>> *I don't have a clue about iommus, so here goes a stupid question*
> >>> I agree with Peter that whole IOVA thing looks broken, when host
> >>> layout dictates the guest's one.
> >>> Shouldn't be there somewhere an iommu that would remap host map into
> >>> guest specific one? (so guest would model board we need and be migrate-able
> >>> instead of mimicking host hw)
> >> The issue is related to IOVAs programmed by the guest in the host
> >> assigned devices. DMA requests issued by the assigned devices using
> >> those IOVAs are supposed to reach the guest RAM. But due to the host
> >> topology they won't (host PCI host bridge windows or MSI reserved
> >> regions). Adding a vIOMMU on guest side effectively allows to use IOVAs
> >> != GPAs but guest is exposed to that change. Adding this extra
> >> translation stage adds a huge performance penalty. And eventually the
> >> IOVA allocator used in the guest should theoretically be aware of host
> >> reserved regions as well.
> >>
> >>>
> >>>
> >>>>> The first approach[1] we had to address the holes in memory was using
> >>>>> the memory alias way mentioned above.  And based on Drew's review, the
> >>>>> pc-dimm way of handling was introduced. I think the main argument was that
> >>>>> it will be useful when we eventually support hotplug.  
> >>>>
> >>>> That's my understanding too.
> >>> not only hotplug,
> >>>
> >>>   a RAM memory region that's split by aliases is difficult to handle
> >>>   as it creates nonlinear GPA<->HVA mapping instead of
> >>>   1:1 mapping of pc-dimm,
> >>>   so if one needs to build HVA<->GPA map for a given MemoryRegion
> >>>   in case of aliases one would have to get list of MemorySections
> >>>   that belong to it and build map from that vs (addr + offset) in
> >>>   case of simple 1:1 mapping.
> >>>
> >>>   complicated machine specific SRAT/e820 code due to holes
> >>>   /grep 'the memory map is a bit tricky'/
> >>>
> >>>>  But since that is added
> >>>>> anyway as part of this series, I am not sure we have any other benefit in
> >>>>> modeling it as pc-dimm. May be I am missing something here.  
> >>>>
> >>>> I tentatively agree with you. I was trying to understand if the
> >>>> device_memory region was fitting the original needs too but I think
> >>>> standard alias approach is more adapted to hole creation.
> >>> Aliases are easy way to start with, but as compat knobs grow
> >>> (based on PC experience,  grep 'Calculate ram split')
> >>> It's quite a pain to maintain manual implicit aliases layout
> >>> without breaking it by accident.
> >>> We probably won't be able to get rid of aliases on PC for legacy
> >>> reasons but why introduce the same pain to virt board.
> >>>
> >>> Well, magical conversion from -m X to 2..y memory regions (aliases or not)
> >>> aren't going to be easy in both cases, especially if one would take into
> >>> account "-numa memdev|mem".
> >>> I'd rather use a single pc-dimm approach for both /initial and hotpluggble RAM/
> >>> and then use device_memory framework to enumerate RAM wherever needed (ACPI/DT)
> >>> in inform way.
> >> We have 2 problematics:
> >> - support more RAM. This can be achieved by adding a single memory
> >> region based on DIMMs
> >> - manage IOVA reserved regions. I don't think we have a consensus on the
> >> solution at the moment. What about migration between 2 guests having a
> >> different memory topogy?
> > 
> > With a dynamic RAM base (pretty easy to do in QEMU, but requires FW change
> > - now on my TODO), then one only needs to pick a contiguous region within
> > the guest physical address limits that has the requested size and does not
> > overlap any host reserved regions (I think). I'm still not sure what the
> > migration concern is.
> > 
> > Thanks,
> > drew
> > 
> >>
> >> Thanks
> >>
> >> Eric
> >>>
> >>>
> >>>> Thanks
> >>>>
> >>>> Eric
> >>>>>
> >>>>> Thanks,
> >>>>> Shameer
> >>>>>
> >>>>> [1]. https://lists.gnu.org/archive/html/qemu-arm/2018-04/msg00243.html
> >>>>>
> >>>>>   
> >>>>>> Thanks!
> >>>>>>
> >>>>>> Eric  
> >>>>>>>  
> >>>>>>>>
> >>>>>>>> Thanks
> >>>>>>>>
> >>>>>>>> Eric
> >>>>>>>>  
> >>>>>>>>>  
> >>>>>>>>>>
> >>>>>>>>>> Thanks
> >>>>>>>>>>
> >>>>>>>>>> Eric  
> >>>>>>>>>
> >>>>>>>>>  
> >>>>>>>
> >>>>>>>  
> >>>
> >>>
> >>
>
Igor Mammedov July 18, 2018, 1 p.m. UTC | #16
On Thu, 12 Jul 2018 16:53:01 +0200
Auger Eric <eric.auger@redhat.com> wrote:

> Hi Drew,
> 
> On 07/12/2018 04:45 PM, Andrew Jones wrote:
> > On Thu, Jul 12, 2018 at 04:22:05PM +0200, Auger Eric wrote:  
> >> Hi Igor,
> >>
> >> On 07/11/2018 03:17 PM, Igor Mammedov wrote:  
> >>> On Thu, 5 Jul 2018 16:27:05 +0200
> >>> Auger Eric <eric.auger@redhat.com> wrote:
> >>>  
> >>>> Hi Shameer,
> >>>>
> >>>> On 07/05/2018 03:19 PM, Shameerali Kolothum Thodi wrote:  
> >>>>>     
> >>>>>> -----Original Message-----
> >>>>>> From: Auger Eric [mailto:eric.auger@redhat.com]
> >>>>>> Sent: 05 July 2018 13:18
> >>>>>> To: David Hildenbrand <david@redhat.com>; eric.auger.pro@gmail.com;
> >>>>>> qemu-devel@nongnu.org; qemu-arm@nongnu.org; peter.maydell@linaro.org;
> >>>>>> Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;
> >>>>>> imammedo@redhat.com
> >>>>>> Cc: wei@redhat.com; drjones@redhat.com; david@gibson.dropbear.id.au;
> >>>>>> dgilbert@redhat.com; agraf@suse.de
> >>>>>> Subject: Re: [Qemu-devel] [RFC v3 06/15] hw/arm/virt: Allocate
> >>>>>> device_memory
> >>>>>>
> >>>>>> Hi David,
> >>>>>>
> >>>>>> On 07/05/2018 02:09 PM, David Hildenbrand wrote:    
> >>>>>>> On 05.07.2018 14:00, Auger Eric wrote:    
> >>>>>>>> Hi David,
> >>>>>>>>
> >>>>>>>> On 07/05/2018 01:54 PM, David Hildenbrand wrote:    
> >>>>>>>>> On 05.07.2018 13:42, Auger Eric wrote:    
> >>>>>>>>>> Hi David,
> >>>>>>>>>>
> >>>>>>>>>> On 07/04/2018 02:05 PM, David Hildenbrand wrote:    
> >>>>>>>>>>> On 03.07.2018 21:27, Auger Eric wrote:    
> >>>>>>>>>>>> Hi David,
> >>>>>>>>>>>> On 07/03/2018 08:25 PM, David Hildenbrand wrote:    
> >>>>>>>>>>>>> On 03.07.2018 09:19, Eric Auger wrote:    
> >>>>>>>>>>>>>> We define a new hotpluggable RAM region (aka. device memory).
> >>>>>>>>>>>>>> Its base is 2TB GPA. This obviously requires 42b IPA support
> >>>>>>>>>>>>>> in KVM/ARM, FW and guest kernel. At the moment the device
> >>>>>>>>>>>>>> memory region is max 2TB.    
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> Maybe a stupid question, but why exactly does it have to start at 2TB
> >>>>>>>>>>>>> (and not e.g. at 1TB)?    
> >>>>>>>>>>>> not a stupid question. See tentative answer below.    
> >>>>>>>>>>>>>    
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> This is largely inspired of device memory initialization in
> >>>>>>>>>>>>>> pc machine code.
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
> >>>>>>>>>>>>>> Signed-off-by: Kwangwoo Lee <kwangwoo.lee@sk.com>
> >>>>>>>>>>>>>> ---
> >>>>>>>>>>>>>>  hw/arm/virt.c         | 104    
> >>>>>> ++++++++++++++++++++++++++++++++++++--------------    
> >>>>>>>>>>>>>>  include/hw/arm/arm.h  |   2 +
> >>>>>>>>>>>>>>  include/hw/arm/virt.h |   1 +
> >>>>>>>>>>>>>>  3 files changed, 79 insertions(+), 28 deletions(-)
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> diff --git a/hw/arm/virt.c b/hw/arm/virt.c
> >>>>>>>>>>>>>> index 5a4d0bf..6fefb78 100644
> >>>>>>>>>>>>>> --- a/hw/arm/virt.c
> >>>>>>>>>>>>>> +++ b/hw/arm/virt.c
> >>>>>>>>>>>>>> @@ -59,6 +59,7 @@
> >>>>>>>>>>>>>>  #include "qapi/visitor.h"
> >>>>>>>>>>>>>>  #include "standard-headers/linux/input.h"
> >>>>>>>>>>>>>>  #include "hw/arm/smmuv3.h"
> >>>>>>>>>>>>>> +#include "hw/acpi/acpi.h"
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>>  #define DEFINE_VIRT_MACHINE_LATEST(major, minor, latest) \
> >>>>>>>>>>>>>>      static void virt_##major##_##minor##_class_init(ObjectClass *oc,    
> >>>>>> \    
> >>>>>>>>>>>>>> @@ -94,34 +95,25 @@
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>>  #define PLATFORM_BUS_NUM_IRQS 64
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> -/* RAM limit in GB. Since VIRT_MEM starts at the 1GB mark, this    
> >>>>>> means    
> >>>>>>>>>>>>>> - * RAM can go up to the 256GB mark, leaving 256GB of the physical
> >>>>>>>>>>>>>> - * address space unallocated and free for future use between 256G    
> >>>>>> and 512G.    
> >>>>>>>>>>>>>> - * If we need to provide more RAM to VMs in the future then we    
> >>>>>> need to:    
> >>>>>>>>>>>>>> - *  * allocate a second bank of RAM starting at 2TB and working up    
> >>>>>>>>>>>> I acknowledge this comment was the main justification. Now if you look    
> >>>>>> at    
> >>>>>>>>>>>>
> >>>>>>>>>>>> Principles of ARM Memory Maps
> >>>>>>>>>>>>    
> >>>>>> http://infocenter.arm.com/help/topic/com.arm.doc.den0001c/DEN0001C_princ
> >>>>>> iples_of_arm_memory_maps.pdf    
> >>>>>>>>>>>> chapter 2.3 you will find that when adding PA bits, you always leave
> >>>>>>>>>>>> space for reserved space and mapped IO.    
> >>>>>>>>>>>
> >>>>>>>>>>> Thanks for the pointer!
> >>>>>>>>>>>
> >>>>>>>>>>> So ... we can fit
> >>>>>>>>>>>
> >>>>>>>>>>> a) 2GB at 2GB
> >>>>>>>>>>> b) 32GB at 32GB
> >>>>>>>>>>> c) 512GB at 512GB
> >>>>>>>>>>> d) 8TB at 8TB
> >>>>>>>>>>> e) 128TB at 128TB
> >>>>>>>>>>>
> >>>>>>>>>>> (this is a nice rule of thumb if I understand it correctly :) )
> >>>>>>>>>>>
> >>>>>>>>>>> We should strive for device memory (maxram_size - ram_size) to fit
> >>>>>>>>>>> exactly into one of these slots (otherwise things get nasty).
> >>>>>>>>>>>
> >>>>>>>>>>> Depending on the ram_size, we might have simpler setups and can    
> >>>>>> support    
> >>>>>>>>>>> more configurations, no?
> >>>>>>>>>>>
> >>>>>>>>>>> E.g. ram_size <= 34GB, device_memory <= 512GB    
> >>>>>>>>>>> -> move ram into a) and b)
> >>>>>>>>>>> -> move device memory into c)    
> >>>>>>>>>>
> >>>>>>>>>> The issue is machvirt doesn't comply with that document.
> >>>>>>>>>> At the moment we have
> >>>>>>>>>> 0 -> 1GB MMIO
> >>>>>>>>>> 1GB -> 256GB RAM
> >>>>>>>>>> 256GB -> 512GB is theoretically reserved for IO but most is free.
> >>>>>>>>>> 512GB -> 1T is reserved for ECAM MMIO range. This is the top of our
> >>>>>>>>>> existing 40b GPA space.
> >>>>>>>>>>
> >>>>>>>>>> We don't want to change this address map due to legacy reasons.
[...]

> >> Also there is the problematic of migration. How
> >> would you migrate between guests whose RAM is not laid out at the same
> >> place?  
> > 
> > I'm not sure what you mean here. Boot a guest with a new memory map,
> > probably by explicitly asking for it with a new machine property,
> > which means a new virt machine version. Then migrate at will to any
> > host that supports that machine type.  
> My concern rather was about holes in the memory map matching reserved
> regions.
> >   
> >> I understood hotplug memory relied on a specific device_memory
> >> region. So do you mean we would have 2 contiguous regions?  
> > 
> > I think Igor wants one contiguous region for RAM, where additional
> > space can be reserved for hotplugging.  
> This is not compliant with 2012 ARM white paper, although I don't really
> know if this document truly is a reference (did not get any reply).
it's upto QEMU to pick layout, if we have maxmem (upto 256Gb) we could
accommodate legacy req and put single device_memory in 1Gb-256Gb GPA gap,
if it's more we can move whole device_memory to 2Tb, 8Tb ...
that keeps things manageable for us and fits specs (if such exist).
WE should make selection of the next RAM base deterministic is possible
when layout changes due to maxram size or IOVA, so that we won't need
to use compat knobs/checks to keep machine migratable.

[...]
Igor Mammedov July 18, 2018, 1:05 p.m. UTC | #17
On Tue,  3 Jul 2018 09:19:49 +0200
Eric Auger <eric.auger@redhat.com> wrote:

> We define a new hotpluggable RAM region (aka. device memory).
> Its base is 2TB GPA. This obviously requires 42b IPA support
> in KVM/ARM, FW and guest kernel. At the moment the device
> memory region is max 2TB.
> 
> This is largely inspired of device memory initialization in
> pc machine code.
> 
> Signed-off-by: Eric Auger <eric.auger@redhat.com>
> Signed-off-by: Kwangwoo Lee <kwangwoo.lee@sk.com>
> ---
>  hw/arm/virt.c         | 104 ++++++++++++++++++++++++++++++++++++--------------
>  include/hw/arm/arm.h  |   2 +
>  include/hw/arm/virt.h |   1 +
>  3 files changed, 79 insertions(+), 28 deletions(-)
> 
> diff --git a/hw/arm/virt.c b/hw/arm/virt.c
> index 5a4d0bf..6fefb78 100644
> --- a/hw/arm/virt.c
> +++ b/hw/arm/virt.c
> @@ -59,6 +59,7 @@
>  #include "qapi/visitor.h"
>  #include "standard-headers/linux/input.h"
>  #include "hw/arm/smmuv3.h"
> +#include "hw/acpi/acpi.h"
>  
>  #define DEFINE_VIRT_MACHINE_LATEST(major, minor, latest) \
>      static void virt_##major##_##minor##_class_init(ObjectClass *oc, \
> @@ -94,34 +95,25 @@
>  
>  #define PLATFORM_BUS_NUM_IRQS 64
>  
> -/* RAM limit in GB. Since VIRT_MEM starts at the 1GB mark, this means
> - * RAM can go up to the 256GB mark, leaving 256GB of the physical
> - * address space unallocated and free for future use between 256G and 512G.
> - * If we need to provide more RAM to VMs in the future then we need to:
> - *  * allocate a second bank of RAM starting at 2TB and working up
> - *  * fix the DT and ACPI table generation code in QEMU to correctly
> - *    report two split lumps of RAM to the guest
> - *  * fix KVM in the host kernel to allow guests with >40 bit address spaces
> - * (We don't want to fill all the way up to 512GB with RAM because
> - * we might want it for non-RAM purposes later. Conversely it seems
> - * reasonable to assume that anybody configuring a VM with a quarter
> - * of a terabyte of RAM will be doing it on a host with more than a
> - * terabyte of physical address space.)
> - */
> -#define RAMLIMIT_GB 255
> -#define RAMLIMIT_BYTES (RAMLIMIT_GB * 1024ULL * 1024 * 1024)
> +#define SZ_64K 0x10000
> +#define SZ_1G (1024ULL * 1024 * 1024)
>  
>  /* Addresses and sizes of our components.
> - * 0..128MB is space for a flash device so we can run bootrom code such as UEFI.
> - * 128MB..256MB is used for miscellaneous device I/O.
> - * 256MB..1GB is reserved for possible future PCI support (ie where the
> - * PCI memory window will go if we add a PCI host controller).
> - * 1GB and up is RAM (which may happily spill over into the
> - * high memory region beyond 4GB).
> - * This represents a compromise between how much RAM can be given to
> - * a 32 bit VM and leaving space for expansion and in particular for PCI.
> - * Note that devices should generally be placed at multiples of 0x10000,
> + * 0..128MB is space for a flash device so we can run bootrom code such as UEFI,
> + * 128MB..256MB is used for miscellaneous device I/O,
> + * 256MB..1GB is used for PCI host controller,
> + * 1GB..256GB is RAM (not hotpluggable),
> + * 256GB..512GB: is left for device I/O (non RAM purpose),
> + * 512GB..1TB: high mem PCI MMIO region,
> + * 2TB..4TB is used for hot-pluggable DIMM (assumes 42b GPA is supported).
> + *
> + * Note that IO devices should generally be placed at multiples of 0x10000,
>   * to accommodate guests using 64K pages.
> + *
> + * Conversely it seems reasonable to assume that anybody configuring a VM
> + * with a quarter of a terabyte of RAM will be doing it on a host with more
> + * than a terabyte of physical address space.)
> + *
>   */
>  static const MemMapEntry a15memmap[] = {
>      /* Space up to 0x8000000 is reserved for a boot ROM */
> @@ -148,12 +140,13 @@ static const MemMapEntry a15memmap[] = {
>      [VIRT_PCIE_MMIO] =          { 0x10000000, 0x2eff0000 },
>      [VIRT_PCIE_PIO] =           { 0x3eff0000, 0x00010000 },
>      [VIRT_PCIE_ECAM] =          { 0x3f000000, 0x01000000 },
> -    [VIRT_MEM] =                { 0x40000000, RAMLIMIT_BYTES },
> +    [VIRT_MEM] =                { SZ_1G , 255 * SZ_1G },
>      /* Additional 64 MB redist region (can contain up to 512 redistributors) */
>      [VIRT_GIC_REDIST2] =        { 0x4000000000ULL, 0x4000000 },
>      [VIRT_PCIE_ECAM_HIGH] =     { 0x4010000000ULL, 0x10000000 },
>      /* Second PCIe window, 512GB wide at the 512GB boundary */
> -    [VIRT_PCIE_MMIO_HIGH] =   { 0x8000000000ULL, 0x8000000000ULL },
> +    [VIRT_PCIE_MMIO_HIGH] =     { 512 * SZ_1G, 512 * SZ_1G },
> +    [VIRT_HOTPLUG_MEM] =        { 2048 * SZ_1G, 2048 * SZ_1G },
>  };
>  
>  static const int a15irqmap[] = {
> @@ -1223,6 +1216,58 @@ static void create_secure_ram(VirtMachineState *vms,
>      g_free(nodename);
>  }
>  
> +static void create_device_memory(VirtMachineState *vms, MemoryRegion *sysmem)
> +{
> +    MachineState *ms = MACHINE(vms);
> +    uint64_t device_memory_size;
> +    uint64_t align = SZ_64K;
> +
> +    /* always allocate the device memory information */
> +    ms->device_memory = g_malloc0(sizeof(*ms->device_memory));
> +
> +    if (vms->max_vm_phys_shift < 42) {
> +        /* device memory starts at 2TB whereas this VM supports less than
> +         * 2TB GPA */
> +        if (ms->maxram_size > ms->ram_size || ms->ram_slots) {
> +            MachineClass *mc = MACHINE_GET_CLASS(ms);
> +
> +            error_report("\"-memory 'slots|maxmem'\" is not supported by %s "
> +                         "since KVM does not support more than 41b IPA",
> +                         mc->name);
> +            exit(EXIT_FAILURE);
> +        }
> +        return;
> +    }
> +
> +    if (ms->ram_slots > ACPI_MAX_RAM_SLOTS) {
> +        error_report("unsupported number of memory slots: %"PRIu64,
> +                     ms->ram_slots);
> +        exit(EXIT_FAILURE);
> +    }
> +
> +    if (QEMU_ALIGN_UP(ms->maxram_size, align) != ms->maxram_size) {
> +        error_report("maximum memory size must be aligned to multiple of 0x%"
> +                     PRIx64, align);
> +            exit(EXIT_FAILURE);
> +    }
> +
> +    ms->device_memory->base = vms->memmap[VIRT_HOTPLUG_MEM].base;
> +    device_memory_size = ms->maxram_size - ms->ram_size;
> +
> +    if (device_memory_size > vms->memmap[VIRT_HOTPLUG_MEM].size) {
> +        error_report("unsupported amount of maximum memory: " RAM_ADDR_FMT,
> +                         ms->maxram_size);
> +        exit(EXIT_FAILURE);
> +    }
> +
> +    memory_region_init(&ms->device_memory->mr, OBJECT(vms),
> +                       "device-memory", device_memory_size);
> +    memory_region_add_subregion(sysmem, ms->device_memory->base,
> +                                &ms->device_memory->mr);

> +    vms->bootinfo.device_memory_start = ms->device_memory->base;
> +    vms->bootinfo.device_memory_size = device_memory_size;
why do we need duplicate it in bootinfo?
(I'd try avoid using bootinfo and use original source instead
where it's needed)


> +}
> +
>  static void *machvirt_dtb(const struct arm_boot_info *binfo, int *fdt_size)
>  {
>      const VirtMachineState *board = container_of(binfo, VirtMachineState,
> @@ -1430,7 +1475,8 @@ static void machvirt_init(MachineState *machine)
>      vms->smp_cpus = smp_cpus;
>  
>      if (machine->ram_size > vms->memmap[VIRT_MEM].size) {
> -        error_report("mach-virt: cannot model more than %dGB RAM", RAMLIMIT_GB);
> +        error_report("mach-virt: cannot model more than %dGB RAM",
> +                     (int)(vms->memmap[VIRT_MEM].size / SZ_1G));
>          exit(1);
>      }
>  
> @@ -1525,6 +1571,8 @@ static void machvirt_init(MachineState *machine)
>                                           machine->ram_size);
>      memory_region_add_subregion(sysmem, vms->memmap[VIRT_MEM].base, ram);
>  
> +    create_device_memory(vms, sysmem);
> +
>      create_flash(vms, sysmem, secure_sysmem ? secure_sysmem : sysmem);
>  
>      create_gic(vms, pic);
> diff --git a/include/hw/arm/arm.h b/include/hw/arm/arm.h
> index ffed392..76269e6 100644
> --- a/include/hw/arm/arm.h
> +++ b/include/hw/arm/arm.h
> @@ -116,6 +116,8 @@ struct arm_boot_info {
>      bool secure_board_setup;
>  
>      arm_endianness endianness;
> +    hwaddr device_memory_start;
> +    hwaddr device_memory_size;
>  };
>  
>  /**
> diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h
> index 91f6de2..173938d 100644
> --- a/include/hw/arm/virt.h
> +++ b/include/hw/arm/virt.h
> @@ -78,6 +78,7 @@ enum {
>      VIRT_GPIO,
>      VIRT_SECURE_UART,
>      VIRT_SECURE_MEM,
> +    VIRT_HOTPLUG_MEM,
>  };
>  
>  typedef enum VirtIOMMUType {
Eric Auger Aug. 8, 2018, 9:33 a.m. UTC | #18
Hi Igor,

On 07/18/2018 03:00 PM, Igor Mammedov wrote:
[...]
>>>
>>> I think Igor wants one contiguous region for RAM, where additional
>>> space can be reserved for hotplugging.  
>> This is not compliant with 2012 ARM white paper, although I don't really
>> know if this document truly is a reference (did not get any reply).
> it's upto QEMU to pick layout, if we have maxmem (upto 256Gb) we could
> accommodate legacy req and put single device_memory in 1Gb-256Gb GPA gap,
> if it's more we can move whole device_memory to 2Tb, 8Tb ...
> that keeps things manageable for us and fits specs (if such exist).
> WE should make selection of the next RAM base deterministic is possible
> when layout changes due to maxram size or IOVA, so that we won't need
> to use compat knobs/checks to keep machine migratable.
Sorry for the delay. I was out of the office those past weeks.

OK understood. Your preferred approach is to have a contiguous memory
region (initial + hotplug). So this depends on the FW capability to
support flexible RAM base. Let's see how this dependency gets resolved.

This series does not bump the non hotpluggable memory region limit,
which is still limited to 255GB. The only way to add more memory is
though PCDIMM or NVDIMM (max 2TB atm). To do so you need to add ,maxmem
and ,slots options which need to be on both source and dest, right, +
the PCDIMM/NVDIMM device option lines? Also the series checks the
destination has at least the same IPA range capability as the source,
which conditions the fact the requested device_memory size can be
accommodated. At the moment I fail to see what are the other compat
knobs I must be prepared to handle.

Thanks

Eric
> 
> [...]
>
Eric Auger Aug. 8, 2018, 9:33 a.m. UTC | #19
Hi Igor,
On 07/18/2018 03:05 PM, Igor Mammedov wrote:
> On Tue,  3 Jul 2018 09:19:49 +0200
> Eric Auger <eric.auger@redhat.com> wrote:
> 
>> We define a new hotpluggable RAM region (aka. device memory).
>> Its base is 2TB GPA. This obviously requires 42b IPA support
>> in KVM/ARM, FW and guest kernel. At the moment the device
>> memory region is max 2TB.
>>
>> This is largely inspired of device memory initialization in
>> pc machine code.
>>
>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
>> Signed-off-by: Kwangwoo Lee <kwangwoo.lee@sk.com>
>> ---
>>  hw/arm/virt.c         | 104 ++++++++++++++++++++++++++++++++++++--------------
>>  include/hw/arm/arm.h  |   2 +
>>  include/hw/arm/virt.h |   1 +
>>  3 files changed, 79 insertions(+), 28 deletions(-)
>>
>> diff --git a/hw/arm/virt.c b/hw/arm/virt.c
>> index 5a4d0bf..6fefb78 100644
>> --- a/hw/arm/virt.c
>> +++ b/hw/arm/virt.c
>> @@ -59,6 +59,7 @@
>>  #include "qapi/visitor.h"
>>  #include "standard-headers/linux/input.h"
>>  #include "hw/arm/smmuv3.h"
>> +#include "hw/acpi/acpi.h"
>>  
>>  #define DEFINE_VIRT_MACHINE_LATEST(major, minor, latest) \
>>      static void virt_##major##_##minor##_class_init(ObjectClass *oc, \
>> @@ -94,34 +95,25 @@
>>  
>>  #define PLATFORM_BUS_NUM_IRQS 64
>>  
>> -/* RAM limit in GB. Since VIRT_MEM starts at the 1GB mark, this means
>> - * RAM can go up to the 256GB mark, leaving 256GB of the physical
>> - * address space unallocated and free for future use between 256G and 512G.
>> - * If we need to provide more RAM to VMs in the future then we need to:
>> - *  * allocate a second bank of RAM starting at 2TB and working up
>> - *  * fix the DT and ACPI table generation code in QEMU to correctly
>> - *    report two split lumps of RAM to the guest
>> - *  * fix KVM in the host kernel to allow guests with >40 bit address spaces
>> - * (We don't want to fill all the way up to 512GB with RAM because
>> - * we might want it for non-RAM purposes later. Conversely it seems
>> - * reasonable to assume that anybody configuring a VM with a quarter
>> - * of a terabyte of RAM will be doing it on a host with more than a
>> - * terabyte of physical address space.)
>> - */
>> -#define RAMLIMIT_GB 255
>> -#define RAMLIMIT_BYTES (RAMLIMIT_GB * 1024ULL * 1024 * 1024)
>> +#define SZ_64K 0x10000
>> +#define SZ_1G (1024ULL * 1024 * 1024)
>>  
>>  /* Addresses and sizes of our components.
>> - * 0..128MB is space for a flash device so we can run bootrom code such as UEFI.
>> - * 128MB..256MB is used for miscellaneous device I/O.
>> - * 256MB..1GB is reserved for possible future PCI support (ie where the
>> - * PCI memory window will go if we add a PCI host controller).
>> - * 1GB and up is RAM (which may happily spill over into the
>> - * high memory region beyond 4GB).
>> - * This represents a compromise between how much RAM can be given to
>> - * a 32 bit VM and leaving space for expansion and in particular for PCI.
>> - * Note that devices should generally be placed at multiples of 0x10000,
>> + * 0..128MB is space for a flash device so we can run bootrom code such as UEFI,
>> + * 128MB..256MB is used for miscellaneous device I/O,
>> + * 256MB..1GB is used for PCI host controller,
>> + * 1GB..256GB is RAM (not hotpluggable),
>> + * 256GB..512GB: is left for device I/O (non RAM purpose),
>> + * 512GB..1TB: high mem PCI MMIO region,
>> + * 2TB..4TB is used for hot-pluggable DIMM (assumes 42b GPA is supported).
>> + *
>> + * Note that IO devices should generally be placed at multiples of 0x10000,
>>   * to accommodate guests using 64K pages.
>> + *
>> + * Conversely it seems reasonable to assume that anybody configuring a VM
>> + * with a quarter of a terabyte of RAM will be doing it on a host with more
>> + * than a terabyte of physical address space.)
>> + *
>>   */
>>  static const MemMapEntry a15memmap[] = {
>>      /* Space up to 0x8000000 is reserved for a boot ROM */
>> @@ -148,12 +140,13 @@ static const MemMapEntry a15memmap[] = {
>>      [VIRT_PCIE_MMIO] =          { 0x10000000, 0x2eff0000 },
>>      [VIRT_PCIE_PIO] =           { 0x3eff0000, 0x00010000 },
>>      [VIRT_PCIE_ECAM] =          { 0x3f000000, 0x01000000 },
>> -    [VIRT_MEM] =                { 0x40000000, RAMLIMIT_BYTES },
>> +    [VIRT_MEM] =                { SZ_1G , 255 * SZ_1G },
>>      /* Additional 64 MB redist region (can contain up to 512 redistributors) */
>>      [VIRT_GIC_REDIST2] =        { 0x4000000000ULL, 0x4000000 },
>>      [VIRT_PCIE_ECAM_HIGH] =     { 0x4010000000ULL, 0x10000000 },
>>      /* Second PCIe window, 512GB wide at the 512GB boundary */
>> -    [VIRT_PCIE_MMIO_HIGH] =   { 0x8000000000ULL, 0x8000000000ULL },
>> +    [VIRT_PCIE_MMIO_HIGH] =     { 512 * SZ_1G, 512 * SZ_1G },
>> +    [VIRT_HOTPLUG_MEM] =        { 2048 * SZ_1G, 2048 * SZ_1G },
>>  };
>>  
>>  static const int a15irqmap[] = {
>> @@ -1223,6 +1216,58 @@ static void create_secure_ram(VirtMachineState *vms,
>>      g_free(nodename);
>>  }
>>  
>> +static void create_device_memory(VirtMachineState *vms, MemoryRegion *sysmem)
>> +{
>> +    MachineState *ms = MACHINE(vms);
>> +    uint64_t device_memory_size;
>> +    uint64_t align = SZ_64K;
>> +
>> +    /* always allocate the device memory information */
>> +    ms->device_memory = g_malloc0(sizeof(*ms->device_memory));
>> +
>> +    if (vms->max_vm_phys_shift < 42) {
>> +        /* device memory starts at 2TB whereas this VM supports less than
>> +         * 2TB GPA */
>> +        if (ms->maxram_size > ms->ram_size || ms->ram_slots) {
>> +            MachineClass *mc = MACHINE_GET_CLASS(ms);
>> +
>> +            error_report("\"-memory 'slots|maxmem'\" is not supported by %s "
>> +                         "since KVM does not support more than 41b IPA",
>> +                         mc->name);
>> +            exit(EXIT_FAILURE);
>> +        }
>> +        return;
>> +    }
>> +
>> +    if (ms->ram_slots > ACPI_MAX_RAM_SLOTS) {
>> +        error_report("unsupported number of memory slots: %"PRIu64,
>> +                     ms->ram_slots);
>> +        exit(EXIT_FAILURE);
>> +    }
>> +
>> +    if (QEMU_ALIGN_UP(ms->maxram_size, align) != ms->maxram_size) {
>> +        error_report("maximum memory size must be aligned to multiple of 0x%"
>> +                     PRIx64, align);
>> +            exit(EXIT_FAILURE);
>> +    }
>> +
>> +    ms->device_memory->base = vms->memmap[VIRT_HOTPLUG_MEM].base;
>> +    device_memory_size = ms->maxram_size - ms->ram_size;
>> +
>> +    if (device_memory_size > vms->memmap[VIRT_HOTPLUG_MEM].size) {
>> +        error_report("unsupported amount of maximum memory: " RAM_ADDR_FMT,
>> +                         ms->maxram_size);
>> +        exit(EXIT_FAILURE);
>> +    }
>> +
>> +    memory_region_init(&ms->device_memory->mr, OBJECT(vms),
>> +                       "device-memory", device_memory_size);
>> +    memory_region_add_subregion(sysmem, ms->device_memory->base,
>> +                                &ms->device_memory->mr);
> 
>> +    vms->bootinfo.device_memory_start = ms->device_memory->base;
>> +    vms->bootinfo.device_memory_size = device_memory_size;
> why do we need duplicate it in bootinfo?
> (I'd try avoid using bootinfo and use original source instead
> where it's needed)
agreed. Not needed.

Thanks

Eric
> 
> 
>> +}
>> +
>>  static void *machvirt_dtb(const struct arm_boot_info *binfo, int *fdt_size)
>>  {
>>      const VirtMachineState *board = container_of(binfo, VirtMachineState,
>> @@ -1430,7 +1475,8 @@ static void machvirt_init(MachineState *machine)
>>      vms->smp_cpus = smp_cpus;
>>  
>>      if (machine->ram_size > vms->memmap[VIRT_MEM].size) {
>> -        error_report("mach-virt: cannot model more than %dGB RAM", RAMLIMIT_GB);
>> +        error_report("mach-virt: cannot model more than %dGB RAM",
>> +                     (int)(vms->memmap[VIRT_MEM].size / SZ_1G));
>>          exit(1);
>>      }
>>  
>> @@ -1525,6 +1571,8 @@ static void machvirt_init(MachineState *machine)
>>                                           machine->ram_size);
>>      memory_region_add_subregion(sysmem, vms->memmap[VIRT_MEM].base, ram);
>>  
>> +    create_device_memory(vms, sysmem);
>> +
>>      create_flash(vms, sysmem, secure_sysmem ? secure_sysmem : sysmem);
>>  
>>      create_gic(vms, pic);
>> diff --git a/include/hw/arm/arm.h b/include/hw/arm/arm.h
>> index ffed392..76269e6 100644
>> --- a/include/hw/arm/arm.h
>> +++ b/include/hw/arm/arm.h
>> @@ -116,6 +116,8 @@ struct arm_boot_info {
>>      bool secure_board_setup;
>>  
>>      arm_endianness endianness;
>> +    hwaddr device_memory_start;
>> +    hwaddr device_memory_size;
>>  };
>>  
>>  /**
>> diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h
>> index 91f6de2..173938d 100644
>> --- a/include/hw/arm/virt.h
>> +++ b/include/hw/arm/virt.h
>> @@ -78,6 +78,7 @@ enum {
>>      VIRT_GPIO,
>>      VIRT_SECURE_UART,
>>      VIRT_SECURE_MEM,
>> +    VIRT_HOTPLUG_MEM,
>>  };
>>  
>>  typedef enum VirtIOMMUType {
>
Igor Mammedov Aug. 9, 2018, 8:45 a.m. UTC | #20
On Wed, 8 Aug 2018 11:33:23 +0200
Auger Eric <eric.auger@redhat.com> wrote:

> Hi Igor,
> 
> On 07/18/2018 03:00 PM, Igor Mammedov wrote:
> [...]
> >>>
> >>> I think Igor wants one contiguous region for RAM, where additional
> >>> space can be reserved for hotplugging.    
> >> This is not compliant with 2012 ARM white paper, although I don't really
> >> know if this document truly is a reference (did not get any reply).  
> > it's upto QEMU to pick layout, if we have maxmem (upto 256Gb) we could
> > accommodate legacy req and put single device_memory in 1Gb-256Gb GPA gap,
> > if it's more we can move whole device_memory to 2Tb, 8Tb ...
> > that keeps things manageable for us and fits specs (if such exist).
> > WE should make selection of the next RAM base deterministic is possible
> > when layout changes due to maxram size or IOVA, so that we won't need
> > to use compat knobs/checks to keep machine migratable.  
> Sorry for the delay. I was out of the office those past weeks.
> 
> OK understood. Your preferred approach is to have a contiguous memory
> region (initial + hotplug). So this depends on the FW capability to
> support flexible RAM base. Let's see how this dependency gets resolved.
I think Drew had already a look at FW side of the issue and has
a prototype to works with.
Once he's back in the office he planned to work on upstreaming EDK
and qemu parts.
 
> This series does not bump the non hotpluggable memory region limit,
> which is still limited to 255GB. The only way to add more memory is
> though PCDIMM or NVDIMM (max 2TB atm). To do so you need to add ,maxmem
> and ,slots options which need to be on both source and dest, right, +
> the PCDIMM/NVDIMM device option lines? Also the series checks the
> destination has at least the same IPA range capability as the source,
> which conditions the fact the requested device_memory size can be
> accommodated. At the moment I fail to see what are the other compat
> knobs I must be prepared to handle.
it looks the same to me.

We might use presence of slot/maxmem options as a knob to switch
to a new all DIMM layout (initial + hotplug) with floating ram base.
That way guests/fw that are designed to work with fixed RAM base will
work just fine by default and guests/fw that are to work with
mem hotplug or large RAM need vfio holes will use floating RAM base.
Does it seem reasonable?


> Thanks
> 
> Eric
> > 
> > [...]
> >
Eric Auger Aug. 9, 2018, 9:54 a.m. UTC | #21
Hi Igor,
On 08/09/2018 10:45 AM, Igor Mammedov wrote:
> On Wed, 8 Aug 2018 11:33:23 +0200
> Auger Eric <eric.auger@redhat.com> wrote:
> 
>> Hi Igor,
>>
>> On 07/18/2018 03:00 PM, Igor Mammedov wrote:
>> [...]
>>>>>
>>>>> I think Igor wants one contiguous region for RAM, where additional
>>>>> space can be reserved for hotplugging.    
>>>> This is not compliant with 2012 ARM white paper, although I don't really
>>>> know if this document truly is a reference (did not get any reply).  
>>> it's upto QEMU to pick layout, if we have maxmem (upto 256Gb) we could
>>> accommodate legacy req and put single device_memory in 1Gb-256Gb GPA gap,
>>> if it's more we can move whole device_memory to 2Tb, 8Tb ...
>>> that keeps things manageable for us and fits specs (if such exist).
>>> WE should make selection of the next RAM base deterministic is possible
>>> when layout changes due to maxram size or IOVA, so that we won't need
>>> to use compat knobs/checks to keep machine migratable.  
>> Sorry for the delay. I was out of the office those past weeks.
>>
>> OK understood. Your preferred approach is to have a contiguous memory
>> region (initial + hotplug). So this depends on the FW capability to
>> support flexible RAM base. Let's see how this dependency gets resolved.
> I think Drew had already a look at FW side of the issue and has
> a prototype to works with.
> Once he's back in the office he planned to work on upstreaming EDK
> and qemu parts.
>  
>> This series does not bump the non hotpluggable memory region limit,
>> which is still limited to 255GB. The only way to add more memory is
>> though PCDIMM or NVDIMM (max 2TB atm). To do so you need to add ,maxmem
>> and ,slots options which need to be on both source and dest, right, +
>> the PCDIMM/NVDIMM device option lines? Also the series checks the
>> destination has at least the same IPA range capability as the source,
>> which conditions the fact the requested device_memory size can be
>> accommodated. At the moment I fail to see what are the other compat
>> knobs I must be prepared to handle.
> it looks the same to me.
> 
> We might use presence of slot/maxmem options as a knob to switch
> to a new all DIMM layout (initial + hotplug) with floating ram base.
> That way guests/fw that are designed to work with fixed RAM base will
> work just fine by default and guests/fw that are to work with
> mem hotplug or large RAM need vfio holes will use floating RAM base.
> Does it seem reasonable?

Yep, personally I don't have a strong option regarding using a single
contiguous RAM range or separate ones. I had the impression that both
were feasible but I also understand the concern about potential
migration compat issues you may have encountered in the past on pc
machine. As far as Peter is OK we can investigate the floating RAM base
solution and I will work closely with Drew to rebase this work.

Thanks

Eric
> 
> 
>> Thanks
>>
>> Eric
>>>
>>> [...]
>>>   
> 
>
diff mbox series

Patch

diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 5a4d0bf..6fefb78 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -59,6 +59,7 @@ 
 #include "qapi/visitor.h"
 #include "standard-headers/linux/input.h"
 #include "hw/arm/smmuv3.h"
+#include "hw/acpi/acpi.h"
 
 #define DEFINE_VIRT_MACHINE_LATEST(major, minor, latest) \
     static void virt_##major##_##minor##_class_init(ObjectClass *oc, \
@@ -94,34 +95,25 @@ 
 
 #define PLATFORM_BUS_NUM_IRQS 64
 
-/* RAM limit in GB. Since VIRT_MEM starts at the 1GB mark, this means
- * RAM can go up to the 256GB mark, leaving 256GB of the physical
- * address space unallocated and free for future use between 256G and 512G.
- * If we need to provide more RAM to VMs in the future then we need to:
- *  * allocate a second bank of RAM starting at 2TB and working up
- *  * fix the DT and ACPI table generation code in QEMU to correctly
- *    report two split lumps of RAM to the guest
- *  * fix KVM in the host kernel to allow guests with >40 bit address spaces
- * (We don't want to fill all the way up to 512GB with RAM because
- * we might want it for non-RAM purposes later. Conversely it seems
- * reasonable to assume that anybody configuring a VM with a quarter
- * of a terabyte of RAM will be doing it on a host with more than a
- * terabyte of physical address space.)
- */
-#define RAMLIMIT_GB 255
-#define RAMLIMIT_BYTES (RAMLIMIT_GB * 1024ULL * 1024 * 1024)
+#define SZ_64K 0x10000
+#define SZ_1G (1024ULL * 1024 * 1024)
 
 /* Addresses and sizes of our components.
- * 0..128MB is space for a flash device so we can run bootrom code such as UEFI.
- * 128MB..256MB is used for miscellaneous device I/O.
- * 256MB..1GB is reserved for possible future PCI support (ie where the
- * PCI memory window will go if we add a PCI host controller).
- * 1GB and up is RAM (which may happily spill over into the
- * high memory region beyond 4GB).
- * This represents a compromise between how much RAM can be given to
- * a 32 bit VM and leaving space for expansion and in particular for PCI.
- * Note that devices should generally be placed at multiples of 0x10000,
+ * 0..128MB is space for a flash device so we can run bootrom code such as UEFI,
+ * 128MB..256MB is used for miscellaneous device I/O,
+ * 256MB..1GB is used for PCI host controller,
+ * 1GB..256GB is RAM (not hotpluggable),
+ * 256GB..512GB: is left for device I/O (non RAM purpose),
+ * 512GB..1TB: high mem PCI MMIO region,
+ * 2TB..4TB is used for hot-pluggable DIMM (assumes 42b GPA is supported).
+ *
+ * Note that IO devices should generally be placed at multiples of 0x10000,
  * to accommodate guests using 64K pages.
+ *
+ * Conversely it seems reasonable to assume that anybody configuring a VM
+ * with a quarter of a terabyte of RAM will be doing it on a host with more
+ * than a terabyte of physical address space.)
+ *
  */
 static const MemMapEntry a15memmap[] = {
     /* Space up to 0x8000000 is reserved for a boot ROM */
@@ -148,12 +140,13 @@  static const MemMapEntry a15memmap[] = {
     [VIRT_PCIE_MMIO] =          { 0x10000000, 0x2eff0000 },
     [VIRT_PCIE_PIO] =           { 0x3eff0000, 0x00010000 },
     [VIRT_PCIE_ECAM] =          { 0x3f000000, 0x01000000 },
-    [VIRT_MEM] =                { 0x40000000, RAMLIMIT_BYTES },
+    [VIRT_MEM] =                { SZ_1G , 255 * SZ_1G },
     /* Additional 64 MB redist region (can contain up to 512 redistributors) */
     [VIRT_GIC_REDIST2] =        { 0x4000000000ULL, 0x4000000 },
     [VIRT_PCIE_ECAM_HIGH] =     { 0x4010000000ULL, 0x10000000 },
     /* Second PCIe window, 512GB wide at the 512GB boundary */
-    [VIRT_PCIE_MMIO_HIGH] =   { 0x8000000000ULL, 0x8000000000ULL },
+    [VIRT_PCIE_MMIO_HIGH] =     { 512 * SZ_1G, 512 * SZ_1G },
+    [VIRT_HOTPLUG_MEM] =        { 2048 * SZ_1G, 2048 * SZ_1G },
 };
 
 static const int a15irqmap[] = {
@@ -1223,6 +1216,58 @@  static void create_secure_ram(VirtMachineState *vms,
     g_free(nodename);
 }
 
+static void create_device_memory(VirtMachineState *vms, MemoryRegion *sysmem)
+{
+    MachineState *ms = MACHINE(vms);
+    uint64_t device_memory_size;
+    uint64_t align = SZ_64K;
+
+    /* always allocate the device memory information */
+    ms->device_memory = g_malloc0(sizeof(*ms->device_memory));
+
+    if (vms->max_vm_phys_shift < 42) {
+        /* device memory starts at 2TB whereas this VM supports less than
+         * 2TB GPA */
+        if (ms->maxram_size > ms->ram_size || ms->ram_slots) {
+            MachineClass *mc = MACHINE_GET_CLASS(ms);
+
+            error_report("\"-memory 'slots|maxmem'\" is not supported by %s "
+                         "since KVM does not support more than 41b IPA",
+                         mc->name);
+            exit(EXIT_FAILURE);
+        }
+        return;
+    }
+
+    if (ms->ram_slots > ACPI_MAX_RAM_SLOTS) {
+        error_report("unsupported number of memory slots: %"PRIu64,
+                     ms->ram_slots);
+        exit(EXIT_FAILURE);
+    }
+
+    if (QEMU_ALIGN_UP(ms->maxram_size, align) != ms->maxram_size) {
+        error_report("maximum memory size must be aligned to multiple of 0x%"
+                     PRIx64, align);
+            exit(EXIT_FAILURE);
+    }
+
+    ms->device_memory->base = vms->memmap[VIRT_HOTPLUG_MEM].base;
+    device_memory_size = ms->maxram_size - ms->ram_size;
+
+    if (device_memory_size > vms->memmap[VIRT_HOTPLUG_MEM].size) {
+        error_report("unsupported amount of maximum memory: " RAM_ADDR_FMT,
+                         ms->maxram_size);
+        exit(EXIT_FAILURE);
+    }
+
+    memory_region_init(&ms->device_memory->mr, OBJECT(vms),
+                       "device-memory", device_memory_size);
+    memory_region_add_subregion(sysmem, ms->device_memory->base,
+                                &ms->device_memory->mr);
+    vms->bootinfo.device_memory_start = ms->device_memory->base;
+    vms->bootinfo.device_memory_size = device_memory_size;
+}
+
 static void *machvirt_dtb(const struct arm_boot_info *binfo, int *fdt_size)
 {
     const VirtMachineState *board = container_of(binfo, VirtMachineState,
@@ -1430,7 +1475,8 @@  static void machvirt_init(MachineState *machine)
     vms->smp_cpus = smp_cpus;
 
     if (machine->ram_size > vms->memmap[VIRT_MEM].size) {
-        error_report("mach-virt: cannot model more than %dGB RAM", RAMLIMIT_GB);
+        error_report("mach-virt: cannot model more than %dGB RAM",
+                     (int)(vms->memmap[VIRT_MEM].size / SZ_1G));
         exit(1);
     }
 
@@ -1525,6 +1571,8 @@  static void machvirt_init(MachineState *machine)
                                          machine->ram_size);
     memory_region_add_subregion(sysmem, vms->memmap[VIRT_MEM].base, ram);
 
+    create_device_memory(vms, sysmem);
+
     create_flash(vms, sysmem, secure_sysmem ? secure_sysmem : sysmem);
 
     create_gic(vms, pic);
diff --git a/include/hw/arm/arm.h b/include/hw/arm/arm.h
index ffed392..76269e6 100644
--- a/include/hw/arm/arm.h
+++ b/include/hw/arm/arm.h
@@ -116,6 +116,8 @@  struct arm_boot_info {
     bool secure_board_setup;
 
     arm_endianness endianness;
+    hwaddr device_memory_start;
+    hwaddr device_memory_size;
 };
 
 /**
diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h
index 91f6de2..173938d 100644
--- a/include/hw/arm/virt.h
+++ b/include/hw/arm/virt.h
@@ -78,6 +78,7 @@  enum {
     VIRT_GPIO,
     VIRT_SECURE_UART,
     VIRT_SECURE_MEM,
+    VIRT_HOTPLUG_MEM,
 };
 
 typedef enum VirtIOMMUType {