diff mbox

[2/5] exec: qemu_ram_alloc_device, qemu_ram_resize

Message ID 1416254843-16859-3-git-send-email-mst@redhat.com
State New
Headers show

Commit Message

Michael S. Tsirkin Nov. 17, 2014, 8:08 p.m. UTC
Add API to manage on-device RAM.
This looks just like regular RAM from migration POV,
but has two special properties internally:

    - it is never exposed to guest
    - block is sized on migration, making it easier to extend
      without breaking migration compatibility or wasting
      virtual memory
    - callers must specify an upper bound on size

Device is notified on resize, so it can adjust if necessary.

qemu_ram_alloc_device allocates this memory, qemu_ram_resize resizes it.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/exec/cpu-all.h  |   8 +++-
 include/exec/ram_addr.h |   7 +++
 exec.c                  | 113 +++++++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 115 insertions(+), 13 deletions(-)

Comments

Michael S. Tsirkin Nov. 17, 2014, 8:15 p.m. UTC | #1
On Mon, Nov 17, 2014 at 10:08:53PM +0200, Michael S. Tsirkin wrote:
> Add API to manage on-device RAM.
> This looks just like regular RAM from migration POV,
> but has two special properties internally:
> 
>     - it is never exposed to guest
>     - block is sized on migration, making it easier to extend
>       without breaking migration compatibility or wasting
>       virtual memory
>     - callers must specify an upper bound on size
> 
> Device is notified on resize, so it can adjust if necessary.
> 
> qemu_ram_alloc_device allocates this memory, qemu_ram_resize resizes it.
> 
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

Minor clarification: the need to supply max size helps
simplify code, but it's also a security feature:
the next patch uses that to validate incoming stream,
preventing DOS attacks by making qemu allocate
huge amounts of RAM.

> ---
>  include/exec/cpu-all.h  |   8 +++-
>  include/exec/ram_addr.h |   7 +++
>  exec.c                  | 113 +++++++++++++++++++++++++++++++++++++++++++-----
>  3 files changed, 115 insertions(+), 13 deletions(-)
> 
> diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h
> index 62f5581..26eb9b2 100644
> --- a/include/exec/cpu-all.h
> +++ b/include/exec/cpu-all.h
> @@ -299,11 +299,15 @@ CPUArchState *cpu_copy(CPUArchState *env);
>  
>  /* memory API */
>  
> -typedef struct RAMBlock {
> +typedef struct RAMBlock RAMBlock;
> +
> +struct RAMBlock {
>      struct MemoryRegion *mr;
>      uint8_t *host;
>      ram_addr_t offset;
>      ram_addr_t length;
> +    ram_addr_t max_length;
> +    void (*resized)(const char*, uint64_t length, void *host);
>      uint32_t flags;
>      char idstr[256];
>      /* Reads can take either the iothread or the ramlist lock.
> @@ -311,7 +315,7 @@ typedef struct RAMBlock {
>       */
>      QTAILQ_ENTRY(RAMBlock) next;
>      int fd;
> -} RAMBlock;
> +};
>  
>  static inline void *ramblock_ptr(RAMBlock *block, ram_addr_t offset)
>  {
> diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h
> index d7e5238..72ab12b 100644
> --- a/include/exec/ram_addr.h
> +++ b/include/exec/ram_addr.h
> @@ -28,12 +28,19 @@ ram_addr_t qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
>  ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
>                                     MemoryRegion *mr, Error **errp);
>  ram_addr_t qemu_ram_alloc(ram_addr_t size, MemoryRegion *mr, Error **errp);
> +ram_addr_t qemu_ram_alloc_device(ram_addr_t size, ram_addr_t max_size,
> +                                 void (*resized)(const char*,
> +                                                 uint64_t length,
> +                                                 void *host),
> +                                 MemoryRegion *mr, Error **errp);
>  int qemu_get_ram_fd(ram_addr_t addr);
>  void *qemu_get_ram_block_host_ptr(ram_addr_t addr);
>  void *qemu_get_ram_ptr(ram_addr_t addr);
>  void qemu_ram_free(ram_addr_t addr);
>  void qemu_ram_free_from_ptr(ram_addr_t addr);
>  
> +int qemu_ram_resize(ram_addr_t base, ram_addr_t newsize, Error **errp);
> +
>  static inline bool cpu_physical_memory_get_dirty(ram_addr_t start,
>                                                   ram_addr_t length,
>                                                   unsigned client)
> diff --git a/exec.c b/exec.c
> index 9648669..a177816 100644
> --- a/exec.c
> +++ b/exec.c
> @@ -75,6 +75,11 @@ static MemoryRegion io_mem_unassigned;
>  /* RAM is mmap-ed with MAP_SHARED */
>  #define RAM_SHARED     (1 << 1)
>  
> +/* On-device RAM allocated with g_malloc: supports realloc,
> + * not accessible to vcpu on kvm.
> + */
> +#define RAM_DEVICE     (1 << 2)
> +
>  #endif
>  
>  struct CPUTailQ cpus = QTAILQ_HEAD_INITIALIZER(cpus);
> @@ -1186,7 +1191,7 @@ static ram_addr_t find_ram_offset(ram_addr_t size)
>      QTAILQ_FOREACH(block, &ram_list.blocks, next) {
>          ram_addr_t end, next = RAM_ADDR_MAX;
>  
> -        end = block->offset + block->length;
> +        end = block->offset + block->max_length;
>  
>          QTAILQ_FOREACH(next_block, &ram_list.blocks, next) {
>              if (next_block->offset >= end) {
> @@ -1214,7 +1219,7 @@ ram_addr_t last_ram_offset(void)
>      ram_addr_t last = 0;
>  
>      QTAILQ_FOREACH(block, &ram_list.blocks, next)
> -        last = MAX(last, block->offset + block->length);
> +        last = MAX(last, block->offset + block->max_length);
>  
>      return last;
>  }
> @@ -1296,6 +1301,50 @@ static int memory_try_enable_merging(void *addr, size_t len)
>      return qemu_madvise(addr, len, QEMU_MADV_MERGEABLE);
>  }
>  
> +int qemu_ram_resize(ram_addr_t base, ram_addr_t newsize, Error **errp)
> +{
> +    RAMBlock *block = find_ram_block(base);
> +
> +    assert(block);
> +
> +    if (block->length == newsize) {
> +        return 0;
> +    }
> +
> +    if (!(block->flags & RAM_DEVICE)) {
> +        error_setg_errno(errp, EINVAL,
> +                         "Length mismatch: %s: 0x" RAM_ADDR_FMT
> +                         " in != 0x" RAM_ADDR_FMT, block->idstr,
> +                         newsize, block->length);
> +        return -EINVAL;
> +    }
> +
> +    if (block->max_length < newsize) {
> +        error_setg_errno(errp, EINVAL,
> +                         "Length too large: %s: 0x" RAM_ADDR_FMT
> +                         " > 0x" RAM_ADDR_FMT, block->idstr,
> +                         newsize, block->max_length);
> +        return -EINVAL;
> +    }
> +
> +    block->host = g_realloc(block->host, newsize);
> +    if (!block->host) {
> +        error_setg_errno(errp, errno,
> +                         "cannot allocate guest memory '%s'",
> +                         memory_region_name(block->mr));
> +        return -ENOMEM;
> +    }
> +
> +    cpu_physical_memory_clear_dirty_range_nocode(block->offset, block->length);
> +    block->length = newsize;
> +    memset(block->host, 0, block->length);
> +    cpu_physical_memory_set_dirty_range_nocode(block->offset, block->length);
> +    if (block->resized) {
> +        block->resized(block->idstr, newsize, block->host);
> +    }
> +    return 0;
> +}
> +
>  static ram_addr_t ram_block_add(RAMBlock *new_block, Error **errp)
>  {
>      RAMBlock *block;
> @@ -1308,7 +1357,16 @@ static ram_addr_t ram_block_add(RAMBlock *new_block, Error **errp)
>      new_block->offset = find_ram_offset(new_block->length);
>  
>      if (!new_block->host) {
> -        if (xen_enabled()) {
> +        if (new_block->flags & RAM_DEVICE) {
> +            new_block->host = g_malloc0(new_block->length);
> +            if (!new_block->host) {
> +                error_setg_errno(errp, errno,
> +                                 "cannot allocate guest memory '%s'",
> +                                 memory_region_name(new_block->mr));
> +                qemu_mutex_unlock_ramlist();
> +                return -1;
> +            }
> +        } else if (xen_enabled()) {
>              xen_ram_alloc(new_block->offset, new_block->length, new_block->mr);
>          } else {
>              new_block->host = phys_mem_alloc(new_block->length,
> @@ -1352,12 +1410,14 @@ static ram_addr_t ram_block_add(RAMBlock *new_block, Error **errp)
>      }
>      cpu_physical_memory_set_dirty_range(new_block->offset, new_block->length);
>  
> -    qemu_ram_setup_dump(new_block->host, new_block->length);
> -    qemu_madvise(new_block->host, new_block->length, QEMU_MADV_HUGEPAGE);
> -    qemu_madvise(new_block->host, new_block->length, QEMU_MADV_DONTFORK);
> +    if (!(new_block->flags & RAM_DEVICE)) {
> +        qemu_ram_setup_dump(new_block->host, new_block->length);
> +        qemu_madvise(new_block->host, new_block->length, QEMU_MADV_HUGEPAGE);
> +        qemu_madvise(new_block->host, new_block->length, QEMU_MADV_DONTFORK);
>  
> -    if (kvm_enabled()) {
> -        kvm_setup_guest_memory(new_block->host, new_block->length);
> +        if (kvm_enabled()) {
> +            kvm_setup_guest_memory(new_block->host, new_block->length);
> +        }
>      }
>  
>      return new_block->offset;
> @@ -1392,6 +1452,7 @@ ram_addr_t qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
>      new_block = g_malloc0(sizeof(*new_block));
>      new_block->mr = mr;
>      new_block->length = size;
> +    new_block->max_length = size;
>      new_block->flags = share ? RAM_SHARED : 0;
>      new_block->host = file_ram_alloc(new_block, size,
>                                       mem_path, errp);
> @@ -1410,7 +1471,12 @@ ram_addr_t qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
>  }
>  #endif
>  
> -ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
> +static
> +ram_addr_t qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
> +                                   void (*resized)(const char*,
> +                                                   uint64_t length,
> +                                                   void *host),
> +                                   void *host, bool device,
>                                     MemoryRegion *mr, Error **errp)
>  {
>      RAMBlock *new_block;
> @@ -1418,14 +1484,21 @@ ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
>      Error *local_err = NULL;
>  
>      size = TARGET_PAGE_ALIGN(size);
> +    max_size = TARGET_PAGE_ALIGN(max_size);
>      new_block = g_malloc0(sizeof(*new_block));
>      new_block->mr = mr;
> +    new_block->resized = resized;
>      new_block->length = size;
> +    new_block->max_length = max_size;
> +    assert(max_size >= size);
>      new_block->fd = -1;
>      new_block->host = host;
>      if (host) {
>          new_block->flags |= RAM_PREALLOC;
>      }
> +    if (device) {
> +        new_block->flags |= RAM_DEVICE;
> +    }
>      addr = ram_block_add(new_block, &local_err);
>      if (local_err) {
>          g_free(new_block);
> @@ -1435,9 +1508,24 @@ ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
>      return addr;
>  }
>  
> +ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
> +                                   MemoryRegion *mr, Error **errp)
> +{
> +    return qemu_ram_alloc_internal(size, size, NULL, host, false, mr, errp);
> +}
> +
>  ram_addr_t qemu_ram_alloc(ram_addr_t size, MemoryRegion *mr, Error **errp)
>  {
> -    return qemu_ram_alloc_from_ptr(size, NULL, mr, errp);
> +    return qemu_ram_alloc_internal(size, size, NULL, NULL, false, mr, errp);
> +}
> +
> +ram_addr_t qemu_ram_alloc_device(ram_addr_t size, ram_addr_t maxsz,
> +                                   void (*resized)(const char*,
> +                                                   uint64_t length,
> +                                                   void *host),
> +                                   MemoryRegion *mr, Error **errp)
> +{
> +    return qemu_ram_alloc_internal(size, maxsz, resized, NULL, true, mr, errp);
>  }
>  
>  void qemu_ram_free_from_ptr(ram_addr_t addr)
> @@ -1471,6 +1559,8 @@ void qemu_ram_free(ram_addr_t addr)
>              ram_list.version++;
>              if (block->flags & RAM_PREALLOC) {
>                  ;
> +            } else if (block->flags & RAM_DEVICE) {
> +                g_free(block->host);
>              } else if (xen_enabled()) {
>                  xen_invalidate_map_cache_entry(block->host);
>  #ifndef _WIN32
> @@ -1501,7 +1591,8 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
>          offset = addr - block->offset;
>          if (offset < block->length) {
>              vaddr = ramblock_ptr(block, offset);
> -            if (block->flags & RAM_PREALLOC) {
> +            if (block->flags & RAM_PREALLOC ||
> +                block->flags & RAM_DEVICE) {
>                  ;
>              } else if (xen_enabled()) {
>                  abort();
> -- 
> MST
>
Paolo Bonzini Nov. 18, 2014, 6:03 a.m. UTC | #2
On 17/11/2014 21:08, Michael S. Tsirkin wrote:
> Add API to manage on-device RAM.
> This looks just like regular RAM from migration POV,
> but has two special properties internally:
> 
>     - block is sized on migration, making it easier to extend
>       without breaking migration compatibility or wasting
>       virtual memory
>     - callers must specify an upper bound on size

Why should on-device RAM have this property, or why should this property
be interesting for on-device RAM (as opposed to generic "we are using
MemoryRegions internally and we want them resized")?

I admit the patches look clean, but I would prefer to have some changes
to the API and I dislike introducing a worse API just because we are so
close to release.  For example, the resized callback should probably
receive a MemoryRegion, not a host/length pair, or even better there
could be a NotifierList per RAMBlock.

Also, I am afraid that this design could make it easier to introduce
backwards-incompatible changes.  I very much prefer to have
user-controlled ACPI information (coming from the command-line)
byte-for-byte identical for a given machine type.  Patches for that have
been on the list for almost two months, and it's not nice.

Paolo
Michael S. Tsirkin Nov. 18, 2014, 7:49 a.m. UTC | #3
On Tue, Nov 18, 2014 at 07:03:58AM +0100, Paolo Bonzini wrote:
> 
> 
> On 17/11/2014 21:08, Michael S. Tsirkin wrote:
> > Add API to manage on-device RAM.
> > This looks just like regular RAM from migration POV,
> > but has two special properties internally:
> > 
> >     - block is sized on migration, making it easier to extend
> >       without breaking migration compatibility or wasting
> >       virtual memory
> >     - callers must specify an upper bound on size
> 
> Why should on-device RAM have this property, or why should this property
> be interesting for on-device RAM (as opposed to generic "we are using
> MemoryRegions internally and we want them resized")?

I guess it's just a question of terminology.
internally == on device

> I admit the patches look clean, but I would prefer to have some changes
> to the API and I dislike introducing a worse API just because we are so
> close to release.

Well, please list the issues - maybe they are easy to resolve
even close to release.

>  For example, the resized callback should probably
> receive a MemoryRegion, not a host/length pair, or even better there
> could be a NotifierList per RAMBlock.

I will do that quickly, that's easy.
Any more changes?

> Also, I am afraid that this design could make it easier to introduce
> backwards-incompatible changes.


Well the point is exactly to make it easy to make *compatible*
changes.

As I mentioned in the cover letter, it's not just ACPI.
For example, we now change boot index dynamically.
People using large fw cfg blobs, e.g. -initrd, would benefit from
ability to change the blob dynamically.
There could be other examples.

>  I very much prefer to have
> user-controlled ACPI information (coming from the command-line)
> byte-for-byte identical for a given machine type.  Patches for that have
> been on the list for almost two months, and it's not nice.
> 
> Paolo

I guess we just disagree on whether these patches will effectively achieve
this goal.  For example, some people want to rewrite iasl bits,
generating everything in C. This will affect static bits too.
I don't want to make every single change in code conditional
on a machine type.
Juan Quintela Nov. 19, 2014, 9:19 a.m. UTC | #4
"Michael S. Tsirkin" <mst@redhat.com> wrote:
> On Tue, Nov 18, 2014 at 07:03:58AM +0100, Paolo Bonzini wrote:
>> 
>> 
>> On 17/11/2014 21:08, Michael S. Tsirkin wrote:
>> > Add API to manage on-device RAM.
>> > This looks just like regular RAM from migration POV,
>> > but has two special properties internally:
>> > 
>> >     - block is sized on migration, making it easier to extend
>> >       without breaking migration compatibility or wasting
>> >       virtual memory
>> >     - callers must specify an upper bound on size
>> 


>> Also, I am afraid that this design could make it easier to introduce
>> backwards-incompatible changes.
>
>
> Well the point is exactly to make it easy to make *compatible*
> changes.
>
> As I mentioned in the cover letter, it's not just ACPI.
> For example, we now change boot index dynamically.
> People using large fw cfg blobs, e.g. -initrd, would benefit from
> ability to change the blob dynamically.
> There could be other examples.

changing the size of the initrd, on the fly and wanting to migrate?  Is
that a real use case?  One that we should really care?

>>  I very much prefer to have
>> user-controlled ACPI information (coming from the command-line)
>> byte-for-byte identical for a given machine type.  Patches for that have
>> been on the list for almost two months, and it's not nice.
>> 
>> Paolo
>
> I guess we just disagree on whether these patches will effectively achieve
> this goal.  For example, some people want to rewrite iasl bits,
> generating everything in C. This will affect static bits too.
> I don't want to make every single change in code conditional
> on a machine type.

You can't migrate with a different BIOS on destination, period.  That is
what is making this whole issue complicated.  We have two clear options:

a- require BIOS & memory regions to be exactly the same in both sides.
   No need to add compat machinery.
b- trying to accomodate any potential change that could appear and use
   the same BIOS.

IMHO, b) is just asking for trouble.  Being able to go from random
changes to random changes look strange.

Just think about it for a second.  We are sending more data for some
regions that it was allocated.  And we just grow the regions and expect
that everything is going to be ok.  It is me, or this goes against every
security discipline that I can think of?

Later, Juan.
Michael S. Tsirkin Nov. 19, 2014, 9:33 a.m. UTC | #5
On Wed, Nov 19, 2014 at 10:19:22AM +0100, Juan Quintela wrote:
> "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > On Tue, Nov 18, 2014 at 07:03:58AM +0100, Paolo Bonzini wrote:
> >> 
> >> 
> >> On 17/11/2014 21:08, Michael S. Tsirkin wrote:
> >> > Add API to manage on-device RAM.
> >> > This looks just like regular RAM from migration POV,
> >> > but has two special properties internally:
> >> > 
> >> >     - block is sized on migration, making it easier to extend
> >> >       without breaking migration compatibility or wasting
> >> >       virtual memory
> >> >     - callers must specify an upper bound on size
> >> 
> 
> 
> >> Also, I am afraid that this design could make it easier to introduce
> >> backwards-incompatible changes.
> >
> >
> > Well the point is exactly to make it easy to make *compatible*
> > changes.
> >
> > As I mentioned in the cover letter, it's not just ACPI.
> > For example, we now change boot index dynamically.
> > People using large fw cfg blobs, e.g. -initrd, would benefit from
> > ability to change the blob dynamically.
> > There could be other examples.
> 
> changing the size of the initrd, on the fly and wanting to migrate?  Is
> that a real use case?  One that we should really care?

I'm not sure.

At the moment one can swap kernels by doing halt in guest and
restarting with the new one.

If we wanted to allow reboot in guest to bring a new kernel instead,
that would be one way to implement it.

I was merely pointing out that the capability might find other uses.


> >>  I very much prefer to have
> >> user-controlled ACPI information (coming from the command-line)
> >> byte-for-byte identical for a given machine type.  Patches for that have
> >> been on the list for almost two months, and it's not nice.
> >> 
> >> Paolo
> >
> > I guess we just disagree on whether these patches will effectively achieve
> > this goal.  For example, some people want to rewrite iasl bits,
> > generating everything in C. This will affect static bits too.
> > I don't want to make every single change in code conditional
> > on a machine type.
> 
> You can't migrate with a different BIOS on destination, period.

This claim is very wrong.
This would make is impossible to change BIOS bus without breaking
migration.  Look at history of qemu, we change BIOS every release.


>  That is
> what is making this whole issue complicated.  We have two clear options:
> 
> a- require BIOS & memory regions to be exactly the same in both sides.
>    No need to add compat machinery.
> b- trying to accomodate any potential change that could appear and use
>    the same BIOS.
> 
> IMHO, b) is just asking for trouble.  Being able to go from random
> changes to random changes look strange.

Yes, it is hard to support.
But it's a required feature, and in fact, it's an existing one.

> Just think about it for a second.  We are sending more data for some
> regions that it was allocated.  And we just grow the regions and expect
> that everything is going to be ok.  It is me, or this goes against every
> security discipline that I can think of?
> 
> Later, Juan.

We have many devices that just get N from stream, do malloc(N), then read
data from stream into it.
You think it's unsafe? Go ahead and fix them all.

However, my patch does address your concern: callers specify the upper
limit on the region size.
Trying to migrate in a 1Gbyte region
Juan Quintela Nov. 19, 2014, 10:11 a.m. UTC | #6
"Michael S. Tsirkin" <mst@redhat.com> wrote:
> On Wed, Nov 19, 2014 at 10:19:22AM +0100, Juan Quintela wrote:


>> >>  I very much prefer to have
>> >> user-controlled ACPI information (coming from the command-line)
>> >> byte-for-byte identical for a given machine type.  Patches for that have
>> >> been on the list for almost two months, and it's not nice.
>> >> 
>> >> Paolo
>> >
>> > I guess we just disagree on whether these patches will effectively achieve
>> > this goal.  For example, some people want to rewrite iasl bits,
>> > generating everything in C. This will affect static bits too.
>> > I don't want to make every single change in code conditional
>> > on a machine type.
>> 
>> You can't migrate with a different BIOS on destination, period.
>
> This claim is very wrong.
> This would make is impossible to change BIOS bus without breaking
> migration.  Look at history of qemu, we change BIOS every release.

And we should do:

qemu -M pc-2.0 -L BIOS-2.0.bin

And that way for every version and every bios.  If they are the same,
a symlink does.  If they are not, different file.  And we would not have
this problems.

I fully agree that we have problems with BIOS every relase.  What we
don't agree is _what_ is the best way to fix the issue.


>> IMHO, b) is just asking for trouble.  Being able to go from random
>> changes to random changes look strange.
>
> Yes, it is hard to support.
> But it's a required feature, and in fact, it's an existing one.

>> Just think about it for a second.  We are sending more data for some
>> regions that it was allocated.  And we just grow the regions and expect
>> that everything is going to be ok.  It is me, or this goes against every
>> security discipline that I can think of?
>> 
>> Later, Juan.
>
> We have many devices that just get N from stream, do malloc(N), then read
> data from stream into it.
> You think it's unsafe? Go ahead and fix them all.

I am sure it is unsafe.  Fixing them requires incompatible changes, that
is the whole point :-(

> However, my patch does address your concern: callers specify the upper
> limit on the region size.
> Trying to migrate in a 1Gbyte region

Yes and no.  You are assuming that a guest launched with a device ram
size of 256KB receives a 512KB section and it knows what to do with it.

It knows what to do with the 256KB section, with the 512KB answer is
always a "perhaps".  It depends of what is on the extra space.

My solution would be that RAM/ROM sizes are always the same for
migration, so destination would always understand it.

It just forbids migrating between different machine types.  And I think
that is good, not bad.

Later, Juan.
Markus Armbruster Nov. 19, 2014, 10:16 a.m. UTC | #7
"Michael S. Tsirkin" <mst@redhat.com> writes:

> On Wed, Nov 19, 2014 at 10:19:22AM +0100, Juan Quintela wrote:
>> "Michael S. Tsirkin" <mst@redhat.com> wrote:
>> > On Tue, Nov 18, 2014 at 07:03:58AM +0100, Paolo Bonzini wrote:
>> >> 
>> >> 
>> >> On 17/11/2014 21:08, Michael S. Tsirkin wrote:
>> >> > Add API to manage on-device RAM.
>> >> > This looks just like regular RAM from migration POV,
>> >> > but has two special properties internally:
>> >> > 
>> >> >     - block is sized on migration, making it easier to extend
>> >> >       without breaking migration compatibility or wasting
>> >> >       virtual memory
>> >> >     - callers must specify an upper bound on size
>> >> 
>> 
>> 
>> >> Also, I am afraid that this design could make it easier to introduce
>> >> backwards-incompatible changes.
>> >
>> >
>> > Well the point is exactly to make it easy to make *compatible*
>> > changes.
>> >
>> > As I mentioned in the cover letter, it's not just ACPI.
>> > For example, we now change boot index dynamically.
>> > People using large fw cfg blobs, e.g. -initrd, would benefit from
>> > ability to change the blob dynamically.
>> > There could be other examples.
>> 
>> changing the size of the initrd, on the fly and wanting to migrate?  Is
>> that a real use case?  One that we should really care?
>
> I'm not sure.
>
> At the moment one can swap kernels by doing halt in guest and
> restarting with the new one.
>
> If we wanted to allow reboot in guest to bring a new kernel instead,
> that would be one way to implement it.
>
> I was merely pointing out that the capability might find other uses.
>
>
>> >>  I very much prefer to have
>> >> user-controlled ACPI information (coming from the command-line)
>> >> byte-for-byte identical for a given machine type.  Patches for that have
>> >> been on the list for almost two months, and it's not nice.
>> >> 
>> >> Paolo
>> >
>> > I guess we just disagree on whether these patches will effectively achieve
>> > this goal.  For example, some people want to rewrite iasl bits,
>> > generating everything in C. This will affect static bits too.
>> > I don't want to make every single change in code conditional
>> > on a machine type.
>> 
>> You can't migrate with a different BIOS on destination, period.
>
> This claim is very wrong.
> This would make is impossible to change BIOS bus without breaking
> migration.  Look at history of qemu, we change BIOS every release.

Since migration doesn't transport configuration, we require a compatibly
configured target, and that includes identical memory sizes.  RAM size
is explicit and the user's problem.  ROM size is generally implicit, and
we use machine type compatibility machinery to keep it fixed.  BIOS
changes can break migration only when we screw up or forget the
compatibility machinery.  Same as for lots of other stuff.  No big deal,
really, just a consequence of not migrating configuration.

>>  That is
>> what is making this whole issue complicated.  We have two clear options:
>> 
>> a- require BIOS & memory regions to be exactly the same in both sides.
>>    No need to add compat machinery.
>> b- trying to accomodate any potential change that could appear and use
>>    the same BIOS.
>> 
>> IMHO, b) is just asking for trouble.  Being able to go from random
>> changes to random changes look strange.
>
> Yes, it is hard to support.
> But it's a required feature, and in fact, it's an existing one.
>
>> Just think about it for a second.  We are sending more data for some
>> regions that it was allocated.  And we just grow the regions and expect
>> that everything is going to be ok.  It is me, or this goes against every
>> security discipline that I can think of?
>> 
>> Later, Juan.
>
> We have many devices that just get N from stream, do malloc(N), then read
> data from stream into it.
> You think it's unsafe? Go ahead and fix them all.
>
> However, my patch does address your concern: callers specify the upper
> limit on the region size.
> Trying to migrate in a 1Gbyte region

Are you proposing to make incoming migration adjust some or all memory
sizes on the target from "whatever was configured during startup" to
"whatever is configured on the source"?  Possibly with some limitations,
such as "can only adjust downwards"?
Michael S. Tsirkin Nov. 19, 2014, 10:21 a.m. UTC | #8
On Wed, Nov 19, 2014 at 11:11:26AM +0100, Juan Quintela wrote:
> "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > On Wed, Nov 19, 2014 at 10:19:22AM +0100, Juan Quintela wrote:
> 
> 
> >> >>  I very much prefer to have
> >> >> user-controlled ACPI information (coming from the command-line)
> >> >> byte-for-byte identical for a given machine type.  Patches for that have
> >> >> been on the list for almost two months, and it's not nice.
> >> >> 
> >> >> Paolo
> >> >
> >> > I guess we just disagree on whether these patches will effectively achieve
> >> > this goal.  For example, some people want to rewrite iasl bits,
> >> > generating everything in C. This will affect static bits too.
> >> > I don't want to make every single change in code conditional
> >> > on a machine type.
> >> 
> >> You can't migrate with a different BIOS on destination, period.
> >
> > This claim is very wrong.
> > This would make is impossible to change BIOS bus without breaking
> > migration.  Look at history of qemu, we change BIOS every release.
> 
> And we should do:
> 
> qemu -M pc-2.0 -L BIOS-2.0.bin
> And that way for every version and every bios.  If they are the same,
> a symlink does.  If they are not, different file.  And we would not have
> this problems.

You want to keep old bios around forever, and never fix
bugs in it?  I disagree, quite strongly.


> 
> I fully agree that we have problems with BIOS every relase.  What we
> don't agree is _what_ is the best way to fix the issue.
> 


You don't seem to want to fix them. Your solution is just to keep
bugs around forver.



> >> IMHO, b) is just asking for trouble.  Being able to go from random
> >> changes to random changes look strange.
> >
> > Yes, it is hard to support.
> > But it's a required feature, and in fact, it's an existing one.
> 
> >> Just think about it for a second.  We are sending more data for some
> >> regions that it was allocated.  And we just grow the regions and expect
> >> that everything is going to be ok.  It is me, or this goes against every
> >> security discipline that I can think of?
> >> 
> >> Later, Juan.
> >
> > We have many devices that just get N from stream, do malloc(N), then read
> > data from stream into it.
> > You think it's unsafe? Go ahead and fix them all.
> 
> I am sure it is unsafe.  Fixing them requires incompatible changes, that
> is the whole point :-(

I don't see the point, sorry.  Want to elaborate?

> > However, my patch does address your concern: callers specify the upper
> > limit on the region size.
> > Trying to migrate in a 1Gbyte region
> 
> Yes and no.  You are assuming that a guest launched with a device ram
> size of 256KB receives a 512KB section and it knows what to do with it.
> 
> It knows what to do with the 256KB section, with the 512KB answer is
> always a "perhaps".  It depends of what is on the extra space.
> 
> My solution would be that RAM/ROM sizes are always the same for
> migration, so destination would always understand it.
> 
> It just forbids migrating between different machine types.  And I think
> that is good, not bad.
> 
> Later, Juan.

You basically ask firmware to be perfect, or want us to carry old bugs
around forever.  It makes things simple for migration code, at huge cost
elsewhere, and pain for users.

I don't want to maintain old bugs in ACPI, and same applies to
other firmware.

This is really the whole point of the patchset.
Keep bugs in device ram around until next reboot.
At that point users get new device with non buggy
behaviour.
Michael S. Tsirkin Nov. 19, 2014, 10:30 a.m. UTC | #9
On Wed, Nov 19, 2014 at 11:16:57AM +0100, Markus Armbruster wrote:
> "Michael S. Tsirkin" <mst@redhat.com> writes:
> 
> > On Wed, Nov 19, 2014 at 10:19:22AM +0100, Juan Quintela wrote:
> >> "Michael S. Tsirkin" <mst@redhat.com> wrote:
> >> > On Tue, Nov 18, 2014 at 07:03:58AM +0100, Paolo Bonzini wrote:
> >> >> 
> >> >> 
> >> >> On 17/11/2014 21:08, Michael S. Tsirkin wrote:
> >> >> > Add API to manage on-device RAM.
> >> >> > This looks just like regular RAM from migration POV,
> >> >> > but has two special properties internally:
> >> >> > 
> >> >> >     - block is sized on migration, making it easier to extend
> >> >> >       without breaking migration compatibility or wasting
> >> >> >       virtual memory
> >> >> >     - callers must specify an upper bound on size
> >> >> 
> >> 
> >> 
> >> >> Also, I am afraid that this design could make it easier to introduce
> >> >> backwards-incompatible changes.
> >> >
> >> >
> >> > Well the point is exactly to make it easy to make *compatible*
> >> > changes.
> >> >
> >> > As I mentioned in the cover letter, it's not just ACPI.
> >> > For example, we now change boot index dynamically.
> >> > People using large fw cfg blobs, e.g. -initrd, would benefit from
> >> > ability to change the blob dynamically.
> >> > There could be other examples.
> >> 
> >> changing the size of the initrd, on the fly and wanting to migrate?  Is
> >> that a real use case?  One that we should really care?
> >
> > I'm not sure.
> >
> > At the moment one can swap kernels by doing halt in guest and
> > restarting with the new one.
> >
> > If we wanted to allow reboot in guest to bring a new kernel instead,
> > that would be one way to implement it.
> >
> > I was merely pointing out that the capability might find other uses.
> >
> >
> >> >>  I very much prefer to have
> >> >> user-controlled ACPI information (coming from the command-line)
> >> >> byte-for-byte identical for a given machine type.  Patches for that have
> >> >> been on the list for almost two months, and it's not nice.
> >> >> 
> >> >> Paolo
> >> >
> >> > I guess we just disagree on whether these patches will effectively achieve
> >> > this goal.  For example, some people want to rewrite iasl bits,
> >> > generating everything in C. This will affect static bits too.
> >> > I don't want to make every single change in code conditional
> >> > on a machine type.
> >> 
> >> You can't migrate with a different BIOS on destination, period.
> >
> > This claim is very wrong.
> > This would make is impossible to change BIOS bus without breaking
> > migration.  Look at history of qemu, we change BIOS every release.
> 
> Since migration doesn't transport configuration, we require a compatibly
> configured target, and that includes identical memory sizes.  RAM size
> is explicit and the user's problem.  ROM size is generally implicit, and
> we use machine type compatibility machinery to keep it fixed.  BIOS
> changes can break migration only when we screw up or forget the
> compatibility machinery.  Same as for lots of other stuff.  No big deal,
> really, just a consequence of not migrating configuration.

You don't get to maintain it, so it's no big deal for you.

I see pain every single release and code is becoming spaghetty-like very
quickly.  We thought it would work. It does not.  We do need a solution.

And the pain is completely self-inflicted: we already migrate
all necessary information!
It's just a question of adjusting our datastructures to it.



> >>  That is
> >> what is making this whole issue complicated.  We have two clear options:
> >> 
> >> a- require BIOS & memory regions to be exactly the same in both sides.
> >>    No need to add compat machinery.
> >> b- trying to accomodate any potential change that could appear and use
> >>    the same BIOS.
> >> 
> >> IMHO, b) is just asking for trouble.  Being able to go from random
> >> changes to random changes look strange.
> >
> > Yes, it is hard to support.
> > But it's a required feature, and in fact, it's an existing one.
> >
> >> Just think about it for a second.  We are sending more data for some
> >> regions that it was allocated.  And we just grow the regions and expect
> >> that everything is going to be ok.  It is me, or this goes against every
> >> security discipline that I can think of?
> >> 
> >> Later, Juan.
> >
> > We have many devices that just get N from stream, do malloc(N), then read
> > data from stream into it.
> > You think it's unsafe? Go ahead and fix them all.
> >
> > However, my patch does address your concern: callers specify the upper
> > limit on the region size.
> > Trying to migrate in a 1Gbyte region
> 
> Are you proposing to make incoming migration adjust some or all memory
> sizes on the target from "whatever was configured during startup" to
> "whatever is configured on the source"?

Yes.

At the moment, I only propose this for internal on-device RAM,
for the simple reason that users don't know or care about it.
So migrating it just removes maintainance pain.

It wouldn't be hard to extend it for user-specified RAM,
but I don't know whether that is useful.

> Possibly with some limitations,
> such as "can only adjust downwards"?

Yes.

"Can adjust downwards" is too limiting, since especially downstreams
want two-way migration to work.
So I just have devices specify an upper limit.
Juan Quintela Nov. 19, 2014, 10:45 a.m. UTC | #10
"Michael S. Tsirkin" <mst@redhat.com> wrote:
> On Wed, Nov 19, 2014 at 11:11:26AM +0100, Juan Quintela wrote:
>> "Michael S. Tsirkin" <mst@redhat.com> wrote:
>> > On Wed, Nov 19, 2014 at 10:19:22AM +0100, Juan Quintela wrote:


>> qemu -M pc-2.0 -L BIOS-2.0.bin
>> And that way for every version and every bios.  If they are the same,
>> a symlink does.  If they are not, different file.  And we would not have
>> this problems.
>
> You want to keep old bios around forever, and never fix
> bugs in it?  I disagree, quite strongly.

One thing is fix bugs, and a different one is completely change the way
the tables/data are generated.

About keeping old bios forever, no.  Only while the machine types that
depend on that BIOS are supported.  At the very point that they get not
supported, we can drop it.
>
>> 
>> I fully agree that we have problems with BIOS every relase.  What we
>> don't agree is _what_ is the best way to fix the issue.
>> 
>
>
> You don't seem to want to fix them. Your solution is just to keep
> bugs around forver.

That is unfair.  It is the same that if I answer that your solution is
just paper over the bugs that we have already foound and hope that there
are no more bugs.  Do you think that describes your position?  Mine is
also not described but your statement.

>> > We have many devices that just get N from stream, do malloc(N), then read
>> > data from stream into it.
>> > You think it's unsafe? Go ahead and fix them all.
>> 
>> I am sure it is unsafe.  Fixing them requires incompatible changes, that
>> is the whole point :-(
>
> I don't see the point, sorry.  Want to elaborate?

In general, I don't have specific examples:
- when we have a string buffer, we sent/receive it with:

        VMSTATE_BUFFER(queue.data, PS2State),

We are sending whatever the size of queue.data is on source to whatever
the queue.data is on destination.  We are hopping that they are the
same.

In this case, if sizes are different, we got just a migration stream
that is out of sync.  And if attacker modifies stream,  bad things could happen.
In the case of buffers/arrays (so far it appears that everyplace that
recives a size from the network, it checks that it is small enough).

>
>> > However, my patch does address your concern: callers specify the upper
>> > limit on the region size.
>> > Trying to migrate in a 1Gbyte region
>> 
>> Yes and no.  You are assuming that a guest launched with a device ram
>> size of 256KB receives a 512KB section and it knows what to do with it.
>> 
>> It knows what to do with the 256KB section, with the 512KB answer is
>> always a "perhaps".  It depends of what is on the extra space.
>> 
>> My solution would be that RAM/ROM sizes are always the same for
>> migration, so destination would always understand it.
>> 
>> It just forbids migrating between different machine types.  And I think
>> that is good, not bad.
>> 
>> Later, Juan.
>
> You basically ask firmware to be perfect, or want us to carry old bugs
> around forever.  It makes things simple for migration code, at huge cost
> elsewhere, and pain for users.
>
> I don't want to maintain old bugs in ACPI, and same applies to
> other firmware.
>
> This is really the whole point of the patchset.
> Keep bugs in device ram around until next reboot.
> At that point users get new device with non buggy
> behaviour.

False.

qemu-2.2 -M pc-2.0 -> qemu-2.0 -M pc-2.0

You migrate that way, and you go from "new-good" BIOS to "bad-old" BIOS.

I think the question here is, when we do this migration:

qemu-2.0 -M pc-2.0 -> qemu-2.2 -M pc-2.0

what BIOS should the second qemu use?  the one that existed on qemu-2.0
time or the one that existed on qemu-2.2 time?  You can allow for
bugfixes, but not for big changes like moving where the acpi tables were
generated, etc, etc.

I really think that we should use the BIOS from qemu-2.0 era.  And my
understanding is that you defend that we should use the qemu-2.2 era
BIOS.

I think that is the whole point of the discussion.  Have I at least
framed correctly what the discussion is about?

Later, Juan.
Juan Quintela Nov. 19, 2014, 10:50 a.m. UTC | #11
"Michael S. Tsirkin" <mst@redhat.com> wrote:
> On Wed, Nov 19, 2014 at 11:16:57AM +0100, Markus Armbruster wrote:
>> "Michael S. Tsirkin" <mst@redhat.com> writes:


>> Since migration doesn't transport configuration, we require a compatibly
>> configured target, and that includes identical memory sizes.  RAM size
>> is explicit and the user's problem.  ROM size is generally implicit, and
>> we use machine type compatibility machinery to keep it fixed.  BIOS
>> changes can break migration only when we screw up or forget the
>> compatibility machinery.  Same as for lots of other stuff.  No big deal,
>> really, just a consequence of not migrating configuration.
>
> You don't get to maintain it, so it's no big deal for you.
>
> I see pain every single release and code is becoming spaghetty-like very
> quickly.  We thought it would work. It does not.  We do need a solution.
>
> And the pain is completely self-inflicted: we already migrate
> all necessary information!
> It's just a question of adjusting our datastructures to it.

migration from version foo to version bar.

You have two options here:

- You make source (foo) send the data on the format/sizes that destination
  (bar) wants.
- You make destination (bar) handle whatever source (foo) sends.

You need to put the "spagueti code" in foo or bar.  It needs to be in
one of the two places, because if that code was not needed, we would not
be discussion here,  see?

So, what we are discussing is where is better to put this code.  Emit
the code that destination expects, or make destination handle whatever
is sent.  Amound of mangling need to be basically the same.

Later, Juan.
Michael S. Tsirkin Nov. 19, 2014, 1:28 p.m. UTC | #12
On Wed, Nov 19, 2014 at 11:45:15AM +0100, Juan Quintela wrote:
> "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > On Wed, Nov 19, 2014 at 11:11:26AM +0100, Juan Quintela wrote:
> >> "Michael S. Tsirkin" <mst@redhat.com> wrote:
> >> > On Wed, Nov 19, 2014 at 10:19:22AM +0100, Juan Quintela wrote:
> 
> 
> >> qemu -M pc-2.0 -L BIOS-2.0.bin
> >> And that way for every version and every bios.  If they are the same,
> >> a symlink does.  If they are not, different file.  And we would not have
> >> this problems.
> >
> > You want to keep old bios around forever, and never fix
> > bugs in it?  I disagree, quite strongly.
> 
> One thing is fix bugs, and a different one is completely change the way
> the tables/data are generated.

I want the ability to do both without keeping a ton of old code around.

> About keeping old bios forever, no.  Only while the machine types that
> depend on that BIOS are supported.  At the very point that they get not
> supported, we can drop it.

Still too much.
This is unsupportable and is not what we did historically.


> >
> >> 
> >> I fully agree that we have problems with BIOS every relase.  What we
> >> don't agree is _what_ is the best way to fix the issue.
> >> 
> >
> >
> > You don't seem to want to fix them. Your solution is just to keep
> > bugs around forver.
> 
> That is unfair.  It is the same that if I answer that your solution is
> just paper over the bugs that we have already foound and hope that there
> are no more bugs.  Do you think that describes your position?  Mine is
> also not described but your statement.

Then I don't understand. How do you suggest fixing BIOS bugs without
changing BIOS?
People have legitimate reasons to run compat machine types, and
they also need bugs fixed.

> >> > We have many devices that just get N from stream, do malloc(N), then read
> >> > data from stream into it.
> >> > You think it's unsafe? Go ahead and fix them all.
> >> 
> >> I am sure it is unsafe.  Fixing them requires incompatible changes, that
> >> is the whole point :-(
> >
> > I don't see the point, sorry.  Want to elaborate?
> 
> In general, I don't have specific examples:
> - when we have a string buffer, we sent/receive it with:
> 
>         VMSTATE_BUFFER(queue.data, PS2State),
> 
> We are sending whatever the size of queue.data is on source to whatever
> the queue.data is on destination.  We are hopping that they are the
> same.
> 
> In this case, if sizes are different, we got just a migration stream
> that is out of sync.  And if attacker modifies stream,  bad things could happen.
> In the case of buffers/arrays (so far it appears that everyplace that
> recives a size from the network, it checks that it is small enough).
> 
> >
> >> > However, my patch does address your concern: callers specify the upper
> >> > limit on the region size.
> >> > Trying to migrate in a 1Gbyte region
> >> 
> >> Yes and no.  You are assuming that a guest launched with a device ram
> >> size of 256KB receives a 512KB section and it knows what to do with it.
> >> 
> >> It knows what to do with the 256KB section, with the 512KB answer is
> >> always a "perhaps".  It depends of what is on the extra space.
> >> 
> >> My solution would be that RAM/ROM sizes are always the same for
> >> migration, so destination would always understand it.
> >> 
> >> It just forbids migrating between different machine types.  And I think
> >> that is good, not bad.
> >> 
> >> Later, Juan.
> >
> > You basically ask firmware to be perfect, or want us to carry old bugs
> > around forever.  It makes things simple for migration code, at huge cost
> > elsewhere, and pain for users.
> >
> > I don't want to maintain old bugs in ACPI, and same applies to
> > other firmware.
> >
> > This is really the whole point of the patchset.
> > Keep bugs in device ram around until next reboot.
> > At that point users get new device with non buggy
> > behaviour.
> 
> False.
> 
> qemu-2.2 -M pc-2.0 -> qemu-2.0 -M pc-2.0
> 
> You migrate that way, and you go from "new-good" BIOS to "bad-old" BIOS.

So? What is the point?

> I think the question here is, when we do this migration:
> 
> qemu-2.0 -M pc-2.0 -> qemu-2.2 -M pc-2.0
> 
> what BIOS should the second qemu use?  the one that existed on qemu-2.0
> time or the one that existed on qemu-2.2 time?  You can allow for
> bugfixes, but not for big changes like moving where the acpi tables were
> generated, etc, etc.

If you just ship old BIOS, you do not allow for bugfixes.

> I really think that we should use the BIOS from qemu-2.0 era.  And my
> understanding is that you defend that we should use the qemu-2.2 era
> BIOS.

Not only that.  We already do. And we don't intend to change that for 2.2.

> I think that is the whole point of the discussion.  Have I at least
> framed correctly what the discussion is about?
> 
> Later, Juan.

I think so.

Basically you want to freeze all firmware at time of release and never change it
for that machine type.
Correct?

This would be a regression, this is not how we did things in the past.

Real hardware lets users update firmware and so should virtual hardware.
Michael S. Tsirkin Nov. 19, 2014, 1:36 p.m. UTC | #13
On Wed, Nov 19, 2014 at 11:50:28AM +0100, Juan Quintela wrote:
> "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > On Wed, Nov 19, 2014 at 11:16:57AM +0100, Markus Armbruster wrote:
> >> "Michael S. Tsirkin" <mst@redhat.com> writes:
> 
> 
> >> Since migration doesn't transport configuration, we require a compatibly
> >> configured target, and that includes identical memory sizes.  RAM size
> >> is explicit and the user's problem.  ROM size is generally implicit, and
> >> we use machine type compatibility machinery to keep it fixed.  BIOS
> >> changes can break migration only when we screw up or forget the
> >> compatibility machinery.  Same as for lots of other stuff.  No big deal,
> >> really, just a consequence of not migrating configuration.
> >
> > You don't get to maintain it, so it's no big deal for you.
> >
> > I see pain every single release and code is becoming spaghetty-like very
> > quickly.  We thought it would work. It does not.  We do need a solution.
> >
> > And the pain is completely self-inflicted: we already migrate
> > all necessary information!
> > It's just a question of adjusting our datastructures to it.
> 
> migration from version foo to version bar.
> 
> You have two options here:
> 
> - You make source (foo) send the data on the format/sizes that destination
>   (bar) wants.
> - You make destination (bar) handle whatever source (foo) sends.
> 
> You need to put the "spagueti code" in foo or bar.  It needs to be in
> one of the two places, because if that code was not needed, we would not
> be discussion here,  see?
> 
> So, what we are discussing is where is better to put this code.  Emit
> the code that destination expects, or make destination handle whatever
> is sent.  Amound of mangling need to be basically the same.
> 
> Later, Juan.

This is not what the patch does at all.  There is no special-casing
depending on machine type anywhere. Please review the code and respond
to actual patches.
Paolo Bonzini Nov. 19, 2014, 1:44 p.m. UTC | #14
On 19/11/2014 14:28, Michael S. Tsirkin wrote:
> > 
> > qemu-2.0 -M pc-2.0 -> qemu-2.2 -M pc-2.0
> > 
> > what BIOS should the second qemu use?  the one that existed on qemu-2.0
> > time or the one that existed on qemu-2.2 time?  You can allow for
> > bugfixes, but not for big changes like moving where the acpi tables were
> > generated, etc, etc.
> 
> > I really think that we should use the BIOS from qemu-2.0 era.  And my
> > understanding is that you defend that we should use the qemu-2.2 era
> > BIOS.
> 
> Not only that.  We already do. And we don't intend to change that for 2.2.

Am I missing a part of the discussion?  When we migrate, the second QEMU
uses the BIOS from the first.

So:

qemu-2.0 -M pc-2.0 -> qemu-2.2 -M pc-2.0

   uses 2.0 BIOS

qemu-2.2 -M pc-2.0 -> qemu-2.0 -M pc-2.0

   uses 2.2 BIOS

Both should work, in general.  BIOS is rarely the reason for
incompatibilities.  However, breakage can happen, for example I know
that RHEL7 SeaBIOS does not work on RHEL6.  RHEL6 SeaBIOS works on
RHEL7, but it needs a couple workarounds.

Shipping a separate BIOS for different machine types is unrealistic and
pointless.  It would also be a good terrain for bug reports, unless you
also do things like "forbid creating -device megasas-gen2 on 2.1 because
it was introduced in 2.2".  Remember that libvirt keeps the same machine
type for the whole life of a virtual machine definition, even if other
parts of the hardware (e.g. disks or NICs) change.

Paolo
Juan Quintela Nov. 19, 2014, 1:49 p.m. UTC | #15
"Michael S. Tsirkin" <mst@redhat.com> wrote:
> On Wed, Nov 19, 2014 at 11:45:15AM +0100, Juan Quintela wrote:
>> "Michael S. Tsirkin" <mst@redhat.com> wrote:
>> > On Wed, Nov 19, 2014 at 11:11:26AM +0100, Juan Quintela wrote:
>> >> "Michael S. Tsirkin" <mst@redhat.com> wrote:
>> >> > On Wed, Nov 19, 2014 at 10:19:22AM +0100, Juan Quintela wrote:
>> 
>> 
>> >> qemu -M pc-2.0 -L BIOS-2.0.bin
>> >> And that way for every version and every bios.  If they are the same,
>> >> a symlink does.  If they are not, different file.  And we would not have
>> >> this problems.
>> >
>> > You want to keep old bios around forever, and never fix
>> > bugs in it?  I disagree, quite strongly.
>> 
>> One thing is fix bugs, and a different one is completely change the way
>> the tables/data are generated.
>
> I want the ability to do both without keeping a ton of old code around.

You want.

>> About keeping old bios forever, no.  Only while the machine types that
>> depend on that BIOS are supported.  At the very point that they get not
>> supported, we can drop it.
>
> Still too much.
> This is unsupportable and is not what we did historically.

And we have failed spectacularly.  If what you do don't work
.... changing what you do?


>> That is unfair.  It is the same that if I answer that your solution is
>> just paper over the bugs that we have already foound and hope that there
>> are no more bugs.  Do you think that describes your position?  Mine is
>> also not described but your statement.
>
> Then I don't understand. How do you suggest fixing BIOS bugs without
> changing BIOS?
> People have legitimate reasons to run compat machine types, and
> they also need bugs fixed.

if the change in the BIOS is big enough, they need to change also
machine type.  Is an easy as that.  You should not put a big change  in
BIOS without previous warning to the user.  This is independent of
migration.

Extreme example: You used to have BIOS and now move to UEFI. And don't
want to ship the old BIOS?  You consider that right?  if no, then we are
discussing about where is the limit, not if there is a limit.

>> >
>> > This is really the whole point of the patchset.
>> > Keep bugs in device ram around until next reboot.
>> > At that point users get new device with non buggy
                                             ^^^^^^^^^
>> > behaviour.
     ^^^^^^^^^
>> 
>> False.
>> 
>> qemu-2.2 -M pc-2.0 -> qemu-2.0 -M pc-2.0
>> 
>> You migrate that way, and you go from "new-good" BIOS to "bad-old" BIOS.
>
> So? What is the point?

You got new BIOS, and you got that they don't get the new non-buggy
behaviour.  They are running the old BIOS.  So, your solution don't fix
all problems.

>> I think the question here is, when we do this migration:
>> 
>> qemu-2.0 -M pc-2.0 -> qemu-2.2 -M pc-2.0
>> 
>> what BIOS should the second qemu use?  the one that existed on qemu-2.0
>> time or the one that existed on qemu-2.2 time?  You can allow for
>> bugfixes, but not for big changes like moving where the acpi tables were
>> generated, etc, etc.
>
> If you just ship old BIOS, you do not allow for bugfixes.

We have qemu-2.0
And now we have qemu-2.0.1 and qemu-2.1
You know that some changes are not valid for qemu-2.0.1, right?  Some
for BIOS.

>> I really think that we should use the BIOS from qemu-2.0 era.  And my
>> understanding is that you defend that we should use the qemu-2.2 era
>> BIOS.
>
> Not only that.  We already do. And we don't intend to change that for 2.2.
>
>> I think that is the whole point of the discussion.  Have I at least
>> framed correctly what the discussion is about?
>> 
>> Later, Juan.
>
> I think so.
>
> Basically you want to freeze all firmware at time of release and never change it
> for that machine type.
> Correct?

Bug fixes are ok.  Big changes no.  Think of what is permisible for
2.0.1 and not.  For instance, no new features allowed.

> This would be a regression, this is not how we did things in the past.

Back to square one.  We are failing with compatibility in the past.
Time to try new approachs?

> Real hardware lets users update firmware and so should virtual hardware.

But you can hibernate your laptop, update the firmware, and reboot?
Where the change can be anyting, like moving from traditional BIOS to
UEFI?

NO.  And for good reason.  If you are able to upgrade the BIOS while
hibernated, would it try to resume or just normal reboot?  Same for S3.

Later, Juan.
Paolo Bonzini Nov. 19, 2014, 1:51 p.m. UTC | #16
On 19/11/2014 14:49, Juan Quintela wrote:
>> > Real hardware lets users update firmware and so should virtual hardware.
> But you can hibernate your laptop, update the firmware, and reboot?
> Where the change can be anyting, like moving from traditional BIOS to
> UEFI?

Wait wait wait.  I totally cannot follow.  What would be the equivalent
in QEMU?

Paolo
Juan Quintela Nov. 19, 2014, 1:51 p.m. UTC | #17
"Michael S. Tsirkin" <mst@redhat.com> wrote:
> On Wed, Nov 19, 2014 at 11:50:28AM +0100, Juan Quintela wrote:
>> "Michael S. Tsirkin" <mst@redhat.com> wrote:
>> > On Wed, Nov 19, 2014 at 11:16:57AM +0100, Markus Armbruster wrote:
>> >> "Michael S. Tsirkin" <mst@redhat.com> writes:
>> 
>> 
>> >> Since migration doesn't transport configuration, we require a compatibly
>> >> configured target, and that includes identical memory sizes.  RAM size
>> >> is explicit and the user's problem.  ROM size is generally implicit, and
>> >> we use machine type compatibility machinery to keep it fixed.  BIOS
>> >> changes can break migration only when we screw up or forget the
>> >> compatibility machinery.  Same as for lots of other stuff.  No big deal,
>> >> really, just a consequence of not migrating configuration.
>> >
>> > You don't get to maintain it, so it's no big deal for you.
>> >
>> > I see pain every single release and code is becoming spaghetty-like very
>> > quickly.  We thought it would work. It does not.  We do need a solution.
>> >
>> > And the pain is completely self-inflicted: we already migrate
>> > all necessary information!
>> > It's just a question of adjusting our datastructures to it.
>> 
>> migration from version foo to version bar.
>> 
>> You have two options here:
>> 
>> - You make source (foo) send the data on the format/sizes that destination
>>   (bar) wants.
>> - You make destination (bar) handle whatever source (foo) sends.
>> 
>> You need to put the "spagueti code" in foo or bar.  It needs to be in
>> one of the two places, because if that code was not needed, we would not
>> be discussion here,  see?
>> 
>> So, what we are discussing is where is better to put this code.  Emit
>> the code that destination expects, or make destination handle whatever
>> is sent.  Amound of mangling need to be basically the same.
>> 
>> Later, Juan.
>
> This is not what the patch does at all.  There is no special-casing
> depending on machine type anywhere. Please review the code and respond
> to actual patches.

The code allows increasing of the ram regions if they are bigger on
source.  Without further explanation.  Without knowing _why_ they are
bigger on the other side.  In general it would not work, even if it
works on one particular case.  If they are bigger, it is because device
code use that for something.  not necesarely something that can be
ignored.

Later, Juan.
Juan Quintela Nov. 19, 2014, 1:57 p.m. UTC | #18
Paolo Bonzini <pbonzini@redhat.com> wrote:

> Am I missing a part of the discussion?  When we migrate, the second QEMU
> uses the BIOS from the first.
>
> So:
>
> qemu-2.0 -M pc-2.0 -> qemu-2.2 -M pc-2.0
>
>    uses 2.0 BIOS
>
> qemu-2.2 -M pc-2.0 -> qemu-2.0 -M pc-2.0
>
>    uses 2.2 BIOS
>
> Both should work, in general.  BIOS is rarely the reason for
> incompatibilities.  However, breakage can happen, for example I know
> that RHEL7 SeaBIOS does not work on RHEL6.  RHEL6 SeaBIOS works on
> RHEL7, but it needs a couple workarounds.
>
> Shipping a separate BIOS for different machine types is unrealistic and
> pointless.  It would also be a good terrain for bug reports, unless you
> also do things like "forbid creating -device megasas-gen2 on 2.1 because
> it was introduced in 2.2".

And I agree with that.  If it got introduced on 2.2, it should not be
allowed on pc-2.1.  It just makes things more complicated.  We don't
have infrastructure to enforce that.  And I am claining that is the
problem.  We are just papering over this problem each time that it
happens.  I honestely think that the only way to really fix
compatibility is enforcing that machine types are stable.  right now
they are now, and we ended nothing it.

> Remember that libvirt keeps the same machine
> type for the whole life of a virtual machine definition, even if other
> parts of the hardware (e.g. disks or NICs) change.

There is a way to "upgrade" the machine type of a specific machine.  If
you want to update it, just do it.  If you want "it is not broking, so
not mess with it", it means just that, not changing anything that we can
avoid to change.

Later, Juan.
Peter Maydell Nov. 19, 2014, 1:58 p.m. UTC | #19
On 17 November 2014 20:08, Michael S. Tsirkin <mst@redhat.com> wrote:
> Add API to manage on-device RAM.
> This looks just like regular RAM from migration POV,
> but has two special properties internally:
>
>     - it is never exposed to guest
>     - block is sized on migration, making it easier to extend
>       without breaking migration compatibility or wasting
>       virtual memory
>     - callers must specify an upper bound on size

> +/* On-device RAM allocated with g_malloc: supports realloc,
> + * not accessible to vcpu on kvm.
> + */
> +#define RAM_DEVICE     (1 << 2)

Does this comment mean "KVM guests cannot access this
memory, so it's a board bug to attempt to map it into
guest address space"?. If so, what breaks? Can we have
an assert or something to catch usage errors if it is
mapped? Would it be possible to drop the restriction?

I'm not convinced about the naming either -- isn't this
for BIOSes rather than generic on-device scratch RAM
(which you'd model either with a plain RAM memoryregion
or with a locally allocated block of memory or array,
depending on the device semantics)?

thanks
-- PMM
Juan Quintela Nov. 19, 2014, 2:03 p.m. UTC | #20
Paolo Bonzini <pbonzini@redhat.com> wrote:
> On 19/11/2014 14:49, Juan Quintela wrote:
>>> > Real hardware lets users update firmware and so should virtual hardware.
>> But you can hibernate your laptop, update the firmware, and reboot?
>> Where the change can be anyting, like moving from traditional BIOS to
>> UEFI?
>
> Wait wait wait.  I totally cannot follow.  What would be the equivalent
> in QEMU?

qemu-2.0 -M pc-2.0

migrate to disk/s3/s4

upgrade qemu

qemu-2.2 -M pc-2.0

try interesting variation of s3/s4/migration to disk.  Migration to disk
should work (we migrate BIOS ROM blocks, enphasis on ROM), s3 perhaps
(machine needs to be saved to disk), s4 ..... depends how it ends being
done.

Later, Juan.
Juan Quintela Nov. 19, 2014, 2:07 p.m. UTC | #21
Peter Maydell <peter.maydell@linaro.org> wrote:
> On 17 November 2014 20:08, Michael S. Tsirkin <mst@redhat.com> wrote:
>> Add API to manage on-device RAM.
>> This looks just like regular RAM from migration POV,
>> but has two special properties internally:
>>
>>     - it is never exposed to guest
>>     - block is sized on migration, making it easier to extend
>>       without breaking migration compatibility or wasting
>>       virtual memory
>>     - callers must specify an upper bound on size
>
>> +/* On-device RAM allocated with g_malloc: supports realloc,
>> + * not accessible to vcpu on kvm.
>> + */
>> +#define RAM_DEVICE     (1 << 2)
>
> Does this comment mean "KVM guests cannot access this
> memory, so it's a board bug to attempt to map it into
> guest address space"?. If so, what breaks? Can we have
> an assert or something to catch usage errors if it is
> mapped? Would it be possible to drop the restriction?
>
> I'm not convinced about the naming either -- isn't this
> for BIOSes rather than generic on-device scratch RAM
> (which you'd model either with a plain RAM memoryregion
> or with a locally allocated block of memory or array,
> depending on the device semantics)?

My understanding is that it is a "trick".  We have internal memory for a
device that is needed for the emulation, but not showed to the guest.
And it is big enough that we want to save it during the "live" stage of
migration, so we mark it as RAM.  if it is somekind of cash, we can just
enlarge it on destination, and it don't matter.  If this has anything
different on the other part of the RAM, we are on trouble.

Have I understood it correctly?

If so, it appears that all the cases (or the ones that mst cares about)
don't care about this size.

Later, Juan.
Peter Maydell Nov. 19, 2014, 2:10 p.m. UTC | #22
On 19 November 2014 14:07, Juan Quintela <quintela@redhat.com> wrote:
> My understanding is that it is a "trick".  We have internal memory for a
> device that is needed for the emulation, but not showed to the guest.
> And it is big enough that we want to save it during the "live" stage of
> migration, so we mark it as RAM.  if it is somekind of cash, we can just
> enlarge it on destination, and it don't matter.  If this has anything
> different on the other part of the RAM, we are on trouble.

Would it be feasible to just have the migration code provide
an API for registering things to be migrated in the live
migration stage, rather than creating memory regions which
you can't actually use for most of the purposes the memory
region API exists for?

-- PMM
Paolo Bonzini Nov. 19, 2014, 2:11 p.m. UTC | #23
On 19/11/2014 15:03, Juan Quintela wrote:
> Paolo Bonzini <pbonzini@redhat.com> wrote:
>> On 19/11/2014 14:49, Juan Quintela wrote:
>>>>> Real hardware lets users update firmware and so should virtual hardware.
>>> But you can hibernate your laptop, update the firmware, and reboot?
>>> Where the change can be anyting, like moving from traditional BIOS to
>>> UEFI?
>>
>> Wait wait wait.  I totally cannot follow.  What would be the equivalent
>> in QEMU?
> 
> qemu-2.0 -M pc-2.0
> 
> migrate to disk/s3/s4
> 
> upgrade qemu
> 
> qemu-2.2 -M pc-2.0
> 
> try interesting variation of s3/s4/migration to disk.  Migration to disk
> should work (we migrate BIOS ROM blocks, enphasis on ROM), s3 perhaps
> (machine needs to be saved to disk), s4 ..... depends how it ends being
> done.

Ok, got it.  S3 + migrate to disk should work.

S4 probably would work, but I think it would work on a real system too
as long as you update software and not hardware (e.g. changing the
motherboard would change the MAC address of the on-board NIC, for example).

Consider the similar case on real hardware:

boot
update microcode RPM
s4
turn on

CPU microcode is installed early by the kernel, before looking for a
hibernation image to resume from, so the CPU microcode after resume from
S4 is different from the microcode at the time you suspended to disk.
This probably would work.

Paolo
Dr. David Alan Gilbert Nov. 19, 2014, 2:13 p.m. UTC | #24
Since we've wondered off the actual ACPI table stuff into general
ROM sizing, I'd like to propose some concrete fixes:

  1) We explicitly name the bios file in a .romfile attribute for
     all ROMs.
  2) The code that uses .romfile has an expansion for $MACHINETYPE
  3) We actually symlink all of those together, anyone who wants/has
     to deal with different versions can downstream.
  4) The machine types contain size attributes for the ROMs that
     are generoously larger than the ROMs anyone currently uses.

I think 1..3 should deal with those of us who have to deal with different
ROM versions on different machine types.
4 might be good enough for the ACPI tables if you can bound it.

Dave
     
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Dr. David Alan Gilbert Nov. 19, 2014, 2:16 p.m. UTC | #25
* Paolo Bonzini (pbonzini@redhat.com) wrote:
> 
> 
> On 19/11/2014 15:03, Juan Quintela wrote:
> > Paolo Bonzini <pbonzini@redhat.com> wrote:
> >> On 19/11/2014 14:49, Juan Quintela wrote:
> >>>>> Real hardware lets users update firmware and so should virtual hardware.
> >>> But you can hibernate your laptop, update the firmware, and reboot?
> >>> Where the change can be anyting, like moving from traditional BIOS to
> >>> UEFI?
> >>
> >> Wait wait wait.  I totally cannot follow.  What would be the equivalent
> >> in QEMU?
> > 
> > qemu-2.0 -M pc-2.0
> > 
> > migrate to disk/s3/s4
> > 
> > upgrade qemu
> > 
> > qemu-2.2 -M pc-2.0
> > 
> > try interesting variation of s3/s4/migration to disk.  Migration to disk
> > should work (we migrate BIOS ROM blocks, enphasis on ROM), s3 perhaps
> > (machine needs to be saved to disk), s4 ..... depends how it ends being
> > done.
> 
> Ok, got it.  S3 + migrate to disk should work.
> 
> S4 probably would work, but I think it would work on a real system too
> as long as you update software and not hardware (e.g. changing the
> motherboard would change the MAC address of the on-board NIC, for example).
> 
> Consider the similar case on real hardware:
> 
> boot
> update microcode RPM
> s4
> turn on
> 
> CPU microcode is installed early by the kernel, before looking for a
> hibernation image to resume from, so the CPU microcode after resume from
> S4 is different from the microcode at the time you suspended to disk.
> This probably would work.

You mean, unless for example, someone had disabled a CPU feature in the
new microcode?

Dave

> 
> Paolo
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Juan Quintela Nov. 19, 2014, 2:18 p.m. UTC | #26
Peter Maydell <peter.maydell@linaro.org> wrote:
> On 19 November 2014 14:07, Juan Quintela <quintela@redhat.com> wrote:
>> My understanding is that it is a "trick".  We have internal memory for a
>> device that is needed for the emulation, but not showed to the guest.
>> And it is big enough that we want to save it during the "live" stage of
>> migration, so we mark it as RAM.  if it is somekind of cash, we can just
>> enlarge it on destination, and it don't matter.  If this has anything
>> different on the other part of the RAM, we are on trouble.
>
> Would it be feasible to just have the migration code provide
> an API for registering things to be migrated in the live
> migration stage, rather than creating memory regions which
> you can't actually use for most of the purposes the memory
> region API exists for?

If somebody told me what they need, we can do it.

Stefan, you needed something like that for data-plane?  Or that memory
is mapped on the guest?

Later, Juan.
Paolo Bonzini Nov. 19, 2014, 2:19 p.m. UTC | #27
On 19/11/2014 15:10, Peter Maydell wrote:
> On 19 November 2014 14:07, Juan Quintela <quintela@redhat.com> wrote:
> > My understanding is that it is a "trick".  We have internal memory for a
> > device that is needed for the emulation, but not showed to the guest.
> > And it is big enough that we want to save it during the "live" stage of
> > migration, so we mark it as RAM.  if it is somekind of cash, we can just
> > enlarge it on destination, and it don't matter.  If this has anything
> > different on the other part of the RAM, we are on trouble.
>
> Would it be feasible to just have the migration code provide
> an API for registering things to be migrated in the live
> migration stage, rather than creating memory regions which
> you can't actually use for most of the purposes the memory
> region API exists for?

It does already, for example PPC uses it for its IOMMU tables.

But in any case this is really just memory that is auto-resized on
migration.  And it can work even if it is mapped in memory, as long as
your resize callback (or some post_load code executed while migrating
devices) does something useful to update the memory map.  Let's call it
like what it is.

Basically it is an "I know how to deal with the source's configuration
whatever it is, so I'm not bothering to reconstruct it equivalently on
the destination" flag.

The fine print is that it will create more differences between a given
machine type on different versions of QEMU.  It can have ripple effects
on the memory map and it can make existing VMs a bit more likely to
break when updating QEMU.  This is why I do not particularly like it,
and I posted different patches to do the same thing.  I understand that
it is a simple fix that will mostly work and probably avoids work more
than causing it.

And BTW, those patches are still unreviewed,.  Juan, "do as you
preach"---review patches that are closer to what you preach.  And
Michael, you too---review patches in addition to asking people to review
yours, so that we can compare the approaches.

Paolo
Juan Quintela Nov. 19, 2014, 2:20 p.m. UTC | #28
Paolo Bonzini <pbonzini@redhat.com> wrote:
> On 19/11/2014 15:03, Juan Quintela wrote:
>> Paolo Bonzini <pbonzini@redhat.com> wrote:
>>> On 19/11/2014 14:49, Juan Quintela wrote:
>>>>>> Real hardware lets users update firmware and so should virtual hardware.
>>>> But you can hibernate your laptop, update the firmware, and reboot?
>>>> Where the change can be anyting, like moving from traditional BIOS to
>>>> UEFI?
>>>
>>> Wait wait wait.  I totally cannot follow.  What would be the equivalent
>>> in QEMU?
>> 
>> qemu-2.0 -M pc-2.0
>> 
>> migrate to disk/s3/s4
>> 
>> upgrade qemu
>> 
>> qemu-2.2 -M pc-2.0
>> 
>> try interesting variation of s3/s4/migration to disk.  Migration to disk
>> should work (we migrate BIOS ROM blocks, enphasis on ROM), s3 perhaps
>> (machine needs to be saved to disk), s4 ..... depends how it ends being
>> done.
>
> Ok, got it.  S3 + migrate to disk should work.
>
> S4 probably would work, but I think it would work on a real system too
> as long as you update software and not hardware (e.g. changing the
> motherboard would change the MAC address of the on-board NIC, for example).
>
> Consider the similar case on real hardware:
>
> boot
> update microcode RPM
> s4
> turn on
>
> CPU microcode is installed early by the kernel, before looking for a
> hibernation image to resume from, so the CPU microcode after resume from
> S4 is different from the microcode at the time you suspended to disk.
> This probably would work.

I am not an expert of cpu microcode, but I would assume that changes
there tend to be minimal, no?  And anyways, I wouldn't expect to
introduce/remove features like NX (i.e. visible by the guest) on a
microcode update?

Later, Juan.

> Paolo
Paolo Bonzini Nov. 19, 2014, 2:20 p.m. UTC | #29
On 19/11/2014 14:57, Juan Quintela wrote:
> > Shipping a separate BIOS for different machine types is unrealistic and
> > pointless.  It would also be a good terrain for bug reports, unless you
> > also do things like "forbid creating -device megasas-gen2 on 2.1 because
> > it was introduced in 2.2".
>
> And I agree with that.  If it got introduced on 2.2, it should not be
> allowed on pc-2.1.  It just makes things more complicated.  We don't
> have infrastructure to enforce that.  And I am claining that is the
> problem.  We are just papering over this problem each time that it
> happens.  I honestely think that the only way to really fix
> compatibility is enforcing that machine types are stable.  right now
> they are now, and we ended nothing it.

Weird, I have bought this USB device last month and I plugged it into a
two-year-old laptop.

    QEMU version = when did I last update firmware / buy hardware
    Machine type = when did I buy the computer

I honestly think that you are talking out of design dogma, without
really thinking through the consequences of the design.

Paolo
Peter Maydell Nov. 19, 2014, 2:21 p.m. UTC | #30
On 19 November 2014 14:19, Paolo Bonzini <pbonzini@redhat.com> wrote:
> On 19/11/2014 15:10, Peter Maydell wrote:
>> Would it be feasible to just have the migration code provide
>> an API for registering things to be migrated in the live
>> migration stage, rather than creating memory regions which
>> you can't actually use for most of the purposes the memory
>> region API exists for?
>
> It does already, for example PPC uses it for its IOMMU tables.
>
> But in any case this is really just memory that is auto-resized on
> migration.  And it can work even if it is mapped in memory, as long as
> your resize callback (or some post_load code executed while migrating
> devices) does something useful to update the memory map.  Let's call it
> like what it is.

...so why all the "this isn't going to work in KVM" warnings
in the patchset?

thanks
-- PMM
Paolo Bonzini Nov. 19, 2014, 2:22 p.m. UTC | #31
On 19/11/2014 15:13, Dr. David Alan Gilbert wrote:
> Since we've wondered off the actual ACPI table stuff into general
> ROM sizing, I'd like to propose some concrete fixes:
> 
>   1) We explicitly name the bios file in a .romfile attribute for
>      all ROMs.
>   2) The code that uses .romfile has an expansion for $MACHINETYPE
>   3) We actually symlink all of those together, anyone who wants/has
>      to deal with different versions can downstream.
>   4) The machine types contain size attributes for the ROMs that
>      are generoously larger than the ROMs anyone currently uses.
> 
> I think 1..3 should deal with those of us who have to deal with different
> ROM versions on different machine types.

It should, but it's a solution in search of a problem.

> 4 might be good enough for the ACPI tables if you can bound it.

Already doing that (rounding to 128k, warning if >64k), but it is not a
definitive solution.

We also do (4) for ROMs, since VGA BIOSes use only 36k out of 64k and
iPXE ROMs use only ~200k out of 256k.

Paolo
Juan Quintela Nov. 19, 2014, 2:22 p.m. UTC | #32
"Dr. David Alan Gilbert" <dgilbert@redhat.com> wrote:
> Since we've wondered off the actual ACPI table stuff into general
> ROM sizing, I'd like to propose some concrete fixes:
>
>   1) We explicitly name the bios file in a .romfile attribute for
>      all ROMs.
>   2) The code that uses .romfile has an expansion for $MACHINETYPE
>   3) We actually symlink all of those together, anyone who wants/has
>      to deal with different versions can downstream.
>   4) The machine types contain size attributes for the ROMs that
>      are generoously larger than the ROMs anyone currently uses.
>
> I think 1..3 should deal with those of us who have to deal with different
> ROM versions on different machine types.
> 4 might be good enough for the ACPI tables if you can bound it.
>
> Dave

I would agree with something like that.

Thanks, Juan.
Dr. David Alan Gilbert Nov. 19, 2014, 2:26 p.m. UTC | #33
* Paolo Bonzini (pbonzini@redhat.com) wrote:
> 
> 
> On 19/11/2014 15:13, Dr. David Alan Gilbert wrote:
> > Since we've wondered off the actual ACPI table stuff into general
> > ROM sizing, I'd like to propose some concrete fixes:
> > 
> >   1) We explicitly name the bios file in a .romfile attribute for
> >      all ROMs.
> >   2) The code that uses .romfile has an expansion for $MACHINETYPE
> >   3) We actually symlink all of those together, anyone who wants/has
> >      to deal with different versions can downstream.
> >   4) The machine types contain size attributes for the ROMs that
> >      are generoously larger than the ROMs anyone currently uses.
> > 
> > I think 1..3 should deal with those of us who have to deal with different
> > ROM versions on different machine types.
> 
> It should, but it's a solution in search of a problem.

Well we already do something close to 1 & 2 downstream but more ad-hoc;
it's just a generalisation (and 4 from padding the size of our images).
So we already had that problem.

> 
> > 4 might be good enough for the ACPI tables if you can bound it.
> 
> Already doing that (rounding to 128k, warning if >64k), but it is not a
> definitive solution.
> 
> We also do (4) for ROMs, since VGA BIOSes use only 36k out of 64k and
> iPXE ROMs use only ~200k out of 256k.
> 
> Paolo
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Paolo Bonzini Nov. 19, 2014, 2:28 p.m. UTC | #34
On 19/11/2014 15:26, Dr. David Alan Gilbert wrote:
> * Paolo Bonzini (pbonzini@redhat.com) wrote:
>>
>>
>> On 19/11/2014 15:13, Dr. David Alan Gilbert wrote:
>>> Since we've wondered off the actual ACPI table stuff into general
>>> ROM sizing, I'd like to propose some concrete fixes:
>>>
>>>   1) We explicitly name the bios file in a .romfile attribute for
>>>      all ROMs.
>>>   2) The code that uses .romfile has an expansion for $MACHINETYPE
>>>   3) We actually symlink all of those together, anyone who wants/has
>>>      to deal with different versions can downstream.
>>>   4) The machine types contain size attributes for the ROMs that
>>>      are generoously larger than the ROMs anyone currently uses.
>>>
>>> I think 1..3 should deal with those of us who have to deal with different
>>> ROM versions on different machine types.
>>
>> It should, but it's a solution in search of a problem.
> 
> Well we already do something close to 1 & 2 downstream but more ad-hoc;
> it's just a generalisation (and 4 from padding the size of our images).
> So we already had that problem.

Upstream too.  See pxe-* vs. efi-* NIC option ROMs.  The latter includes
both PXE firmware for BIOS and EFI drivers.  We keep two copies because
they have different sizes.  Having explicit expansions for $MACHINETYPE
would be hugely overkill, in my opinion.

Paolo

>>
>>> 4 might be good enough for the ACPI tables if you can bound it.
>>
>> Already doing that (rounding to 128k, warning if >64k), but it is not a
>> definitive solution.
>>
>> We also do (4) for ROMs, since VGA BIOSes use only 36k out of 64k and
>> iPXE ROMs use only ~200k out of 256k.
>>
>> Paolo
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
>
Paolo Bonzini Nov. 19, 2014, 2:28 p.m. UTC | #35
On 19/11/2014 15:16, Dr. David Alan Gilbert wrote:
>> > Consider the similar case on real hardware:
>> > 
>> > boot
>> > update microcode RPM
>> > s4
>> > turn on
>> > 
>> > CPU microcode is installed early by the kernel, before looking for a
>> > hibernation image to resume from, so the CPU microcode after resume from
>> > S4 is different from the microcode at the time you suspended to disk.
>> > This probably would work.
> You mean, unless for example, someone had disabled a CPU feature in the
> new microcode?

A random example, right? :)

I think it reinforces my point---just like it would work almost always
on real hardware, but can fails if the stars align right, it should work
almost always on QEMU.  It doesn't have to be _perfect_.  Bugs are
always possible, of course, but things can be tested.

Interested people/downstreams (hint, hint!) can try doing S4 on a
release and restarting on the next, with and without machine type
changes.  If it breaks, it can be fixed or just documented.

Paolo
Paolo Bonzini Nov. 19, 2014, 2:30 p.m. UTC | #36
On 19/11/2014 15:21, Peter Maydell wrote:
> > But in any case this is really just memory that is auto-resized on
> > migration.  And it can work even if it is mapped in memory, as long as
> > your resize callback (or some post_load code executed while migrating
> > devices) does something useful to update the memory map.  Let's call it
> > like what it is.
> ...so why all the "this isn't going to work in KVM" warnings
> in the patchset?

Just a limitation of the patches, and one thing I was going to ask to
change for v2. :)

Paolo
Dr. David Alan Gilbert Nov. 19, 2014, 2:59 p.m. UTC | #37
* Paolo Bonzini (pbonzini@redhat.com) wrote:
> 
> 
> On 19/11/2014 15:26, Dr. David Alan Gilbert wrote:
> > * Paolo Bonzini (pbonzini@redhat.com) wrote:
> >>
> >>
> >> On 19/11/2014 15:13, Dr. David Alan Gilbert wrote:
> >>> Since we've wondered off the actual ACPI table stuff into general
> >>> ROM sizing, I'd like to propose some concrete fixes:
> >>>
> >>>   1) We explicitly name the bios file in a .romfile attribute for
> >>>      all ROMs.
> >>>   2) The code that uses .romfile has an expansion for $MACHINETYPE
> >>>   3) We actually symlink all of those together, anyone who wants/has
> >>>      to deal with different versions can downstream.
> >>>   4) The machine types contain size attributes for the ROMs that
> >>>      are generoously larger than the ROMs anyone currently uses.
> >>>
> >>> I think 1..3 should deal with those of us who have to deal with different
> >>> ROM versions on different machine types.
> >>
> >> It should, but it's a solution in search of a problem.
> > 
> > Well we already do something close to 1 & 2 downstream but more ad-hoc;
> > it's just a generalisation (and 4 from padding the size of our images).
> > So we already had that problem.
> 
> Upstream too.  See pxe-* vs. efi-* NIC option ROMs.  The latter includes
> both PXE firmware for BIOS and EFI drivers.  We keep two copies because
> they have different sizes.  Having explicit expansions for $MACHINETYPE
> would be hugely overkill, in my opinion.

Yes it is, but it's simple and feels easy to understand.

Dave

> 
> Paolo
> 
> >>
> >>> 4 might be good enough for the ACPI tables if you can bound it.
> >>
> >> Already doing that (rounding to 128k, warning if >64k), but it is not a
> >> definitive solution.
> >>
> >> We also do (4) for ROMs, since VGA BIOSes use only 36k out of 64k and
> >> iPXE ROMs use only ~200k out of 256k.
> >>
> >> Paolo
> > --
> > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> > 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Michael S. Tsirkin Nov. 19, 2014, 3:04 p.m. UTC | #38
On Wed, Nov 19, 2014 at 01:58:54PM +0000, Peter Maydell wrote:
> On 17 November 2014 20:08, Michael S. Tsirkin <mst@redhat.com> wrote:
> > Add API to manage on-device RAM.
> > This looks just like regular RAM from migration POV,
> > but has two special properties internally:
> >
> >     - it is never exposed to guest
> >     - block is sized on migration, making it easier to extend
> >       without breaking migration compatibility or wasting
> >       virtual memory
> >     - callers must specify an upper bound on size
> 
> > +/* On-device RAM allocated with g_malloc: supports realloc,
> > + * not accessible to vcpu on kvm.
> > + */
> > +#define RAM_DEVICE     (1 << 2)
> 
> Does this comment mean "KVM guests cannot access this
> memory, so it's a board bug to attempt to map it into
> guest address space"?.

Yes.

> If so, what breaks?

When memory is reallocated, we don't update KVM.

> Can we have
> an assert or something to catch usage errors if it is
> mapped? Would it be possible to drop the restriction?

Yes, it's definitely possible.  It's a bit tricky: we need to tweak a
bunch of page flags on realloc, for example DONT_FORK:
clear them on old pages copy and set on new ones.
So I simply didn't want to bother.



> I'm not convinced about the naming either -- isn't this
> for BIOSes rather than generic on-device scratch RAM
> (which you'd model either with a plain RAM memoryregion
> or with a locally allocated block of memory or array,
> depending on the device semantics)?
> 
> thanks
> -- PMM

Hmm I don't exactly see the difference.  This RAM is internal to FW CFG
device.  Other devices might want to use this too if they keep guest
code in their internal RAM.


I'm fine with changing the name though - what's your
suggestion?
Michael S. Tsirkin Nov. 19, 2014, 3:12 p.m. UTC | #39
On Wed, Nov 19, 2014 at 02:10:36PM +0000, Peter Maydell wrote:
> On 19 November 2014 14:07, Juan Quintela <quintela@redhat.com> wrote:
> > My understanding is that it is a "trick".  We have internal memory for a
> > device that is needed for the emulation, but not showed to the guest.
> > And it is big enough that we want to save it during the "live" stage of
> > migration, so we mark it as RAM.  if it is somekind of cash, we can just
> > enlarge it on destination, and it don't matter.  If this has anything
> > different on the other part of the RAM, we are on trouble.
> 
> Would it be feasible to just have the migration code provide
> an API for registering things to be migrated in the live
> migration stage, rather than creating memory regions which
> you can't actually use for most of the purposes the memory
> region API exists for?
> 
> -- PMM

We could, it's just much more work, touching a lot more
places.
Michael S. Tsirkin Nov. 19, 2014, 3:16 p.m. UTC | #40
On Wed, Nov 19, 2014 at 03:30:59PM +0100, Paolo Bonzini wrote:
> 
> 
> On 19/11/2014 15:21, Peter Maydell wrote:
> > > But in any case this is really just memory that is auto-resized on
> > > migration.  And it can work even if it is mapped in memory, as long as
> > > your resize callback (or some post_load code executed while migrating
> > > devices) does something useful to update the memory map.  Let's call it
> > > like what it is.
> > ...so why all the "this isn't going to work in KVM" warnings
> > in the patchset?
> 
> Just a limitation of the patches,

Yes, it's a shortcut I took.
It's absolutely fixable.

> and one thing I was going to ask to
> change for v2. :)
> 
> Paolo

I felt it's not necessary but yes, sure.
Michael S. Tsirkin Nov. 19, 2014, 3:38 p.m. UTC | #41
On Wed, Nov 19, 2014 at 02:59:12PM +0000, Dr. David Alan Gilbert wrote:
> * Paolo Bonzini (pbonzini@redhat.com) wrote:
> > 
> > 
> > On 19/11/2014 15:26, Dr. David Alan Gilbert wrote:
> > > * Paolo Bonzini (pbonzini@redhat.com) wrote:
> > >>
> > >>
> > >> On 19/11/2014 15:13, Dr. David Alan Gilbert wrote:
> > >>> Since we've wondered off the actual ACPI table stuff into general
> > >>> ROM sizing, I'd like to propose some concrete fixes:
> > >>>
> > >>>   1) We explicitly name the bios file in a .romfile attribute for
> > >>>      all ROMs.
> > >>>   2) The code that uses .romfile has an expansion for $MACHINETYPE
> > >>>   3) We actually symlink all of those together, anyone who wants/has
> > >>>      to deal with different versions can downstream.
> > >>>   4) The machine types contain size attributes for the ROMs that
> > >>>      are generoously larger than the ROMs anyone currently uses.
> > >>>
> > >>> I think 1..3 should deal with those of us who have to deal with different
> > >>> ROM versions on different machine types.
> > >>
> > >> It should, but it's a solution in search of a problem.
> > > 
> > > Well we already do something close to 1 & 2 downstream but more ad-hoc;
> > > it's just a generalisation (and 4 from padding the size of our images).
> > > So we already had that problem.
> > 
> > Upstream too.  See pxe-* vs. efi-* NIC option ROMs.  The latter includes
> > both PXE firmware for BIOS and EFI drivers.  We keep two copies because
> > they have different sizes.  Having explicit expansions for $MACHINETYPE
> > would be hugely overkill, in my opinion.
> 
> Yes it is, but it's simple and feels easy to understand.
> 
> Dave

I feel it's an implementation detail that really shouldn't
be pushed out to users.


> > 
> > Paolo
> > 
> > >>
> > >>> 4 might be good enough for the ACPI tables if you can bound it.
> > >>
> > >> Already doing that (rounding to 128k, warning if >64k), but it is not a
> > >> definitive solution.
> > >>
> > >> We also do (4) for ROMs, since VGA BIOSes use only 36k out of 64k and
> > >> iPXE ROMs use only ~200k out of 256k.
> > >>
> > >> Paolo
> > > --
> > > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> > > 
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Michael S. Tsirkin Nov. 19, 2014, 3:43 p.m. UTC | #42
On Wed, Nov 19, 2014 at 03:20:07PM +0100, Juan Quintela wrote:
> Paolo Bonzini <pbonzini@redhat.com> wrote:
> > On 19/11/2014 15:03, Juan Quintela wrote:
> >> Paolo Bonzini <pbonzini@redhat.com> wrote:
> >>> On 19/11/2014 14:49, Juan Quintela wrote:
> >>>>>> Real hardware lets users update firmware and so should virtual hardware.
> >>>> But you can hibernate your laptop, update the firmware, and reboot?
> >>>> Where the change can be anyting, like moving from traditional BIOS to
> >>>> UEFI?
> >>>
> >>> Wait wait wait.  I totally cannot follow.  What would be the equivalent
> >>> in QEMU?
> >> 
> >> qemu-2.0 -M pc-2.0
> >> 
> >> migrate to disk/s3/s4
> >> 
> >> upgrade qemu
> >> 
> >> qemu-2.2 -M pc-2.0
> >> 
> >> try interesting variation of s3/s4/migration to disk.  Migration to disk
> >> should work (we migrate BIOS ROM blocks, enphasis on ROM), s3 perhaps
> >> (machine needs to be saved to disk), s4 ..... depends how it ends being
> >> done.
> >
> > Ok, got it.  S3 + migrate to disk should work.
> >
> > S4 probably would work, but I think it would work on a real system too
> > as long as you update software and not hardware (e.g. changing the
> > motherboard would change the MAC address of the on-board NIC, for example).
> >
> > Consider the similar case on real hardware:
> >
> > boot
> > update microcode RPM
> > s4
> > turn on
> >
> > CPU microcode is installed early by the kernel, before looking for a
> > hibernation image to resume from, so the CPU microcode after resume from
> > S4 is different from the microcode at the time you suspended to disk.
> > This probably would work.
> 
> I am not an expert of cpu microcode, but I would assume that changes
> there tend to be minimal, no?  And anyways, I wouldn't expect to
> introduce/remove features like NX (i.e. visible by the guest) on a
> microcode update?
> 
> Later, Juan.
> 
> > Paolo

Not added mostly because there's no point: CPU vendors
would much rather sell you a new CPU :)
Features could be removed because of some errata?
Michael S. Tsirkin Nov. 19, 2014, 3:46 p.m. UTC | #43
On Wed, Nov 19, 2014 at 02:51:38PM +0100, Juan Quintela wrote:
> "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > On Wed, Nov 19, 2014 at 11:50:28AM +0100, Juan Quintela wrote:
> >> "Michael S. Tsirkin" <mst@redhat.com> wrote:
> >> > On Wed, Nov 19, 2014 at 11:16:57AM +0100, Markus Armbruster wrote:
> >> >> "Michael S. Tsirkin" <mst@redhat.com> writes:
> >> 
> >> 
> >> >> Since migration doesn't transport configuration, we require a compatibly
> >> >> configured target, and that includes identical memory sizes.  RAM size
> >> >> is explicit and the user's problem.  ROM size is generally implicit, and
> >> >> we use machine type compatibility machinery to keep it fixed.  BIOS
> >> >> changes can break migration only when we screw up or forget the
> >> >> compatibility machinery.  Same as for lots of other stuff.  No big deal,
> >> >> really, just a consequence of not migrating configuration.
> >> >
> >> > You don't get to maintain it, so it's no big deal for you.
> >> >
> >> > I see pain every single release and code is becoming spaghetty-like very
> >> > quickly.  We thought it would work. It does not.  We do need a solution.
> >> >
> >> > And the pain is completely self-inflicted: we already migrate
> >> > all necessary information!
> >> > It's just a question of adjusting our datastructures to it.
> >> 
> >> migration from version foo to version bar.
> >> 
> >> You have two options here:
> >> 
> >> - You make source (foo) send the data on the format/sizes that destination
> >>   (bar) wants.
> >> - You make destination (bar) handle whatever source (foo) sends.
> >> 
> >> You need to put the "spagueti code" in foo or bar.  It needs to be in
> >> one of the two places, because if that code was not needed, we would not
> >> be discussion here,  see?
> >> 
> >> So, what we are discussing is where is better to put this code.  Emit
> >> the code that destination expects, or make destination handle whatever
> >> is sent.  Amound of mangling need to be basically the same.
> >> 
> >> Later, Juan.
> >
> > This is not what the patch does at all.  There is no special-casing
> > depending on machine type anywhere. Please review the code and respond
> > to actual patches.
> 
> The code allows increasing of the ram regions if they are bigger on
> source.  Without further explanation.  Without knowing _why_ they are
> bigger on the other side.

Actually yes: devices that want this functionality need to call
the new API.
At the point where API is called, would be the best place to
put in an explanation why it should be resizeable.


> In general it would not work, even if it
> works on one particular case.  If they are bigger, it is because device
> code use that for something.  not necesarely something that can be
> ignored.
> 
> Later, Juan.

Absolutely.  And that is why callers get a callback notifying them about
resize.

See? You are arriving at my design step by step :)
Dr. David Alan Gilbert Nov. 19, 2014, 3:53 p.m. UTC | #44
* Michael S. Tsirkin (mst@redhat.com) wrote:
> On Wed, Nov 19, 2014 at 02:59:12PM +0000, Dr. David Alan Gilbert wrote:
> > * Paolo Bonzini (pbonzini@redhat.com) wrote:
> > > 
> > > 
> > > On 19/11/2014 15:26, Dr. David Alan Gilbert wrote:
> > > > * Paolo Bonzini (pbonzini@redhat.com) wrote:
> > > >>
> > > >>
> > > >> On 19/11/2014 15:13, Dr. David Alan Gilbert wrote:
> > > >>> Since we've wondered off the actual ACPI table stuff into general
> > > >>> ROM sizing, I'd like to propose some concrete fixes:
> > > >>>
> > > >>>   1) We explicitly name the bios file in a .romfile attribute for
> > > >>>      all ROMs.
> > > >>>   2) The code that uses .romfile has an expansion for $MACHINETYPE
> > > >>>   3) We actually symlink all of those together, anyone who wants/has
> > > >>>      to deal with different versions can downstream.
> > > >>>   4) The machine types contain size attributes for the ROMs that
> > > >>>      are generoously larger than the ROMs anyone currently uses.
> > > >>>
> > > >>> I think 1..3 should deal with those of us who have to deal with different
> > > >>> ROM versions on different machine types.
> > > >>
> > > >> It should, but it's a solution in search of a problem.
> > > > 
> > > > Well we already do something close to 1 & 2 downstream but more ad-hoc;
> > > > it's just a generalisation (and 4 from padding the size of our images).
> > > > So we already had that problem.
> > > 
> > > Upstream too.  See pxe-* vs. efi-* NIC option ROMs.  The latter includes
> > > both PXE firmware for BIOS and EFI drivers.  We keep two copies because
> > > they have different sizes.  Having explicit expansions for $MACHINETYPE
> > > would be hugely overkill, in my opinion.
> > 
> > Yes it is, but it's simple and feels easy to understand.
> > 
> > Dave
> 
> I feel it's an implementation detail that really shouldn't
> be pushed out to users.

Define 'users' - I don't see that it's being pushed anywhere except giving
the ROM packagers control to make sure they supply the right thing.
The end user of qemu doesn't have to worry about it.

Dave

> 
> 
> > > 
> > > Paolo
> > > 
> > > >>
> > > >>> 4 might be good enough for the ACPI tables if you can bound it.
> > > >>
> > > >> Already doing that (rounding to 128k, warning if >64k), but it is not a
> > > >> definitive solution.
> > > >>
> > > >> We also do (4) for ROMs, since VGA BIOSes use only 36k out of 64k and
> > > >> iPXE ROMs use only ~200k out of 256k.
> > > >>
> > > >> Paolo
> > > > --
> > > > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> > > > 
> > --
> > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Stefan Hajnoczi Nov. 19, 2014, 4:08 p.m. UTC | #45
On Wed, Nov 19, 2014 at 03:18:01PM +0100, Juan Quintela wrote:
> Peter Maydell <peter.maydell@linaro.org> wrote:
> > On 19 November 2014 14:07, Juan Quintela <quintela@redhat.com> wrote:
> >> My understanding is that it is a "trick".  We have internal memory for a
> >> device that is needed for the emulation, but not showed to the guest.
> >> And it is big enough that we want to save it during the "live" stage of
> >> migration, so we mark it as RAM.  if it is somekind of cash, we can just
> >> enlarge it on destination, and it don't matter.  If this has anything
> >> different on the other part of the RAM, we are on trouble.
> >
> > Would it be feasible to just have the migration code provide
> > an API for registering things to be migrated in the live
> > migration stage, rather than creating memory regions which
> > you can't actually use for most of the purposes the memory
> > region API exists for?
> 
> If somebody told me what they need, we can do it.
> 
> Stefan, you needed something like that for data-plane?  Or that memory
> is mapped on the guest?

No, dataplane has no special requirements here.

I am working on making the dirty memory bitmap atomic so that dataplane
threads can dirty memory at the same time as other threads.  But that's
a different topic from what you are discussing here.

Stefan
Kevin O'Connor Nov. 19, 2014, 4:27 p.m. UTC | #46
On Wed, Nov 19, 2014 at 02:44:32PM +0100, Paolo Bonzini wrote:
> So:
> 
> qemu-2.0 -M pc-2.0 -> qemu-2.2 -M pc-2.0
> 
>    uses 2.0 BIOS
> 
> qemu-2.2 -M pc-2.0 -> qemu-2.0 -M pc-2.0
> 
>    uses 2.2 BIOS
> 
> Both should work, in general.  BIOS is rarely the reason for
> incompatibilities.  However, breakage can happen, for example I know
> that RHEL7 SeaBIOS does not work on RHEL6.  RHEL6 SeaBIOS works on
> RHEL7, but it needs a couple workarounds.

Do you know why that is?  We do try to make SeaBIOS backwards
compatible with older versions of QEMU.

-Kevin
Juan Quintela Nov. 19, 2014, 4:39 p.m. UTC | #47
Paolo Bonzini <pbonzini@redhat.com> wrote:
> On 19/11/2014 14:57, Juan Quintela wrote:
>> > Shipping a separate BIOS for different machine types is unrealistic and
>> > pointless.  It would also be a good terrain for bug reports, unless you
>> > also do things like "forbid creating -device megasas-gen2 on 2.1 because
>> > it was introduced in 2.2".
>>
>> And I agree with that.  If it got introduced on 2.2, it should not be
>> allowed on pc-2.1.  It just makes things more complicated.  We don't
>> have infrastructure to enforce that.  And I am claining that is the
>> problem.  We are just papering over this problem each time that it
>> happens.  I honestely think that the only way to really fix
>> compatibility is enforcing that machine types are stable.  right now
>> they are now, and we ended nothing it.
>
> Weird, I have bought this USB device last month and I plugged it into a
> two-year-old laptop.
>
>     QEMU version = when did I last update firmware / buy hardware
>     Machine type = when did I buy the computer
>
> I honestly think that you are talking out of design dogma, without
> really thinking through the consequences of the design.

It is not the same, and you know it.
It is the equivalent: I have this aging pc with PCI and I have bought
this PCI-EXpress card, if I use enough force, perhaps it could work.

Enough force here would mean put some soldering here and there, new
chips, blah, blah, blah.  While the machine is running.  When you have a
different solution.  Powerdown machine.  Change machine type.  Boot it
again.  magic!!!!

It is not that I am not giving you one option to fix the problem, it is
a different solution.  Mine don't require changing anything, just forbid
something that now it is allowed, and that we have found difficult to
support.  I would jsut remove the claim that we support it.  I honestly
think that it is a good tradeof, and the only that we can guarantee.

Later, Juan.
Juan Quintela Nov. 19, 2014, 4:45 p.m. UTC | #48
"Michael S. Tsirkin" <mst@redhat.com> wrote:
> On Wed, Nov 19, 2014 at 02:51:38PM +0100, Juan Quintela wrote:

> Actually yes: devices that want this functionality need to call
> the new API.
> At the point where API is called, would be the best place to
> put in an explanation why it should be resizeable.
>
>
>> In general it would not work, even if it
>> works on one particular case.  If they are bigger, it is because device
>> code use that for something.  not necesarely something that can be
>> ignored.
>> 
>> Later, Juan.
>
> Absolutely.  And that is why callers get a callback notifying them about
> resize.
>
> See? You are arriving at my design step by step :)

Then why we ever wonder about assingning the space on the 1st place?
Just got it from the migration stream?

Later, Juan.
Juan Quintela Nov. 19, 2014, 4:50 p.m. UTC | #49
Paolo Bonzini <pbonzini@redhat.com> wrote:
> On 19/11/2014 15:10, Peter Maydell wrote:

> It does already, for example PPC uses it for its IOMMU tables.
>
> But in any case this is really just memory that is auto-resized on
> migration.  And it can work even if it is mapped in memory, as long as
> your resize callback (or some post_load code executed while migrating
> devices) does something useful to update the memory map.  Let's call it
> like what it is.
>
> Basically it is an "I know how to deal with the source's configuration
> whatever it is, so I'm not bothering to reconstruct it equivalently on
> the destination" flag.
>
> The fine print is that it will create more differences between a given
> machine type on different versions of QEMU.  It can have ripple effects
> on the memory map and it can make existing VMs a bit more likely to
> break when updating QEMU.  This is why I do not particularly like it,
> and I posted different patches to do the same thing.  I understand that
> it is a simple fix that will mostly work and probably avoids work more
> than causing it.
>
> And BTW, those patches are still unreviewed,.  Juan, "do as you
> preach"---review patches that are closer to what you preach.  And
> Michael, you too---review patches in addition to asking people to review
> yours, so that we can compare the approaches.

I will review them, for the migration bits.  But my ACPI knowledge is
basically: when I see ACPI, I run in the other direction O:-) (*)

Later, Juan.

(*) I think that this is all that is needed to know about ACPI O:-)
Paolo Bonzini Nov. 19, 2014, 4:56 p.m. UTC | #50
On 19/11/2014 17:39, Juan Quintela wrote:
> Paolo Bonzini <pbonzini@redhat.com> wrote:
>> On 19/11/2014 14:57, Juan Quintela wrote:
>>>> Shipping a separate BIOS for different machine types is unrealistic and
>>>> pointless.  It would also be a good terrain for bug reports, unless you
>>>> also do things like "forbid creating -device megasas-gen2 on 2.1 because
>>>> it was introduced in 2.2".
>>>
>>> And I agree with that.  If it got introduced on 2.2, it should not be
>>> allowed on pc-2.1.  It just makes things more complicated.
>>
>> Weird, I have bought this USB device last month and I plugged it into a
>> two-year-old laptop.
>>
>>     QEMU version = when did I last update firmware / buy hardware
>>     Machine type = when did I buy the computer
> 
> It is not the same, and you know it.
> It is the equivalent: I have this aging pc with PCI and I have bought
> this PCI-EXpress card, if I use enough force, perhaps it could work.

Hmm, no.  My example is megasas-gen2.  You can certainly say "I have
bought this PCI HBA last month and I plugged it into a two-year-old
desktop".

It's irrelevant that we model most PCIe devices we emulate as PCI, just
because our main machine type is PCI.  It's irrelevant because it's the
same for a pc-2.2 machine type, it doesn't depend on the machine type.

There's nothing in an external device that affects the stability of
machine types.

> Enough force here would mean put some soldering here and there, new
> chips, blah, blah, blah.  While the machine is running.

So you've moved the goal post to hotplug after migration from 2.1 to
2.2?  No problem, I can certainly hotplug a new PCI HBA into a
two-year-old running server, if the server supports hotplug.

> It is not that I am not giving you one option to fix the problem, it is
> a different solution.  Mine don't require changing anything, just forbid
> something that now it is allowed, and that we have found difficult to
> support.

Sorry, I think this is not true.  It's hard, yes.  But life is hard.
There's nothing in this that we have found difficult to support.  It's
just code that someone has to write.

Paolo
Paolo Bonzini Nov. 19, 2014, 5:01 p.m. UTC | #51
On 19/11/2014 17:27, Kevin O'Connor wrote:
> On Wed, Nov 19, 2014 at 02:44:32PM +0100, Paolo Bonzini wrote:
>> So:
>>
>> qemu-2.0 -M pc-2.0 -> qemu-2.2 -M pc-2.0
>>
>>    uses 2.0 BIOS
>>
>> qemu-2.2 -M pc-2.0 -> qemu-2.0 -M pc-2.0
>>
>>    uses 2.2 BIOS
>>
>> Both should work, in general.  BIOS is rarely the reason for
>> incompatibilities.  However, breakage can happen, for example I know
>> that RHEL7 SeaBIOS does not work on RHEL6.  RHEL6 SeaBIOS works on
>> RHEL7, but it needs a couple workarounds.
> 
> Do you know why that is?  We do try to make SeaBIOS backwards
> compatible with older versions of QEMU.

For RHEL6-on-RHEL7 SeaBIOS was violating the virtio-scsi spec, but QEMU
happened to support the broken case until recently.  When I was told
that the broken case stopped working, I added a workaround.

I don't know why RHEL7 SeaBIOS does not work on RHEL6.  But note that
it's a really old version (0.12).  I guess one could try compiling
various QEMU versions and see where it broke, and then either debug or
bisect further.  RHEL7 uses 1.5.3.

Paolo
Paolo Bonzini Nov. 19, 2014, 6:28 p.m. UTC | #52
On 19/11/2014 17:45, Juan Quintela wrote:
>> > Absolutely.  And that is why callers get a callback notifying them about
>> > resize.
>> >
>> > See? You are arriving at my design step by step :)
> Then why we ever wonder about assingning the space on the 1st place?
> Just got it from the migration stream?

Just because then we have fewer "if (!incoming_migration())"s.

Paolo
Gerd Hoffmann Nov. 20, 2014, 8:12 a.m. UTC | #53
Hi,

> I don't know why RHEL7 SeaBIOS does not work on RHEL6.  But note that
> it's a really old version (0.12).

Hmm, works for me on a quick smoke test.  Do you remember what exactly
broke and which version it was?  Maybe the 1.7.2 -> 1.7.5 update fixed
it?

Or was it live-migration by chance?  rhel6 qemu doesn't emulate pam
registers correctly, and seabios has a workaround for that.  So
live-migrating between versions with and without correct pam emulation
creates some *ahem* very interesting corner cases ...

cheers,
  Gerd
Paolo Bonzini Nov. 20, 2014, 10 a.m. UTC | #54
On 20/11/2014 09:12, Gerd Hoffmann wrote:
>   Hi,
> 
>> I don't know why RHEL7 SeaBIOS does not work on RHEL6.  But note that
>> it's a really old version (0.12).
> 
> Hmm, works for me on a quick smoke test.  Do you remember what exactly
> broke and which version it was?  Maybe the 1.7.2 -> 1.7.5 update fixed
> it?

Possibly.  I tested 1.7.2 about a month ago and it failed.

> Or was it live-migration by chance?  rhel6 qemu doesn't emulate pam
> registers correctly, and seabios has a workaround for that.  So
> live-migrating between versions with and without correct pam emulation
> creates some *ahem* very interesting corner cases ...

No, it was not live migration.  Laszlo found it when he was trying to
identify a live migration bug as a QEMU bug or SeaBIOS bug; but he
didn't even get to the point of doing live migration. :)

Paolo
Markus Armbruster Nov. 20, 2014, 1:35 p.m. UTC | #55
"Michael S. Tsirkin" <mst@redhat.com> writes:

> On Wed, Nov 19, 2014 at 11:16:57AM +0100, Markus Armbruster wrote:
>> "Michael S. Tsirkin" <mst@redhat.com> writes:
>> 
>> > On Wed, Nov 19, 2014 at 10:19:22AM +0100, Juan Quintela wrote:
>> >> "Michael S. Tsirkin" <mst@redhat.com> wrote:
>> >> > On Tue, Nov 18, 2014 at 07:03:58AM +0100, Paolo Bonzini wrote:
>> >> >> 
>> >> >> 
>> >> >> On 17/11/2014 21:08, Michael S. Tsirkin wrote:
>> >> >> > Add API to manage on-device RAM.
>> >> >> > This looks just like regular RAM from migration POV,
>> >> >> > but has two special properties internally:
>> >> >> > 
>> >> >> >     - block is sized on migration, making it easier to extend
>> >> >> >       without breaking migration compatibility or wasting
>> >> >> >       virtual memory
>> >> >> >     - callers must specify an upper bound on size
>> >> >> 
>> >> 
>> >> 
>> >> >> Also, I am afraid that this design could make it easier to introduce
>> >> >> backwards-incompatible changes.
>> >> >
>> >> >
>> >> > Well the point is exactly to make it easy to make *compatible*
>> >> > changes.
>> >> >
>> >> > As I mentioned in the cover letter, it's not just ACPI.
>> >> > For example, we now change boot index dynamically.
>> >> > People using large fw cfg blobs, e.g. -initrd, would benefit from
>> >> > ability to change the blob dynamically.
>> >> > There could be other examples.
>> >> 
>> >> changing the size of the initrd, on the fly and wanting to migrate?  Is
>> >> that a real use case?  One that we should really care?
>> >
>> > I'm not sure.
>> >
>> > At the moment one can swap kernels by doing halt in guest and
>> > restarting with the new one.
>> >
>> > If we wanted to allow reboot in guest to bring a new kernel instead,
>> > that would be one way to implement it.
>> >
>> > I was merely pointing out that the capability might find other uses.
>> >
>> >
>> >> >>  I very much prefer to have
>> >> >> user-controlled ACPI information (coming from the command-line)
>> >> >> byte-for-byte identical for a given machine type.  Patches for that have
>> >> >> been on the list for almost two months, and it's not nice.
>> >> >> 
>> >> >> Paolo
>> >> >
>> >> > I guess we just disagree on whether these patches will
>> >> > effectively achieve
>> >> > this goal.  For example, some people want to rewrite iasl bits,
>> >> > generating everything in C. This will affect static bits too.
>> >> > I don't want to make every single change in code conditional
>> >> > on a machine type.
>> >> 
>> >> You can't migrate with a different BIOS on destination, period.
>> >
>> > This claim is very wrong.
>> > This would make is impossible to change BIOS bus without breaking
>> > migration.  Look at history of qemu, we change BIOS every release.
>> 
>> Since migration doesn't transport configuration, we require a compatibly
>> configured target, and that includes identical memory sizes.  RAM size
>> is explicit and the user's problem.  ROM size is generally implicit, and
>> we use machine type compatibility machinery to keep it fixed.  BIOS
>> changes can break migration only when we screw up or forget the
>> compatibility machinery.  Same as for lots of other stuff.  No big deal,
>> really, just a consequence of not migrating configuration.
>
> You don't get to maintain it, so it's no big deal for you.
>
> I see pain every single release and code is becoming spaghetty-like very
> quickly.  We thought it would work. It does not.  We do need a solution.
>
> And the pain is completely self-inflicted: we already migrate
> all necessary information!
> It's just a question of adjusting our datastructures to it.
>
>
>
>> >>  That is
>> >> what is making this whole issue complicated.  We have two clear options:
>> >> 
>> >> a- require BIOS & memory regions to be exactly the same in both sides.
>> >>    No need to add compat machinery.
>> >> b- trying to accomodate any potential change that could appear and use
>> >>    the same BIOS.
>> >> 
>> >> IMHO, b) is just asking for trouble.  Being able to go from random
>> >> changes to random changes look strange.
>> >
>> > Yes, it is hard to support.
>> > But it's a required feature, and in fact, it's an existing one.
>> >
>> >> Just think about it for a second.  We are sending more data for some
>> >> regions that it was allocated.  And we just grow the regions and expect
>> >> that everything is going to be ok.  It is me, or this goes against every
>> >> security discipline that I can think of?
>> >> 
>> >> Later, Juan.
>> >
>> > We have many devices that just get N from stream, do malloc(N), then read
>> > data from stream into it.
>> > You think it's unsafe? Go ahead and fix them all.
>> >
>> > However, my patch does address your concern: callers specify the upper
>> > limit on the region size.
>> > Trying to migrate in a 1Gbyte region
>> 
>> Are you proposing to make incoming migration adjust some or all memory
>> sizes on the target from "whatever was configured during startup" to
>> "whatever is configured on the source"?
>
> Yes.
>
> At the moment, I only propose this for internal on-device RAM,
> for the simple reason that users don't know or care about it.
> So migrating it just removes maintainance pain.
>
> It wouldn't be hard to extend it for user-specified RAM,
> but I don't know whether that is useful.
>
>> Possibly with some limitations,
>> such as "can only adjust downwards"?
>
> Yes.
>
> "Can adjust downwards" is too limiting, since especially downstreams
> want two-way migration to work.
> So I just have devices specify an upper limit.

Okay, I now have a better understanding of what you're trying to do.

The general nice-to-have feature is "I don't want to repeat the source
configuration on the target manually, I want the target get it from the
source".

Since that's a tough nut to crack, you're proposing to do a special
case: get a few selected memory sizes "that users don't know or care
about" from the source.  We get their contents anyway, so why not the
size.  Implementation detail: involves resizing memory on the target.

The problem I have with this: in the physical world, memory sizes don't
change.  You can reflash your BIOS, but to get a bigger flash chip, you
have to replace the motherboard.  Translated to virtual, this means the
size of the flash chip is tied to the machine type.

Likewise for other devices containing memory.

Furthermore, I'm struggling to see why coping memory sizes tied to
machine type is hard.  Pick a sane flash size, leaving ample space for
bug fixes.  Next machine type can pick a bigger size if it really needs
so much more space for new features.  If we pick our sizes stingily, we
may end up with a number of sizes tied to machine types.  Not exactly
pretty, but it's what we've been doing for a while, and it works.  If
you want fewer sizes, start picking sizes more generously.

What am I missing here that can justify the complexity of partially
overriding target configuration in the migration stream plus
infrastructure for resizing memory?
Michael S. Tsirkin Nov. 20, 2014, 2:04 p.m. UTC | #56
On Thu, Nov 20, 2014 at 02:35:14PM +0100, Markus Armbruster wrote:
> What am I missing here that can justify the complexity of partially
> overriding target configuration in the migration stream plus
> infrastructure for resizing memory?

The justification is that sizing it properly is an unsolved problem.

The difference with real hardware is that size of the flash depends
dynamically on the machine configuration.  And it's drastic: you can
have from 1 to 256 CPUs, 0 to 256 PCI bridges on each root, etc.

And I do believe the infrastructure will be handy for other
things.  For example, boot order ROM is now dynamic too,
with enough bootable devices it will start overflowing a page
and then we will have the same problem.

And the patchset is all of 150 lines with comments, the point
is that everything follows the same path: it's
enough to test one cross-version migration, e.g. 2.1->2.3 or whatever,
to make sure resizing works properly. Unlike extra modes which need
testing of each machine type with each guest.
Paolo Bonzini Nov. 24, 2014, 1:48 p.m. UTC | #57
On 20/11/2014 15:04, Michael S. Tsirkin wrote:
> On Thu, Nov 20, 2014 at 02:35:14PM +0100, Markus Armbruster wrote:
>> What am I missing here that can justify the complexity of partially
>> overriding target configuration in the migration stream plus
>> infrastructure for resizing memory?
> 
> The justification is that sizing it properly is an unsolved problem.

Not really: sizing it properly is a tedious thing to do, but it is not a
problem at all.  We _almost_ get it right, only the DSDT can change from
one version to the next right now.  And patches exist to pad the DSDT
adequately; those patches are simpler than these ones.

I am okay with considering it "too tedious to spend more time on it",
especially because the size of the ACPI tables already changed
arbitrarily from one version to the other when it was computed in the
firmware.  On the other hand, one could also say that being able to size
tables properly is an _advantage_ of doing tables in QEMU.

I was expecting little opposition and thus thinking it was not
worthwhile to discuss the approach to take.  Apparently there _is_
opposition, so I think we should reconsider my patches.

Paolo
diff mbox

Patch

diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h
index 62f5581..26eb9b2 100644
--- a/include/exec/cpu-all.h
+++ b/include/exec/cpu-all.h
@@ -299,11 +299,15 @@  CPUArchState *cpu_copy(CPUArchState *env);
 
 /* memory API */
 
-typedef struct RAMBlock {
+typedef struct RAMBlock RAMBlock;
+
+struct RAMBlock {
     struct MemoryRegion *mr;
     uint8_t *host;
     ram_addr_t offset;
     ram_addr_t length;
+    ram_addr_t max_length;
+    void (*resized)(const char*, uint64_t length, void *host);
     uint32_t flags;
     char idstr[256];
     /* Reads can take either the iothread or the ramlist lock.
@@ -311,7 +315,7 @@  typedef struct RAMBlock {
      */
     QTAILQ_ENTRY(RAMBlock) next;
     int fd;
-} RAMBlock;
+};
 
 static inline void *ramblock_ptr(RAMBlock *block, ram_addr_t offset)
 {
diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h
index d7e5238..72ab12b 100644
--- a/include/exec/ram_addr.h
+++ b/include/exec/ram_addr.h
@@ -28,12 +28,19 @@  ram_addr_t qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
 ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
                                    MemoryRegion *mr, Error **errp);
 ram_addr_t qemu_ram_alloc(ram_addr_t size, MemoryRegion *mr, Error **errp);
+ram_addr_t qemu_ram_alloc_device(ram_addr_t size, ram_addr_t max_size,
+                                 void (*resized)(const char*,
+                                                 uint64_t length,
+                                                 void *host),
+                                 MemoryRegion *mr, Error **errp);
 int qemu_get_ram_fd(ram_addr_t addr);
 void *qemu_get_ram_block_host_ptr(ram_addr_t addr);
 void *qemu_get_ram_ptr(ram_addr_t addr);
 void qemu_ram_free(ram_addr_t addr);
 void qemu_ram_free_from_ptr(ram_addr_t addr);
 
+int qemu_ram_resize(ram_addr_t base, ram_addr_t newsize, Error **errp);
+
 static inline bool cpu_physical_memory_get_dirty(ram_addr_t start,
                                                  ram_addr_t length,
                                                  unsigned client)
diff --git a/exec.c b/exec.c
index 9648669..a177816 100644
--- a/exec.c
+++ b/exec.c
@@ -75,6 +75,11 @@  static MemoryRegion io_mem_unassigned;
 /* RAM is mmap-ed with MAP_SHARED */
 #define RAM_SHARED     (1 << 1)
 
+/* On-device RAM allocated with g_malloc: supports realloc,
+ * not accessible to vcpu on kvm.
+ */
+#define RAM_DEVICE     (1 << 2)
+
 #endif
 
 struct CPUTailQ cpus = QTAILQ_HEAD_INITIALIZER(cpus);
@@ -1186,7 +1191,7 @@  static ram_addr_t find_ram_offset(ram_addr_t size)
     QTAILQ_FOREACH(block, &ram_list.blocks, next) {
         ram_addr_t end, next = RAM_ADDR_MAX;
 
-        end = block->offset + block->length;
+        end = block->offset + block->max_length;
 
         QTAILQ_FOREACH(next_block, &ram_list.blocks, next) {
             if (next_block->offset >= end) {
@@ -1214,7 +1219,7 @@  ram_addr_t last_ram_offset(void)
     ram_addr_t last = 0;
 
     QTAILQ_FOREACH(block, &ram_list.blocks, next)
-        last = MAX(last, block->offset + block->length);
+        last = MAX(last, block->offset + block->max_length);
 
     return last;
 }
@@ -1296,6 +1301,50 @@  static int memory_try_enable_merging(void *addr, size_t len)
     return qemu_madvise(addr, len, QEMU_MADV_MERGEABLE);
 }
 
+int qemu_ram_resize(ram_addr_t base, ram_addr_t newsize, Error **errp)
+{
+    RAMBlock *block = find_ram_block(base);
+
+    assert(block);
+
+    if (block->length == newsize) {
+        return 0;
+    }
+
+    if (!(block->flags & RAM_DEVICE)) {
+        error_setg_errno(errp, EINVAL,
+                         "Length mismatch: %s: 0x" RAM_ADDR_FMT
+                         " in != 0x" RAM_ADDR_FMT, block->idstr,
+                         newsize, block->length);
+        return -EINVAL;
+    }
+
+    if (block->max_length < newsize) {
+        error_setg_errno(errp, EINVAL,
+                         "Length too large: %s: 0x" RAM_ADDR_FMT
+                         " > 0x" RAM_ADDR_FMT, block->idstr,
+                         newsize, block->max_length);
+        return -EINVAL;
+    }
+
+    block->host = g_realloc(block->host, newsize);
+    if (!block->host) {
+        error_setg_errno(errp, errno,
+                         "cannot allocate guest memory '%s'",
+                         memory_region_name(block->mr));
+        return -ENOMEM;
+    }
+
+    cpu_physical_memory_clear_dirty_range_nocode(block->offset, block->length);
+    block->length = newsize;
+    memset(block->host, 0, block->length);
+    cpu_physical_memory_set_dirty_range_nocode(block->offset, block->length);
+    if (block->resized) {
+        block->resized(block->idstr, newsize, block->host);
+    }
+    return 0;
+}
+
 static ram_addr_t ram_block_add(RAMBlock *new_block, Error **errp)
 {
     RAMBlock *block;
@@ -1308,7 +1357,16 @@  static ram_addr_t ram_block_add(RAMBlock *new_block, Error **errp)
     new_block->offset = find_ram_offset(new_block->length);
 
     if (!new_block->host) {
-        if (xen_enabled()) {
+        if (new_block->flags & RAM_DEVICE) {
+            new_block->host = g_malloc0(new_block->length);
+            if (!new_block->host) {
+                error_setg_errno(errp, errno,
+                                 "cannot allocate guest memory '%s'",
+                                 memory_region_name(new_block->mr));
+                qemu_mutex_unlock_ramlist();
+                return -1;
+            }
+        } else if (xen_enabled()) {
             xen_ram_alloc(new_block->offset, new_block->length, new_block->mr);
         } else {
             new_block->host = phys_mem_alloc(new_block->length,
@@ -1352,12 +1410,14 @@  static ram_addr_t ram_block_add(RAMBlock *new_block, Error **errp)
     }
     cpu_physical_memory_set_dirty_range(new_block->offset, new_block->length);
 
-    qemu_ram_setup_dump(new_block->host, new_block->length);
-    qemu_madvise(new_block->host, new_block->length, QEMU_MADV_HUGEPAGE);
-    qemu_madvise(new_block->host, new_block->length, QEMU_MADV_DONTFORK);
+    if (!(new_block->flags & RAM_DEVICE)) {
+        qemu_ram_setup_dump(new_block->host, new_block->length);
+        qemu_madvise(new_block->host, new_block->length, QEMU_MADV_HUGEPAGE);
+        qemu_madvise(new_block->host, new_block->length, QEMU_MADV_DONTFORK);
 
-    if (kvm_enabled()) {
-        kvm_setup_guest_memory(new_block->host, new_block->length);
+        if (kvm_enabled()) {
+            kvm_setup_guest_memory(new_block->host, new_block->length);
+        }
     }
 
     return new_block->offset;
@@ -1392,6 +1452,7 @@  ram_addr_t qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
     new_block = g_malloc0(sizeof(*new_block));
     new_block->mr = mr;
     new_block->length = size;
+    new_block->max_length = size;
     new_block->flags = share ? RAM_SHARED : 0;
     new_block->host = file_ram_alloc(new_block, size,
                                      mem_path, errp);
@@ -1410,7 +1471,12 @@  ram_addr_t qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
 }
 #endif
 
-ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
+static
+ram_addr_t qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
+                                   void (*resized)(const char*,
+                                                   uint64_t length,
+                                                   void *host),
+                                   void *host, bool device,
                                    MemoryRegion *mr, Error **errp)
 {
     RAMBlock *new_block;
@@ -1418,14 +1484,21 @@  ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
     Error *local_err = NULL;
 
     size = TARGET_PAGE_ALIGN(size);
+    max_size = TARGET_PAGE_ALIGN(max_size);
     new_block = g_malloc0(sizeof(*new_block));
     new_block->mr = mr;
+    new_block->resized = resized;
     new_block->length = size;
+    new_block->max_length = max_size;
+    assert(max_size >= size);
     new_block->fd = -1;
     new_block->host = host;
     if (host) {
         new_block->flags |= RAM_PREALLOC;
     }
+    if (device) {
+        new_block->flags |= RAM_DEVICE;
+    }
     addr = ram_block_add(new_block, &local_err);
     if (local_err) {
         g_free(new_block);
@@ -1435,9 +1508,24 @@  ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
     return addr;
 }
 
+ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
+                                   MemoryRegion *mr, Error **errp)
+{
+    return qemu_ram_alloc_internal(size, size, NULL, host, false, mr, errp);
+}
+
 ram_addr_t qemu_ram_alloc(ram_addr_t size, MemoryRegion *mr, Error **errp)
 {
-    return qemu_ram_alloc_from_ptr(size, NULL, mr, errp);
+    return qemu_ram_alloc_internal(size, size, NULL, NULL, false, mr, errp);
+}
+
+ram_addr_t qemu_ram_alloc_device(ram_addr_t size, ram_addr_t maxsz,
+                                   void (*resized)(const char*,
+                                                   uint64_t length,
+                                                   void *host),
+                                   MemoryRegion *mr, Error **errp)
+{
+    return qemu_ram_alloc_internal(size, maxsz, resized, NULL, true, mr, errp);
 }
 
 void qemu_ram_free_from_ptr(ram_addr_t addr)
@@ -1471,6 +1559,8 @@  void qemu_ram_free(ram_addr_t addr)
             ram_list.version++;
             if (block->flags & RAM_PREALLOC) {
                 ;
+            } else if (block->flags & RAM_DEVICE) {
+                g_free(block->host);
             } else if (xen_enabled()) {
                 xen_invalidate_map_cache_entry(block->host);
 #ifndef _WIN32
@@ -1501,7 +1591,8 @@  void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
         offset = addr - block->offset;
         if (offset < block->length) {
             vaddr = ramblock_ptr(block, offset);
-            if (block->flags & RAM_PREALLOC) {
+            if (block->flags & RAM_PREALLOC ||
+                block->flags & RAM_DEVICE) {
                 ;
             } else if (xen_enabled()) {
                 abort();