diff mbox

[13/18] arch_init: adjust ram_save_setup() for migrate_is_localhost

Message ID 1377069536-12658-14-git-send-email-lilei@linux.vnet.ibm.com
State New
Headers show

Commit Message

Lei Li Aug. 21, 2013, 7:18 a.m. UTC
Send all the ram blocks hooked by save_page, which will copy
ram page and MADV_DONTNEED the page just copied.

Signed-off-by: Lei Li <lilei@linux.vnet.ibm.com>
---
 arch_init.c |   19 +++++++++++++------
 1 files changed, 13 insertions(+), 6 deletions(-)

Comments

Paolo Bonzini Aug. 21, 2013, 10:48 a.m. UTC | #1
Il 21/08/2013 09:18, Lei Li ha scritto:
> Send all the ram blocks hooked by save_page, which will copy
> ram page and MADV_DONTNEED the page just copied.

You should implement this entirely in the hook.

It will be a little less efficient because of the dirty bitmap overhead,
but you should aim at having *zero* changes in arch_init.c and migration.c.

Paolo

> 
> Signed-off-by: Lei Li <lilei@linux.vnet.ibm.com>
> ---
>  arch_init.c |   19 +++++++++++++------
>  1 files changed, 13 insertions(+), 6 deletions(-)
> 
> diff --git a/arch_init.c b/arch_init.c
> index 434a4ca..cbbb4db 100644
> --- a/arch_init.c
> +++ b/arch_init.c
> @@ -474,7 +474,7 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
>              /* In doubt sent page as normal */
>              bytes_sent = -1;
>              ret = ram_control_save_page(f, block->offset,
> -                               offset, TARGET_PAGE_SIZE, &bytes_sent);
> +                                        offset, TARGET_PAGE_SIZE, &bytes_sent);
>  
>              if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
>                  if (ret != RAM_SAVE_CONTROL_DELAYED) {
> @@ -613,11 +613,13 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
>      RAMBlock *block;
>      int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
>  
> -    migration_bitmap = bitmap_new(ram_pages);
> -    bitmap_set(migration_bitmap, 0, ram_pages);
> -    migration_dirty_pages = ram_pages;
> -    mig_throttle_on = false;
> -    dirty_rate_high_cnt = 0;
> +    if (!migrate_is_localhost()) {
> +        migration_bitmap = bitmap_new(ram_pages);
> +        bitmap_set(migration_bitmap, 0, ram_pages);
> +        migration_dirty_pages = ram_pages;
> +        mig_throttle_on = false;
> +        dirty_rate_high_cnt = 0;
> +    }
>  
>      if (migrate_use_xbzrle()) {
>          XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
> @@ -641,6 +643,11 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
>      migration_bitmap_sync();
>      qemu_mutex_unlock_iothread();
>  
> +    if (migrate_is_localhost()) {
> +        ram_save_blocks(f);
> +        return 0;
> +    }
> +
>      qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
>  
>      QTAILQ_FOREACH(block, &ram_list.blocks, next) {
>
Lei Li Aug. 23, 2013, 6:25 a.m. UTC | #2
On 08/21/2013 06:48 PM, Paolo Bonzini wrote:
> Il 21/08/2013 09:18, Lei Li ha scritto:
>> Send all the ram blocks hooked by save_page, which will copy
>> ram page and MADV_DONTNEED the page just copied.
> You should implement this entirely in the hook.
>
> It will be a little less efficient because of the dirty bitmap overhead,
> but you should aim at having *zero* changes in arch_init.c and migration.c.

Yes, the reason I modify the migration_thread() to have new process that send all
the ram pages in adjusted qemu_savevm_state_begin stage and send device states in
qemu_savevm_device_state stage for localhost migration is to avoid the bitmap thing,
which is a little less efficient just like you mentioned above.

The performance assurance is very important to this feature, our goal is 100ms
of downtime for a 1TB guest.

>
> Paolo
>
>> Signed-off-by: Lei Li <lilei@linux.vnet.ibm.com>
>> ---
>>   arch_init.c |   19 +++++++++++++------
>>   1 files changed, 13 insertions(+), 6 deletions(-)
>>
>> diff --git a/arch_init.c b/arch_init.c
>> index 434a4ca..cbbb4db 100644
>> --- a/arch_init.c
>> +++ b/arch_init.c
>> @@ -474,7 +474,7 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
>>               /* In doubt sent page as normal */
>>               bytes_sent = -1;
>>               ret = ram_control_save_page(f, block->offset,
>> -                               offset, TARGET_PAGE_SIZE, &bytes_sent);
>> +                                        offset, TARGET_PAGE_SIZE, &bytes_sent);
>>   
>>               if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
>>                   if (ret != RAM_SAVE_CONTROL_DELAYED) {
>> @@ -613,11 +613,13 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
>>       RAMBlock *block;
>>       int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
>>   
>> -    migration_bitmap = bitmap_new(ram_pages);
>> -    bitmap_set(migration_bitmap, 0, ram_pages);
>> -    migration_dirty_pages = ram_pages;
>> -    mig_throttle_on = false;
>> -    dirty_rate_high_cnt = 0;
>> +    if (!migrate_is_localhost()) {
>> +        migration_bitmap = bitmap_new(ram_pages);
>> +        bitmap_set(migration_bitmap, 0, ram_pages);
>> +        migration_dirty_pages = ram_pages;
>> +        mig_throttle_on = false;
>> +        dirty_rate_high_cnt = 0;
>> +    }
>>   
>>       if (migrate_use_xbzrle()) {
>>           XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
>> @@ -641,6 +643,11 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
>>       migration_bitmap_sync();
>>       qemu_mutex_unlock_iothread();
>>   
>> +    if (migrate_is_localhost()) {
>> +        ram_save_blocks(f);
>> +        return 0;
>> +    }
>> +
>>       qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
>>   
>>       QTAILQ_FOREACH(block, &ram_list.blocks, next) {
>>
>
Paolo Bonzini Aug. 23, 2013, 7:48 a.m. UTC | #3
Il 23/08/2013 08:25, Lei Li ha scritto:
> On 08/21/2013 06:48 PM, Paolo Bonzini wrote:
>> Il 21/08/2013 09:18, Lei Li ha scritto:
>>> Send all the ram blocks hooked by save_page, which will copy
>>> ram page and MADV_DONTNEED the page just copied.
>> You should implement this entirely in the hook.
>>
>> It will be a little less efficient because of the dirty bitmap overhead,
>> but you should aim at having *zero* changes in arch_init.c and
>> migration.c.
> 
> Yes, the reason I modify the migration_thread() to have new process that
> send all the ram pages in adjusted qemu_savevm_state_begin stage and send device
> states in qemu_savevm_device_state stage for localhost migration is to avoid the
> bitmap thing, which is a little less efficient just like you mentioned above.
> 
> The performance assurance is very important to this feature, our goal is
> 100ms of downtime for a 1TB guest.

Do not _start_ by introducing encapsulation violations all over the place.

Juan has been working on optimizing the dirty bitmap code.  His patches
could introduce a speedup of a factor of up to 64.  Thus it is possible
that his work will help you enough that you can work with the dirty bitmap.

Also, this feature (not looking at the dirty bitmap if the machine is
stopped) is not limited to localhost migration, add it later once the
basic vmsplice plumbing is in place.  This will also let you profile the
code and understand whether the goal is attainable.

I honestly doubt that 100ms of downtime is possible while the machine is
stopped.  A 1TB guest has 2^28 = 268*10^6 pages, which you want to
process in 100*10^6 nanoseconds.  Thus, your approach would require 0.4
nanoseconds per page, or roughly 2 clock cycles per page.  This is
impossible without _massive_ parallelization at all levels, starting
from the kernel.

As a matter of fact, 2^28 madvise system calls will take much, much
longer than 100ms.

Have you thought of using shared memory (with -mempath) instead of vmsplice?

Paolo
Alex Bligh Aug. 23, 2013, 7:57 a.m. UTC | #4
--On 23 August 2013 09:48:42 +0200 Paolo Bonzini <pbonzini@redhat.com> 
wrote:

> As a matter of fact, 2^28 madvise system calls will take much, much
> longer than 100ms.

Probably a stupid question, but why would you need to do one call per
page? It takes a 'size_t length' parameter.
Paolo Bonzini Aug. 23, 2013, 8:06 a.m. UTC | #5
Il 23/08/2013 09:57, Alex Bligh ha scritto:
> 
> 
> --On 23 August 2013 09:48:42 +0200 Paolo Bonzini <pbonzini@redhat.com>
> wrote:
> 
>> As a matter of fact, 2^28 madvise system calls will take much, much
>> longer than 100ms.
> 
> Probably a stupid question, but why would you need to do one call per
> page? It takes a 'size_t length' parameter.

Right now migration is done a page at a time, and so is madvise AFAIU.
However, even with a larger length parameter I suspect it would alone
take more than 2 cycles per page.

So one way to do this could be to add a flag to "migrate" that would
migrate devices only, and use shared memory in both the source and the
target.

There is still a problem, because we must make sure the destination
doesn't write to memory (e.g. read firmware) when initializing the
board, because that would overwrite the memory of the running instance.
 But it looks more promising than page flipping.

Paolo
Lei Li Aug. 23, 2013, 9 a.m. UTC | #6
On 08/23/2013 03:48 PM, Paolo Bonzini wrote:
> Il 23/08/2013 08:25, Lei Li ha scritto:
>> On 08/21/2013 06:48 PM, Paolo Bonzini wrote:
>>> Il 21/08/2013 09:18, Lei Li ha scritto:
>>>> Send all the ram blocks hooked by save_page, which will copy
>>>> ram page and MADV_DONTNEED the page just copied.
>>> You should implement this entirely in the hook.
>>>
>>> It will be a little less efficient because of the dirty bitmap overhead,
>>> but you should aim at having *zero* changes in arch_init.c and
>>> migration.c.
>> Yes, the reason I modify the migration_thread() to have new process that
>> send all the ram pages in adjusted qemu_savevm_state_begin stage and send device
>> states in qemu_savevm_device_state stage for localhost migration is to avoid the
>> bitmap thing, which is a little less efficient just like you mentioned above.
>>
>> The performance assurance is very important to this feature, our goal is
>> 100ms of downtime for a 1TB guest.
> Do not _start_ by introducing encapsulation violations all over the place.
>
> Juan has been working on optimizing the dirty bitmap code.  His patches
> could introduce a speedup of a factor of up to 64.  Thus it is possible
> that his work will help you enough that you can work with the dirty bitmap.
>
> Also, this feature (not looking at the dirty bitmap if the machine is
> stopped) is not limited to localhost migration, add it later once the
> basic vmsplice plumbing is in place.  This will also let you profile the
> code and understand whether the goal is attainable.
>
> I honestly doubt that 100ms of downtime is possible while the machine is
> stopped.  A 1TB guest has 2^28 = 268*10^6 pages, which you want to
> process in 100*10^6 nanoseconds.  Thus, your approach would require 0.4
> nanoseconds per page, or roughly 2 clock cycles per page.  This is
> impossible without _massive_ parallelization at all levels, starting
> from the kernel.
>
> As a matter of fact, 2^28 madvise system calls will take much, much
> longer than 100ms.
>
> Have you thought of using shared memory (with -mempath) instead of vmsplice?

Precisely!

Well, as Anthony mentioned in the version 1[1], there has been some work involved
regarding improvement of vmsplice() at kernel side by Robert Jennings[2].

And yes, shared memory is an alternative, I think the problem with shared memory is
that can't share anonymous memory. For this maybe Anthony can chime in as the original
idea him.  :-)


Reference links:

[1] Anthony's comments:
   https://lists.gnu.org/archive/html/qemu-devel/2013-06/msg02577.html

[2] vmpslice support for zero-copy gifting of pages:
   http://comments.gmane.org/gmane.linux.kernel.mm/103998

>
> Paolo
>
Paolo Bonzini Aug. 23, 2013, 9:12 a.m. UTC | #7
Il 23/08/2013 11:00, Lei Li ha scritto:
>>> The performance assurance is very important to this feature, our goal is
>>> 100ms of downtime for a 1TB guest.
>>
>> I honestly doubt that 100ms of downtime is possible while the machine is
>> stopped.  A 1TB guest has 2^28 = 268*10^6 pages, which you want to
>> process in 100*10^6 nanoseconds.  Thus, your approach would require 0.4
>> nanoseconds per page, or roughly 2 clock cycles per page.  This is
>> impossible without _massive_ parallelization at all levels, starting
>> from the kernel.
>>
>> Have you thought of using shared memory (with -mempath) instead of
>> vmsplice?
> 
> Precisely!
> 
> Well, as Anthony mentioned in the version 1[1], there has been some work involved
> regarding improvement of vmsplice() at kernel side by Robert Jennings[2].

Oh, finally!

> And yes, shared memory is an alternative, I think the problem with shared memory is
> that can't share anonymous memory. For this maybe Anthony can chime in
> as the original idea him.  :-)

You could perhaps switch from normal to shared memory while the VM is
running?  Either use the dirty bitmap for this, or mmap the shared
memory "in place" on top of the anonymous memory.  Since it needs to be
done while the VM is stopped, you can do it perhaps 1G at a time and let
the VM run for some time before doing the next 1G.

The nice part is that because the VM is running, you can do it as slow
as you want. :)  Only the final flush and device save affects downtime.

Paolo
> 
> Reference links:
> 
> [1] Anthony's comments:
>   https://lists.gnu.org/archive/html/qemu-devel/2013-06/msg02577.html
> 
> [2] vmpslice support for zero-copy gifting of pages:
>   http://comments.gmane.org/gmane.linux.kernel.mm/103998
> 
>>
>> Paolo
>>
> 
>
diff mbox

Patch

diff --git a/arch_init.c b/arch_init.c
index 434a4ca..cbbb4db 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -474,7 +474,7 @@  static int ram_save_block(QEMUFile *f, bool last_stage)
             /* In doubt sent page as normal */
             bytes_sent = -1;
             ret = ram_control_save_page(f, block->offset,
-                               offset, TARGET_PAGE_SIZE, &bytes_sent);
+                                        offset, TARGET_PAGE_SIZE, &bytes_sent);
 
             if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
                 if (ret != RAM_SAVE_CONTROL_DELAYED) {
@@ -613,11 +613,13 @@  static int ram_save_setup(QEMUFile *f, void *opaque)
     RAMBlock *block;
     int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
 
-    migration_bitmap = bitmap_new(ram_pages);
-    bitmap_set(migration_bitmap, 0, ram_pages);
-    migration_dirty_pages = ram_pages;
-    mig_throttle_on = false;
-    dirty_rate_high_cnt = 0;
+    if (!migrate_is_localhost()) {
+        migration_bitmap = bitmap_new(ram_pages);
+        bitmap_set(migration_bitmap, 0, ram_pages);
+        migration_dirty_pages = ram_pages;
+        mig_throttle_on = false;
+        dirty_rate_high_cnt = 0;
+    }
 
     if (migrate_use_xbzrle()) {
         XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
@@ -641,6 +643,11 @@  static int ram_save_setup(QEMUFile *f, void *opaque)
     migration_bitmap_sync();
     qemu_mutex_unlock_iothread();
 
+    if (migrate_is_localhost()) {
+        ram_save_blocks(f);
+        return 0;
+    }
+
     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
 
     QTAILQ_FOREACH(block, &ram_list.blocks, next) {