Patchwork [4/4] Try not to exceed max downtime on stage3

login
register
mail settings
Submitter lirans@il.ibm.com
Date Jan. 12, 2010, 8:27 a.m.
Message ID <12632848353279-git-send-email-lirans@il.ibm.com>
Download mbox | patch
Permalink /patch/42689/
State New
Headers show

Comments

lirans@il.ibm.com - Jan. 12, 2010, 8:27 a.m.
Move to stage3 only when remaining work can be done below max downtime.
To make sure the process will converge we will try only MAX_DIRTY_ITERATIONS.

Signed-off-by: Liran Schour <lirans@il.ibm.com>
---
 block-migration.c |   67 +++++++++++++++++++++++++++++++++++-----------------
 1 files changed, 45 insertions(+), 22 deletions(-)
Pierre Riteau - Jan. 12, 2010, 9:52 a.m.
On 12 janv. 2010, at 09:27, Liran Schour wrote:

> Move to stage3 only when remaining work can be done below max downtime.
> To make sure the process will converge we will try only MAX_DIRTY_ITERATIONS.
> 
> Signed-off-by: Liran Schour <lirans@il.ibm.com>
> ---
> block-migration.c |   67 +++++++++++++++++++++++++++++++++++-----------------
> 1 files changed, 45 insertions(+), 22 deletions(-)
> 
> diff --git a/block-migration.c b/block-migration.c
> index 90c84b1..9ae04c4 100644
> --- a/block-migration.c
> +++ b/block-migration.c
> @@ -17,6 +17,7 @@
> #include "qemu-queue.h"
> #include "monitor.h"
> #include "block-migration.h"
> +#include "migration.h"
> #include <assert.h>
> 
> #define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK << BDRV_SECTOR_BITS)
> @@ -30,6 +31,7 @@
> #define BLOCKS_READ_CHANGE 100
> #define INITIAL_BLOCKS_READ 100
> #define MAX_DIRTY_ITERATIONS 100
> +#define DISK_RATE (30 << 20) //30 MB/sec

This number seems rather arbitrary. We should try to infer the storage performance from previous reads instead (but it could be difficult, for example when we switch from bulk copy to dirty blocks only, we may switch from sequential reads to random reads).
Also, shouldn't the migration speed limit (migrate_set_speed) be taken into account?	

> //#define DEBUG_BLK_MIGRATION
> 
> @@ -135,10 +137,11 @@ static void blk_mig_read_cb(void *opaque, int ret)
>     blk->ret = ret;
> 
>     QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry);
> -
> +    

Please don't add trailing white space...

>     block_mig_state.submitted--;
>     block_mig_state.read_done++;
>     assert(block_mig_state.submitted >= 0);
> +    
> }

... and unnecessary new lines. Comments valid for the rest of this patch and the other ones as well.

> 
> static int mig_save_device_bulk(Monitor *mon, QEMUFile *f,
> @@ -225,7 +228,7 @@ static void init_blk_migration(Monitor *mon, QEMUFile *f)
>     block_mig_state.prev_progress = -1;
>     block_mig_state.bulk_completed = 0;
>     block_mig_state.dirty_iterations = 0;
> -
> +    
>     for (bs = bdrv_first; bs != NULL; bs = bs->next) {
>         if (bs->type == BDRV_TYPE_HD) {
>             sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
> @@ -405,15 +408,41 @@ static void flush_blks(QEMUFile* f)
>             block_mig_state.transferred);
> }
> 
> -static int is_stage2_completed(void)
> +static int64_t get_remaining_dirty(void)
> {
> +    BlkMigDevState *bmds;
> +    int64_t dirty = 0;
> +    
> +    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
> +        dirty += bdrv_get_dirty_count(bmds->bs);
> +    }
> +    
> +    return dirty;
> +}
> 
> -    if (block_mig_state.submitted == 0 &&
> -        block_mig_state.bulk_completed == 1) {
> -        return 1;
> -    } else {
> -        return 0;
> +static int is_stage2_completed(void)
> +{
> +    int64_t remaining_dirty;
> +    
> +    if (block_mig_state.bulk_completed == 1) {
> +        if (block_mig_state.dirty_iterations++ > MAX_DIRTY_ITERATIONS) {
> +            /* finish stage2 because we have too much dirty iterations */
> +            
> +            return 1;
> +        }
> +        
> +        remaining_dirty = get_remaining_dirty();
> +        
> +        if ((remaining_dirty * BLOCK_SIZE) * 1000000000 / DISK_RATE <= 
> +            migrate_max_downtime()) {
> +            /* finish stage2 because we think that we can finish remaing work
> +               below max_downtime */
> +            
> +            return 1;
> +        }
>     }
> +    
> +    return 0;
> }
> 
> static void blk_mig_cleanup(Monitor *mon)
> @@ -438,9 +467,7 @@ static void blk_mig_cleanup(Monitor *mon)
> }
> 
> static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
> -{
> -    int dirty_iteration = 0;
> -    
> +{    
>     dprintf("Enter save live stage %d submitted %d transferred %d\n",
>             stage, block_mig_state.submitted, block_mig_state.transferred);
> 
> @@ -482,19 +509,12 @@ static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
>                     /* finish saving bulk on all devices */
>                     block_mig_state.bulk_completed = 1;
>                 }
> -            } else if (block_mig_state.dirty_iterations < MAX_DIRTY_ITERATIONS) {
> -                if (dirty_iteration == 0) {
> -                    /* increment dirty iteration only once per round */
> -                    dirty_iteration = 1;
> -                    block_mig_state.dirty_iterations++;
> -                }
> +            } else {
> +                
>                 if (blk_mig_save_dirty_block(mon, f, 1) == 0) {
>                     /* no more dirty blocks */
>                     break;
>                 }
> -            } else {
> -                /* if we got here stop the loop */
> -                break;
>             }
>         }
> 
> @@ -507,9 +527,12 @@ static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
>     }
> 
>     if (stage == 3) {
> -        /* we now for sure that save bulk is completed */
> -
> +        /* we know for sure that save bulk is completed and
> +           all async read completed */
> +        assert(block_mig_state.submitted == 0);
> +        
>         while(blk_mig_save_dirty_block(mon, f, 0) != 0);
> +        
>         blk_mig_cleanup(mon);
> 
>         /* report completion */
> -- 
> 1.5.2.4
> 
> 
>
Jan Kiszka - Jan. 12, 2010, 11:51 a.m.
Liran Schour wrote:
> Move to stage3 only when remaining work can be done below max downtime.
> To make sure the process will converge we will try only MAX_DIRTY_ITERATIONS.

OK, that explains now patch 2. But do we have such barrier for memory
migration as well? I don't thinks so, and I don't think this hard-coded
limit is the right approach. Such thing should be derived from the
bandwidth the user can control during runtime.

> 
> Signed-off-by: Liran Schour <lirans@il.ibm.com>
> ---
>  block-migration.c |   67 +++++++++++++++++++++++++++++++++++-----------------
>  1 files changed, 45 insertions(+), 22 deletions(-)
> 
> diff --git a/block-migration.c b/block-migration.c
> index 90c84b1..9ae04c4 100644
> --- a/block-migration.c
> +++ b/block-migration.c
> @@ -17,6 +17,7 @@
>  #include "qemu-queue.h"
>  #include "monitor.h"
>  #include "block-migration.h"
> +#include "migration.h"
>  #include <assert.h>
>  
>  #define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK << BDRV_SECTOR_BITS)
> @@ -30,6 +31,7 @@
>  #define BLOCKS_READ_CHANGE 100
>  #define INITIAL_BLOCKS_READ 100
>  #define MAX_DIRTY_ITERATIONS 100
> +#define DISK_RATE (30 << 20) //30 MB/sec

IMHO a bad idea (e.g. mine was 6 MB/s last time I tried). Measure it
during runtime just like the mem-migration does.

<skipping the rest of the patch>

Jan
lirans@il.ibm.com - Jan. 12, 2010, 11:56 a.m.
Pierre Riteau <Pierre.Riteau@irisa.fr> wrote on 12/01/2010 11:52:18:

> On 12 janv. 2010, at 09:27, Liran Schour wrote:
>
> > Move to stage3 only when remaining work can be done below max downtime.
> > To make sure the process will converge we will try only
> MAX_DIRTY_ITERATIONS.
> >
> > Signed-off-by: Liran Schour <lirans@il.ibm.com>
> > ---
> > block-migration.c |   67 ++++++++++++++++++++++++++++++++++
> +-----------------
> > 1 files changed, 45 insertions(+), 22 deletions(-)
> >
> > diff --git a/block-migration.c b/block-migration.c
> > index 90c84b1..9ae04c4 100644
> > --- a/block-migration.c
> > +++ b/block-migration.c
> > @@ -17,6 +17,7 @@
> > #include "qemu-queue.h"
> > #include "monitor.h"
> > #include "block-migration.h"
> > +#include "migration.h"
> > #include <assert.h>
> >
> > #define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK << BDRV_SECTOR_BITS)
> > @@ -30,6 +31,7 @@
> > #define BLOCKS_READ_CHANGE 100
> > #define INITIAL_BLOCKS_READ 100
> > #define MAX_DIRTY_ITERATIONS 100
> > +#define DISK_RATE (30 << 20) //30 MB/sec
>
> This number seems rather arbitrary. We should try to infer the
> storage performance from previous reads instead (but it could be
> difficult, for example when we switch from bulk copy to dirty blocks
> only, we may switch from sequential reads to random reads).
> Also, shouldn't the migration speed limit (migrate_set_speed) be
> taken into account?
>
It will be hard to estimate the disk rate in advance. We need to estimate
the rate of sync writes (during the final stage) while we are writing async
to disk iteratively. Anyone has a simple solution for this?
About the migration speed limit, we do not need to take this into account
because on stage 3 we will transfer the data without referring to migration
speed limit.

Thanks,
- Liran
Anthony Liguori - Jan. 12, 2010, 3:07 p.m.
On 01/12/2010 05:51 AM, Jan Kiszka wrote:
> Liran Schour wrote:
>    
>> Move to stage3 only when remaining work can be done below max downtime.
>> To make sure the process will converge we will try only MAX_DIRTY_ITERATIONS.
>>      
> OK, that explains now patch 2. But do we have such barrier for memory
> migration as well?

No, we explicitly don't because making that decision is a management 
tool job.

A management tool can force convergence by explicitly stopping the guest.

Iterations is a bad metric because there's no real useful meaning to a user.

Time is probably the best metric and it's easy for a management tool to 
set a timer to stop the guest if it hasn't completed by a certain time 
period.

Regards,

Anthony Liguori
lirans@il.ibm.com - Jan. 12, 2010, 3:07 p.m.
Jan Kiszka <jan.kiszka@siemens.com> wrote on 12/01/2010 13:51:09:

> Liran Schour wrote:
> > Move to stage3 only when remaining work can be done below max downtime.
> > To make sure the process will converge we will try only
> MAX_DIRTY_ITERATIONS.
>
> OK, that explains now patch 2. But do we have such barrier for memory
> migration as well? I don't thinks so, and I don't think this hard-coded
> limit is the right approach. Such thing should be derived from the
> bandwidth the user can control during runtime.

So if I understand you correctly you say that the way to assure convergence
is by the user controlling the bandwidth during runtime. So we can remove
this MAX_DIRTY_ITERATIONS.

> >
> > Signed-off-by: Liran Schour <lirans@il.ibm.com>
> > ---
> >  block-migration.c |   67 ++++++++++++++++++++++++++++++++++
> +-----------------
> >  1 files changed, 45 insertions(+), 22 deletions(-)
> >
> > diff --git a/block-migration.c b/block-migration.c
> > index 90c84b1..9ae04c4 100644
> > --- a/block-migration.c
> > +++ b/block-migration.c
> > @@ -17,6 +17,7 @@
> >  #include "qemu-queue.h"
> >  #include "monitor.h"
> >  #include "block-migration.h"
> > +#include "migration.h"
> >  #include <assert.h>
> >
> >  #define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK << BDRV_SECTOR_BITS)
> > @@ -30,6 +31,7 @@
> >  #define BLOCKS_READ_CHANGE 100
> >  #define INITIAL_BLOCKS_READ 100
> >  #define MAX_DIRTY_ITERATIONS 100
> > +#define DISK_RATE (30 << 20) //30 MB/sec
>
> IMHO a bad idea (e.g. mine was 6 MB/s last time I tried). Measure it
> during runtime just like the mem-migration does.
How about measuring the performance of reading BLOCK_SIZE and then compute
the disk rate? I can compute the average time to read BLOCK_SIZE and then
estimate the remaining time to complete the work.

Thanks,
- Liran

Patch

diff --git a/block-migration.c b/block-migration.c
index 90c84b1..9ae04c4 100644
--- a/block-migration.c
+++ b/block-migration.c
@@ -17,6 +17,7 @@ 
 #include "qemu-queue.h"
 #include "monitor.h"
 #include "block-migration.h"
+#include "migration.h"
 #include <assert.h>
 
 #define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK << BDRV_SECTOR_BITS)
@@ -30,6 +31,7 @@ 
 #define BLOCKS_READ_CHANGE 100
 #define INITIAL_BLOCKS_READ 100
 #define MAX_DIRTY_ITERATIONS 100
+#define DISK_RATE (30 << 20) //30 MB/sec
 
 //#define DEBUG_BLK_MIGRATION
 
@@ -135,10 +137,11 @@  static void blk_mig_read_cb(void *opaque, int ret)
     blk->ret = ret;
 
     QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry);
-
+    
     block_mig_state.submitted--;
     block_mig_state.read_done++;
     assert(block_mig_state.submitted >= 0);
+    
 }
 
 static int mig_save_device_bulk(Monitor *mon, QEMUFile *f,
@@ -225,7 +228,7 @@  static void init_blk_migration(Monitor *mon, QEMUFile *f)
     block_mig_state.prev_progress = -1;
     block_mig_state.bulk_completed = 0;
     block_mig_state.dirty_iterations = 0;
-
+    
     for (bs = bdrv_first; bs != NULL; bs = bs->next) {
         if (bs->type == BDRV_TYPE_HD) {
             sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
@@ -405,15 +408,41 @@  static void flush_blks(QEMUFile* f)
             block_mig_state.transferred);
 }
 
-static int is_stage2_completed(void)
+static int64_t get_remaining_dirty(void)
 {
+    BlkMigDevState *bmds;
+    int64_t dirty = 0;
+    
+    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
+        dirty += bdrv_get_dirty_count(bmds->bs);
+    }
+    
+    return dirty;
+}
 
-    if (block_mig_state.submitted == 0 &&
-        block_mig_state.bulk_completed == 1) {
-        return 1;
-    } else {
-        return 0;
+static int is_stage2_completed(void)
+{
+    int64_t remaining_dirty;
+    
+    if (block_mig_state.bulk_completed == 1) {
+        if (block_mig_state.dirty_iterations++ > MAX_DIRTY_ITERATIONS) {
+            /* finish stage2 because we have too much dirty iterations */
+            
+            return 1;
+        }
+        
+        remaining_dirty = get_remaining_dirty();
+        
+        if ((remaining_dirty * BLOCK_SIZE) * 1000000000 / DISK_RATE <= 
+            migrate_max_downtime()) {
+            /* finish stage2 because we think that we can finish remaing work
+               below max_downtime */
+            
+            return 1;
+        }
     }
+    
+    return 0;
 }
 
 static void blk_mig_cleanup(Monitor *mon)
@@ -438,9 +467,7 @@  static void blk_mig_cleanup(Monitor *mon)
 }
 
 static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
-{
-    int dirty_iteration = 0;
-    
+{    
     dprintf("Enter save live stage %d submitted %d transferred %d\n",
             stage, block_mig_state.submitted, block_mig_state.transferred);
 
@@ -482,19 +509,12 @@  static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
                     /* finish saving bulk on all devices */
                     block_mig_state.bulk_completed = 1;
                 }
-            } else if (block_mig_state.dirty_iterations < MAX_DIRTY_ITERATIONS) {
-                if (dirty_iteration == 0) {
-                    /* increment dirty iteration only once per round */
-                    dirty_iteration = 1;
-                    block_mig_state.dirty_iterations++;
-                }
+            } else {
+                
                 if (blk_mig_save_dirty_block(mon, f, 1) == 0) {
                     /* no more dirty blocks */
                     break;
                 }
-            } else {
-                /* if we got here stop the loop */
-                break;
             }
         }
         
@@ -507,9 +527,12 @@  static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
     }
     
     if (stage == 3) {
-        /* we now for sure that save bulk is completed */
-
+        /* we know for sure that save bulk is completed and
+           all async read completed */
+        assert(block_mig_state.submitted == 0);
+        
         while(blk_mig_save_dirty_block(mon, f, 0) != 0);
+        
         blk_mig_cleanup(mon);
 
         /* report completion */