Patchwork [20/34] savevm: New save live migration method: pending

login
register
mail settings
Submitter Juan Quintela
Date Dec. 19, 2012, 12:33 p.m.
Message ID <1355920437-29882-21-git-send-email-quintela@redhat.com>
Download mbox | patch
Permalink /patch/207372/
State New
Headers show

Comments

Juan Quintela - Dec. 19, 2012, 12:33 p.m.
Code just now does (simplified for clarity)

    if (qemu_savevm_state_iterate(s->file) == 1) {
       vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
       qemu_savevm_state_complete(s->file);
    }

Problem here is that qemu_savevm_state_iterate() returns 1 when it
knows that remaining memory to sent takes less than max downtime.

But this means that we could end spending 2x max_downtime, one
downtime in qemu_savevm_iterate, and the other in
qemu_savevm_state_complete.

Changed code to:

    pending_size = qemu_savevm_state_pending(s->file, max_size);
    DPRINTF("pending size %lu max %lu\n", pending_size, max_size);
    if (pending_size >= max_size) {
        ret = qemu_savevm_state_iterate(s->file);
     } else {
        vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
        qemu_savevm_state_complete(s->file);
     }

So what we do is: at current network speed, we calculate the maximum
number of bytes we can sent: max_size.

Then we ask every save_live section how much they have pending.  If
they are less than max_size, we move to complete phase, otherwise we
do an iterate one.

This makes things much simpler, because now individual sections don't
have to caluclate the bandwidth (it was implossible to do right from
there).

Signed-off-by: Juan Quintela <quintela@redhat.com>

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch_init.c       | 48 ++++++++++++++++++------------------------------
 block-migration.c | 49 ++++++++++---------------------------------------
 buffered_file.c   | 25 ++++++++++++++++++-------
 migration.c       | 22 +++++++++++++++-------
 migration.h       |  2 +-
 savevm.c          | 19 +++++++++++++++++++
 sysemu.h          |  1 +
 vmstate.h         |  1 +
 8 files changed, 83 insertions(+), 84 deletions(-)

Patch

diff --git a/arch_init.c b/arch_init.c
index 9cee58a..f092ea2 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -582,12 +582,9 @@  static int ram_save_setup(QEMUFile *f, void *opaque)

 static int ram_save_iterate(QEMUFile *f, void *opaque)
 {
-    uint64_t bytes_transferred_last;
-    double bwidth = 0;
     int ret;
     int i;
-    uint64_t expected_downtime;
-    MigrationState *s = migrate_get_current();
+    int64_t t0;

     qemu_mutex_lock_ramlist();

@@ -595,9 +592,7 @@  static int ram_save_iterate(QEMUFile *f, void *opaque)
         reset_ram_globals();
     }

-    bytes_transferred_last = bytes_transferred;
-    bwidth = qemu_get_clock_ns(rt_clock);
-
+    t0 = qemu_get_clock_ns(rt_clock);
     i = 0;
     while ((ret = qemu_file_rate_limit(f)) == 0) {
         int bytes_sent;
@@ -615,7 +610,7 @@  static int ram_save_iterate(QEMUFile *f, void *opaque)
            iterations
         */
         if ((i & 63) == 0) {
-            uint64_t t1 = (qemu_get_clock_ns(rt_clock) - bwidth) / 1000000;
+            uint64_t t1 = (qemu_get_clock_ns(rt_clock) - t0) / 1000000;
             if (t1 > MAX_WAIT) {
                 DPRINTF("big wait: %" PRIu64 " milliseconds, %d iterations\n",
                         t1, i);
@@ -629,31 +624,10 @@  static int ram_save_iterate(QEMUFile *f, void *opaque)
         return ret;
     }

-    bwidth = qemu_get_clock_ns(rt_clock) - bwidth;
-    bwidth = (bytes_transferred - bytes_transferred_last) / bwidth;
-
-    /* if we haven't transferred anything this round, force
-     * expected_downtime to a very high value, but without
-     * crashing */
-    if (bwidth == 0) {
-        bwidth = 0.000001;
-    }
-
     qemu_mutex_unlock_ramlist();
     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);

-    expected_downtime = ram_save_remaining() * TARGET_PAGE_SIZE / bwidth;
-    DPRINTF("ram_save_live: expected(%" PRIu64 ") <= max(" PRIu64 ")?\n",
-            expected_downtime, migrate_max_downtime());
-
-    if (expected_downtime <= migrate_max_downtime()) {
-        migration_bitmap_sync();
-        expected_downtime = ram_save_remaining() * TARGET_PAGE_SIZE / bwidth;
-        s->expected_downtime = expected_downtime / 1000000; /* ns -> ms */
-
-        return expected_downtime <= migrate_max_downtime();
-    }
-    return 0;
+    return i;
 }

 static int ram_save_complete(QEMUFile *f, void *opaque)
@@ -683,6 +657,19 @@  static int ram_save_complete(QEMUFile *f, void *opaque)
     return 0;
 }

+static uint64_t ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
+{
+    uint64_t remaining_size;
+
+    remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
+
+    if (remaining_size < max_size) {
+        migration_bitmap_sync();
+        remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
+    }
+    return remaining_size;
+}
+
 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
 {
     int ret, rc = 0;
@@ -869,6 +856,7 @@  SaveVMHandlers savevm_ram_handlers = {
     .save_live_setup = ram_save_setup,
     .save_live_iterate = ram_save_iterate,
     .save_live_complete = ram_save_complete,
+    .save_live_pending = ram_save_pending,
     .load_state = ram_load,
     .cancel = ram_migration_cancel,
 };
diff --git a/block-migration.c b/block-migration.c
index 71b9601..5db01fe 100644
--- a/block-migration.c
+++ b/block-migration.c
@@ -77,9 +77,7 @@  typedef struct BlkMigState {
     int64_t total_sector_sum;
     int prev_progress;
     int bulk_completed;
-    long double total_time;
     long double prev_time_offset;
-    int reads;
 } BlkMigState;

 static BlkMigState block_mig_state;
@@ -132,12 +130,6 @@  uint64_t blk_mig_bytes_total(void)
     return sum << BDRV_SECTOR_BITS;
 }

-static inline long double compute_read_bwidth(void)
-{
-    assert(block_mig_state.total_time != 0);
-    return (block_mig_state.reads / block_mig_state.total_time) * BLOCK_SIZE;
-}
-
 static int bmds_aio_inflight(BlkMigDevState *bmds, int64_t sector)
 {
     int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
@@ -191,8 +183,6 @@  static void blk_mig_read_cb(void *opaque, int ret)

     blk->ret = ret;

-    block_mig_state.reads++;
-    block_mig_state.total_time += (curr_time - block_mig_state.prev_time_offset);
     block_mig_state.prev_time_offset = curr_time;

     QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry);
@@ -310,8 +300,6 @@  static void init_blk_migration(QEMUFile *f)
     block_mig_state.total_sector_sum = 0;
     block_mig_state.prev_progress = -1;
     block_mig_state.bulk_completed = 0;
-    block_mig_state.total_time = 0;
-    block_mig_state.reads = 0;

     bdrv_iterate(init_blk_migration_it, NULL);
 }
@@ -493,32 +481,6 @@  static int64_t get_remaining_dirty(void)
     return dirty * BLOCK_SIZE;
 }

-static int is_stage2_completed(void)
-{
-    int64_t remaining_dirty;
-    long double bwidth;
-
-    if (block_mig_state.bulk_completed == 1) {
-
-        remaining_dirty = get_remaining_dirty();
-        if (remaining_dirty == 0) {
-            return 1;
-        }
-
-        bwidth = compute_read_bwidth();
-
-        if ((remaining_dirty / bwidth) <=
-            migrate_max_downtime()) {
-            /* finish stage2 because we think that we can finish remaining work
-               below max_downtime */
-
-            return 1;
-        }
-    }
-
-    return 0;
-}
-
 static void blk_mig_cleanup(void)
 {
     BlkMigDevState *bmds;
@@ -619,7 +581,7 @@  static int block_save_iterate(QEMUFile *f, void *opaque)

     qemu_put_be64(f, BLK_MIG_FLAG_EOS);

-    return is_stage2_completed();
+    return 0;
 }

 static int block_save_complete(QEMUFile *f, void *opaque)
@@ -659,6 +621,14 @@  static int block_save_complete(QEMUFile *f, void *opaque)
     return 0;
 }

+static uint64_t block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
+{
+
+    DPRINTF("Enter save live pending  %ld\n", get_remaining_dirty());
+
+    return get_remaining_dirty();
+}
+
 static int block_load(QEMUFile *f, void *opaque, int version_id)
 {
     static int banner_printed;
@@ -755,6 +725,7 @@  SaveVMHandlers savevm_block_handlers = {
     .save_live_setup = block_save_setup,
     .save_live_iterate = block_save_iterate,
     .save_live_complete = block_save_complete,
+    .save_live_pending = block_save_pending,
     .load_state = block_load,
     .cancel = block_migration_cancel,
     .is_active = block_is_active,
diff --git a/buffered_file.c b/buffered_file.c
index 11efd8f..dda9db8 100644
--- a/buffered_file.c
+++ b/buffered_file.c
@@ -182,13 +182,15 @@  static int64_t buffered_get_rate_limit(void *opaque)
     return s->xfer_limit;
 }

-/* 10ms  xfer_limit is the limit that we should write each 10ms */
+/* 100ms  xfer_limit is the limit that we should write each 100ms */
 #define BUFFER_DELAY 100

 static void *buffered_file_thread(void *opaque)
 {
     QEMUFileBuffered *s = opaque;
-    int64_t expire_time = qemu_get_clock_ms(rt_clock) + BUFFER_DELAY;
+    int64_t initial_time = qemu_get_clock_ms(rt_clock);
+    int64_t max_size = 0;
+    bool last_round = false;

     while (true) {
         int64_t current_time = qemu_get_clock_ms(rt_clock);
@@ -196,13 +198,22 @@  static void *buffered_file_thread(void *opaque)
         if (s->migration_state->complete) {
             break;
         }
-        if (current_time >= expire_time) {
+        if (current_time >= initial_time + BUFFER_DELAY) {
+            uint64_t transferred_bytes = s->bytes_xfer;
+            uint64_t time_spent = current_time - initial_time;
+            double bandwidth = transferred_bytes / time_spent;
+            max_size = bandwidth * migrate_max_downtime() / 1000000;
+
+            DPRINTF("transferred %" PRIu64 " time_spent %" PRIu64
+                    " bandwidth %g max_size %" PRId64 "\n",
+                    transferred_bytes, time_spent, bandwidth, max_size);
+
             s->bytes_xfer = 0;
-            expire_time = current_time + BUFFER_DELAY;
+            initial_time = current_time;
         }
-        if (s->bytes_xfer >= s->xfer_limit) {
+        if (!last_round && (s->bytes_xfer >= s->xfer_limit)) {
             /* usleep expects microseconds */
-            g_usleep((expire_time - current_time)*1000);
+            g_usleep((initial_time + BUFFER_DELAY - current_time)*1000);
         }
         if (buffered_flush(s) < 0) {
             break;
@@ -211,7 +222,7 @@  static void *buffered_file_thread(void *opaque)
         DPRINTF("file is ready\n");
         if (s->bytes_xfer < s->xfer_limit) {
             DPRINTF("notifying client\n");
-            migrate_fd_put_ready(s->migration_state);
+            last_round = migrate_fd_put_ready(s->migration_state, max_size);
         }
     }

diff --git a/migration.c b/migration.c
index d6cb320..fa34e19 100644
--- a/migration.c
+++ b/migration.c
@@ -316,15 +316,17 @@  ssize_t migrate_fd_put_buffer(MigrationState *s, const void *data,
     return ret;
 }

-void migrate_fd_put_ready(MigrationState *s)
+bool migrate_fd_put_ready(MigrationState *s, uint64_t max_size)
 {
     int ret;
+    uint64_t pending_size;
+    bool last_round = false;

     qemu_mutex_lock_iothread();
     if (s->state != MIG_STATE_ACTIVE) {
         DPRINTF("put_ready returning because of non-active state\n");
         qemu_mutex_unlock_iothread();
-        return;
+        return false;
     }
     if (s->first_time) {
         s->first_time = false;
@@ -334,15 +336,19 @@  void migrate_fd_put_ready(MigrationState *s)
             DPRINTF("failed, %d\n", ret);
             migrate_fd_error(s);
             qemu_mutex_unlock_iothread();
-            return;
+            return false;
         }
     }

     DPRINTF("iterate\n");
-    ret = qemu_savevm_state_iterate(s->file);
-    if (ret < 0) {
-        migrate_fd_error(s);
-    } else if (ret == 1) {
+    pending_size = qemu_savevm_state_pending(s->file, max_size);
+    DPRINTF("pending size %lu max %lu\n", pending_size, max_size);
+    if (pending_size >= max_size) {
+        ret = qemu_savevm_state_iterate(s->file);
+        if (ret < 0) {
+            migrate_fd_error(s);
+        }
+    } else {
         int old_vm_running = runstate_is_running();
         int64_t start_time, end_time;

@@ -368,9 +374,11 @@  void migrate_fd_put_ready(MigrationState *s)
                 vm_start();
             }
         }
+        last_round = true;
     }
     qemu_mutex_unlock_iothread();

+    return last_round;
 }

 static void migrate_fd_cancel(MigrationState *s)
diff --git a/migration.h b/migration.h
index 6760d7f..92f658d 100644
--- a/migration.h
+++ b/migration.h
@@ -81,7 +81,7 @@  void migrate_fd_connect(MigrationState *s);

 ssize_t migrate_fd_put_buffer(MigrationState *s, const void *data,
                               size_t size);
-void migrate_fd_put_ready(MigrationState *s);
+bool migrate_fd_put_ready(MigrationState *s, uint64_t max_size);
 int migrate_fd_close(MigrationState *s);

 void add_migration_state_change_notifier(Notifier *notify);
diff --git a/savevm.c b/savevm.c
index c4ee899..f6e91cf 100644
--- a/savevm.c
+++ b/savevm.c
@@ -1754,6 +1754,25 @@  int qemu_savevm_state_complete(QEMUFile *f)
     return qemu_file_get_error(f);
 }

+uint64_t qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size)
+{
+    SaveStateEntry *se;
+    uint64_t ret = 0;
+
+    QTAILQ_FOREACH(se, &savevm_handlers, entry) {
+        if (!se->ops || !se->ops->save_live_pending) {
+            continue;
+        }
+        if (se->ops && se->ops->is_active) {
+            if (!se->ops->is_active(se->opaque)) {
+                continue;
+            }
+        }
+        ret += se->ops->save_live_pending(f, se->opaque, max_size);
+    }
+    return ret;
+}
+
 void qemu_savevm_state_cancel(QEMUFile *f)
 {
     SaveStateEntry *se;
diff --git a/sysemu.h b/sysemu.h
index 1b6add2..7832c69 100644
--- a/sysemu.h
+++ b/sysemu.h
@@ -78,6 +78,7 @@  int qemu_savevm_state_begin(QEMUFile *f,
 int qemu_savevm_state_iterate(QEMUFile *f);
 int qemu_savevm_state_complete(QEMUFile *f);
 void qemu_savevm_state_cancel(QEMUFile *f);
+uint64_t qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size);
 int qemu_loadvm_state(QEMUFile *f);

 /* SLIRP */
diff --git a/vmstate.h b/vmstate.h
index 623af0a..f27276c 100644
--- a/vmstate.h
+++ b/vmstate.h
@@ -35,6 +35,7 @@  typedef struct SaveVMHandlers {
     int (*save_live_setup)(QEMUFile *f, void *opaque);
     int (*save_live_iterate)(QEMUFile *f, void *opaque);
     int (*save_live_complete)(QEMUFile *f, void *opaque);
+    uint64_t (*save_live_pending)(QEMUFile *f, void *opaque, uint64_t max_size);
     void (*cancel)(void *opaque);
     LoadStateHandler *load_state;
     bool (*is_active)(void *opaque);