Patchwork [13/18] savevm: New save live migration method: pending

login
register
mail settings
Submitter Juan Quintela
Date Oct. 29, 2012, 2:11 p.m.
Message ID <1351519903-26607-14-git-send-email-quintela@redhat.com>
Download mbox | patch
Permalink /patch/195022/
State New
Headers show

Comments

Juan Quintela - Oct. 29, 2012, 2:11 p.m.
Code just now does (simplified for clarity)

    if (qemu_savevm_state_iterate(s->file) == 1) {
       vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
       qemu_savevm_state_complete(s->file);
    }

Problem here is that qemu_savevm_state_iterate() returns 1 when it
knows that remaining memory to sent takes less than max downtime.

But this means that we could end spending 2x max_downtime, one
downtime in qemu_savevm_iterate, and the other in
qemu_savevm_state_complete.

Changed code to:

    pending_size = qemu_savevm_state_pending(s->file, max_size);
    DPRINTF("pending size %lu max %lu\n", pending_size, max_size);
    if (pending_size >= max_size) {
        ret = qemu_savevm_state_iterate(s->file);
     } else {
        vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
        qemu_savevm_state_complete(s->file);
     }

So what we do is: at current network speed, we calculate the maximum
number of bytes we can sent: max_size.

Then we ask every save_live section how much they have pending.  If
they are less than max_size, we move to complete phase, otherwise we
do an iterate one.

This makes things much simpler, because now individual sections don't
have to caluclate the bandwidth (it was implossible to do right from
there).

Signed-off-by: Juan Quintela <quintela@redhat.com>
---
 arch_init.c       | 48 ++++++++++++++++++------------------------------
 block-migration.c | 49 ++++++++++---------------------------------------
 buffered_file.c   | 23 +++++++++++++++++------
 migration.c       | 22 +++++++++++++++-------
 migration.h       |  2 +-
 savevm.c          | 19 +++++++++++++++++++
 sysemu.h          |  1 +
 vmstate.h         |  1 +
 8 files changed, 82 insertions(+), 83 deletions(-)

Patch

diff --git a/arch_init.c b/arch_init.c
index 2d29828..1eefef8 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -608,12 +608,9 @@  static int ram_save_setup(QEMUFile *f, void *opaque)

 static int ram_save_iterate(QEMUFile *f, void *opaque)
 {
-    uint64_t bytes_transferred_last;
-    double bwidth = 0;
     int ret;
     int i;
-    uint64_t expected_downtime;
-    MigrationState *s = migrate_get_current();
+    int64_t t0;

     qemu_mutex_lock_ramlist();

@@ -621,9 +618,7 @@  static int ram_save_iterate(QEMUFile *f, void *opaque)
         reset_ram_globals();
     }

-    bytes_transferred_last = bytes_transferred;
-    bwidth = qemu_get_clock_ns(rt_clock);
-
+    t0 = qemu_get_clock_ns(rt_clock);
     i = 0;
     while ((ret = qemu_file_rate_limit(f)) == 0) {
         int bytes_sent;
@@ -641,7 +636,7 @@  static int ram_save_iterate(QEMUFile *f, void *opaque)
            iterations
         */
         if ((i & 63) == 0) {
-            uint64_t t1 = (qemu_get_clock_ns(rt_clock) - bwidth) / 1000000;
+            uint64_t t1 = (qemu_get_clock_ns(rt_clock) - t0) / 1000000;
             if (t1 > MAX_WAIT) {
                 DPRINTF("big wait: %" PRIu64 " milliseconds, %d iterations\n",
                         t1, i);
@@ -655,31 +650,10 @@  static int ram_save_iterate(QEMUFile *f, void *opaque)
         return ret;
     }

-    bwidth = qemu_get_clock_ns(rt_clock) - bwidth;
-    bwidth = (bytes_transferred - bytes_transferred_last) / bwidth;
-
-    /* if we haven't transferred anything this round, force
-     * expected_downtime to a very high value, but without
-     * crashing */
-    if (bwidth == 0) {
-        bwidth = 0.000001;
-    }
-
     qemu_mutex_unlock_ramlist();
     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);

-    expected_downtime = ram_save_remaining() * TARGET_PAGE_SIZE / bwidth;
-    DPRINTF("ram_save_live: expected(%" PRIu64 ") <= max(" PRIu64 ")?\n",
-            expected_downtime, migrate_max_downtime());
-
-    if (expected_downtime <= migrate_max_downtime()) {
-        migration_bitmap_sync();
-        expected_downtime = ram_save_remaining() * TARGET_PAGE_SIZE / bwidth;
-        s->expected_downtime = expected_downtime / 1000000; /* ns -> ms */
-
-        return expected_downtime <= migrate_max_downtime();
-    }
-    return 0;
+    return i;
 }

 static int ram_save_complete(QEMUFile *f, void *opaque)
@@ -712,6 +686,19 @@  static int ram_save_complete(QEMUFile *f, void *opaque)
     return 0;
 }

+static uint64_t ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
+{
+    uint64_t remaining_size;
+
+    remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
+
+    if (remaining_size < max_size) {
+        migration_bitmap_sync();
+        remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
+    }
+    return remaining_size;
+}
+
 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
 {
     int ret, rc = 0;
@@ -897,6 +884,7 @@  SaveVMHandlers savevm_ram_handlers = {
     .save_live_setup = ram_save_setup,
     .save_live_iterate = ram_save_iterate,
     .save_live_complete = ram_save_complete,
+    .save_live_pending = ram_save_pending,
     .load_state = ram_load,
     .cancel = ram_migration_cancel,
 };
diff --git a/block-migration.c b/block-migration.c
index 71b9601..5db01fe 100644
--- a/block-migration.c
+++ b/block-migration.c
@@ -77,9 +77,7 @@  typedef struct BlkMigState {
     int64_t total_sector_sum;
     int prev_progress;
     int bulk_completed;
-    long double total_time;
     long double prev_time_offset;
-    int reads;
 } BlkMigState;

 static BlkMigState block_mig_state;
@@ -132,12 +130,6 @@  uint64_t blk_mig_bytes_total(void)
     return sum << BDRV_SECTOR_BITS;
 }

-static inline long double compute_read_bwidth(void)
-{
-    assert(block_mig_state.total_time != 0);
-    return (block_mig_state.reads / block_mig_state.total_time) * BLOCK_SIZE;
-}
-
 static int bmds_aio_inflight(BlkMigDevState *bmds, int64_t sector)
 {
     int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
@@ -191,8 +183,6 @@  static void blk_mig_read_cb(void *opaque, int ret)

     blk->ret = ret;

-    block_mig_state.reads++;
-    block_mig_state.total_time += (curr_time - block_mig_state.prev_time_offset);
     block_mig_state.prev_time_offset = curr_time;

     QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry);
@@ -310,8 +300,6 @@  static void init_blk_migration(QEMUFile *f)
     block_mig_state.total_sector_sum = 0;
     block_mig_state.prev_progress = -1;
     block_mig_state.bulk_completed = 0;
-    block_mig_state.total_time = 0;
-    block_mig_state.reads = 0;

     bdrv_iterate(init_blk_migration_it, NULL);
 }
@@ -493,32 +481,6 @@  static int64_t get_remaining_dirty(void)
     return dirty * BLOCK_SIZE;
 }

-static int is_stage2_completed(void)
-{
-    int64_t remaining_dirty;
-    long double bwidth;
-
-    if (block_mig_state.bulk_completed == 1) {
-
-        remaining_dirty = get_remaining_dirty();
-        if (remaining_dirty == 0) {
-            return 1;
-        }
-
-        bwidth = compute_read_bwidth();
-
-        if ((remaining_dirty / bwidth) <=
-            migrate_max_downtime()) {
-            /* finish stage2 because we think that we can finish remaining work
-               below max_downtime */
-
-            return 1;
-        }
-    }
-
-    return 0;
-}
-
 static void blk_mig_cleanup(void)
 {
     BlkMigDevState *bmds;
@@ -619,7 +581,7 @@  static int block_save_iterate(QEMUFile *f, void *opaque)

     qemu_put_be64(f, BLK_MIG_FLAG_EOS);

-    return is_stage2_completed();
+    return 0;
 }

 static int block_save_complete(QEMUFile *f, void *opaque)
@@ -659,6 +621,14 @@  static int block_save_complete(QEMUFile *f, void *opaque)
     return 0;
 }

+static uint64_t block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
+{
+
+    DPRINTF("Enter save live pending  %ld\n", get_remaining_dirty());
+
+    return get_remaining_dirty();
+}
+
 static int block_load(QEMUFile *f, void *opaque, int version_id)
 {
     static int banner_printed;
@@ -755,6 +725,7 @@  SaveVMHandlers savevm_block_handlers = {
     .save_live_setup = block_save_setup,
     .save_live_iterate = block_save_iterate,
     .save_live_complete = block_save_complete,
+    .save_live_pending = block_save_pending,
     .load_state = block_load,
     .cancel = block_migration_cancel,
     .is_active = block_is_active,
diff --git a/buffered_file.c b/buffered_file.c
index 017745d..fbd11c4 100644
--- a/buffered_file.c
+++ b/buffered_file.c
@@ -181,7 +181,9 @@  static int64_t buffered_get_rate_limit(void *opaque)
 static void *buffered_file_thread(void *opaque)
 {
     QEMUFileBuffered *s = opaque;
-    int64_t expire_time = qemu_get_clock_ms(rt_clock) + BUFFER_DELAY;
+    int64_t initial_time = qemu_get_clock_ms(rt_clock);
+    int64_t max_size = 0;
+    bool last_round = false;

     while (true) {
         int64_t current_time = qemu_get_clock_ms(rt_clock);
@@ -189,20 +191,29 @@  static void *buffered_file_thread(void *opaque)
         if (s->migration_state->complete) {
             break;
         }
-        if (current_time >= expire_time) {
+        if (current_time >= initial_time + BUFFER_DELAY) {
+            uint64_t transferred_bytes = s->bytes_xfer;
+            uint64_t time_spent = current_time - initial_time;
+            double bandwidth = transferred_bytes / time_spent;
+            max_size = bandwidth * migrate_max_downtime() / 1000000;
+
+            DPRINTF("transferred %" PRIu64 " time_spent %" PRIu64
+                    " bandwidth %g max_size %" PRId64 "\n",
+                    transferred_bytes, time_spent, bandwidth, max_size);
+
             s->bytes_xfer = 0;
-            expire_time = current_time + BUFFER_DELAY;
+            initial_time = current_time;
         }
-        if (s->bytes_xfer >= s->xfer_limit) {
+        if (!last_round && (s->bytes_xfer >= s->xfer_limit)) {
             /* usleep expects microseconds */
-            g_usleep((expire_time - current_time)*1000);
+            g_usleep((initial_time + BUFFER_DELAY - current_time)*1000);
         }
         buffered_flush(s);

         DPRINTF("file is ready\n");
         if (s->bytes_xfer < s->xfer_limit) {
             DPRINTF("notifying client\n");
-            migrate_fd_put_ready(s->migration_state);
+            last_round = migrate_fd_put_ready(s->migration_state, max_size);
         }
     }

diff --git a/migration.c b/migration.c
index 9c409d7..769ae56 100644
--- a/migration.c
+++ b/migration.c
@@ -297,16 +297,18 @@  ssize_t migrate_fd_put_buffer(MigrationState *s, const void *data,
     return ret;
 }

-void migrate_fd_put_ready(MigrationState *s)
+bool migrate_fd_put_ready(MigrationState *s, uint64_t max_size)
 {
     int ret;
     static bool first_time = true;
+    uint64_t pending_size;
+    bool last_round = false;

     qemu_mutex_lock_iothread();
     if (s->state != MIG_STATE_ACTIVE) {
         DPRINTF("put_ready returning because of non-active state\n");
         qemu_mutex_unlock_iothread();
-        return;
+        return false;
     }
     if (first_time) {
         first_time = false;
@@ -316,15 +318,19 @@  void migrate_fd_put_ready(MigrationState *s)
             DPRINTF("failed, %d\n", ret);
             migrate_fd_error(s);
             qemu_mutex_unlock_iothread();
-            return;
+            return false;
         }
     }

     DPRINTF("iterate\n");
-    ret = qemu_savevm_state_iterate(s->file);
-    if (ret < 0) {
-        migrate_fd_error(s);
-    } else if (ret == 1) {
+    pending_size = qemu_savevm_state_pending(s->file, max_size);
+    DPRINTF("pending size %lu max %lu\n", pending_size, max_size);
+    if (pending_size >= max_size) {
+        ret = qemu_savevm_state_iterate(s->file);
+        if (ret < 0) {
+            migrate_fd_error(s);
+        }
+    } else {
         int old_vm_running = runstate_is_running();
         int64_t start_time, end_time;

@@ -350,9 +356,11 @@  void migrate_fd_put_ready(MigrationState *s)
                 vm_start();
             }
         }
+        last_round = true;
     }
     qemu_mutex_unlock_iothread();

+    return last_round;
 }

 static void migrate_fd_cancel(MigrationState *s)
diff --git a/migration.h b/migration.h
index 5ff68eb..72e638b 100644
--- a/migration.h
+++ b/migration.h
@@ -80,7 +80,7 @@  void migrate_fd_connect(MigrationState *s);

 ssize_t migrate_fd_put_buffer(MigrationState *s, const void *data,
                               size_t size);
-void migrate_fd_put_ready(MigrationState *s);
+bool migrate_fd_put_ready(MigrationState *s, uint64_t max_size);
 int migrate_fd_close(MigrationState *s);

 void add_migration_state_change_notifier(Notifier *notify);
diff --git a/savevm.c b/savevm.c
index 69f1768..a68c37b 100644
--- a/savevm.c
+++ b/savevm.c
@@ -1681,6 +1681,25 @@  int qemu_savevm_state_complete(QEMUFile *f)
     return qemu_file_get_error(f);
 }

+uint64_t qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size)
+{
+    SaveStateEntry *se;
+    uint64_t ret = 0;
+
+    QTAILQ_FOREACH(se, &savevm_handlers, entry) {
+        if (!se->ops || !se->ops->save_live_pending) {
+            continue;
+        }
+        if (se->ops && se->ops->is_active) {
+            if (!se->ops->is_active(se->opaque)) {
+                continue;
+            }
+        }
+        ret += se->ops->save_live_pending(f, se->opaque, max_size);
+    }
+    return ret;
+}
+
 void qemu_savevm_state_cancel(QEMUFile *f)
 {
     SaveStateEntry *se;
diff --git a/sysemu.h b/sysemu.h
index 0c39a3a..0276205 100644
--- a/sysemu.h
+++ b/sysemu.h
@@ -83,6 +83,7 @@  int qemu_savevm_state_begin(QEMUFile *f,
 int qemu_savevm_state_iterate(QEMUFile *f);
 int qemu_savevm_state_complete(QEMUFile *f);
 void qemu_savevm_state_cancel(QEMUFile *f);
+uint64_t qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size);
 int qemu_loadvm_state(QEMUFile *f);

 /* SLIRP */
diff --git a/vmstate.h b/vmstate.h
index c9c320e..241d962 100644
--- a/vmstate.h
+++ b/vmstate.h
@@ -35,6 +35,7 @@  typedef struct SaveVMHandlers {
     int (*save_live_setup)(QEMUFile *f, void *opaque);
     int (*save_live_iterate)(QEMUFile *f, void *opaque);
     int (*save_live_complete)(QEMUFile *f, void *opaque);
+    uint64_t (*save_live_pending)(QEMUFile *f, void *opaque, uint64_t max_size);
     void (*cancel)(void *opaque);
     LoadStateHandler *load_state;
     bool (*is_active)(void *opaque);