Patchwork [v10,14/14] rdma: add setup time accounting to QMP statistics

login
register
mail settings
Submitter mrhines@linux.vnet.ibm.com
Date June 18, 2013, 8:16 p.m.
Message ID <1371586561-22803-15-git-send-email-mrhines@linux.vnet.ibm.com>
Download mbox | patch
Permalink /patch/252434/
State New
Headers show

Comments

mrhines@linux.vnet.ibm.com - June 18, 2013, 8:16 p.m.
From: "Michael R. Hines" <mrhines@us.ibm.com>

For very large virtual machines, pinning can take a long time.
While this does not affect the migration's *actual* time itself,
it is still important for the user to know what's going on and
to know what component of the total time is actual taken up by
pinning.

For example, using a 14GB virtual machine, pinning can take as
long as 5 seconds, for which the user would not otherwise know
what was happening.

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Chegu Vinod <chegu_vinod@hp.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Tested-by: Chegu Vinod <chegu_vinod@hp.com>
Tested-by: Michael R. Hines <mrhines@us.ibm.com>
Signed-off-by: Michael R. Hines <mrhines@us.ibm.com>
---
 hmp.c                         |   10 ++++++--
 include/migration/migration.h |    1 +
 migration-rdma.c              |   51 +++++++++++++++++++++++------------------
 migration.c                   |   41 +++++++++++++++++++++++----------
 qapi-schema.json              |   13 ++++++++---
 5 files changed, 77 insertions(+), 39 deletions(-)

Patch

diff --git a/hmp.c b/hmp.c
index 148a3fb..2b75b28 100644
--- a/hmp.c
+++ b/hmp.c
@@ -154,8 +154,10 @@  void hmp_info_migrate(Monitor *mon, const QDict *qdict)
 
     if (info->has_status) {
         monitor_printf(mon, "Migration status: %s\n", info->status);
-        monitor_printf(mon, "total time: %" PRIu64 " milliseconds\n",
-                       info->total_time);
+        if (info->has_total_time) {
+            monitor_printf(mon, "total time: %" PRIu64 " milliseconds\n",
+                           info->total_time);
+        }
         if (info->has_expected_downtime) {
             monitor_printf(mon, "expected downtime: %" PRIu64 " milliseconds\n",
                            info->expected_downtime);
@@ -164,6 +166,10 @@  void hmp_info_migrate(Monitor *mon, const QDict *qdict)
             monitor_printf(mon, "downtime: %" PRIu64 " milliseconds\n",
                            info->downtime);
         }
+        if (info->has_setup_time) {
+            monitor_printf(mon, "setup: %" PRIu64 " milliseconds\n",
+                           info->setup_time);
+        }
     }
 
     if (info->has_ram) {
diff --git a/include/migration/migration.h b/include/migration/migration.h
index b49e68b..82e8713 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -49,6 +49,7 @@  struct MigrationState
     bool enabled_capabilities[MIGRATION_CAPABILITY_MAX];
     int64_t xbzrle_cache_size;
     double mbps;
+    int64_t setup_time; 
 };
 
 void process_incoming_migration(QEMUFile *f);
diff --git a/migration-rdma.c b/migration-rdma.c
index 853de18..165297e 100644
--- a/migration-rdma.c
+++ b/migration-rdma.c
@@ -699,12 +699,10 @@  static int qemu_rdma_alloc_qp(RDMAContext *rdma)
     return 0;
 }
 
-static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma,
-                                RDMALocalBlocks *rdma_local_ram_blocks)
+static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma)
 {
     int i;
-    uint64_t start = qemu_get_clock_ms(rt_clock);
-    (void)start;
+    RDMALocalBlocks *rdma_local_ram_blocks = &rdma->local_ram_blocks;
 
     for (i = 0; i < rdma_local_ram_blocks->num_blocks; i++) {
         rdma_local_ram_blocks->block[i].mr =
@@ -721,8 +719,6 @@  static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma,
         rdma->total_registrations++;
     }
 
-    DPRINTF("lock time: %" PRIu64 "\n", qemu_get_clock_ms(rt_clock) - start);
-
     if (i >= rdma_local_ram_blocks->num_blocks) {
         return 0;
     }
@@ -1262,7 +1258,8 @@  static void qemu_rdma_move_header(RDMAContext *rdma, int idx,
  */
 static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
                                    uint8_t *data, RDMAControlHeader *resp,
-                                   int *resp_idx)
+                                   int *resp_idx,
+                                   int (*callback)(RDMAContext *rdma))
 {
     int ret = 0;
     int idx = 0;
@@ -1315,6 +1312,14 @@  static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
      * If we're expecting a response, block and wait for it.
      */
     if (resp) {
+        if (callback) {
+            DPRINTF("Issuing callback before receiving response...\n");
+            ret = callback(rdma);
+            if (ret < 0) {
+                return ret;
+            }
+        }
+
         DDPRINTF("Waiting for response %s\n", control_desc[resp->type]);
         ret = qemu_rdma_exchange_get_response(rdma, resp, resp->type, idx + 1);
 
@@ -1464,7 +1469,7 @@  static int qemu_rdma_write_one(QEMUFile *f, RDMAContext *rdma,
                     chunk, sge.length, current_index, offset);
 
                 ret = qemu_rdma_exchange_send(rdma, &head,
-                                (uint8_t *) &comp, NULL, NULL);
+                                (uint8_t *) &comp, NULL, NULL, NULL);
 
                 if (ret < 0) {
                     return -EIO;
@@ -1487,7 +1492,7 @@  static int qemu_rdma_write_one(QEMUFile *f, RDMAContext *rdma,
                     chunk, sge.length, current_index, offset);
 
             ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) &reg,
-                                    &resp, &reg_result_idx);
+                                    &resp, &reg_result_idx, NULL);
             if (ret < 0) {
                 return ret;
             }
@@ -2126,7 +2131,7 @@  static int qemu_rdma_put_buffer(void *opaque, const uint8_t *buf,
         head.len = r->len;
         head.type = RDMA_CONTROL_QEMU_FILE;
 
-        ret = qemu_rdma_exchange_send(rdma, &head, data, NULL, NULL);
+        ret = qemu_rdma_exchange_send(rdma, &head, data, NULL, NULL, NULL);
 
         if (ret < 0) {
             rdma->error_state = ret;
@@ -2482,7 +2487,7 @@  static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque,
             DPRINTF("Initial setup info requested.\n");
 
             if (rdma->pin_all) {
-                ret = qemu_rdma_reg_whole_ram_blocks(rdma, &rdma->local_ram_blocks);
+                ret = qemu_rdma_reg_whole_ram_blocks(rdma);
                 if (ret) {
                     fprintf(stderr, "rdma migration: error dest "
                                     "registering ram blocks!\n");
@@ -2617,7 +2622,17 @@  static int qemu_rdma_registration_stop(QEMUFile *f, void *opaque,
         head.type = RDMA_CONTROL_RAM_BLOCKS_REQUEST;
         DPRINTF("Sending registration setup for ram blocks...\n");
 
-        ret = qemu_rdma_exchange_send(rdma, &head, NULL, &resp, &reg_result_idx);
+        /*
+         * Make sure that we parallelize the pinning on both sides.
+         * For very large guests, doing this serially takes a really
+         * long time, so we have to 'interleave' the pinning locally
+         * with the control messages by performing the pinning on this
+         * side before we receive the control response from the other
+         * side that the pinning has completed.
+         */
+        ret = qemu_rdma_exchange_send(rdma, &head, NULL, &resp, 
+                    &reg_result_idx, rdma->pin_all ?
+                    qemu_rdma_reg_whole_ram_blocks : NULL);
         if (ret < 0) {
             ERROR(errp, "receiving remote info!\n");
             return ret;
@@ -2633,14 +2648,7 @@  static int qemu_rdma_registration_stop(QEMUFile *f, void *opaque,
             return ret;
         }
 
-        if (rdma->pin_all) {
-            ret = qemu_rdma_reg_whole_ram_blocks(rdma, &rdma->local_ram_blocks);
-            if (ret) {
-                fprintf(stderr, "rdma migration: error source "
-                                "registering ram blocks!\n");
-                return ret;
-            }
-        } else {
+        if (!rdma->pin_all) {
             int x = 0;
             for (x = 0; x < rdma->local_ram_blocks.num_blocks; x++) {
                 RDMALocalBlock *block = &(rdma->local_ram_blocks.block[x]);
@@ -2653,7 +2661,7 @@  static int qemu_rdma_registration_stop(QEMUFile *f, void *opaque,
     DDDPRINTF("Sending registration finish %" PRIu64 "...\n", flags);
 
     head.type = RDMA_CONTROL_REGISTER_FINISHED;
-    ret = qemu_rdma_exchange_send(rdma, &head, NULL, NULL, NULL);
+    ret = qemu_rdma_exchange_send(rdma, &head, NULL, NULL, NULL, NULL);
 
     if (ret < 0) {
         goto err;
@@ -2802,7 +2810,6 @@  void rdma_start_outgoing_migration(void *opaque,
     DPRINTF("qemu_rdma_source_connect success\n");
 
     s->file = qemu_fopen_rdma(rdma, "wb");
-    s->total_time = qemu_get_clock_ms(rt_clock);
     migrate_fd_connect(s);
     return;
 err:
diff --git a/migration.c b/migration.c
index 62c6b85..bb412df 100644
--- a/migration.c
+++ b/migration.c
@@ -36,7 +36,8 @@ 
 #endif
 
 enum {
-    MIG_STATE_ERROR,
+    MIG_STATE_ERROR = -1,
+    MIG_STATE_NONE,
     MIG_STATE_SETUP,
     MIG_STATE_CANCELLED,
     MIG_STATE_ACTIVE,
@@ -63,7 +64,7 @@  static NotifierList migration_state_notifiers =
 MigrationState *migrate_get_current(void)
 {
     static MigrationState current_migration = {
-        .state = MIG_STATE_SETUP,
+        .state = MIG_STATE_NONE,
         .bandwidth_limit = MAX_THROTTLE,
         .xbzrle_cache_size = DEFAULT_MIGRATE_CACHE_SIZE,
         .mbps = -1,
@@ -184,9 +185,14 @@  MigrationInfo *qmp_query_migrate(Error **errp)
     MigrationState *s = migrate_get_current();
 
     switch (s->state) {
-    case MIG_STATE_SETUP:
+    case MIG_STATE_NONE:
         /* no migration has happened ever */
         break;
+    case MIG_STATE_SETUP:
+        info->has_status = true;
+        info->status = g_strdup("setup");
+        info->has_total_time = false;
+        break;
     case MIG_STATE_ACTIVE:
         info->has_status = true;
         info->status = g_strdup("active");
@@ -195,6 +201,8 @@  MigrationInfo *qmp_query_migrate(Error **errp)
             - s->total_time;
         info->has_expected_downtime = true;
         info->expected_downtime = s->expected_downtime;
+        info->has_setup_time = true;
+        info->setup_time = s->setup_time;
 
         info->has_ram = true;
         info->ram = g_malloc0(sizeof(*info->ram));
@@ -226,6 +234,8 @@  MigrationInfo *qmp_query_migrate(Error **errp)
         info->total_time = s->total_time;
         info->has_downtime = true;
         info->downtime = s->downtime;
+        info->has_setup_time = true;
+        info->setup_time = s->setup_time;
 
         info->has_ram = true;
         info->ram = g_malloc0(sizeof(*info->ram));
@@ -257,7 +267,7 @@  void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params,
     MigrationState *s = migrate_get_current();
     MigrationCapabilityStatusList *cap;
 
-    if (s->state == MIG_STATE_ACTIVE) {
+    if (s->state == MIG_STATE_ACTIVE || s->state == MIG_STATE_SETUP) {
         error_set(errp, QERR_MIGRATION_ACTIVE);
         return;
     }
@@ -295,9 +305,9 @@  static void migrate_fd_cleanup(void *opaque)
     notifier_list_notify(&migration_state_notifiers, s);
 }
 
-static void migrate_finish_set_state(MigrationState *s, int new_state)
+static void migrate_set_state(MigrationState *s, int old_state, int new_state)
 {
-    if (__sync_val_compare_and_swap(&s->state, MIG_STATE_ACTIVE,
+    if (__sync_val_compare_and_swap(&s->state, old_state,
                                     new_state) == new_state) {
         trace_migrate_set_state(new_state);
     }
@@ -316,7 +326,8 @@  static void migrate_fd_cancel(MigrationState *s)
 {
     DPRINTF("cancelling migration\n");
 
-    migrate_finish_set_state(s, MIG_STATE_CANCELLED);
+    migrate_set_state(s, MIG_STATE_SETUP, MIG_STATE_CANCELLED);
+    migrate_set_state(s, MIG_STATE_ACTIVE, MIG_STATE_CANCELLED);
 }
 
 void add_migration_state_change_notifier(Notifier *notify)
@@ -393,7 +404,7 @@  void qmp_migrate(const char *uri, bool has_blk, bool blk,
     params.blk = blk;
     params.shared = inc;
 
-    if (s->state == MIG_STATE_ACTIVE) {
+    if (s->state == MIG_STATE_ACTIVE || s->state == MIG_STATE_SETUP) {
         error_set(errp, QERR_MIGRATION_ACTIVE);
         return;
     }
@@ -517,6 +528,7 @@  static void *migration_thread(void *opaque)
 {
     MigrationState *s = opaque;
     int64_t initial_time = qemu_get_clock_ms(rt_clock);
+    int64_t setup_start = qemu_get_clock_ms(host_clock);
     int64_t initial_bytes = 0;
     int64_t max_size = 0;
     int64_t start_time = initial_time;
@@ -525,6 +537,11 @@  static void *migration_thread(void *opaque)
     DPRINTF("beginning savevm\n");
     qemu_savevm_state_begin(s->file, &s->params);
 
+    s->setup_time = qemu_get_clock_ms(host_clock) - setup_start;
+    migrate_set_state(s, MIG_STATE_SETUP, MIG_STATE_ACTIVE);
+
+    DPRINTF("setup complete\n");
+
     while (s->state == MIG_STATE_ACTIVE) {
         int64_t current_time;
         uint64_t pending_size;
@@ -546,14 +563,14 @@  static void *migration_thread(void *opaque)
                 qemu_savevm_state_complete(s->file);
                 qemu_mutex_unlock_iothread();
                 if (!qemu_file_get_error(s->file)) {
-                    migrate_finish_set_state(s, MIG_STATE_COMPLETED);
+                    migrate_set_state(s, MIG_STATE_ACTIVE, MIG_STATE_COMPLETED);
                     break;
                 }
             }
         }
 
         if (qemu_file_get_error(s->file)) {
-            migrate_finish_set_state(s, MIG_STATE_ERROR);
+            migrate_set_state(s, MIG_STATE_ACTIVE, MIG_STATE_ERROR);
             break;
         }
         current_time = qemu_get_clock_ms(rt_clock);
@@ -604,8 +621,8 @@  static void *migration_thread(void *opaque)
 
 void migrate_fd_connect(MigrationState *s)
 {
-    s->state = MIG_STATE_ACTIVE;
-    trace_migrate_set_state(MIG_STATE_ACTIVE);
+    s->state = MIG_STATE_SETUP;
+    trace_migrate_set_state(MIG_STATE_SETUP);
 
     /* This is a best 1st approximation. ns to ms */
     s->expected_downtime = max_downtime/1000000;
diff --git a/qapi-schema.json b/qapi-schema.json
index 4e9cd6d..69faaad 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -578,6 +578,12 @@ 
 #        expected downtime in milliseconds for the guest in last walk
 #        of the dirty bitmap. (since 1.3)
 #
+# @setup-time: #optional amount of setup time spent _before_ the iterations 
+#        begin but _after_ the QMP command is issued. This is designed to
+#        provide an accounting of any activities (such as RDMA pinning) which
+#        may be expensive, but do not actually occur during the iterative
+#        migration rounds themselves. (since 1.6)
+#
 # Since: 0.14.0
 ##
 { 'type': 'MigrationInfo',
@@ -586,7 +592,8 @@ 
            '*xbzrle-cache': 'XBZRLECacheStats',
            '*total-time': 'int',
            '*expected-downtime': 'int',
-           '*downtime': 'int'} }
+           '*downtime': 'int',
+           '*setup-time' : 'int'} }
 
 ##
 # @query-migrate