From patchwork Mon Feb 11 22:49:57 2013 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: mrhines@linux.vnet.ibm.com X-Patchwork-Id: 219698 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from lists.gnu.org (lists.gnu.org [208.118.235.17]) (using TLSv1 with cipher AES256-SHA (256/256 bits)) (Client did not present a certificate) by ozlabs.org (Postfix) with ESMTPS id CE0E32C02F8 for ; Tue, 12 Feb 2013 10:05:20 +1100 (EST) Received: from localhost ([::1]:42498 helo=lists.gnu.org) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1U52EF-0000xE-3P for incoming@patchwork.ozlabs.org; Mon, 11 Feb 2013 17:52:03 -0500 Received: from eggs.gnu.org ([208.118.235.92]:40369) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1U52DB-0006bC-E9 for qemu-devel@nongnu.org; Mon, 11 Feb 2013 17:51:04 -0500 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1U52D4-0007sg-4G for qemu-devel@nongnu.org; Mon, 11 Feb 2013 17:50:56 -0500 Received: from e34.co.us.ibm.com ([32.97.110.152]:41516) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1U52D3-0007s7-SB for qemu-devel@nongnu.org; Mon, 11 Feb 2013 17:50:50 -0500 Received: from /spool/local by e34.co.us.ibm.com with IBM ESMTP SMTP Gateway: Authorized Use Only! Violators will be prosecuted for from ; Mon, 11 Feb 2013 15:50:49 -0700 Received: from d03dlp02.boulder.ibm.com (9.17.202.178) by e34.co.us.ibm.com (192.168.1.134) with IBM ESMTP SMTP Gateway: Authorized Use Only! Violators will be prosecuted; Mon, 11 Feb 2013 15:50:47 -0700 Received: from d03relay05.boulder.ibm.com (d03relay05.boulder.ibm.com [9.17.195.107]) by d03dlp02.boulder.ibm.com (Postfix) with ESMTP id 2F6533E40039 for ; Mon, 11 Feb 2013 15:50:39 -0700 (MST) Received: from d03av04.boulder.ibm.com (d03av04.boulder.ibm.com [9.17.195.170]) by d03relay05.boulder.ibm.com (8.13.8/8.13.8/NCO v10.0) with ESMTP id r1BMoYab081680 for ; Mon, 11 Feb 2013 15:50:35 -0700 Received: from d03av04.boulder.ibm.com (loopback [127.0.0.1]) by d03av04.boulder.ibm.com (8.14.4/8.13.1/NCO v10.0 AVout) with ESMTP id r1BMoXln010269 for ; Mon, 11 Feb 2013 15:50:33 -0700 Received: from mrhinesdev.klabtestbed.com (klinux.watson.ibm.com [9.2.208.21]) by d03av04.boulder.ibm.com (8.14.4/8.13.1/NCO v10.0 AVin) with ESMTP id r1BMoUWM010052; Mon, 11 Feb 2013 15:50:33 -0700 From: "Michael R. Hines" To: qemu-devel@nongnu.org Date: Mon, 11 Feb 2013 17:49:57 -0500 Message-Id: <1360622997-26904-6-git-send-email-mrhines@linux.vnet.ibm.com> X-Mailer: git-send-email 1.7.10.4 In-Reply-To: <1360622997-26904-1-git-send-email-mrhines@linux.vnet.ibm.com> References: <1360622997-26904-1-git-send-email-mrhines@linux.vnet.ibm.com> X-Content-Scanned: Fidelis XPS MAILER x-cbid: 13021122-2876-0000-0000-0000052CEB4B X-detected-operating-system: by eggs.gnu.org: GNU/Linux 2.4.x-2.6.x [generic] X-Received-From: 32.97.110.152 Cc: aliguori@us.ibm.com, abali@us.ibm.com, "Michael R. Hines" , gokul@us.ibm.com Subject: [Qemu-devel] [RFC PATCH RDMA support v2: 6/6] send memory over RDMA as blocks are iterated X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.14 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org Sender: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org From: "Michael R. Hines" Signed-off-by: Michael R. Hines --- arch_init.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++--- savevm.c | 59 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 139 insertions(+), 4 deletions(-) diff --git a/arch_init.c b/arch_init.c index dada6de..76092cc 100644 --- a/arch_init.c +++ b/arch_init.c @@ -42,6 +42,7 @@ #include "migration/migration.h" #include "exec/gdbstub.h" #include "hw/smbios.h" +#include "qemu/rdma.h" #include "exec/address-spaces.h" #include "hw/pcspk.h" #include "migration/page_cache.h" @@ -170,6 +171,15 @@ static int is_dup_page(uint8_t *page) VECTYPE val = SPLAT(page); int i; + /* + * RFC RDMA: The empirical cost of searching for zero pages here + * plus the cost of communicating with the other side + * seems to take significantly more time than simply + * dumping the page into remote memory. + */ + if (migrate_rdma_enabled()) + return 0; + for (i = 0; i < TARGET_PAGE_SIZE / sizeof(VECTYPE); i++) { if (!ALL_EQ(val, p[i])) { return 0; @@ -282,6 +292,44 @@ static size_t save_block_hdr(QEMUFile *f, RAMBlock *block, ram_addr_t offset, return size; } +static size_t save_rdma_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset, + int cont) +{ + size_t bytes_sent = 0; + ram_addr_t current_addr; + + acct_info.norm_pages++; + + /* + * use RDMA to send page + */ + current_addr = block->offset + offset; + if (rdma_write(&rdma_mdata, current_addr, + TARGET_PAGE_SIZE)) { + fprintf(stderr, "rdma migration: write error!\n"); + qemu_file_set_error(f, -EIO); + return 0; + } + + /* + * do some polling + */ + while (1) { + int ret = rdma_poll(&rdma_mdata); + if (ret == RDMA_WRID_NONE) { + break; + } + if (ret < 0) { + fprintf(stderr, "rdma migration: polling error!\n"); + qemu_file_set_error(f, -EIO); + return 0; + } + } + + bytes_sent += TARGET_PAGE_SIZE; + return bytes_sent; +} + #define ENCODING_FLAG_XBZRLE 0x1 static int save_xbzrle_page(QEMUFile *f, uint8_t *current_data, @@ -474,6 +522,8 @@ static int ram_save_block(QEMUFile *f, bool last_stage) if (!last_stage) { p = get_cached_data(XBZRLE.cache, current_addr); } + } else if (migrate_rdma_enabled()) { + bytes_sent = save_rdma_page(f, block, offset, cont); } /* XBZRLE overflow or normal page */ @@ -601,12 +651,14 @@ static int ram_save_setup(QEMUFile *f, void *opaque) return 0; } +static int tprate = 1000; + static int ram_save_iterate(QEMUFile *f, void *opaque) { int ret; int i; - int64_t t0; - int total_sent = 0; + int64_t t0, tp0; + int total_sent = 0, last_total_sent = 0; qemu_mutex_lock_ramlist(); @@ -625,23 +677,49 @@ static int ram_save_iterate(QEMUFile *f, void *opaque) break; } total_sent += bytes_sent; + last_total_sent += bytes_sent; acct_info.iterations++; /* we want to check in the 1st loop, just in case it was the 1st time and we had to sync the dirty bitmap. qemu_get_clock_ns() is a bit expensive, so we only check each some iterations */ + + /* + * RFC RDMA: Can we have something like this to periodically print + * out throughput? This is just a rough-sketch that + * partially worked for me. I assume there a better way + * that everyone would prefer. Perhaps we could set a QMP + * command that toggled a "periodic printing" option that + * allowed more details to be printed on stdout.....? + */ if ((i & 63) == 0) { - uint64_t t1 = (qemu_get_clock_ns(rt_clock) - t0) / 1000000; + uint64_t curr = qemu_get_clock_ns(rt_clock); + uint64_t t1 = (curr - t0) / 1000000; + double tp; if (t1 > MAX_WAIT) { DPRINTF("big wait: %" PRIu64 " milliseconds, %d iterations\n", t1, i); break; } + + if ((i % tprate) == 0) { + uint64_t tp1 = (curr - tp0) / 1000000; + tp = ((double) last_total_sent * 8.0 / + ((double) tp1 / 1000.0)) / 1000.0 / 1000.0; + printf("throughput: %f mbps\n", tp); + last_total_sent = 0; + tp0 = curr; + } } i++; } + if (migrate_rdma_enabled() && rdma_write_flush(&rdma_mdata) < 0) { + qemu_file_set_error(f, -EIO); + return 0; + } + qemu_mutex_unlock_ramlist(); if (ret < 0) { diff --git a/savevm.c b/savevm.c index 304d1ef..4d0bef3 100644 --- a/savevm.c +++ b/savevm.c @@ -24,6 +24,7 @@ #include "config-host.h" #include "qemu-common.h" +#include "qemu/rdma.h" #include "hw/hw.h" #include "hw/qdev.h" #include "net/net.h" @@ -417,7 +418,7 @@ int qemu_file_get_error(QEMUFile *f) return f->last_error; } -static void qemu_file_set_error(QEMUFile *f, int ret) +void qemu_file_set_error(QEMUFile *f, int ret) { if (f->last_error == 0) { f->last_error = ret; @@ -1613,6 +1614,7 @@ int qemu_savevm_state_iterate(QEMUFile *f) { SaveStateEntry *se; int ret = 1; + static int first_time = 1; QTAILQ_FOREACH(se, &savevm_handlers, entry) { if (!se->ops || !se->ops->save_live_iterate) { @@ -1643,6 +1645,30 @@ int qemu_savevm_state_iterate(QEMUFile *f) } } if (ret != 0) { +#ifdef RDMA_EXTRA_SYNC + /* + * We use two "sync" infiniband messages happen during migration. + * One at the beginning and one at the end, just to be thorough. + * This is the first one. + */ + if (first_time && migrate_rdma_enabled()) { + int r; + first_time = 0; + if (rdma_post_send_sync(&rdma_mdata, RDMA_WRID_SEND_EXTRA_SYNC)) { + fprintf(stderr, + "rdma migration: error posting extra send sync!\n"); + return -EIO; + } + + r = rdma_wait_for_wrid(&rdma_mdata, RDMA_WRID_SEND_EXTRA_SYNC); + if (r < 0) { + fprintf(stderr, + "rdma migration: qemu_savevm_state_iterate" + " sync polling error!\n"); + return -EIO; + } + } +#endif return ret; } ret = qemu_file_get_error(f); @@ -1703,8 +1729,30 @@ int qemu_savevm_state_complete(QEMUFile *f) trace_savevm_section_end(se->section_id); } + /* + * We use two "sync" infiniband messages happen during migration. + * One at the beginning and one at the end, just to be thorough. + * This is the second one. + */ + if (migrate_rdma_enabled()) { + if (rdma_post_send_sync(&rdma_mdata, RDMA_WRID_SEND_SYNC)) { + fprintf(stderr, "rdma migration: error posting send sync!\n"); + return -EIO; + } + } + qemu_put_byte(f, QEMU_VM_EOF); + /* wait for RDMA sync message to complete */ + if (migrate_rdma_enabled()) { + int ret = rdma_wait_for_wrid(&rdma_mdata, RDMA_WRID_SEND_SYNC); + if (ret < 0) { + fprintf(stderr, "rdma migration: qemu_savevm_state_full" + " sync polling error!\n"); + return -EIO; + } + } + return qemu_file_get_error(f); } @@ -2014,6 +2062,15 @@ int qemu_loadvm_state(QEMUFile *f) cpu_synchronize_all_post_init(); + /* wait for RDMA sync message */ + if (migrate_rdma_enabled()) { + ret = rdma_wait_for_wrid(&rdma_mdata, RDMA_WRID_RECV_SYNC); + if (ret < 0) { + fprintf(stderr, "rdma migration: qemu_loadvm_state_no_header" + " sync polling error!\n"); + goto out; + } + } ret = 0; out: